sched-multi-instance-regression.sh 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. #!/usr/bin/env bash
  2. # S8-SCHED-MULTI-INSTANCE-1 多实例 lease 抢锁回归(dev/test only, aidopdev)。
  3. #
  4. # === 重要:本脚本当前为「轻量补证」,不能作为多实例 PASS 凭证 ===
  5. #
  6. # 阻碍项:
  7. # 尝试在端口 5006 启动第二实例 B 时,SqlSugar.CodeFirstProvider.InitTables 抛出:
  8. # SqlSugar.SqlSugarException: Incorrect datetime value: '0000-00-00 00:00:00'
  9. # for column 'create_time' at row 1
  10. # 当前 dev MySQL 8.0.31 严格 SQL_MODE 拒绝 zero-date;某张共享表(疑为 Sys* 类)
  11. # 存在历史 zero-date 行,CodeFirst ALTER 校验失败。5005 实例先于该 SQL_MODE 启动
  12. # 故未触发;新实例无法 bootstrap。
  13. #
  14. # 按 CTO 八条款:不改代码硬绕;本轮无法完成"两个后端实例同时运行"严格意义上的多
  15. # 实例压测;脚本退化为对 lease 原语的纯 DB 行为验证(race UPDATE / expired reset
  16. # / token guard),可作补证不作多实例 PASS。
  17. #
  18. # === 验证不变量(DB 层 lease 原语)===
  19. # I1. 同时两路 UPDATE 抢同一 ready rule,恰好 1 路 affectedRows=1。
  20. # I2. lock_until 过期后 ResetExpiredLeasesAsync 等价 SQL 释放,新 token 可抢到。
  21. # I3. OnRuleCompletedAsync 等价 SQL(WHERE id + lock_token)旧 token 回写
  22. # affectedRows=0,新 lease 状态不被覆盖。
  23. # I4. 同一 dedup_key 不重复建单:复用 sched-exec-regression.sh 中 antiflap=3 的覆盖。
  24. #
  25. # === 严格遵守 ===
  26. # - 不修改 demo rule 10/11/12 的最终态。
  27. # - TEMP rule 测后 enabled=0;TEMP exception(若产生)soft delete。
  28. # - 不清理 detection_log / rule_detection_state。
  29. # - 不在生产库执行写操作。
  30. set -uo pipefail
  31. SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
  32. # shellcheck source=./s8-regression-common.sh
  33. source "${SCRIPT_DIR}/s8-regression-common.sh"
  34. TEMP_RULE='TEMP_SCHED_MULTI_LOCK'
  35. TEMP_DEDUP_OBJ='TEMP-MULTI-LOCK-001'
  36. baseline_before=$(read_baseline)
  37. echo "==== sched-multi-instance-regression baseline_before=${baseline_before} ===="
  38. echo "BLOCKER: instance B startup fails on SqlSugar zero-date schema check;"
  39. echo " this run is LIGHTWEIGHT-SIMULATION ONLY, not multi-instance PASS."
  40. echo
  41. # demo rule pre-snapshot(仅观测,不写)
  42. echo "---- demo rule 10/11/12 pre-snapshot ----"
  43. mysql_run "SELECT id, rule_code, enabled, IFNULL(paused_until,'NULL'), trigger_count_required, recover_count_required, IFNULL(last_status,'NULL') FROM ado_s8_watch_rule WHERE id IN (10,11,12) ORDER BY id;" \
  44. | awk '{printf " rule %s: %s\n", $1, $0}'
  45. # ---------------------------------------------------------------------------
  46. # Setup TEMP rule (idempotent, ready-state, no lease)
  47. # ---------------------------------------------------------------------------
  48. echo
  49. echo "---- Setup TEMP_SCHED_MULTI_LOCK ----"
  50. TEMP_PARAMS='{"dueAtField":"due_at","statusField":"status","completedStates":["CLOSED","DONE","COMPLETED"],"objectCodeField":"related_object_code","objectIdField":"source_object_id","graceMinutes":0,"exceptionTypeCode":"DELIVERY_DELAY"}'
  51. TEMP_EXPR="SELECT '${TEMP_DEDUP_OBJ}' AS related_object_code, '${TEMP_DEDUP_OBJ}' AS source_object_id, DATE_SUB(NOW(), INTERVAL 1 HOUR) AS due_at, 'PENDING' AS status"
  52. existing=$(mysql_run "SELECT COUNT(*) FROM ado_s8_watch_rule WHERE rule_code='${TEMP_RULE}' AND tenant_id=${TENANT_ID} AND factory_id=${FACTORY_ID};")
  53. if [[ "${existing}" == "0" ]]; then
  54. mysql_run_strict "INSERT INTO ado_s8_watch_rule
  55. (tenant_id, factory_id, rule_code, scene_code, data_source_id, watch_object_type, expression, severity,
  56. poll_interval_seconds, enabled, created_at, rule_type, source_object_type, params_json,
  57. consecutive_failure_count, trigger_count_required, recover_count_required, next_run_at, lock_token, locked_by, lock_until)
  58. VALUES (${TENANT_ID}, ${FACTORY_ID}, '${TEMP_RULE}', 'S7', 1, 'ORDER',
  59. \"${TEMP_EXPR}\", 'HIGH', 60, 1, NOW(), 'TIMEOUT', 'ORDER', '${TEMP_PARAMS}',
  60. 0, 1, 1, NOW(), NULL, NULL, NULL);" >/dev/null
  61. echo " inserted TEMP rule"
  62. else
  63. mysql_run_strict "UPDATE ado_s8_watch_rule SET enabled=1, expression=\"${TEMP_EXPR}\", params_json='${TEMP_PARAMS}', trigger_count_required=1, recover_count_required=1, paused_until=NULL, pause_reason=NULL, lock_token=NULL, locked_by=NULL, lock_until=NULL, running_started_at=NULL, consecutive_failure_count=0, last_status=NULL, last_error=NULL, next_run_at=NOW(), updated_at=NOW() WHERE rule_code='${TEMP_RULE}' AND tenant_id=${TENANT_ID} AND factory_id=${FACTORY_ID};" >/dev/null
  64. echo " re-armed TEMP rule"
  65. fi
  66. TEMP_RULE_ID=$(get_rule_id_by_code "${TEMP_RULE}")
  67. echo " TEMP_RULE_ID=${TEMP_RULE_ID}"
  68. # 清残留 detection_state 与 TEMP exception
  69. mysql_run_strict "DELETE FROM ado_s8_rule_detection_state WHERE rule_code='${TEMP_RULE}' AND tenant_id=${TENANT_ID} AND factory_id=${FACTORY_ID};" >/dev/null
  70. mysql_run_strict "UPDATE ado_s8_exception SET is_deleted=1, updated_at=NOW() WHERE source_rule_code='${TEMP_RULE}' AND is_deleted=0;" >/dev/null
  71. # ---------------------------------------------------------------------------
  72. # I1. Lease uniqueness on race (两路并发 UPDATE 抢同一 lease)
  73. # ---------------------------------------------------------------------------
  74. echo
  75. echo "---- I1. Lease uniqueness on race ----"
  76. # 复刻 PickReadyRulesAsync 的乐观抢锁 SQL(.NET 端 SetColumns + Where),单条 rule 模式。
  77. # 通过两个并发 mysql_run,各使用独立 token,统计 ROW_COUNT()。
  78. # 重置成 ready 态。
  79. mysql_run_strict "UPDATE ado_s8_watch_rule SET lock_token=NULL, locked_by=NULL, lock_until=NULL, running_started_at=NULL, next_run_at=NOW() WHERE id=${TEMP_RULE_ID};" >/dev/null
  80. TOKEN_A="SIM_A_$(date +%s%N)"
  81. TOKEN_B="SIM_B_$(date +%s%N)"
  82. race_update() {
  83. local token="$1"
  84. local locked_by="$2"
  85. # 用复合语句一次性 UPDATE + 输出 ROW_COUNT()。
  86. MYSQL_PWD="${DB_PASS}" mysql -h "${DB_HOST}" -P "${DB_PORT}" -u "${DB_USER}" "${DB_NAME}" \
  87. --default-character-set=utf8mb4 --connect-timeout=8 -N -B -e "
  88. UPDATE ado_s8_watch_rule
  89. SET lock_token='${token}',
  90. locked_by='${locked_by}',
  91. lock_until=DATE_ADD(NOW(), INTERVAL 5 MINUTE),
  92. running_started_at=NOW(),
  93. last_run_id='${token:0:16}',
  94. updated_at=NOW()
  95. WHERE id=${TEMP_RULE_ID}
  96. AND enabled=1
  97. AND (paused_until IS NULL OR paused_until <= NOW())
  98. AND (next_run_at IS NULL OR next_run_at <= NOW())
  99. AND (lock_until IS NULL OR lock_until <= NOW());
  100. SELECT ROW_COUNT() AS affected;
  101. " 2>/dev/null | tail -1
  102. }
  103. # 并发执行
  104. A_RESULT=$(race_update "${TOKEN_A}" "SIM_A_PID_A") &
  105. A_PID=$!
  106. B_RESULT=$(race_update "${TOKEN_B}" "SIM_B_PID_B") &
  107. B_PID=$!
  108. # 这里 bash 不直接捕获子 shell stdout 到父变量,改用临时文件
  109. unset A_RESULT B_RESULT
  110. TMP_A=$(mktemp); TMP_B=$(mktemp)
  111. ( race_update "${TOKEN_A}" "SIM_A_PID_A" > "${TMP_A}" ) &
  112. PID_A=$!
  113. ( race_update "${TOKEN_B}" "SIM_B_PID_B" > "${TMP_B}" ) &
  114. PID_B=$!
  115. wait ${PID_A} ${PID_B}
  116. ROWS_A=$(cat "${TMP_A}")
  117. ROWS_B=$(cat "${TMP_B}")
  118. rm -f "${TMP_A}" "${TMP_B}"
  119. echo " affected_rows: A=${ROWS_A} B=${ROWS_B}"
  120. # 校验:恰好一个为 1,另一个为 0
  121. if [[ "${ROWS_A}" == "1" && "${ROWS_B}" == "0" ]]; then
  122. WINNER="A"; WINNER_TOKEN="${TOKEN_A}"
  123. elif [[ "${ROWS_A}" == "0" && "${ROWS_B}" == "1" ]]; then
  124. WINNER="B"; WINNER_TOKEN="${TOKEN_B}"
  125. else
  126. WINNER=""
  127. fi
  128. if [[ -n "${WINNER}" ]]; then
  129. record_pass "I1: lease race produced exactly one winner (${WINNER})"
  130. else
  131. record_fail "I1: lease race anomaly — both rows affected (A=${ROWS_A} B=${ROWS_B}); lease primitive broken"
  132. fi
  133. actual_token=$(get_rule_field "${TEMP_RULE_ID}" lock_token)
  134. actual_lockedby=$(get_rule_field "${TEMP_RULE_ID}" locked_by)
  135. echo " DB recorded: lock_token=${actual_token} locked_by=${actual_lockedby}"
  136. if [[ "${actual_token}" == "${WINNER_TOKEN}" ]]; then
  137. record_pass "I1: DB lock_token matches winner token (${WINNER})"
  138. else
  139. record_fail "I1: DB lock_token=${actual_token} does NOT match winner token=${WINNER_TOKEN}"
  140. fi
  141. # ---------------------------------------------------------------------------
  142. # I3. OnRuleCompletedAsync 旧 token 防覆盖
  143. # 进度安排:先做 I3(依赖 I1 已抢到的 lease),再做 I2(强制过期 reset)。
  144. # ---------------------------------------------------------------------------
  145. echo
  146. echo "---- I3. OnRuleCompletedAsync token guard (old token must not overwrite) ----"
  147. # 假设 I1 winner 持有 WINNER_TOKEN。模拟"旧进程"试图用一个 stale 假 token 完成本次 run。
  148. OLD_FAKE_TOKEN="STALE_TOKEN_THAT_NEVER_WON"
  149. guard_rows=$(MYSQL_PWD="${DB_PASS}" mysql -h "${DB_HOST}" -P "${DB_PORT}" -u "${DB_USER}" "${DB_NAME}" \
  150. --default-character-set=utf8mb4 --connect-timeout=8 -N -B -e "
  151. UPDATE ado_s8_watch_rule
  152. SET lock_token=NULL, locked_by=NULL, lock_until=NULL, running_started_at=NULL,
  153. last_status='SUCCESS', last_run_at=NOW(), next_run_at=DATE_ADD(NOW(), INTERVAL 60 SECOND),
  154. updated_at=NOW()
  155. WHERE id=${TEMP_RULE_ID} AND lock_token='${OLD_FAKE_TOKEN}';
  156. SELECT ROW_COUNT();
  157. " 2>/dev/null | tail -1)
  158. echo " stale-token completion attempt affected_rows=${guard_rows}"
  159. if [[ "${guard_rows}" == "0" ]]; then
  160. record_pass "I3: old token completion did not overwrite (affected=0)"
  161. else
  162. record_fail "I3: old token completion overwrote new lease (affected=${guard_rows})"
  163. fi
  164. # 校验 lock_token / lock_until 仍是 winner 的
  165. post_token=$(get_rule_field "${TEMP_RULE_ID}" lock_token)
  166. post_lockuntil=$(get_rule_field "${TEMP_RULE_ID}" lock_until)
  167. echo " post-guard: lock_token=${post_token} lock_until=${post_lockuntil}"
  168. if [[ "${post_token}" == "${WINNER_TOKEN}" ]]; then
  169. record_pass "I3: lock_token still belongs to winner"
  170. else
  171. record_fail "I3: lock_token drifted: ${post_token} != ${WINNER_TOKEN}"
  172. fi
  173. # ---------------------------------------------------------------------------
  174. # I2. Expired lease release & re-acquisition
  175. # ---------------------------------------------------------------------------
  176. echo
  177. echo "---- I2. Expired lease release & re-acquisition ----"
  178. # 把 lock_until 强制写为过去时间,模拟 lease 超时(例如旧进程 crash 未释放)。
  179. mysql_run_strict "UPDATE ado_s8_watch_rule SET lock_until=DATE_SUB(NOW(), INTERVAL 1 MINUTE), next_run_at=NOW() WHERE id=${TEMP_RULE_ID};" >/dev/null
  180. # 复刻 ResetExpiredLeasesAsync 的 SQL:
  181. released=$(MYSQL_PWD="${DB_PASS}" mysql -h "${DB_HOST}" -P "${DB_PORT}" -u "${DB_USER}" "${DB_NAME}" \
  182. --default-character-set=utf8mb4 --connect-timeout=8 -N -B -e "
  183. UPDATE ado_s8_watch_rule
  184. SET lock_token=NULL, locked_by=NULL, lock_until=NULL, running_started_at=NULL, updated_at=NOW()
  185. WHERE tenant_id=${TENANT_ID} AND factory_id=${FACTORY_ID}
  186. AND lock_until IS NOT NULL AND lock_until < NOW()
  187. AND id=${TEMP_RULE_ID};
  188. SELECT ROW_COUNT();
  189. " 2>/dev/null | tail -1)
  190. echo " ResetExpiredLeases-equivalent affected=${released}"
  191. if [[ "${released}" == "1" ]]; then
  192. record_pass "I2: expired lease released (affected=1)"
  193. else
  194. record_fail "I2: expired lease not released (affected=${released})"
  195. fi
  196. reset_token=$(get_rule_field "${TEMP_RULE_ID}" lock_token)
  197. [[ "${reset_token}" == "NULL" ]] && record_pass "I2: lock_token is NULL after reset" || record_fail "I2: lock_token not cleared (${reset_token})"
  198. # 再次抢锁(模拟新实例)
  199. NEW_TOKEN="SIM_NEW_INSTANCE_$(date +%s%N)"
  200. new_rows=$(MYSQL_PWD="${DB_PASS}" mysql -h "${DB_HOST}" -P "${DB_PORT}" -u "${DB_USER}" "${DB_NAME}" \
  201. --default-character-set=utf8mb4 --connect-timeout=8 -N -B -e "
  202. UPDATE ado_s8_watch_rule
  203. SET lock_token='${NEW_TOKEN}', locked_by='SIM_NEW_PID', lock_until=DATE_ADD(NOW(), INTERVAL 5 MINUTE),
  204. running_started_at=NOW(), updated_at=NOW()
  205. WHERE id=${TEMP_RULE_ID}
  206. AND enabled=1
  207. AND (lock_until IS NULL OR lock_until <= NOW());
  208. SELECT ROW_COUNT();
  209. " 2>/dev/null | tail -1)
  210. echo " new-instance re-acquire affected=${new_rows}"
  211. if [[ "${new_rows}" == "1" ]]; then
  212. record_pass "I2: new instance re-acquired lease post-reset"
  213. else
  214. record_fail "I2: new instance failed to re-acquire (affected=${new_rows})"
  215. fi
  216. new_token_db=$(get_rule_field "${TEMP_RULE_ID}" lock_token)
  217. [[ "${new_token_db}" == "${NEW_TOKEN}" ]] && record_pass "I2: new lock_token recorded" || record_fail "I2: lock_token mismatch (${new_token_db})"
  218. # ---------------------------------------------------------------------------
  219. # Cleanup
  220. # ---------------------------------------------------------------------------
  221. echo
  222. echo "---- Cleanup ----"
  223. mysql_run_strict "UPDATE ado_s8_watch_rule SET enabled=0, paused_until=NULL, pause_reason=NULL, lock_token=NULL, locked_by=NULL, lock_until=NULL, running_started_at=NULL, updated_at=NOW() WHERE rule_code='${TEMP_RULE}' AND tenant_id=${TENANT_ID} AND factory_id=${FACTORY_ID};" >/dev/null
  224. TEMP_EXC_IDS=$(mysql_run "SELECT IFNULL(GROUP_CONCAT(id),'none') FROM ado_s8_exception WHERE source_rule_code='${TEMP_RULE}' AND is_deleted=0;")
  225. mysql_run_strict "UPDATE ado_s8_exception SET is_deleted=1, updated_at=NOW() WHERE source_rule_code='${TEMP_RULE}' AND is_deleted=0;" >/dev/null
  226. echo " TEMP rule_id=${TEMP_RULE_ID} disabled; soft-deleted exception ids=[${TEMP_EXC_IDS}]"
  227. final_temp_enabled=$(get_rule_field "${TEMP_RULE_ID}" enabled)
  228. final_temp_visible=$(mysql_run "SELECT COUNT(*) FROM ado_s8_exception WHERE source_rule_code='${TEMP_RULE}' AND is_deleted=0;")
  229. [[ "${final_temp_enabled}" == "0" ]] && record_pass "cleanup: TEMP rule enabled=0" || record_fail "cleanup: TEMP rule still enabled=${final_temp_enabled}"
  230. [[ "${final_temp_visible}" == "0" ]] && record_pass "cleanup: TEMP exception not in default list" || record_fail "cleanup: TEMP exception still in default list (${final_temp_visible})"
  231. # demo rule 10/11/12 守恒
  232. demo_state=$(mysql_run "SELECT GROUP_CONCAT(CONCAT(id,':',enabled,':',IFNULL(paused_until,'NULL'),':',trigger_count_required,':',recover_count_required) ORDER BY id) FROM ado_s8_watch_rule WHERE id IN (10,11,12);")
  233. expected="10:1:NULL:1:1,11:1:NULL:1:1,12:1:NULL:1:1"
  234. [[ "${demo_state}" == "${expected}" ]] \
  235. && record_pass "demo rule 10/11/12 conserved" \
  236. || record_fail "demo rule drifted: ${demo_state}"
  237. assert_baseline_unchanged "${baseline_before}"
  238. echo
  239. print_summary
  240. echo
  241. echo "==== STATUS: LIGHTWEIGHT-SIMULATION ${S8REG_FAIL_COUNT} FAIL — multi-instance PASS BLOCKED on instance B SqlSugar startup ===="
  242. echo "TEMP_RULE=${TEMP_RULE} TEMP_RULE_ID=${TEMP_RULE_ID} TEMP_EXC_IDS=[${TEMP_EXC_IDS}]"
  243. exit_by_summary