r6-detection-log-edge-regression.sh 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. #!/usr/bin/env bash
  2. # R6 detection_log edge regression — S8-DETECTION-LOG-EDGE-FIXTURE-1(dev/test only, aidopdev)。
  3. #
  4. # 演变史:
  5. # - 旧版基于 G01_TEST_TIMEOUT_NO_HIT / G01_TEST_TIMEOUT_FAILED;这两条 fixture 已从 dev 移除。
  6. # - 上一轮 SCAN-1 改造为 SKIP(CREATED/REFRESHED 由 sched-exec 覆盖)。
  7. # - 本轮 EDGE-FIXTURE-1 引入 TEMP_SCHED_TIMEOUT_NO_HIT / TEMP_SCHED_TIMEOUT_EVALUATE_FAILED
  8. # 真实覆盖 NO_HIT 与 EVALUATE_FAILED 两条 detection_log 边缘路径。
  9. #
  10. # 验证机制:
  11. # - 通过 watch_rule.next_run_at=NOW 触发 Job tick 调度,最多 90s 轮询 last_run_at 推进。
  12. # Job 路径(RunDispatchTickAsync→ProcessSingleRuleAsync)写 detection_log + 更新
  13. # watch_rule.last_status / last_error / consecutive_failure_count,CTO 任务约束 五.B 要求。
  14. # - NO_HIT:valid params + 表达式返回未到期行 → evaluator 0 hits → NO_HIT 日志。
  15. # - EVALUATE_FAILED:params_json 缺 dueAtField → evaluator 抛 S8RuleEvaluatorException
  16. # (reason=params_schema_invalid) → EVALUATE_FAILED 日志 + last_status=FAILED。
  17. #
  18. # 严守:
  19. # - 不影响 demo rule 10/11/12。
  20. # - TEMP rule 测后 enabled=0;TEMP exception 若有 soft delete(理论上两条都不应建单)。
  21. # - 不清 detection_log / rule_detection_state / 演示 exception。
  22. set -uo pipefail
  23. SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
  24. # shellcheck source=./s8-regression-common.sh
  25. source "${SCRIPT_DIR}/s8-regression-common.sh"
  26. auth_load
  27. NO_HIT_RULE='TEMP_SCHED_TIMEOUT_NO_HIT'
  28. FAILED_RULE='TEMP_SCHED_TIMEOUT_EVALUATE_FAILED'
  29. # NO_HIT:合法 params + 表达式返回 1 行 due_at=NOW+2h(未到期),evaluator 应判定 0 hits
  30. NO_HIT_PARAMS='{"dueAtField":"due_at","statusField":"status","completedStates":["CLOSED","DONE","COMPLETED"],"objectCodeField":"related_object_code","objectIdField":"source_object_id","graceMinutes":0,"exceptionTypeCode":"DELIVERY_DELAY"}'
  31. NO_HIT_EXPR="SELECT 'TEMP-NOHIT-001' AS related_object_code, 'TEMP-NOHIT-001' AS source_object_id, DATE_ADD(NOW(), INTERVAL 2 HOUR) AS due_at, 'PENDING' AS status"
  32. # EVALUATE_FAILED:params 缺 dueAtField → evaluator 抛 params_schema_invalid
  33. FAILED_PARAMS='{"statusField":"status","completedStates":["CLOSED","DONE","COMPLETED"],"objectCodeField":"related_object_code","objectIdField":"source_object_id","graceMinutes":0,"exceptionTypeCode":"DELIVERY_DELAY"}'
  34. FAILED_EXPR="SELECT 'TEMP-FAILED-001' AS related_object_code, 'TEMP-FAILED-001' AS source_object_id, DATE_SUB(NOW(), INTERVAL 1 HOUR) AS due_at, 'PENDING' AS status"
  35. baseline_before=$(read_baseline)
  36. DEMO_SNAPSHOT=$(snapshot_demo_rule_state)
  37. echo "==== r6-detection-log-edge-regression baseline_before=${baseline_before} demo_snapshot=${DEMO_SNAPSHOT} ===="
  38. # ---------------------------------------------------------------------------
  39. # 幂等 upsert TEMP rules(直接 DB 写入绕过 service-level 校验,进入 evaluator 才暴露 schema 错)
  40. # ---------------------------------------------------------------------------
  41. upsert_temp_rule() {
  42. local code="$1" expr="$2" params="$3"
  43. local exists
  44. exists=$(mysql_run "SELECT COUNT(*) FROM ado_s8_watch_rule WHERE rule_code='${code}' AND tenant_id=${TENANT_ID} AND factory_id=${FACTORY_ID};")
  45. if [[ "${exists}" == "0" ]]; then
  46. mysql_run_strict "INSERT INTO ado_s8_watch_rule
  47. (tenant_id, factory_id, rule_code, scene_code, data_source_id, watch_object_type, expression, severity,
  48. poll_interval_seconds, enabled, created_at, rule_type, source_object_type, params_json,
  49. consecutive_failure_count, trigger_count_required, recover_count_required, next_run_at)
  50. VALUES (${TENANT_ID}, ${FACTORY_ID}, '${code}', 'S7', 1, 'ORDER',
  51. \"${expr}\", 'HIGH', 60, 1, NOW(), 'TIMEOUT', 'ORDER', '${params}',
  52. 0, 1, 1, NOW());" >/dev/null
  53. echo " inserted ${code}"
  54. else
  55. mysql_run_strict "UPDATE ado_s8_watch_rule SET enabled=1, expression=\"${expr}\", params_json='${params}',
  56. trigger_count_required=1, recover_count_required=1,
  57. paused_until=NULL, pause_reason=NULL,
  58. lock_token=NULL, locked_by=NULL, lock_until=NULL, running_started_at=NULL,
  59. consecutive_failure_count=0, last_status=NULL, last_error=NULL,
  60. next_run_at=NOW(), updated_at=NOW()
  61. WHERE rule_code='${code}' AND tenant_id=${TENANT_ID} AND factory_id=${FACTORY_ID};" >/dev/null
  62. echo " re-armed ${code}"
  63. fi
  64. }
  65. echo "---- Setup TEMP rules ----"
  66. upsert_temp_rule "${NO_HIT_RULE}" "${NO_HIT_EXPR}" "${NO_HIT_PARAMS}"
  67. upsert_temp_rule "${FAILED_RULE}" "${FAILED_EXPR}" "${FAILED_PARAMS}"
  68. NO_HIT_ID=$(get_rule_id_by_code "${NO_HIT_RULE}")
  69. FAILED_ID=$(get_rule_id_by_code "${FAILED_RULE}")
  70. echo " NO_HIT_ID=${NO_HIT_ID} FAILED_ID=${FAILED_ID}"
  71. # 清理可能残留的 detection_state / TEMP exception(理论上两条都不应建单)
  72. mysql_run_strict "DELETE FROM ado_s8_rule_detection_state WHERE rule_code IN ('${NO_HIT_RULE}','${FAILED_RULE}') AND tenant_id=${TENANT_ID} AND factory_id=${FACTORY_ID};" >/dev/null
  73. mysql_run_strict "UPDATE ado_s8_exception SET is_deleted=1, updated_at=NOW() WHERE source_rule_code IN ('${NO_HIT_RULE}','${FAILED_RULE}') AND is_deleted=0;" >/dev/null
  74. # ---------------------------------------------------------------------------
  75. # 触发 Job tick:等待两条 TEMP rule 的 last_run_at 推进(Job 间隔 60s + 余量)
  76. # ---------------------------------------------------------------------------
  77. echo "---- Wait Job tick (poll up to 90s for both TEMP rules to be processed) ----"
  78. marker=$(mysql_run "SELECT NOW();")
  79. sleep 1
  80. echo " marker=${marker}"
  81. # 触发:next_run_at=NOW(已在 upsert 时设过,但再确保一次以防 Job 已抢锁释放)
  82. mysql_run_strict "UPDATE ado_s8_watch_rule SET next_run_at=NOW(), lock_token=NULL, locked_by=NULL, lock_until=NULL WHERE id IN (${NO_HIT_ID}, ${FAILED_ID});" >/dev/null
  83. picked_up=0
  84. for ((i=1; i<=18; i++)); do
  85. sleep 5
  86. nh_run_at=$(get_rule_field "${NO_HIT_ID}" last_run_at)
  87. fl_run_at=$(get_rule_field "${FAILED_ID}" last_run_at)
  88. if [[ "${nh_run_at}" != "NULL" && "${fl_run_at}" != "NULL" \
  89. && "${nh_run_at}" > "${marker}" && "${fl_run_at}" > "${marker}" ]]; then
  90. picked_up=1
  91. echo " iter ${i}: both TEMP rules picked up (nh_run_at=${nh_run_at}, fl_run_at=${fl_run_at})"
  92. break
  93. fi
  94. echo " iter ${i}: waiting… nh_run_at=${nh_run_at} fl_run_at=${fl_run_at}"
  95. done
  96. if (( picked_up == 0 )); then
  97. record_skip "Job did not pick up both TEMP rules within 90s — backend cron likely paused or saturated"
  98. # 仍走收尾流程
  99. fi
  100. # ---------------------------------------------------------------------------
  101. # A. NO_HIT 验证
  102. # ---------------------------------------------------------------------------
  103. echo "---- A. NO_HIT verification ----"
  104. if (( picked_up == 1 )); then
  105. nh_no_hit_logs=$(mysql_run "SELECT COUNT(*) FROM ado_s8_detection_log WHERE rule_code='${NO_HIT_RULE}' AND detect_result='NO_HIT' AND detected_at >= '${marker}';")
  106. nh_other_logs=$(mysql_run "SELECT COUNT(*) FROM ado_s8_detection_log WHERE rule_code='${NO_HIT_RULE}' AND detect_result IN ('CREATED','REFRESHED','EVALUATE_FAILED') AND detected_at >= '${marker}';")
  107. nh_excs=$(mysql_run "SELECT COUNT(*) FROM ado_s8_exception WHERE source_rule_code='${NO_HIT_RULE}' AND is_deleted=0;")
  108. nh_status=$(get_rule_field "${NO_HIT_ID}" last_status)
  109. echo " NO_HIT_logs=${nh_no_hit_logs} other_logs=${nh_other_logs} active_excs=${nh_excs} last_status=${nh_status}"
  110. [[ "${nh_no_hit_logs}" -ge 1 ]] && record_pass "A: NO_HIT detection_log present (${nh_no_hit_logs})" || record_fail "A: expected NO_HIT log >=1, got ${nh_no_hit_logs}"
  111. [[ "${nh_other_logs}" == "0" ]] && record_pass "A: NO_HIT rule produced no CREATED/REFRESHED/FAILED" || record_fail "A: NO_HIT rule unexpected ${nh_other_logs} other logs"
  112. [[ "${nh_excs}" == "0" ]] && record_pass "A: NO_HIT rule did not create exception" || record_fail "A: NO_HIT rule wrongly created ${nh_excs} exception(s)"
  113. [[ "${nh_status}" == "SUCCESS" ]] && record_pass "A: NO_HIT rule last_status=SUCCESS (evaluator OK with 0 hits)" || record_fail "A: NO_HIT rule last_status expected SUCCESS, got ${nh_status}"
  114. else
  115. record_skip "A: NO_HIT — Job tick timeout, cannot verify"
  116. fi
  117. # ---------------------------------------------------------------------------
  118. # B. EVALUATE_FAILED 验证
  119. # ---------------------------------------------------------------------------
  120. echo "---- B. EVALUATE_FAILED verification ----"
  121. if (( picked_up == 1 )); then
  122. fl_failed_logs=$(mysql_run "SELECT COUNT(*) FROM ado_s8_detection_log WHERE rule_code='${FAILED_RULE}' AND detect_result='EVALUATE_FAILED' AND detected_at >= '${marker}';")
  123. fl_failed_reason=$(mysql_run "SELECT failure_reason FROM ado_s8_detection_log WHERE rule_code='${FAILED_RULE}' AND detect_result='EVALUATE_FAILED' AND detected_at >= '${marker}' ORDER BY id DESC LIMIT 1;")
  124. fl_failed_msg_len=$(mysql_run "SELECT CHAR_LENGTH(IFNULL(failure_message,'')) FROM ado_s8_detection_log WHERE rule_code='${FAILED_RULE}' AND detect_result='EVALUATE_FAILED' AND detected_at >= '${marker}' ORDER BY id DESC LIMIT 1;")
  125. fl_excs=$(mysql_run "SELECT COUNT(*) FROM ado_s8_exception WHERE source_rule_code='${FAILED_RULE}' AND is_deleted=0;")
  126. fl_status=$(get_rule_field "${FAILED_ID}" last_status)
  127. fl_error=$(get_rule_field "${FAILED_ID}" last_error)
  128. fl_failcount=$(get_rule_field "${FAILED_ID}" consecutive_failure_count)
  129. echo " EVALUATE_FAILED_logs=${fl_failed_logs} reason='${fl_failed_reason}' msg_len=${fl_failed_msg_len} active_excs=${fl_excs}"
  130. echo " last_status=${fl_status} last_error=${fl_error} consecutive_failure_count=${fl_failcount}"
  131. [[ "${fl_failed_logs}" -ge 1 ]] && record_pass "B: EVALUATE_FAILED detection_log present (${fl_failed_logs})" || record_fail "B: expected EVALUATE_FAILED log >=1, got ${fl_failed_logs}"
  132. [[ -n "${fl_failed_reason}" && "${fl_failed_reason}" != "NULL" ]] && record_pass "B: failure_reason populated ('${fl_failed_reason}')" || record_fail "B: failure_reason missing"
  133. [[ "${fl_failed_msg_len}" -gt 0 ]] && record_pass "B: failure_message non-empty (${fl_failed_msg_len} chars)" || record_fail "B: failure_message empty"
  134. [[ "${fl_excs}" == "0" ]] && record_pass "B: FAILED rule did not create exception" || record_fail "B: FAILED rule wrongly created ${fl_excs} exception(s)"
  135. [[ "${fl_status}" == "FAILED" ]] && record_pass "B: FAILED rule last_status=FAILED" || record_fail "B: FAILED rule last_status expected FAILED, got ${fl_status}"
  136. [[ "${fl_error}" != "NULL" && -n "${fl_error}" ]] && record_pass "B: last_error populated" || record_fail "B: last_error missing"
  137. [[ "${fl_failcount}" -ge 1 ]] && record_pass "B: consecutive_failure_count=${fl_failcount} (>=1)" || record_fail "B: consecutive_failure_count not bumped (${fl_failcount})"
  138. else
  139. record_skip "B: EVALUATE_FAILED — Job tick timeout, cannot verify"
  140. fi
  141. # ---------------------------------------------------------------------------
  142. # Cleanup TEMP rules
  143. # ---------------------------------------------------------------------------
  144. echo "---- Cleanup TEMP rules ----"
  145. mysql_run_strict "UPDATE ado_s8_watch_rule SET enabled=0, paused_until=NULL, pause_reason=NULL, lock_token=NULL, locked_by=NULL, lock_until=NULL, running_started_at=NULL, updated_at=NOW() WHERE rule_code IN ('${NO_HIT_RULE}','${FAILED_RULE}') AND tenant_id=${TENANT_ID} AND factory_id=${FACTORY_ID};" >/dev/null
  146. TEMP_EXC_IDS=$(mysql_run "SELECT IFNULL(GROUP_CONCAT(id),'none') FROM ado_s8_exception WHERE source_rule_code IN ('${NO_HIT_RULE}','${FAILED_RULE}') AND is_deleted=0;")
  147. mysql_run_strict "UPDATE ado_s8_exception SET is_deleted=1, updated_at=NOW() WHERE source_rule_code IN ('${NO_HIT_RULE}','${FAILED_RULE}') AND is_deleted=0;" >/dev/null
  148. cleanup_temp_sched_approval_ghost_tasks
  149. echo " disabled ${NO_HIT_RULE} (id=${NO_HIT_ID}) / ${FAILED_RULE} (id=${FAILED_ID}); soft-deleted exception ids=[${TEMP_EXC_IDS}]; ghost approval cancelled"
  150. nh_enabled=$(get_rule_field "${NO_HIT_ID}" enabled)
  151. fl_enabled=$(get_rule_field "${FAILED_ID}" enabled)
  152. [[ "${nh_enabled}" == "0" ]] && record_pass "cleanup: ${NO_HIT_RULE} enabled=0" || record_fail "cleanup: ${NO_HIT_RULE} still enabled=${nh_enabled}"
  153. [[ "${fl_enabled}" == "0" ]] && record_pass "cleanup: ${FAILED_RULE} enabled=0" || record_fail "cleanup: ${FAILED_RULE} still enabled=${fl_enabled}"
  154. temp_visible=$(mysql_run "SELECT COUNT(*) FROM ado_s8_exception WHERE source_rule_code IN ('${NO_HIT_RULE}','${FAILED_RULE}') AND is_deleted=0;")
  155. [[ "${temp_visible}" == "0" ]] && record_pass "cleanup: TEMP exceptions not in default list" || record_fail "cleanup: ${temp_visible} TEMP exception(s) still in default list"
  156. # ---------------------------------------------------------------------------
  157. # demo rule 10/11/12 漂移检测 + baseline 守恒
  158. # ---------------------------------------------------------------------------
  159. assert_demo_rule_state_unchanged "${DEMO_SNAPSHOT}"
  160. assert_baseline_unchanged "${baseline_before}"
  161. print_summary
  162. echo "TEMP rules: ${NO_HIT_RULE} (id=${NO_HIT_ID}), ${FAILED_RULE} (id=${FAILED_ID}); TEMP exception ids=[${TEMP_EXC_IDS}]"
  163. exit_by_summary