20260618 修复远端231发布验证流程

This commit is contained in:
yuyr 2026-06-18 11:31:55 +08:00
parent b6344074ce
commit 261d652123

View File

@ -86,6 +86,7 @@ PACKAGE_BASENAME="$(basename "$PACKAGE_ARCHIVE")"
REMOTE_ARCHIVE="$REMOTE_STAGE_PARENT/$PACKAGE_BASENAME" REMOTE_ARCHIVE="$REMOTE_STAGE_PARENT/$PACKAGE_BASENAME"
LOCAL_GIT_SHA="$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || printf 'unknown')" LOCAL_GIT_SHA="$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || printf 'unknown')"
LOCAL_GIT_STATUS="$(git -C "$REPO_ROOT" status --short 2>/dev/null || true)" LOCAL_GIT_STATUS="$(git -C "$REPO_ROOT" status --short 2>/dev/null || true)"
FIRST_PUBLISHED_RUN_TIMEOUT_SECS="${FIRST_PUBLISHED_RUN_TIMEOUT_SECS:-7200}"
echo "publish mode=$MODE remote=$REMOTE_HOST root=$REMOTE_ROOT package=$PACKAGE_ARCHIVE git=$LOCAL_GIT_SHA" echo "publish mode=$MODE remote=$REMOTE_HOST root=$REMOTE_ROOT package=$PACKAGE_ARCHIVE git=$LOCAL_GIT_SHA"
if [[ -n "$LOCAL_GIT_STATUS" ]]; then if [[ -n "$LOCAL_GIT_STATUS" ]]; then
@ -103,6 +104,7 @@ remote_archive="$2"
mode="$3" mode="$3"
restart_query_service="$4" restart_query_service="$4"
query_pattern="$5" query_pattern="$5"
first_published_run_timeout_secs="$6"
log() { log() {
printf '[publish] %s\n' "$*" printf '[publish] %s\n' "$*"
@ -129,6 +131,28 @@ except Exception:
PY PY
} }
json_get_value() {
local path="$1"
local key_path="$2"
python3 - "$path" "$key_path" <<'PY'
import json, sys
path, key_path = sys.argv[1], sys.argv[2]
try:
value = json.load(open(path, encoding="utf-8"))
except Exception:
print("")
raise SystemExit(0)
for part in key_path.split("."):
if not isinstance(value, dict):
value = None
break
value = value.get(part)
if value is None:
break
print("" if value is None else value)
PY
}
max_run_name() { max_run_name() {
find "$remote_root/runs" -maxdepth 1 -type d -name 'run_*' -printf '%f\n' 2>/dev/null | sort -V | tail -1 find "$remote_root/runs" -maxdepth 1 -type d -name 'run_*' -printf '%f\n' 2>/dev/null | sort -V | tail -1
} }
@ -178,6 +202,73 @@ terminate_matching() {
fi fi
} }
wait_for_first_published_run() {
local run_id="$1"
local deadline=$((SECONDS + first_published_run_timeout_secs))
local run_dir="$remote_root/runs/$run_id"
local meta_path="$run_dir/run-meta.json"
local summary_path="$run_dir/run-summary.json"
local controller_pid
controller_pid="$(cat "$remote_root/state/meta/run_soak-pid" 2>/dev/null || true)"
while (( SECONDS < deadline )); do
if [[ -f "$summary_path" && -f "$meta_path" ]]; then
local summary_status meta_status sync_mode
summary_status="$(json_get_status "$summary_path")"
meta_status="$(json_get_status "$meta_path")"
sync_mode="$(json_get_value "$meta_path" "sync_mode")"
if [[ "$summary_status" == "success" && "$meta_status" == "success" ]]; then
if [[ "$sync_mode" != "snapshot" ]]; then
echo "first published run $run_id completed but was not a snapshot: sync_mode=$sync_mode" >&2
return 6
fi
python3 - "$summary_path" "$meta_path" <<'PY'
import json, sys
summary = json.load(open(sys.argv[1], encoding="utf-8"))
meta = json.load(open(sys.argv[2], encoding="utf-8"))
stage = summary.get("stageTiming") or {}
report = summary.get("reportCounts") or {}
roa = stage.get("roa_validation_cache") or {}
print(
"first published run success "
f"run_id={meta.get('run_id')} "
f"wall_ms={summary.get('wallMs')} "
f"validation_ms={stage.get('validation_ms')} "
f"repo_sync_ms_total={stage.get('repo_sync_ms_total')} "
f"rrdp_ms={stage.get('rrdp_download_ms_total')} "
f"rsync_ms={stage.get('rsync_download_ms_total')} "
f"pp_cache={stage.get('enable_publication_point_validation_cache')} "
f"roa_cache={stage.get('enable_roa_validation_cache')} "
f"roa_hit={roa.get('hit_roas')} "
f"roa_fresh={roa.get('fresh_roas')} "
f"vrps={report.get('vrps')} "
f"aspas={report.get('aspas')} "
f"publication_points={report.get('publicationPoints')} "
f"warnings={report.get('warnings')}"
)
PY
return 0
fi
if [[ "$summary_status" == "failed" || "$summary_status" == "error" || "$meta_status" == "failed" || "$meta_status" == "error" ]]; then
echo "first published run $run_id failed early: meta_status=$meta_status summary_status=$summary_status" >&2
sed -n '1,80p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true
sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stderr" >&2 || true
return 6
fi
fi
if [[ -n "$controller_pid" ]] && ! kill -0 "$controller_pid" 2>/dev/null; then
echo "run_soak controller exited before first published run $run_id completed" >&2
sed -n '1,80p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true
sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stderr" >&2 || true
return 6
fi
sleep 5
done
echo "timeout waiting for first published run $run_id to complete" >&2
sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true
sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stderr" >&2 || true
return 6
}
[[ -d "$remote_root" ]] || { echo "remote root not found: $remote_root" >&2; exit 2; } [[ -d "$remote_root" ]] || { echo "remote root not found: $remote_root" >&2; exit 2; }
[[ -f "$remote_archive" ]] || { echo "archive not found: $remote_archive" >&2; exit 2; } [[ -f "$remote_archive" ]] || { echo "archive not found: $remote_archive" >&2; exit 2; }
@ -328,6 +419,8 @@ if [[ "$mode" == "execute" ]]; then
sed -n '1,40p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true sed -n '1,40p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true
exit 6 exit 6
fi fi
log "waiting for first published run to complete"
wait_for_first_published_run "$next_run"
else else
log "would start: nohup bash $remote_root/run_soak.sh > $remote_root/logs/run_soak.publish-${timestamp}.stdout 2> $remote_root/logs/run_soak.publish-${timestamp}.stderr &" log "would start: nohup bash $remote_root/run_soak.sh > $remote_root/logs/run_soak.publish-${timestamp}.stdout 2> $remote_root/logs/run_soak.publish-${timestamp}.stderr &"
log "expected first line: run_soak mode=continuous max_existing_run_index=${last_run#run_} next_run=$next_run" log "expected first line: run_soak mode=continuous max_existing_run_index=${last_run#run_} next_run=$next_run"
@ -347,6 +440,7 @@ if [[ "$restart_query_service" == "1" ]]; then
--listen 0.0.0.0:9560 \ --listen 0.0.0.0:9560 \
--watch-run-root "$remote_root" \ --watch-run-root "$remote_root" \
--watch-interval-secs 60 \ --watch-interval-secs 60 \
--watch-min-run-seq "$next_index" \
--retain-indexed-runs 10 \ --retain-indexed-runs 10 \
--indexer-bin /root/rpki_20260616_query_service_deploy/bin/rpki_query_indexer \ --indexer-bin /root/rpki_20260616_query_service_deploy/bin/rpki_query_indexer \
> /root/rpki_20260616_query_service_deploy/query-service.publish-${timestamp}.log 2>&1 & > /root/rpki_20260616_query_service_deploy/query-service.publish-${timestamp}.log 2>&1 &
@ -365,4 +459,4 @@ df -h / /root 2>/dev/null | sort -u || true
REMOTE REMOTE
)" )"
ssh "$REMOTE_HOST" "bash -s -- '$REMOTE_ROOT' '$REMOTE_ARCHIVE' '$MODE' '$RESTART_QUERY_SERVICE' '$QUERY_SERVICE_PID_PATTERN'" <<< "$REMOTE_SCRIPT" ssh "$REMOTE_HOST" "bash -s -- '$REMOTE_ROOT' '$REMOTE_ARCHIVE' '$MODE' '$RESTART_QUERY_SERVICE' '$QUERY_SERVICE_PID_PATTERN' '$FIRST_PUBLISHED_RUN_TIMEOUT_SECS'" <<< "$REMOTE_SCRIPT"