From 261d6521231dfacbbf212e8eaf405fdc14ec2b88 Mon Sep 17 00:00:00 2001 From: yuyr Date: Thu, 18 Jun 2026 11:31:55 +0800 Subject: [PATCH] =?UTF-8?q?20260618=20=E4=BF=AE=E5=A4=8D=E8=BF=9C=E7=AB=AF?= =?UTF-8?q?231=E5=8F=91=E5=B8=83=E9=AA=8C=E8=AF=81=E6=B5=81=E7=A8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/soak/publish_remote231.sh | 96 ++++++++++++++++++++++++++++++- 1 file changed, 95 insertions(+), 1 deletion(-) diff --git a/scripts/soak/publish_remote231.sh b/scripts/soak/publish_remote231.sh index ba4ba94..119496e 100755 --- a/scripts/soak/publish_remote231.sh +++ b/scripts/soak/publish_remote231.sh @@ -86,6 +86,7 @@ PACKAGE_BASENAME="$(basename "$PACKAGE_ARCHIVE")" REMOTE_ARCHIVE="$REMOTE_STAGE_PARENT/$PACKAGE_BASENAME" LOCAL_GIT_SHA="$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || printf 'unknown')" LOCAL_GIT_STATUS="$(git -C "$REPO_ROOT" status --short 2>/dev/null || true)" +FIRST_PUBLISHED_RUN_TIMEOUT_SECS="${FIRST_PUBLISHED_RUN_TIMEOUT_SECS:-7200}" echo "publish mode=$MODE remote=$REMOTE_HOST root=$REMOTE_ROOT package=$PACKAGE_ARCHIVE git=$LOCAL_GIT_SHA" if [[ -n "$LOCAL_GIT_STATUS" ]]; then @@ -103,6 +104,7 @@ remote_archive="$2" mode="$3" restart_query_service="$4" query_pattern="$5" +first_published_run_timeout_secs="$6" log() { printf '[publish] %s\n' "$*" @@ -129,6 +131,28 @@ except Exception: PY } +json_get_value() { + local path="$1" + local key_path="$2" + python3 - "$path" "$key_path" <<'PY' +import json, sys +path, key_path = sys.argv[1], sys.argv[2] +try: + value = json.load(open(path, encoding="utf-8")) +except Exception: + print("") + raise SystemExit(0) +for part in key_path.split("."): + if not isinstance(value, dict): + value = None + break + value = value.get(part) + if value is None: + break +print("" if value is None else value) +PY +} + max_run_name() { find "$remote_root/runs" -maxdepth 1 -type d -name 'run_*' -printf '%f\n' 2>/dev/null | sort -V | tail -1 } @@ -178,6 +202,73 @@ terminate_matching() { fi } +wait_for_first_published_run() { + local run_id="$1" + local deadline=$((SECONDS + first_published_run_timeout_secs)) + local run_dir="$remote_root/runs/$run_id" + local meta_path="$run_dir/run-meta.json" + local summary_path="$run_dir/run-summary.json" + local controller_pid + controller_pid="$(cat "$remote_root/state/meta/run_soak-pid" 2>/dev/null || true)" + while (( SECONDS < deadline )); do + if [[ -f "$summary_path" && -f "$meta_path" ]]; then + local summary_status meta_status sync_mode + summary_status="$(json_get_status "$summary_path")" + meta_status="$(json_get_status "$meta_path")" + sync_mode="$(json_get_value "$meta_path" "sync_mode")" + if [[ "$summary_status" == "success" && "$meta_status" == "success" ]]; then + if [[ "$sync_mode" != "snapshot" ]]; then + echo "first published run $run_id completed but was not a snapshot: sync_mode=$sync_mode" >&2 + return 6 + fi + python3 - "$summary_path" "$meta_path" <<'PY' +import json, sys +summary = json.load(open(sys.argv[1], encoding="utf-8")) +meta = json.load(open(sys.argv[2], encoding="utf-8")) +stage = summary.get("stageTiming") or {} +report = summary.get("reportCounts") or {} +roa = stage.get("roa_validation_cache") or {} +print( + "first published run success " + f"run_id={meta.get('run_id')} " + f"wall_ms={summary.get('wallMs')} " + f"validation_ms={stage.get('validation_ms')} " + f"repo_sync_ms_total={stage.get('repo_sync_ms_total')} " + f"rrdp_ms={stage.get('rrdp_download_ms_total')} " + f"rsync_ms={stage.get('rsync_download_ms_total')} " + f"pp_cache={stage.get('enable_publication_point_validation_cache')} " + f"roa_cache={stage.get('enable_roa_validation_cache')} " + f"roa_hit={roa.get('hit_roas')} " + f"roa_fresh={roa.get('fresh_roas')} " + f"vrps={report.get('vrps')} " + f"aspas={report.get('aspas')} " + f"publication_points={report.get('publicationPoints')} " + f"warnings={report.get('warnings')}" +) +PY + return 0 + fi + if [[ "$summary_status" == "failed" || "$summary_status" == "error" || "$meta_status" == "failed" || "$meta_status" == "error" ]]; then + echo "first published run $run_id failed early: meta_status=$meta_status summary_status=$summary_status" >&2 + sed -n '1,80p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true + sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stderr" >&2 || true + return 6 + fi + fi + if [[ -n "$controller_pid" ]] && ! kill -0 "$controller_pid" 2>/dev/null; then + echo "run_soak controller exited before first published run $run_id completed" >&2 + sed -n '1,80p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true + sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stderr" >&2 || true + return 6 + fi + sleep 5 + done + echo "timeout waiting for first published run $run_id to complete" >&2 + sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true + sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stderr" >&2 || true + return 6 +} + [[ -d "$remote_root" ]] || { echo "remote root not found: $remote_root" >&2; exit 2; } [[ -f "$remote_archive" ]] || { echo "archive not found: $remote_archive" >&2; exit 2; } @@ -328,6 +419,8 @@ if [[ "$mode" == "execute" ]]; then sed -n '1,40p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true exit 6 fi + log "waiting for first published run to complete" + wait_for_first_published_run "$next_run" else log "would start: nohup bash $remote_root/run_soak.sh > $remote_root/logs/run_soak.publish-${timestamp}.stdout 2> $remote_root/logs/run_soak.publish-${timestamp}.stderr &" log "expected first line: run_soak mode=continuous max_existing_run_index=${last_run#run_} next_run=$next_run" @@ -347,6 +440,7 @@ if [[ "$restart_query_service" == "1" ]]; then --listen 0.0.0.0:9560 \ --watch-run-root "$remote_root" \ --watch-interval-secs 60 \ + --watch-min-run-seq "$next_index" \ --retain-indexed-runs 10 \ --indexer-bin /root/rpki_20260616_query_service_deploy/bin/rpki_query_indexer \ > /root/rpki_20260616_query_service_deploy/query-service.publish-${timestamp}.log 2>&1 & @@ -365,4 +459,4 @@ df -h / /root 2>/dev/null | sort -u || true REMOTE )" -ssh "$REMOTE_HOST" "bash -s -- '$REMOTE_ROOT' '$REMOTE_ARCHIVE' '$MODE' '$RESTART_QUERY_SERVICE' '$QUERY_SERVICE_PID_PATTERN'" <<< "$REMOTE_SCRIPT" +ssh "$REMOTE_HOST" "bash -s -- '$REMOTE_ROOT' '$REMOTE_ARCHIVE' '$MODE' '$RESTART_QUERY_SERVICE' '$QUERY_SERVICE_PID_PATTERN' '$FIRST_PUBLISHED_RUN_TIMEOUT_SECS'" <<< "$REMOTE_SCRIPT"