20260618 修复远端231发布验证流程
This commit is contained in:
parent
b6344074ce
commit
261d652123
@ -86,6 +86,7 @@ PACKAGE_BASENAME="$(basename "$PACKAGE_ARCHIVE")"
|
|||||||
REMOTE_ARCHIVE="$REMOTE_STAGE_PARENT/$PACKAGE_BASENAME"
|
REMOTE_ARCHIVE="$REMOTE_STAGE_PARENT/$PACKAGE_BASENAME"
|
||||||
LOCAL_GIT_SHA="$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || printf 'unknown')"
|
LOCAL_GIT_SHA="$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || printf 'unknown')"
|
||||||
LOCAL_GIT_STATUS="$(git -C "$REPO_ROOT" status --short 2>/dev/null || true)"
|
LOCAL_GIT_STATUS="$(git -C "$REPO_ROOT" status --short 2>/dev/null || true)"
|
||||||
|
FIRST_PUBLISHED_RUN_TIMEOUT_SECS="${FIRST_PUBLISHED_RUN_TIMEOUT_SECS:-7200}"
|
||||||
|
|
||||||
echo "publish mode=$MODE remote=$REMOTE_HOST root=$REMOTE_ROOT package=$PACKAGE_ARCHIVE git=$LOCAL_GIT_SHA"
|
echo "publish mode=$MODE remote=$REMOTE_HOST root=$REMOTE_ROOT package=$PACKAGE_ARCHIVE git=$LOCAL_GIT_SHA"
|
||||||
if [[ -n "$LOCAL_GIT_STATUS" ]]; then
|
if [[ -n "$LOCAL_GIT_STATUS" ]]; then
|
||||||
@ -103,6 +104,7 @@ remote_archive="$2"
|
|||||||
mode="$3"
|
mode="$3"
|
||||||
restart_query_service="$4"
|
restart_query_service="$4"
|
||||||
query_pattern="$5"
|
query_pattern="$5"
|
||||||
|
first_published_run_timeout_secs="$6"
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
printf '[publish] %s\n' "$*"
|
printf '[publish] %s\n' "$*"
|
||||||
@ -129,6 +131,28 @@ except Exception:
|
|||||||
PY
|
PY
|
||||||
}
|
}
|
||||||
|
|
||||||
|
json_get_value() {
|
||||||
|
local path="$1"
|
||||||
|
local key_path="$2"
|
||||||
|
python3 - "$path" "$key_path" <<'PY'
|
||||||
|
import json, sys
|
||||||
|
path, key_path = sys.argv[1], sys.argv[2]
|
||||||
|
try:
|
||||||
|
value = json.load(open(path, encoding="utf-8"))
|
||||||
|
except Exception:
|
||||||
|
print("")
|
||||||
|
raise SystemExit(0)
|
||||||
|
for part in key_path.split("."):
|
||||||
|
if not isinstance(value, dict):
|
||||||
|
value = None
|
||||||
|
break
|
||||||
|
value = value.get(part)
|
||||||
|
if value is None:
|
||||||
|
break
|
||||||
|
print("" if value is None else value)
|
||||||
|
PY
|
||||||
|
}
|
||||||
|
|
||||||
max_run_name() {
|
max_run_name() {
|
||||||
find "$remote_root/runs" -maxdepth 1 -type d -name 'run_*' -printf '%f\n' 2>/dev/null | sort -V | tail -1
|
find "$remote_root/runs" -maxdepth 1 -type d -name 'run_*' -printf '%f\n' 2>/dev/null | sort -V | tail -1
|
||||||
}
|
}
|
||||||
@ -178,6 +202,73 @@ terminate_matching() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
wait_for_first_published_run() {
|
||||||
|
local run_id="$1"
|
||||||
|
local deadline=$((SECONDS + first_published_run_timeout_secs))
|
||||||
|
local run_dir="$remote_root/runs/$run_id"
|
||||||
|
local meta_path="$run_dir/run-meta.json"
|
||||||
|
local summary_path="$run_dir/run-summary.json"
|
||||||
|
local controller_pid
|
||||||
|
controller_pid="$(cat "$remote_root/state/meta/run_soak-pid" 2>/dev/null || true)"
|
||||||
|
while (( SECONDS < deadline )); do
|
||||||
|
if [[ -f "$summary_path" && -f "$meta_path" ]]; then
|
||||||
|
local summary_status meta_status sync_mode
|
||||||
|
summary_status="$(json_get_status "$summary_path")"
|
||||||
|
meta_status="$(json_get_status "$meta_path")"
|
||||||
|
sync_mode="$(json_get_value "$meta_path" "sync_mode")"
|
||||||
|
if [[ "$summary_status" == "success" && "$meta_status" == "success" ]]; then
|
||||||
|
if [[ "$sync_mode" != "snapshot" ]]; then
|
||||||
|
echo "first published run $run_id completed but was not a snapshot: sync_mode=$sync_mode" >&2
|
||||||
|
return 6
|
||||||
|
fi
|
||||||
|
python3 - "$summary_path" "$meta_path" <<'PY'
|
||||||
|
import json, sys
|
||||||
|
summary = json.load(open(sys.argv[1], encoding="utf-8"))
|
||||||
|
meta = json.load(open(sys.argv[2], encoding="utf-8"))
|
||||||
|
stage = summary.get("stageTiming") or {}
|
||||||
|
report = summary.get("reportCounts") or {}
|
||||||
|
roa = stage.get("roa_validation_cache") or {}
|
||||||
|
print(
|
||||||
|
"first published run success "
|
||||||
|
f"run_id={meta.get('run_id')} "
|
||||||
|
f"wall_ms={summary.get('wallMs')} "
|
||||||
|
f"validation_ms={stage.get('validation_ms')} "
|
||||||
|
f"repo_sync_ms_total={stage.get('repo_sync_ms_total')} "
|
||||||
|
f"rrdp_ms={stage.get('rrdp_download_ms_total')} "
|
||||||
|
f"rsync_ms={stage.get('rsync_download_ms_total')} "
|
||||||
|
f"pp_cache={stage.get('enable_publication_point_validation_cache')} "
|
||||||
|
f"roa_cache={stage.get('enable_roa_validation_cache')} "
|
||||||
|
f"roa_hit={roa.get('hit_roas')} "
|
||||||
|
f"roa_fresh={roa.get('fresh_roas')} "
|
||||||
|
f"vrps={report.get('vrps')} "
|
||||||
|
f"aspas={report.get('aspas')} "
|
||||||
|
f"publication_points={report.get('publicationPoints')} "
|
||||||
|
f"warnings={report.get('warnings')}"
|
||||||
|
)
|
||||||
|
PY
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
if [[ "$summary_status" == "failed" || "$summary_status" == "error" || "$meta_status" == "failed" || "$meta_status" == "error" ]]; then
|
||||||
|
echo "first published run $run_id failed early: meta_status=$meta_status summary_status=$summary_status" >&2
|
||||||
|
sed -n '1,80p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true
|
||||||
|
sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stderr" >&2 || true
|
||||||
|
return 6
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
if [[ -n "$controller_pid" ]] && ! kill -0 "$controller_pid" 2>/dev/null; then
|
||||||
|
echo "run_soak controller exited before first published run $run_id completed" >&2
|
||||||
|
sed -n '1,80p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true
|
||||||
|
sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stderr" >&2 || true
|
||||||
|
return 6
|
||||||
|
fi
|
||||||
|
sleep 5
|
||||||
|
done
|
||||||
|
echo "timeout waiting for first published run $run_id to complete" >&2
|
||||||
|
sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true
|
||||||
|
sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stderr" >&2 || true
|
||||||
|
return 6
|
||||||
|
}
|
||||||
|
|
||||||
[[ -d "$remote_root" ]] || { echo "remote root not found: $remote_root" >&2; exit 2; }
|
[[ -d "$remote_root" ]] || { echo "remote root not found: $remote_root" >&2; exit 2; }
|
||||||
[[ -f "$remote_archive" ]] || { echo "archive not found: $remote_archive" >&2; exit 2; }
|
[[ -f "$remote_archive" ]] || { echo "archive not found: $remote_archive" >&2; exit 2; }
|
||||||
|
|
||||||
@ -328,6 +419,8 @@ if [[ "$mode" == "execute" ]]; then
|
|||||||
sed -n '1,40p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true
|
sed -n '1,40p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true
|
||||||
exit 6
|
exit 6
|
||||||
fi
|
fi
|
||||||
|
log "waiting for first published run to complete"
|
||||||
|
wait_for_first_published_run "$next_run"
|
||||||
else
|
else
|
||||||
log "would start: nohup bash $remote_root/run_soak.sh > $remote_root/logs/run_soak.publish-${timestamp}.stdout 2> $remote_root/logs/run_soak.publish-${timestamp}.stderr &"
|
log "would start: nohup bash $remote_root/run_soak.sh > $remote_root/logs/run_soak.publish-${timestamp}.stdout 2> $remote_root/logs/run_soak.publish-${timestamp}.stderr &"
|
||||||
log "expected first line: run_soak mode=continuous max_existing_run_index=${last_run#run_} next_run=$next_run"
|
log "expected first line: run_soak mode=continuous max_existing_run_index=${last_run#run_} next_run=$next_run"
|
||||||
@ -347,6 +440,7 @@ if [[ "$restart_query_service" == "1" ]]; then
|
|||||||
--listen 0.0.0.0:9560 \
|
--listen 0.0.0.0:9560 \
|
||||||
--watch-run-root "$remote_root" \
|
--watch-run-root "$remote_root" \
|
||||||
--watch-interval-secs 60 \
|
--watch-interval-secs 60 \
|
||||||
|
--watch-min-run-seq "$next_index" \
|
||||||
--retain-indexed-runs 10 \
|
--retain-indexed-runs 10 \
|
||||||
--indexer-bin /root/rpki_20260616_query_service_deploy/bin/rpki_query_indexer \
|
--indexer-bin /root/rpki_20260616_query_service_deploy/bin/rpki_query_indexer \
|
||||||
> /root/rpki_20260616_query_service_deploy/query-service.publish-${timestamp}.log 2>&1 &
|
> /root/rpki_20260616_query_service_deploy/query-service.publish-${timestamp}.log 2>&1 &
|
||||||
@ -365,4 +459,4 @@ df -h / /root 2>/dev/null | sort -u || true
|
|||||||
REMOTE
|
REMOTE
|
||||||
)"
|
)"
|
||||||
|
|
||||||
ssh "$REMOTE_HOST" "bash -s -- '$REMOTE_ROOT' '$REMOTE_ARCHIVE' '$MODE' '$RESTART_QUERY_SERVICE' '$QUERY_SERVICE_PID_PATTERN'" <<< "$REMOTE_SCRIPT"
|
ssh "$REMOTE_HOST" "bash -s -- '$REMOTE_ROOT' '$REMOTE_ARCHIVE' '$MODE' '$RESTART_QUERY_SERVICE' '$QUERY_SERVICE_PID_PATTERN' '$FIRST_PUBLISHED_RUN_TIMEOUT_SECS'" <<< "$REMOTE_SCRIPT"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user