#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" REMOTE_HOST="${REMOTE_HOST:-root@47.251.127.231}" REMOTE_ROOT="${REMOTE_ROOT:-/root/rpki_20260608_2_feature062_24h_20260608T075547Z/portable-soak}" PACKAGE_ARCHIVE="${PACKAGE_ARCHIVE:-}" MODE="${MODE:-dry-run}" RESTART_QUERY_SERVICE="${RESTART_QUERY_SERVICE:-0}" QUERY_SERVICE_PID_PATTERN="${QUERY_SERVICE_PID_PATTERN:-rpki_query_service --query-db /root/rpki_20260616_query_service_deploy/query-db}" usage() { cat <<'USAGE' Usage: scripts/soak/publish_remote231.sh --package [--execute] [--remote-root ] Publishes a new portable soak package to remote231 in place: - stops only the current soak controller/daemon/rpki child under REMOTE_ROOT; - preserves runs/ so run numbering continues; - backs up state/db before replacing binaries/scripts; - moves state/db away and creates a new empty state/db so the next run is snapshot; - leaves metrics/query/prometheus/grafana configuration untouched. Default mode is dry-run. Use --execute to apply changes. Environment overrides: REMOTE_HOST=root@47.251.127.231 REMOTE_ROOT=/root/rpki_20260608_2_feature062_24h_20260608T075547Z/portable-soak RESTART_QUERY_SERVICE=0|1 USAGE } die() { echo "error: $*" >&2 exit 2 } while [[ $# -gt 0 ]]; do case "$1" in --package) shift PACKAGE_ARCHIVE="${1:?--package requires a value}" ;; --remote-host) shift REMOTE_HOST="${1:?--remote-host requires a value}" ;; --remote-root) shift REMOTE_ROOT="${1:?--remote-root requires a value}" ;; --execute) MODE="execute" ;; --dry-run) MODE="dry-run" ;; --restart-query-service) RESTART_QUERY_SERVICE=1 ;; --help|-h) usage exit 0 ;; *) die "unknown argument: $1" ;; esac shift done [[ -n "$PACKAGE_ARCHIVE" ]] || die "--package is required" [[ -f "$PACKAGE_ARCHIVE" ]] || die "package not found: $PACKAGE_ARCHIVE" case "$MODE" in dry-run|execute) ;; *) die "MODE must be dry-run or execute: $MODE" ;; esac command -v ssh >/dev/null 2>&1 || die "ssh is required" command -v scp >/dev/null 2>&1 || die "scp is required" REMOTE_STAGE_PARENT="/root/rpki_publish_packages" PACKAGE_BASENAME="$(basename "$PACKAGE_ARCHIVE")" REMOTE_ARCHIVE="$REMOTE_STAGE_PARENT/$PACKAGE_BASENAME" LOCAL_GIT_SHA="$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || printf 'unknown')" LOCAL_GIT_STATUS="$(git -C "$REPO_ROOT" status --short 2>/dev/null || true)" FIRST_PUBLISHED_RUN_TIMEOUT_SECS="${FIRST_PUBLISHED_RUN_TIMEOUT_SECS:-7200}" echo "publish mode=$MODE remote=$REMOTE_HOST root=$REMOTE_ROOT package=$PACKAGE_ARCHIVE git=$LOCAL_GIT_SHA" if [[ -n "$LOCAL_GIT_STATUS" ]]; then echo "warning: local git worktree is dirty; package manifest should record provenance" >&2 fi ssh "$REMOTE_HOST" "mkdir -p '$REMOTE_STAGE_PARENT'" scp "$PACKAGE_ARCHIVE" "$REMOTE_HOST:$REMOTE_ARCHIVE" REMOTE_SCRIPT="$(cat <<'REMOTE' set -euo pipefail remote_root="$1" remote_archive="$2" mode="$3" restart_query_service="$4" query_pattern="$5" first_published_run_timeout_secs="$6" log() { printf '[publish] %s\n' "$*" } run_or_echo() { if [[ "$mode" == "execute" ]]; then "$@" else printf '[dry-run] ' printf '%q ' "$@" printf '\n' fi } json_get_status() { local path="$1" python3 - "$path" <<'PY' import json, sys try: print(json.load(open(sys.argv[1], encoding="utf-8")).get("status", "missing")) except Exception: print("missing") PY } json_get_value() { local path="$1" local key_path="$2" python3 - "$path" "$key_path" <<'PY' import json, sys path, key_path = sys.argv[1], sys.argv[2] try: value = json.load(open(path, encoding="utf-8")) except Exception: print("") raise SystemExit(0) for part in key_path.split("."): if not isinstance(value, dict): value = None break value = value.get(part) if value is None: break print("" if value is None else value) PY } max_run_name() { find "$remote_root/runs" -maxdepth 1 -type d -name 'run_*' -printf '%f\n' 2>/dev/null | sort -V | tail -1 } max_successful_run_name() { local candidate find "$remote_root/runs" -maxdepth 1 -type d -name 'run_*' -printf '%f\n' 2>/dev/null | sort -V | while read -r candidate; do [[ -n "$candidate" ]] || continue if [[ "$(json_get_status "$remote_root/runs/$candidate/run-meta.json")" == "success" \ && "$(json_get_status "$remote_root/runs/$candidate/run-summary.json")" == "success" ]]; then printf '%s\n' "$candidate" fi done | tail -1 } wait_no_soak_children() { local deadline=$((SECONDS + 120)) while (( SECONDS < deadline )); do if [[ -z "$(matching_pids "$remote_root/(run_soak.sh|bin/rpki_daemon|bin/rpki)( |$)" | head -1)" ]]; then return 0 fi sleep 2 done return 1 } matching_pids() { local pattern="$1" pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do local pid cmd pid="${line%% *}" cmd="${line#* }" [[ "$pid" =~ ^[0-9]+$ ]] || continue [[ "$pid" == "$$" || "$pid" == "$BASHPID" || "$pid" == "${PPID:-}" ]] && continue [[ "$cmd" == *"bash -s --"* && "$cmd" == *"$remote_root"* ]] && continue printf '%s\n' "$pid" done | sort -u } terminate_matching() { local signal="$1" local pattern="$2" local -a pids=() mapfile -t pids < <(matching_pids "$pattern") if (( ${#pids[@]} > 0 )); then kill "$signal" "${pids[@]}" >/dev/null 2>&1 || true fi } wait_for_first_published_run() { local run_id="$1" local deadline=$((SECONDS + first_published_run_timeout_secs)) local run_dir="$remote_root/runs/$run_id" local meta_path="$run_dir/run-meta.json" local summary_path="$run_dir/run-summary.json" local controller_pid controller_pid="$(cat "$remote_root/state/meta/run_soak-pid" 2>/dev/null || true)" while (( SECONDS < deadline )); do if [[ -f "$summary_path" && -f "$meta_path" ]]; then local summary_status meta_status sync_mode summary_status="$(json_get_status "$summary_path")" meta_status="$(json_get_status "$meta_path")" sync_mode="$(json_get_value "$meta_path" "sync_mode")" if [[ "$summary_status" == "success" && "$meta_status" == "success" ]]; then if [[ "$sync_mode" != "snapshot" ]]; then echo "first published run $run_id completed but was not a snapshot: sync_mode=$sync_mode" >&2 return 6 fi python3 - "$summary_path" "$meta_path" <<'PY' import json, sys summary = json.load(open(sys.argv[1], encoding="utf-8")) meta = json.load(open(sys.argv[2], encoding="utf-8")) stage = summary.get("stageTiming") or {} report = summary.get("reportCounts") or {} roa = stage.get("roa_validation_cache") or {} print( "first published run success " f"run_id={meta.get('run_id')} " f"wall_ms={summary.get('wallMs')} " f"validation_ms={stage.get('validation_ms')} " f"repo_sync_ms_total={stage.get('repo_sync_ms_total')} " f"rrdp_ms={stage.get('rrdp_download_ms_total')} " f"rsync_ms={stage.get('rsync_download_ms_total')} " f"pp_cache={stage.get('enable_publication_point_validation_cache')} " f"roa_cache={stage.get('enable_roa_validation_cache')} " f"roa_hit={roa.get('hit_roas')} " f"roa_fresh={roa.get('fresh_roas')} " f"vrps={report.get('vrps')} " f"aspas={report.get('aspas')} " f"publication_points={report.get('publicationPoints')} " f"warnings={report.get('warnings')}" ) PY return 0 fi if [[ "$summary_status" == "failed" || "$summary_status" == "error" || "$meta_status" == "failed" || "$meta_status" == "error" ]]; then echo "first published run $run_id failed early: meta_status=$meta_status summary_status=$summary_status" >&2 sed -n '1,80p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stderr" >&2 || true return 6 fi fi if [[ -n "$controller_pid" ]] && ! kill -0 "$controller_pid" 2>/dev/null; then echo "run_soak controller exited before first published run $run_id completed" >&2 sed -n '1,80p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stderr" >&2 || true return 6 fi sleep 5 done echo "timeout waiting for first published run $run_id to complete" >&2 sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stderr" >&2 || true return 6 } [[ -d "$remote_root" ]] || { echo "remote root not found: $remote_root" >&2; exit 2; } [[ -f "$remote_archive" ]] || { echo "archive not found: $remote_archive" >&2; exit 2; } timestamp="$(date -u +%Y%m%dT%H%M%SZ)" latest_run="$(max_run_name || true)" last_successful_run="$(max_successful_run_name || true)" if [[ -z "$last_successful_run" ]]; then echo "no existing runs found under $remote_root/runs" >&2 exit 2 fi last_run="$last_successful_run" last_status="$(json_get_status "$remote_root/runs/$last_run/run-meta.json")" last_summary_status="$(json_get_status "$remote_root/runs/$last_run/run-summary.json")" next_index=$((10#${last_run#run_} + 1)) next_run="$(printf 'run_%04d' "$next_index")" backup_root="$remote_root/state/backups/pre_publish_${timestamp}_after_${last_run}" extract_root="$remote_root/state/publish-staging/$timestamp" new_pkg="$extract_root/portable-soak" log "latest_run=${latest_run:-none} last_successful_run=$last_run run_meta_status=$last_status run_summary_status=$last_summary_status next_run=$next_run" log "backup_root=$backup_root" log "extract_root=$extract_root" log "mode=$mode" if [[ "$last_status" != "success" || "$last_summary_status" != "success" ]]; then echo "last run is not successful; refusing publish: $last_run meta=$last_status summary=$last_summary_status" >&2 exit 3 fi log "current monitored sidecars" pgrep -af 'rpki_artifact_metrics|rpki_query_service|rpki_inter_rp_metrics|prometheus|grafana' || true log "current soak processes under root" pgrep -af "$remote_root/(run_soak.sh|bin/rpki_daemon|bin/rpki)( |$)" || true if [[ "$mode" == "execute" ]]; then terminate_matching -TERM "$remote_root/bin/rpki " terminate_matching -TERM "$remote_root/bin/rpki_daemon " terminate_matching -TERM "$remote_root/run_soak.sh" if ! wait_no_soak_children; then echo "soak processes did not stop cleanly; forcing kill" >&2 terminate_matching -KILL "$remote_root/bin/rpki " terminate_matching -KILL "$remote_root/bin/rpki_daemon " terminate_matching -KILL "$remote_root/run_soak.sh" wait_no_soak_children || { echo "failed to stop soak processes" >&2; exit 4; } fi else log "would stop soak processes under $remote_root only" fi latest_run_after_stop="$(max_run_name || true)" last_successful_run_after_stop="$(max_successful_run_name || true)" if [[ -z "$last_successful_run_after_stop" ]]; then echo "no successful run remains after stopping soak" >&2 exit 7 fi if [[ "$last_successful_run_after_stop" != "$last_run" ]]; then last_run="$last_successful_run_after_stop" last_status="$(json_get_status "$remote_root/runs/$last_run/run-meta.json")" last_summary_status="$(json_get_status "$remote_root/runs/$last_run/run-summary.json")" next_index=$((10#${last_run#run_} + 1)) next_run="$(printf 'run_%04d' "$next_index")" backup_root="$remote_root/state/backups/pre_publish_${timestamp}_after_${last_run}" log "recomputed last_successful_run=$last_run next_run=$next_run after stopping soak" fi if [[ -n "${latest_run_after_stop:-}" && "$latest_run_after_stop" != "$last_run" ]]; then log "latest run after stop is incomplete: $latest_run_after_stop; preserving it outside runs/ before publishing" incomplete_dir="$backup_root/incomplete-runs" run_or_echo mkdir -p "$incomplete_dir" latest_index=$((10#${latest_run_after_stop#run_})) stable_index=$((10#${last_run#run_})) for ((idx = stable_index + 1; idx <= latest_index; idx++)); do candidate="$(printf 'run_%04d' "$idx")" if [[ -d "$remote_root/runs/$candidate" ]]; then run_or_echo mv "$remote_root/runs/$candidate" "$incomplete_dir/$candidate" fi done fi run_or_echo mkdir -p "$backup_root" "$extract_root" if [[ "$mode" == "execute" ]]; then tar -C "$extract_root" -xzf "$remote_archive" [[ -x "$new_pkg/bin/rpki" ]] || { echo "extracted package missing bin/rpki" >&2; exit 5; } else log "would extract archive to $extract_root" fi if [[ "$mode" == "execute" ]]; then { echo "timestamp_utc=$timestamp" echo "remote_root=$remote_root" echo "remote_archive=$remote_archive" echo "last_run=$last_run" echo "next_run=$next_run" echo "last_status=$last_status" echo "last_summary_status=$last_summary_status" echo "mode=$mode" } > "$backup_root/publish-meta.txt" fi if [[ -d "$remote_root/state/db" ]]; then run_or_echo mv "$remote_root/state/db" "$backup_root/db" fi if [[ -d "$remote_root/state/meta" ]]; then run_or_echo cp -a "$remote_root/state/meta" "$backup_root/meta-copy" fi if [[ -f "$remote_root/.env" ]]; then run_or_echo cp -a "$remote_root/.env" "$backup_root/env.before" fi if [[ -d "$remote_root/bin" ]]; then run_or_echo mv "$remote_root/bin" "$backup_root/bin.before" fi for path in run_soak.sh run_24h_soak_with_metrics.sh scripts monitor fixtures copied-binaries.txt missing-optional-binaries.txt fixtures.txt scripts.txt manifest.json portable-soak.env.example; do if [[ -e "$new_pkg/$path" ]]; then if [[ -e "$remote_root/$path" ]]; then run_or_echo rm -rf "$remote_root/$path" fi run_or_echo cp -a "$new_pkg/$path" "$remote_root/$path" fi done run_or_echo cp -a "$new_pkg/bin" "$remote_root/bin" if [[ -f "$remote_root/.env" ]]; then run_or_echo cp -a "$remote_root/.env" "$backup_root/env.generated_from_package" fi if [[ -f "$backup_root/env.before" ]]; then run_or_echo cp -a "$backup_root/env.before" "$remote_root/.env" fi run_or_echo mkdir -p "$remote_root/state/db" "$remote_root/state/meta" "$remote_root/tmp" "$remote_root/logs" "$remote_root/state/invalid" if [[ -f "$backup_root/meta-copy/last-run-id" ]]; then run_or_echo cp -a "$backup_root/meta-copy/last-run-id" "$remote_root/state/meta/last-run-id" fi if [[ "$mode" == "execute" ]]; then chmod +x "$remote_root/run_soak.sh" "$remote_root/run_24h_soak_with_metrics.sh" "$remote_root/bin/"* 2>/dev/null || true nohup bash "$remote_root/run_soak.sh" > "$remote_root/logs/run_soak.publish-${timestamp}.stdout" 2> "$remote_root/logs/run_soak.publish-${timestamp}.stderr" & echo $! > "$remote_root/state/meta/run_soak-pid" sleep 3 log "started run_soak pid=$(cat "$remote_root/state/meta/run_soak-pid")" log "startup log" sed -n '1,20p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" || true if grep -q "starting run ${next_run} sync_mode=snapshot" "$remote_root/logs/run_soak.publish-${timestamp}.stdout"; then log "verified first published run starts as snapshot: $next_run" else echo "failed to verify snapshot startup for $next_run" >&2 sed -n '1,40p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true exit 6 fi log "waiting for first published run to complete" wait_for_first_published_run "$next_run" else log "would start: nohup bash $remote_root/run_soak.sh > $remote_root/logs/run_soak.publish-${timestamp}.stdout 2> $remote_root/logs/run_soak.publish-${timestamp}.stderr &" log "expected first line: run_soak mode=continuous max_existing_run_index=${last_run#run_} next_run=$next_run" log "expected start: starting run $next_run sync_mode=snapshot" fi if [[ "$restart_query_service" == "1" ]]; then if [[ "$mode" == "execute" ]]; then if [[ -n "$(matching_pids "$query_pattern" | head -1)" ]]; then terminate_matching -TERM "$query_pattern" sleep 2 fi nohup /root/rpki_20260616_query_service_deploy/bin/rpki_query_service \ --query-db /root/rpki_20260616_query_service_deploy/query-db \ --repo-bytes-db "$remote_root/state/db/repo-bytes.db" \ --export-root /root/rpki_20260616_query_service_deploy/query-exports \ --listen 0.0.0.0:9560 \ --watch-run-root "$remote_root" \ --watch-interval-secs 60 \ --watch-min-run-seq "$next_index" \ --retain-indexed-runs 10 \ --indexer-bin /root/rpki_20260616_query_service_deploy/bin/rpki_query_indexer \ > /root/rpki_20260616_query_service_deploy/query-service.publish-${timestamp}.log 2>&1 & log "restarted query service" else log "would restart query service to reopen repo-bytes db" fi else log "query service left unchanged" fi log "post-publish sidecars" pgrep -af 'rpki_artifact_metrics|rpki_query_service|rpki_inter_rp_metrics|prometheus|grafana' || true log "df" df -h / /root 2>/dev/null | sort -u || true REMOTE )" ssh "$REMOTE_HOST" "bash -s -- '$REMOTE_ROOT' '$REMOTE_ARCHIVE' '$MODE' '$RESTART_QUERY_SERVICE' '$QUERY_SERVICE_PID_PATTERN' '$FIRST_PUBLISHED_RUN_TIMEOUT_SECS'" <<< "$REMOTE_SCRIPT"