463 lines
17 KiB
Bash
Executable File
463 lines
17 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
|
|
|
REMOTE_HOST="${REMOTE_HOST:-root@47.251.127.231}"
|
|
REMOTE_ROOT="${REMOTE_ROOT:-/root/rpki_20260608_2_feature062_24h_20260608T075547Z/portable-soak}"
|
|
PACKAGE_ARCHIVE="${PACKAGE_ARCHIVE:-}"
|
|
MODE="${MODE:-dry-run}"
|
|
RESTART_QUERY_SERVICE="${RESTART_QUERY_SERVICE:-0}"
|
|
QUERY_SERVICE_PID_PATTERN="${QUERY_SERVICE_PID_PATTERN:-rpki_query_service --query-db /root/rpki_20260616_query_service_deploy/query-db}"
|
|
|
|
usage() {
|
|
cat <<'USAGE'
|
|
Usage:
|
|
scripts/soak/publish_remote231.sh --package <portable-soak.tar.gz> [--execute] [--remote-root <path>]
|
|
|
|
Publishes a new portable soak package to remote231 in place:
|
|
- stops only the current soak controller/daemon/rpki child under REMOTE_ROOT;
|
|
- preserves runs/ so run numbering continues;
|
|
- backs up state/db before replacing binaries/scripts;
|
|
- moves state/db away and creates a new empty state/db so the next run is snapshot;
|
|
- leaves metrics/query/prometheus/grafana configuration untouched.
|
|
|
|
Default mode is dry-run. Use --execute to apply changes.
|
|
|
|
Environment overrides:
|
|
REMOTE_HOST=root@47.251.127.231
|
|
REMOTE_ROOT=/root/rpki_20260608_2_feature062_24h_20260608T075547Z/portable-soak
|
|
RESTART_QUERY_SERVICE=0|1
|
|
USAGE
|
|
}
|
|
|
|
die() {
|
|
echo "error: $*" >&2
|
|
exit 2
|
|
}
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--package)
|
|
shift
|
|
PACKAGE_ARCHIVE="${1:?--package requires a value}"
|
|
;;
|
|
--remote-host)
|
|
shift
|
|
REMOTE_HOST="${1:?--remote-host requires a value}"
|
|
;;
|
|
--remote-root)
|
|
shift
|
|
REMOTE_ROOT="${1:?--remote-root requires a value}"
|
|
;;
|
|
--execute)
|
|
MODE="execute"
|
|
;;
|
|
--dry-run)
|
|
MODE="dry-run"
|
|
;;
|
|
--restart-query-service)
|
|
RESTART_QUERY_SERVICE=1
|
|
;;
|
|
--help|-h)
|
|
usage
|
|
exit 0
|
|
;;
|
|
*)
|
|
die "unknown argument: $1"
|
|
;;
|
|
esac
|
|
shift
|
|
done
|
|
|
|
[[ -n "$PACKAGE_ARCHIVE" ]] || die "--package is required"
|
|
[[ -f "$PACKAGE_ARCHIVE" ]] || die "package not found: $PACKAGE_ARCHIVE"
|
|
case "$MODE" in
|
|
dry-run|execute) ;;
|
|
*) die "MODE must be dry-run or execute: $MODE" ;;
|
|
esac
|
|
|
|
command -v ssh >/dev/null 2>&1 || die "ssh is required"
|
|
command -v scp >/dev/null 2>&1 || die "scp is required"
|
|
|
|
REMOTE_STAGE_PARENT="/root/rpki_publish_packages"
|
|
PACKAGE_BASENAME="$(basename "$PACKAGE_ARCHIVE")"
|
|
REMOTE_ARCHIVE="$REMOTE_STAGE_PARENT/$PACKAGE_BASENAME"
|
|
LOCAL_GIT_SHA="$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || printf 'unknown')"
|
|
LOCAL_GIT_STATUS="$(git -C "$REPO_ROOT" status --short 2>/dev/null || true)"
|
|
FIRST_PUBLISHED_RUN_TIMEOUT_SECS="${FIRST_PUBLISHED_RUN_TIMEOUT_SECS:-7200}"
|
|
|
|
echo "publish mode=$MODE remote=$REMOTE_HOST root=$REMOTE_ROOT package=$PACKAGE_ARCHIVE git=$LOCAL_GIT_SHA"
|
|
if [[ -n "$LOCAL_GIT_STATUS" ]]; then
|
|
echo "warning: local git worktree is dirty; package manifest should record provenance" >&2
|
|
fi
|
|
|
|
ssh "$REMOTE_HOST" "mkdir -p '$REMOTE_STAGE_PARENT'"
|
|
scp "$PACKAGE_ARCHIVE" "$REMOTE_HOST:$REMOTE_ARCHIVE"
|
|
|
|
REMOTE_SCRIPT="$(cat <<'REMOTE'
|
|
set -euo pipefail
|
|
|
|
remote_root="$1"
|
|
remote_archive="$2"
|
|
mode="$3"
|
|
restart_query_service="$4"
|
|
query_pattern="$5"
|
|
first_published_run_timeout_secs="$6"
|
|
|
|
log() {
|
|
printf '[publish] %s\n' "$*"
|
|
}
|
|
|
|
run_or_echo() {
|
|
if [[ "$mode" == "execute" ]]; then
|
|
"$@"
|
|
else
|
|
printf '[dry-run] '
|
|
printf '%q ' "$@"
|
|
printf '\n'
|
|
fi
|
|
}
|
|
|
|
json_get_status() {
|
|
local path="$1"
|
|
python3 - "$path" <<'PY'
|
|
import json, sys
|
|
try:
|
|
print(json.load(open(sys.argv[1], encoding="utf-8")).get("status", "missing"))
|
|
except Exception:
|
|
print("missing")
|
|
PY
|
|
}
|
|
|
|
json_get_value() {
|
|
local path="$1"
|
|
local key_path="$2"
|
|
python3 - "$path" "$key_path" <<'PY'
|
|
import json, sys
|
|
path, key_path = sys.argv[1], sys.argv[2]
|
|
try:
|
|
value = json.load(open(path, encoding="utf-8"))
|
|
except Exception:
|
|
print("")
|
|
raise SystemExit(0)
|
|
for part in key_path.split("."):
|
|
if not isinstance(value, dict):
|
|
value = None
|
|
break
|
|
value = value.get(part)
|
|
if value is None:
|
|
break
|
|
print("" if value is None else value)
|
|
PY
|
|
}
|
|
|
|
max_run_name() {
|
|
find "$remote_root/runs" -maxdepth 1 -type d -name 'run_*' -printf '%f\n' 2>/dev/null | sort -V | tail -1
|
|
}
|
|
|
|
max_successful_run_name() {
|
|
local candidate
|
|
find "$remote_root/runs" -maxdepth 1 -type d -name 'run_*' -printf '%f\n' 2>/dev/null | sort -V | while read -r candidate; do
|
|
[[ -n "$candidate" ]] || continue
|
|
if [[ "$(json_get_status "$remote_root/runs/$candidate/run-meta.json")" == "success" \
|
|
&& "$(json_get_status "$remote_root/runs/$candidate/run-summary.json")" == "success" ]]; then
|
|
printf '%s\n' "$candidate"
|
|
fi
|
|
done | tail -1
|
|
}
|
|
|
|
wait_no_soak_children() {
|
|
local deadline=$((SECONDS + 120))
|
|
while (( SECONDS < deadline )); do
|
|
if [[ -z "$(matching_pids "$remote_root/(run_soak.sh|bin/rpki_daemon|bin/rpki)( |$)" | head -1)" ]]; then
|
|
return 0
|
|
fi
|
|
sleep 2
|
|
done
|
|
return 1
|
|
}
|
|
|
|
matching_pids() {
|
|
local pattern="$1"
|
|
pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
|
|
local pid cmd
|
|
pid="${line%% *}"
|
|
cmd="${line#* }"
|
|
[[ "$pid" =~ ^[0-9]+$ ]] || continue
|
|
[[ "$pid" == "$$" || "$pid" == "$BASHPID" || "$pid" == "${PPID:-}" ]] && continue
|
|
[[ "$cmd" == *"bash -s --"* && "$cmd" == *"$remote_root"* ]] && continue
|
|
printf '%s\n' "$pid"
|
|
done | sort -u
|
|
}
|
|
|
|
terminate_matching() {
|
|
local signal="$1"
|
|
local pattern="$2"
|
|
local -a pids=()
|
|
mapfile -t pids < <(matching_pids "$pattern")
|
|
if (( ${#pids[@]} > 0 )); then
|
|
kill "$signal" "${pids[@]}" >/dev/null 2>&1 || true
|
|
fi
|
|
}
|
|
|
|
wait_for_first_published_run() {
|
|
local run_id="$1"
|
|
local deadline=$((SECONDS + first_published_run_timeout_secs))
|
|
local run_dir="$remote_root/runs/$run_id"
|
|
local meta_path="$run_dir/run-meta.json"
|
|
local summary_path="$run_dir/run-summary.json"
|
|
local controller_pid
|
|
controller_pid="$(cat "$remote_root/state/meta/run_soak-pid" 2>/dev/null || true)"
|
|
while (( SECONDS < deadline )); do
|
|
if [[ -f "$summary_path" && -f "$meta_path" ]]; then
|
|
local summary_status meta_status sync_mode
|
|
summary_status="$(json_get_status "$summary_path")"
|
|
meta_status="$(json_get_status "$meta_path")"
|
|
sync_mode="$(json_get_value "$meta_path" "sync_mode")"
|
|
if [[ "$summary_status" == "success" && "$meta_status" == "success" ]]; then
|
|
if [[ "$sync_mode" != "snapshot" ]]; then
|
|
echo "first published run $run_id completed but was not a snapshot: sync_mode=$sync_mode" >&2
|
|
return 6
|
|
fi
|
|
python3 - "$summary_path" "$meta_path" <<'PY'
|
|
import json, sys
|
|
summary = json.load(open(sys.argv[1], encoding="utf-8"))
|
|
meta = json.load(open(sys.argv[2], encoding="utf-8"))
|
|
stage = summary.get("stageTiming") or {}
|
|
report = summary.get("reportCounts") or {}
|
|
roa = stage.get("roa_validation_cache") or {}
|
|
print(
|
|
"first published run success "
|
|
f"run_id={meta.get('run_id')} "
|
|
f"wall_ms={summary.get('wallMs')} "
|
|
f"validation_ms={stage.get('validation_ms')} "
|
|
f"repo_sync_ms_total={stage.get('repo_sync_ms_total')} "
|
|
f"rrdp_ms={stage.get('rrdp_download_ms_total')} "
|
|
f"rsync_ms={stage.get('rsync_download_ms_total')} "
|
|
f"pp_cache={stage.get('enable_publication_point_validation_cache')} "
|
|
f"roa_cache={stage.get('enable_roa_validation_cache')} "
|
|
f"roa_hit={roa.get('hit_roas')} "
|
|
f"roa_fresh={roa.get('fresh_roas')} "
|
|
f"vrps={report.get('vrps')} "
|
|
f"aspas={report.get('aspas')} "
|
|
f"publication_points={report.get('publicationPoints')} "
|
|
f"warnings={report.get('warnings')}"
|
|
)
|
|
PY
|
|
return 0
|
|
fi
|
|
if [[ "$summary_status" == "failed" || "$summary_status" == "error" || "$meta_status" == "failed" || "$meta_status" == "error" ]]; then
|
|
echo "first published run $run_id failed early: meta_status=$meta_status summary_status=$summary_status" >&2
|
|
sed -n '1,80p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true
|
|
sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stderr" >&2 || true
|
|
return 6
|
|
fi
|
|
fi
|
|
if [[ -n "$controller_pid" ]] && ! kill -0 "$controller_pid" 2>/dev/null; then
|
|
echo "run_soak controller exited before first published run $run_id completed" >&2
|
|
sed -n '1,80p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true
|
|
sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stderr" >&2 || true
|
|
return 6
|
|
fi
|
|
sleep 5
|
|
done
|
|
echo "timeout waiting for first published run $run_id to complete" >&2
|
|
sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true
|
|
sed -n '1,120p' "$remote_root/logs/run_soak.publish-${timestamp}.stderr" >&2 || true
|
|
return 6
|
|
}
|
|
|
|
[[ -d "$remote_root" ]] || { echo "remote root not found: $remote_root" >&2; exit 2; }
|
|
[[ -f "$remote_archive" ]] || { echo "archive not found: $remote_archive" >&2; exit 2; }
|
|
|
|
timestamp="$(date -u +%Y%m%dT%H%M%SZ)"
|
|
latest_run="$(max_run_name || true)"
|
|
last_successful_run="$(max_successful_run_name || true)"
|
|
if [[ -z "$last_successful_run" ]]; then
|
|
echo "no existing runs found under $remote_root/runs" >&2
|
|
exit 2
|
|
fi
|
|
last_run="$last_successful_run"
|
|
last_status="$(json_get_status "$remote_root/runs/$last_run/run-meta.json")"
|
|
last_summary_status="$(json_get_status "$remote_root/runs/$last_run/run-summary.json")"
|
|
next_index=$((10#${last_run#run_} + 1))
|
|
next_run="$(printf 'run_%04d' "$next_index")"
|
|
|
|
backup_root="$remote_root/state/backups/pre_publish_${timestamp}_after_${last_run}"
|
|
extract_root="$remote_root/state/publish-staging/$timestamp"
|
|
new_pkg="$extract_root/portable-soak"
|
|
|
|
log "latest_run=${latest_run:-none} last_successful_run=$last_run run_meta_status=$last_status run_summary_status=$last_summary_status next_run=$next_run"
|
|
log "backup_root=$backup_root"
|
|
log "extract_root=$extract_root"
|
|
log "mode=$mode"
|
|
|
|
if [[ "$last_status" != "success" || "$last_summary_status" != "success" ]]; then
|
|
echo "last run is not successful; refusing publish: $last_run meta=$last_status summary=$last_summary_status" >&2
|
|
exit 3
|
|
fi
|
|
|
|
log "current monitored sidecars"
|
|
pgrep -af 'rpki_artifact_metrics|rpki_query_service|rpki_inter_rp_metrics|prometheus|grafana' || true
|
|
|
|
log "current soak processes under root"
|
|
pgrep -af "$remote_root/(run_soak.sh|bin/rpki_daemon|bin/rpki)( |$)" || true
|
|
|
|
if [[ "$mode" == "execute" ]]; then
|
|
terminate_matching -TERM "$remote_root/bin/rpki "
|
|
terminate_matching -TERM "$remote_root/bin/rpki_daemon "
|
|
terminate_matching -TERM "$remote_root/run_soak.sh"
|
|
if ! wait_no_soak_children; then
|
|
echo "soak processes did not stop cleanly; forcing kill" >&2
|
|
terminate_matching -KILL "$remote_root/bin/rpki "
|
|
terminate_matching -KILL "$remote_root/bin/rpki_daemon "
|
|
terminate_matching -KILL "$remote_root/run_soak.sh"
|
|
wait_no_soak_children || { echo "failed to stop soak processes" >&2; exit 4; }
|
|
fi
|
|
else
|
|
log "would stop soak processes under $remote_root only"
|
|
fi
|
|
|
|
latest_run_after_stop="$(max_run_name || true)"
|
|
last_successful_run_after_stop="$(max_successful_run_name || true)"
|
|
if [[ -z "$last_successful_run_after_stop" ]]; then
|
|
echo "no successful run remains after stopping soak" >&2
|
|
exit 7
|
|
fi
|
|
if [[ "$last_successful_run_after_stop" != "$last_run" ]]; then
|
|
last_run="$last_successful_run_after_stop"
|
|
last_status="$(json_get_status "$remote_root/runs/$last_run/run-meta.json")"
|
|
last_summary_status="$(json_get_status "$remote_root/runs/$last_run/run-summary.json")"
|
|
next_index=$((10#${last_run#run_} + 1))
|
|
next_run="$(printf 'run_%04d' "$next_index")"
|
|
backup_root="$remote_root/state/backups/pre_publish_${timestamp}_after_${last_run}"
|
|
log "recomputed last_successful_run=$last_run next_run=$next_run after stopping soak"
|
|
fi
|
|
if [[ -n "${latest_run_after_stop:-}" && "$latest_run_after_stop" != "$last_run" ]]; then
|
|
log "latest run after stop is incomplete: $latest_run_after_stop; preserving it outside runs/ before publishing"
|
|
incomplete_dir="$backup_root/incomplete-runs"
|
|
run_or_echo mkdir -p "$incomplete_dir"
|
|
latest_index=$((10#${latest_run_after_stop#run_}))
|
|
stable_index=$((10#${last_run#run_}))
|
|
for ((idx = stable_index + 1; idx <= latest_index; idx++)); do
|
|
candidate="$(printf 'run_%04d' "$idx")"
|
|
if [[ -d "$remote_root/runs/$candidate" ]]; then
|
|
run_or_echo mv "$remote_root/runs/$candidate" "$incomplete_dir/$candidate"
|
|
fi
|
|
done
|
|
fi
|
|
|
|
run_or_echo mkdir -p "$backup_root" "$extract_root"
|
|
if [[ "$mode" == "execute" ]]; then
|
|
tar -C "$extract_root" -xzf "$remote_archive"
|
|
[[ -x "$new_pkg/bin/rpki" ]] || { echo "extracted package missing bin/rpki" >&2; exit 5; }
|
|
else
|
|
log "would extract archive to $extract_root"
|
|
fi
|
|
|
|
if [[ "$mode" == "execute" ]]; then
|
|
{
|
|
echo "timestamp_utc=$timestamp"
|
|
echo "remote_root=$remote_root"
|
|
echo "remote_archive=$remote_archive"
|
|
echo "last_run=$last_run"
|
|
echo "next_run=$next_run"
|
|
echo "last_status=$last_status"
|
|
echo "last_summary_status=$last_summary_status"
|
|
echo "mode=$mode"
|
|
} > "$backup_root/publish-meta.txt"
|
|
fi
|
|
|
|
if [[ -d "$remote_root/state/db" ]]; then
|
|
run_or_echo mv "$remote_root/state/db" "$backup_root/db"
|
|
fi
|
|
if [[ -d "$remote_root/state/meta" ]]; then
|
|
run_or_echo cp -a "$remote_root/state/meta" "$backup_root/meta-copy"
|
|
fi
|
|
if [[ -f "$remote_root/.env" ]]; then
|
|
run_or_echo cp -a "$remote_root/.env" "$backup_root/env.before"
|
|
fi
|
|
if [[ -d "$remote_root/bin" ]]; then
|
|
run_or_echo mv "$remote_root/bin" "$backup_root/bin.before"
|
|
fi
|
|
|
|
for path in run_soak.sh run_24h_soak_with_metrics.sh scripts monitor fixtures copied-binaries.txt missing-optional-binaries.txt fixtures.txt scripts.txt manifest.json portable-soak.env.example; do
|
|
if [[ -e "$new_pkg/$path" ]]; then
|
|
if [[ -e "$remote_root/$path" ]]; then
|
|
run_or_echo rm -rf "$remote_root/$path"
|
|
fi
|
|
run_or_echo cp -a "$new_pkg/$path" "$remote_root/$path"
|
|
fi
|
|
done
|
|
run_or_echo cp -a "$new_pkg/bin" "$remote_root/bin"
|
|
|
|
if [[ -f "$remote_root/.env" ]]; then
|
|
run_or_echo cp -a "$remote_root/.env" "$backup_root/env.generated_from_package"
|
|
fi
|
|
if [[ -f "$backup_root/env.before" ]]; then
|
|
run_or_echo cp -a "$backup_root/env.before" "$remote_root/.env"
|
|
fi
|
|
run_or_echo mkdir -p "$remote_root/state/db" "$remote_root/state/meta" "$remote_root/tmp" "$remote_root/logs" "$remote_root/state/invalid"
|
|
if [[ -f "$backup_root/meta-copy/last-run-id" ]]; then
|
|
run_or_echo cp -a "$backup_root/meta-copy/last-run-id" "$remote_root/state/meta/last-run-id"
|
|
fi
|
|
|
|
if [[ "$mode" == "execute" ]]; then
|
|
chmod +x "$remote_root/run_soak.sh" "$remote_root/run_24h_soak_with_metrics.sh" "$remote_root/bin/"* 2>/dev/null || true
|
|
nohup bash "$remote_root/run_soak.sh" > "$remote_root/logs/run_soak.publish-${timestamp}.stdout" 2> "$remote_root/logs/run_soak.publish-${timestamp}.stderr" &
|
|
echo $! > "$remote_root/state/meta/run_soak-pid"
|
|
sleep 3
|
|
log "started run_soak pid=$(cat "$remote_root/state/meta/run_soak-pid")"
|
|
log "startup log"
|
|
sed -n '1,20p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" || true
|
|
if grep -q "starting run ${next_run} sync_mode=snapshot" "$remote_root/logs/run_soak.publish-${timestamp}.stdout"; then
|
|
log "verified first published run starts as snapshot: $next_run"
|
|
else
|
|
echo "failed to verify snapshot startup for $next_run" >&2
|
|
sed -n '1,40p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true
|
|
exit 6
|
|
fi
|
|
log "waiting for first published run to complete"
|
|
wait_for_first_published_run "$next_run"
|
|
else
|
|
log "would start: nohup bash $remote_root/run_soak.sh > $remote_root/logs/run_soak.publish-${timestamp}.stdout 2> $remote_root/logs/run_soak.publish-${timestamp}.stderr &"
|
|
log "expected first line: run_soak mode=continuous max_existing_run_index=${last_run#run_} next_run=$next_run"
|
|
log "expected start: starting run $next_run sync_mode=snapshot"
|
|
fi
|
|
|
|
if [[ "$restart_query_service" == "1" ]]; then
|
|
if [[ "$mode" == "execute" ]]; then
|
|
if [[ -n "$(matching_pids "$query_pattern" | head -1)" ]]; then
|
|
terminate_matching -TERM "$query_pattern"
|
|
sleep 2
|
|
fi
|
|
nohup /root/rpki_20260616_query_service_deploy/bin/rpki_query_service \
|
|
--query-db /root/rpki_20260616_query_service_deploy/query-db \
|
|
--repo-bytes-db "$remote_root/state/db/repo-bytes.db" \
|
|
--export-root /root/rpki_20260616_query_service_deploy/query-exports \
|
|
--listen 0.0.0.0:9560 \
|
|
--watch-run-root "$remote_root" \
|
|
--watch-interval-secs 60 \
|
|
--watch-min-run-seq "$next_index" \
|
|
--retain-indexed-runs 10 \
|
|
--indexer-bin /root/rpki_20260616_query_service_deploy/bin/rpki_query_indexer \
|
|
> /root/rpki_20260616_query_service_deploy/query-service.publish-${timestamp}.log 2>&1 &
|
|
log "restarted query service"
|
|
else
|
|
log "would restart query service to reopen repo-bytes db"
|
|
fi
|
|
else
|
|
log "query service left unchanged"
|
|
fi
|
|
|
|
log "post-publish sidecars"
|
|
pgrep -af 'rpki_artifact_metrics|rpki_query_service|rpki_inter_rp_metrics|prometheus|grafana' || true
|
|
log "df"
|
|
df -h / /root 2>/dev/null | sort -u || true
|
|
REMOTE
|
|
)"
|
|
|
|
ssh "$REMOTE_HOST" "bash -s -- '$REMOTE_ROOT' '$REMOTE_ARCHIVE' '$MODE' '$RESTART_QUERY_SERVICE' '$QUERY_SERVICE_PID_PATTERN' '$FIRST_PUBLISHED_RUN_TIMEOUT_SECS'" <<< "$REMOTE_SCRIPT"
|