rpki/scripts/soak/publish_remote231.sh

369 lines
13 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
REMOTE_HOST="${REMOTE_HOST:-root@47.251.127.231}"
REMOTE_ROOT="${REMOTE_ROOT:-/root/rpki_20260608_2_feature062_24h_20260608T075547Z/portable-soak}"
PACKAGE_ARCHIVE="${PACKAGE_ARCHIVE:-}"
MODE="${MODE:-dry-run}"
RESTART_QUERY_SERVICE="${RESTART_QUERY_SERVICE:-0}"
QUERY_SERVICE_PID_PATTERN="${QUERY_SERVICE_PID_PATTERN:-rpki_query_service --query-db /root/rpki_20260616_query_service_deploy/query-db}"
usage() {
cat <<'USAGE'
Usage:
scripts/soak/publish_remote231.sh --package <portable-soak.tar.gz> [--execute] [--remote-root <path>]
Publishes a new portable soak package to remote231 in place:
- stops only the current soak controller/daemon/rpki child under REMOTE_ROOT;
- preserves runs/ so run numbering continues;
- backs up state/db before replacing binaries/scripts;
- moves state/db away and creates a new empty state/db so the next run is snapshot;
- leaves metrics/query/prometheus/grafana configuration untouched.
Default mode is dry-run. Use --execute to apply changes.
Environment overrides:
REMOTE_HOST=root@47.251.127.231
REMOTE_ROOT=/root/rpki_20260608_2_feature062_24h_20260608T075547Z/portable-soak
RESTART_QUERY_SERVICE=0|1
USAGE
}
die() {
echo "error: $*" >&2
exit 2
}
while [[ $# -gt 0 ]]; do
case "$1" in
--package)
shift
PACKAGE_ARCHIVE="${1:?--package requires a value}"
;;
--remote-host)
shift
REMOTE_HOST="${1:?--remote-host requires a value}"
;;
--remote-root)
shift
REMOTE_ROOT="${1:?--remote-root requires a value}"
;;
--execute)
MODE="execute"
;;
--dry-run)
MODE="dry-run"
;;
--restart-query-service)
RESTART_QUERY_SERVICE=1
;;
--help|-h)
usage
exit 0
;;
*)
die "unknown argument: $1"
;;
esac
shift
done
[[ -n "$PACKAGE_ARCHIVE" ]] || die "--package is required"
[[ -f "$PACKAGE_ARCHIVE" ]] || die "package not found: $PACKAGE_ARCHIVE"
case "$MODE" in
dry-run|execute) ;;
*) die "MODE must be dry-run or execute: $MODE" ;;
esac
command -v ssh >/dev/null 2>&1 || die "ssh is required"
command -v scp >/dev/null 2>&1 || die "scp is required"
REMOTE_STAGE_PARENT="/root/rpki_publish_packages"
PACKAGE_BASENAME="$(basename "$PACKAGE_ARCHIVE")"
REMOTE_ARCHIVE="$REMOTE_STAGE_PARENT/$PACKAGE_BASENAME"
LOCAL_GIT_SHA="$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || printf 'unknown')"
LOCAL_GIT_STATUS="$(git -C "$REPO_ROOT" status --short 2>/dev/null || true)"
echo "publish mode=$MODE remote=$REMOTE_HOST root=$REMOTE_ROOT package=$PACKAGE_ARCHIVE git=$LOCAL_GIT_SHA"
if [[ -n "$LOCAL_GIT_STATUS" ]]; then
echo "warning: local git worktree is dirty; package manifest should record provenance" >&2
fi
ssh "$REMOTE_HOST" "mkdir -p '$REMOTE_STAGE_PARENT'"
scp "$PACKAGE_ARCHIVE" "$REMOTE_HOST:$REMOTE_ARCHIVE"
REMOTE_SCRIPT="$(cat <<'REMOTE'
set -euo pipefail
remote_root="$1"
remote_archive="$2"
mode="$3"
restart_query_service="$4"
query_pattern="$5"
log() {
printf '[publish] %s\n' "$*"
}
run_or_echo() {
if [[ "$mode" == "execute" ]]; then
"$@"
else
printf '[dry-run] '
printf '%q ' "$@"
printf '\n'
fi
}
json_get_status() {
local path="$1"
python3 - "$path" <<'PY'
import json, sys
try:
print(json.load(open(sys.argv[1], encoding="utf-8")).get("status", "missing"))
except Exception:
print("missing")
PY
}
max_run_name() {
find "$remote_root/runs" -maxdepth 1 -type d -name 'run_*' -printf '%f\n' 2>/dev/null | sort -V | tail -1
}
max_successful_run_name() {
local candidate
find "$remote_root/runs" -maxdepth 1 -type d -name 'run_*' -printf '%f\n' 2>/dev/null | sort -V | while read -r candidate; do
[[ -n "$candidate" ]] || continue
if [[ "$(json_get_status "$remote_root/runs/$candidate/run-meta.json")" == "success" \
&& "$(json_get_status "$remote_root/runs/$candidate/run-summary.json")" == "success" ]]; then
printf '%s\n' "$candidate"
fi
done | tail -1
}
wait_no_soak_children() {
local deadline=$((SECONDS + 120))
while (( SECONDS < deadline )); do
if [[ -z "$(matching_pids "$remote_root/(run_soak.sh|bin/rpki_daemon|bin/rpki)( |$)" | head -1)" ]]; then
return 0
fi
sleep 2
done
return 1
}
matching_pids() {
local pattern="$1"
pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
local pid cmd
pid="${line%% *}"
cmd="${line#* }"
[[ "$pid" =~ ^[0-9]+$ ]] || continue
[[ "$pid" == "$$" || "$pid" == "$BASHPID" || "$pid" == "${PPID:-}" ]] && continue
[[ "$cmd" == *"bash -s --"* && "$cmd" == *"$remote_root"* ]] && continue
printf '%s\n' "$pid"
done | sort -u
}
terminate_matching() {
local signal="$1"
local pattern="$2"
local -a pids=()
mapfile -t pids < <(matching_pids "$pattern")
if (( ${#pids[@]} > 0 )); then
kill "$signal" "${pids[@]}" >/dev/null 2>&1 || true
fi
}
[[ -d "$remote_root" ]] || { echo "remote root not found: $remote_root" >&2; exit 2; }
[[ -f "$remote_archive" ]] || { echo "archive not found: $remote_archive" >&2; exit 2; }
timestamp="$(date -u +%Y%m%dT%H%M%SZ)"
latest_run="$(max_run_name || true)"
last_successful_run="$(max_successful_run_name || true)"
if [[ -z "$last_successful_run" ]]; then
echo "no existing runs found under $remote_root/runs" >&2
exit 2
fi
last_run="$last_successful_run"
last_status="$(json_get_status "$remote_root/runs/$last_run/run-meta.json")"
last_summary_status="$(json_get_status "$remote_root/runs/$last_run/run-summary.json")"
next_index=$((10#${last_run#run_} + 1))
next_run="$(printf 'run_%04d' "$next_index")"
backup_root="$remote_root/state/backups/pre_publish_${timestamp}_after_${last_run}"
extract_root="$remote_root/state/publish-staging/$timestamp"
new_pkg="$extract_root/portable-soak"
log "latest_run=${latest_run:-none} last_successful_run=$last_run run_meta_status=$last_status run_summary_status=$last_summary_status next_run=$next_run"
log "backup_root=$backup_root"
log "extract_root=$extract_root"
log "mode=$mode"
if [[ "$last_status" != "success" || "$last_summary_status" != "success" ]]; then
echo "last run is not successful; refusing publish: $last_run meta=$last_status summary=$last_summary_status" >&2
exit 3
fi
log "current monitored sidecars"
pgrep -af 'rpki_artifact_metrics|rpki_query_service|rpki_inter_rp_metrics|prometheus|grafana' || true
log "current soak processes under root"
pgrep -af "$remote_root/(run_soak.sh|bin/rpki_daemon|bin/rpki)( |$)" || true
if [[ "$mode" == "execute" ]]; then
terminate_matching -TERM "$remote_root/bin/rpki "
terminate_matching -TERM "$remote_root/bin/rpki_daemon "
terminate_matching -TERM "$remote_root/run_soak.sh"
if ! wait_no_soak_children; then
echo "soak processes did not stop cleanly; forcing kill" >&2
terminate_matching -KILL "$remote_root/bin/rpki "
terminate_matching -KILL "$remote_root/bin/rpki_daemon "
terminate_matching -KILL "$remote_root/run_soak.sh"
wait_no_soak_children || { echo "failed to stop soak processes" >&2; exit 4; }
fi
else
log "would stop soak processes under $remote_root only"
fi
latest_run_after_stop="$(max_run_name || true)"
last_successful_run_after_stop="$(max_successful_run_name || true)"
if [[ -z "$last_successful_run_after_stop" ]]; then
echo "no successful run remains after stopping soak" >&2
exit 7
fi
if [[ "$last_successful_run_after_stop" != "$last_run" ]]; then
last_run="$last_successful_run_after_stop"
last_status="$(json_get_status "$remote_root/runs/$last_run/run-meta.json")"
last_summary_status="$(json_get_status "$remote_root/runs/$last_run/run-summary.json")"
next_index=$((10#${last_run#run_} + 1))
next_run="$(printf 'run_%04d' "$next_index")"
backup_root="$remote_root/state/backups/pre_publish_${timestamp}_after_${last_run}"
log "recomputed last_successful_run=$last_run next_run=$next_run after stopping soak"
fi
if [[ -n "${latest_run_after_stop:-}" && "$latest_run_after_stop" != "$last_run" ]]; then
log "latest run after stop is incomplete: $latest_run_after_stop; preserving it outside runs/ before publishing"
incomplete_dir="$backup_root/incomplete-runs"
run_or_echo mkdir -p "$incomplete_dir"
latest_index=$((10#${latest_run_after_stop#run_}))
stable_index=$((10#${last_run#run_}))
for ((idx = stable_index + 1; idx <= latest_index; idx++)); do
candidate="$(printf 'run_%04d' "$idx")"
if [[ -d "$remote_root/runs/$candidate" ]]; then
run_or_echo mv "$remote_root/runs/$candidate" "$incomplete_dir/$candidate"
fi
done
fi
run_or_echo mkdir -p "$backup_root" "$extract_root"
if [[ "$mode" == "execute" ]]; then
tar -C "$extract_root" -xzf "$remote_archive"
[[ -x "$new_pkg/bin/rpki" ]] || { echo "extracted package missing bin/rpki" >&2; exit 5; }
else
log "would extract archive to $extract_root"
fi
if [[ "$mode" == "execute" ]]; then
{
echo "timestamp_utc=$timestamp"
echo "remote_root=$remote_root"
echo "remote_archive=$remote_archive"
echo "last_run=$last_run"
echo "next_run=$next_run"
echo "last_status=$last_status"
echo "last_summary_status=$last_summary_status"
echo "mode=$mode"
} > "$backup_root/publish-meta.txt"
fi
if [[ -d "$remote_root/state/db" ]]; then
run_or_echo mv "$remote_root/state/db" "$backup_root/db"
fi
if [[ -d "$remote_root/state/meta" ]]; then
run_or_echo cp -a "$remote_root/state/meta" "$backup_root/meta-copy"
fi
if [[ -f "$remote_root/.env" ]]; then
run_or_echo cp -a "$remote_root/.env" "$backup_root/env.before"
fi
if [[ -d "$remote_root/bin" ]]; then
run_or_echo mv "$remote_root/bin" "$backup_root/bin.before"
fi
for path in run_soak.sh run_24h_soak_with_metrics.sh scripts monitor fixtures copied-binaries.txt missing-optional-binaries.txt fixtures.txt scripts.txt manifest.json portable-soak.env.example; do
if [[ -e "$new_pkg/$path" ]]; then
if [[ -e "$remote_root/$path" ]]; then
run_or_echo rm -rf "$remote_root/$path"
fi
run_or_echo cp -a "$new_pkg/$path" "$remote_root/$path"
fi
done
run_or_echo cp -a "$new_pkg/bin" "$remote_root/bin"
if [[ -f "$remote_root/.env" ]]; then
run_or_echo cp -a "$remote_root/.env" "$backup_root/env.generated_from_package"
fi
if [[ -f "$backup_root/env.before" ]]; then
run_or_echo cp -a "$backup_root/env.before" "$remote_root/.env"
fi
run_or_echo mkdir -p "$remote_root/state/db" "$remote_root/state/meta" "$remote_root/tmp" "$remote_root/logs" "$remote_root/state/invalid"
if [[ -f "$backup_root/meta-copy/last-run-id" ]]; then
run_or_echo cp -a "$backup_root/meta-copy/last-run-id" "$remote_root/state/meta/last-run-id"
fi
if [[ "$mode" == "execute" ]]; then
chmod +x "$remote_root/run_soak.sh" "$remote_root/run_24h_soak_with_metrics.sh" "$remote_root/bin/"* 2>/dev/null || true
nohup bash "$remote_root/run_soak.sh" > "$remote_root/logs/run_soak.publish-${timestamp}.stdout" 2> "$remote_root/logs/run_soak.publish-${timestamp}.stderr" &
echo $! > "$remote_root/state/meta/run_soak-pid"
sleep 3
log "started run_soak pid=$(cat "$remote_root/state/meta/run_soak-pid")"
log "startup log"
sed -n '1,20p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" || true
if grep -q "starting run ${next_run} sync_mode=snapshot" "$remote_root/logs/run_soak.publish-${timestamp}.stdout"; then
log "verified first published run starts as snapshot: $next_run"
else
echo "failed to verify snapshot startup for $next_run" >&2
sed -n '1,40p' "$remote_root/logs/run_soak.publish-${timestamp}.stdout" >&2 || true
exit 6
fi
else
log "would start: nohup bash $remote_root/run_soak.sh > $remote_root/logs/run_soak.publish-${timestamp}.stdout 2> $remote_root/logs/run_soak.publish-${timestamp}.stderr &"
log "expected first line: run_soak mode=continuous max_existing_run_index=${last_run#run_} next_run=$next_run"
log "expected start: starting run $next_run sync_mode=snapshot"
fi
if [[ "$restart_query_service" == "1" ]]; then
if [[ "$mode" == "execute" ]]; then
if [[ -n "$(matching_pids "$query_pattern" | head -1)" ]]; then
terminate_matching -TERM "$query_pattern"
sleep 2
fi
nohup /root/rpki_20260616_query_service_deploy/bin/rpki_query_service \
--query-db /root/rpki_20260616_query_service_deploy/query-db \
--repo-bytes-db "$remote_root/state/db/repo-bytes.db" \
--export-root /root/rpki_20260616_query_service_deploy/query-exports \
--listen 0.0.0.0:9560 \
--watch-run-root "$remote_root" \
--watch-interval-secs 60 \
--retain-indexed-runs 10 \
--indexer-bin /root/rpki_20260616_query_service_deploy/bin/rpki_query_indexer \
> /root/rpki_20260616_query_service_deploy/query-service.publish-${timestamp}.log 2>&1 &
log "restarted query service"
else
log "would restart query service to reopen repo-bytes db"
fi
else
log "query service left unchanged"
fi
log "post-publish sidecars"
pgrep -af 'rpki_artifact_metrics|rpki_query_service|rpki_inter_rp_metrics|prometheus|grafana' || true
log "df"
df -h / /root 2>/dev/null | sort -u || true
REMOTE
)"
ssh "$REMOTE_HOST" "bash -s -- '$REMOTE_ROOT' '$REMOTE_ARCHIVE' '$MODE' '$RESTART_QUERY_SERVICE' '$QUERY_SERVICE_PID_PATTERN'" <<< "$REMOTE_SCRIPT"