753 lines
27 KiB
Bash
Executable File
753 lines
27 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
|
|
|
REMOTE_HOST="${REMOTE_HOST:-root@47.251.127.231}"
|
|
REMOTE_ROOT="${REMOTE_ROOT:-/root/ours-rp-continuous/portable-soak}"
|
|
MODE="${MODE:-dry-run}"
|
|
PUBLISH_MODE="${PUBLISH_MODE:-snapshot}"
|
|
PROFILE="${PROFILE:-release}"
|
|
OUT_DIR="${OUT_DIR:-$REPO_ROOT/target/remote231_publish}"
|
|
PACKAGE_PREFIX="${PACKAGE_PREFIX:-remote231-publish}"
|
|
PACKAGE_ARCHIVE="${PACKAGE_ARCHIVE:-}"
|
|
BUILD_PACKAGE="${BUILD_PACKAGE:-1}"
|
|
RESTART_QUERY_SERVICE="${RESTART_QUERY_SERVICE:-1}"
|
|
START_ARTIFACT_METRICS="${START_ARTIFACT_METRICS:-1}"
|
|
START_QUERY_SERVICE="${START_QUERY_SERVICE:-1}"
|
|
START_INTER_RP="${START_INTER_RP:-1}"
|
|
START_MONITOR_STACK="${START_MONITOR_STACK:-1}"
|
|
START_RPKI_SOAK="${START_RPKI_SOAK:-1}"
|
|
RESTART_FIXED_PHASE_LOOP="${RESTART_FIXED_PHASE_LOOP:-1}"
|
|
START_ROUTINATOR_SYNC="${START_ROUTINATOR_SYNC:-1}"
|
|
START_ROUTINATOR_LOCAL_SERVICES="${START_ROUTINATOR_LOCAL_SERVICES:-1}"
|
|
VERIFY_INTER_RP_DASHBOARD="${VERIFY_INTER_RP_DASHBOARD:-1}"
|
|
WAIT_FIRST_RUN="${WAIT_FIRST_RUN:-1}"
|
|
FIRST_RUN_TIMEOUT_SECS="${FIRST_RUN_TIMEOUT_SECS:-7200}"
|
|
SNAPSHOT_STOP_SIDE_SERVICES="${SNAPSHOT_STOP_SIDE_SERVICES:-1}"
|
|
ROUTINATOR_READY_TIMEOUT_SECS="${ROUTINATOR_READY_TIMEOUT_SECS:-300}"
|
|
|
|
usage() {
|
|
cat <<'USAGE'
|
|
Usage:
|
|
scripts/soak/publish_remote231_full.sh [--snapshot|--delta] [--execute|--dry-run]
|
|
[--package <portable-soak.tar.gz>]
|
|
[--remote-host <root@host>] [--remote-root <path>]
|
|
|
|
Builds or reuses a portable soak package, publishes it to remote231, restores the fixed-phase
|
|
ours RP soak loop, and starts/verifies sidecars:
|
|
- rpki_artifact_metrics (:9556)
|
|
- rpki_query_service (:9560, optional)
|
|
- ours-rp vs Routinator inter-RP exporter (:9557)
|
|
- local Routinator server/loop used by the inter-RP dashboard
|
|
- local Routinator artifact sync helper
|
|
- Prometheus/Grafana monitor stack
|
|
|
|
Modes:
|
|
--snapshot Preserve run history but move state/db aside; first new run is snapshot.
|
|
--delta Preserve state/db and continue from current DB; next run should be delta.
|
|
|
|
Default is dry-run + snapshot.
|
|
|
|
Environment overrides:
|
|
REMOTE_HOST=root@47.251.127.231
|
|
REMOTE_ROOT=/root/ours-rp-continuous/portable-soak
|
|
PROFILE=release
|
|
BUILD_PACKAGE=1
|
|
RESTART_QUERY_SERVICE=1
|
|
START_QUERY_SERVICE=1
|
|
START_INTER_RP=1
|
|
START_ROUTINATOR_LOCAL_SERVICES=1
|
|
SNAPSHOT_STOP_SIDE_SERVICES=1
|
|
START_MONITOR_STACK=1
|
|
WAIT_FIRST_RUN=1
|
|
ROUTINATOR_READY_TIMEOUT_SECS=300
|
|
USAGE
|
|
}
|
|
|
|
die() {
|
|
echo "error: $*" >&2
|
|
exit 2
|
|
}
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--snapshot)
|
|
PUBLISH_MODE="snapshot"
|
|
;;
|
|
--delta)
|
|
PUBLISH_MODE="delta"
|
|
;;
|
|
--execute)
|
|
MODE="execute"
|
|
;;
|
|
--dry-run)
|
|
MODE="dry-run"
|
|
;;
|
|
--package)
|
|
shift
|
|
PACKAGE_ARCHIVE="${1:?--package requires a value}"
|
|
BUILD_PACKAGE=0
|
|
;;
|
|
--remote-host)
|
|
shift
|
|
REMOTE_HOST="${1:?--remote-host requires a value}"
|
|
;;
|
|
--remote-root)
|
|
shift
|
|
REMOTE_ROOT="${1:?--remote-root requires a value}"
|
|
;;
|
|
--profile)
|
|
shift
|
|
PROFILE="${1:?--profile requires a value}"
|
|
;;
|
|
--no-query-service)
|
|
START_QUERY_SERVICE=0
|
|
RESTART_QUERY_SERVICE=0
|
|
;;
|
|
--no-monitor-stack)
|
|
START_MONITOR_STACK=0
|
|
;;
|
|
--no-inter-rp)
|
|
START_INTER_RP=0
|
|
;;
|
|
--no-routinator-local-services)
|
|
START_ROUTINATOR_LOCAL_SERVICES=0
|
|
;;
|
|
--no-snapshot-stop-side-services)
|
|
SNAPSHOT_STOP_SIDE_SERVICES=0
|
|
;;
|
|
--no-wait-first-run)
|
|
WAIT_FIRST_RUN=0
|
|
;;
|
|
--help|-h)
|
|
usage
|
|
exit 0
|
|
;;
|
|
*)
|
|
die "unknown argument: $1"
|
|
;;
|
|
esac
|
|
shift
|
|
done
|
|
|
|
case "$MODE" in
|
|
dry-run|execute) ;;
|
|
*) die "invalid mode: $MODE" ;;
|
|
esac
|
|
case "$PUBLISH_MODE" in
|
|
snapshot|delta) ;;
|
|
*) die "invalid publish mode: $PUBLISH_MODE" ;;
|
|
esac
|
|
|
|
run_or_echo() {
|
|
if [[ "$MODE" == "execute" ]]; then
|
|
"$@"
|
|
else
|
|
printf '[dry-run] '
|
|
printf '%q ' "$@"
|
|
printf '\n'
|
|
fi
|
|
}
|
|
|
|
bool_arg() {
|
|
case "${1:-0}" in
|
|
1|true|TRUE|yes|YES|on|ON) return 0 ;;
|
|
*) return 1 ;;
|
|
esac
|
|
}
|
|
|
|
require_command() {
|
|
command -v "$1" >/dev/null 2>&1 || die "missing required command: $1"
|
|
}
|
|
|
|
require_command ssh
|
|
require_command scp
|
|
require_command python3
|
|
|
|
if bool_arg "$BUILD_PACKAGE"; then
|
|
if [[ "$PROFILE" == "release" ]]; then
|
|
TARGET_DIR="$REPO_ROOT/target/release"
|
|
else
|
|
TARGET_DIR="$REPO_ROOT/target/$PROFILE"
|
|
fi
|
|
required_bins=(rpki rpki_daemon db_stats rpki_artifact_metrics rpki_query_service rpki_query_indexer)
|
|
missing_bins=()
|
|
for bin in "${required_bins[@]}"; do
|
|
[[ -x "$TARGET_DIR/$bin" ]] || missing_bins+=("$bin")
|
|
done
|
|
if (( ${#missing_bins[@]} > 0 )); then
|
|
die "missing required $PROFILE binaries: ${missing_bins[*]}; build them before publish"
|
|
fi
|
|
PACKAGE_ARCHIVE="$(
|
|
OUT_DIR="$OUT_DIR" PACKAGE_PREFIX="$PACKAGE_PREFIX" \
|
|
"$SCRIPT_DIR/build_portable_soak_package.sh" --profile "$PROFILE"
|
|
)"
|
|
else
|
|
[[ -n "$PACKAGE_ARCHIVE" ]] || die "--package is required when BUILD_PACKAGE=0"
|
|
fi
|
|
[[ -f "$PACKAGE_ARCHIVE" ]] || die "package not found: $PACKAGE_ARCHIVE"
|
|
|
|
echo "remote231 full publish mode=$MODE publish_mode=$PUBLISH_MODE remote=$REMOTE_HOST root=$REMOTE_ROOT package=$PACKAGE_ARCHIVE"
|
|
if [[ -n "$(git -C "$REPO_ROOT" status --short 2>/dev/null || true)" ]]; then
|
|
echo "warning: local rpki worktree is dirty; package manifest records dirty provenance" >&2
|
|
fi
|
|
|
|
if [[ "$PUBLISH_MODE" == "snapshot" ]]; then
|
|
ssh "$REMOTE_HOST" "bash -s -- '$REMOTE_ROOT' '$MODE' '$SNAPSHOT_STOP_SIDE_SERVICES'" <<'REMOTE'
|
|
set -euo pipefail
|
|
remote_root="$1"
|
|
mode="$2"
|
|
snapshot_stop_side_services="$3"
|
|
is_true() {
|
|
case "${1:-0}" in 1|true|TRUE|yes|YES|on|ON) return 0 ;; *) return 1 ;; esac
|
|
}
|
|
matching_pids() {
|
|
local pattern="$1"
|
|
pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
|
|
local pid="${line%% *}"
|
|
local cmd="${line#* }"
|
|
[[ "$pid" =~ ^[0-9]+$ ]] || continue
|
|
[[ "$pid" == "$$" || "$pid" == "$BASHPID" || "$pid" == "${PPID:-}" ]] && continue
|
|
[[ "$cmd" == *"bash -s --"* && "$cmd" == *"$remote_root"* ]] && continue
|
|
printf '%s\n' "$pid"
|
|
done | sort -u
|
|
}
|
|
terminate_matching() {
|
|
local signal="$1"
|
|
local pattern="$2"
|
|
local label="$3"
|
|
local -a pids=()
|
|
mapfile -t pids < <(matching_pids "$pattern")
|
|
if (( ${#pids[@]} > 0 )); then
|
|
if [[ "$mode" == "execute" ]]; then
|
|
printf '[snapshot-prestop] %s %s pids=%s\n' "$signal" "$label" "${pids[*]}"
|
|
kill "$signal" "${pids[@]}" >/dev/null 2>&1 || true
|
|
else
|
|
printf '[dry-run] stop %s pids=%s pattern=%s\n' "$label" "${pids[*]}" "$pattern"
|
|
fi
|
|
else
|
|
printf '[snapshot-prestop] no %s processes\n' "$label"
|
|
fi
|
|
}
|
|
if [[ "$mode" == "execute" ]]; then
|
|
terminate_matching -TERM "fixed_phase_loop.sh --name ours-rp" "ours fixed phase loop"
|
|
if is_true "$snapshot_stop_side_services"; then
|
|
terminate_matching -TERM "$remote_root/bin/rpki_query_service" "query service"
|
|
terminate_matching -TERM "$remote_root/bin/rpki_artifact_metrics" "artifact metrics"
|
|
terminate_matching -TERM "inter_rp_ours_routinator_exporter.py" "inter-rp exporter"
|
|
terminate_matching -TERM "rpki_inter_rp_metrics" "inter-rp exporter"
|
|
terminate_matching -TERM "sync_local_routinator_peer.sh" "local routinator sync"
|
|
terminate_matching -TERM "fixed_phase_loop.sh --name routinator" "routinator fixed phase loop"
|
|
terminate_matching -TERM "/root/inter-rp-runners/scripts/run_single_rp_with_rss.sh --rp routinator" "routinator one-shot wrapper"
|
|
terminate_matching -TERM "/root/inter-rp-runners/bin/routinator" "routinator process"
|
|
fi
|
|
sleep 3
|
|
if is_true "$snapshot_stop_side_services"; then
|
|
terminate_matching -KILL "$remote_root/bin/rpki_query_service" "query service"
|
|
terminate_matching -KILL "$remote_root/bin/rpki_artifact_metrics" "artifact metrics"
|
|
terminate_matching -KILL "inter_rp_ours_routinator_exporter.py" "inter-rp exporter"
|
|
terminate_matching -KILL "rpki_inter_rp_metrics" "inter-rp exporter"
|
|
terminate_matching -KILL "sync_local_routinator_peer.sh" "local routinator sync"
|
|
terminate_matching -KILL "fixed_phase_loop.sh --name routinator" "routinator fixed phase loop"
|
|
terminate_matching -KILL "/root/inter-rp-runners/scripts/run_single_rp_with_rss.sh --rp routinator" "routinator one-shot wrapper"
|
|
terminate_matching -KILL "/root/inter-rp-runners/bin/routinator" "routinator process"
|
|
fi
|
|
else
|
|
echo "[dry-run] stop existing fixed_phase_loop.sh --name ours-rp before snapshot publish"
|
|
if is_true "$snapshot_stop_side_services"; then
|
|
terminate_matching -TERM "$remote_root/bin/rpki_query_service" "query service"
|
|
terminate_matching -TERM "$remote_root/bin/rpki_artifact_metrics" "artifact metrics"
|
|
terminate_matching -TERM "inter_rp_ours_routinator_exporter.py" "inter-rp exporter"
|
|
terminate_matching -TERM "rpki_inter_rp_metrics" "inter-rp exporter"
|
|
terminate_matching -TERM "sync_local_routinator_peer.sh" "local routinator sync"
|
|
terminate_matching -TERM "fixed_phase_loop.sh --name routinator" "routinator fixed phase loop"
|
|
terminate_matching -TERM "/root/inter-rp-runners/scripts/run_single_rp_with_rss.sh --rp routinator" "routinator one-shot wrapper"
|
|
terminate_matching -TERM "/root/inter-rp-runners/bin/routinator" "routinator process"
|
|
fi
|
|
fi
|
|
REMOTE
|
|
publish_args=(
|
|
"$SCRIPT_DIR/publish_remote231.sh"
|
|
--package "$PACKAGE_ARCHIVE"
|
|
--remote-host "$REMOTE_HOST"
|
|
--remote-root "$REMOTE_ROOT"
|
|
)
|
|
if bool_arg "$RESTART_QUERY_SERVICE"; then
|
|
publish_args+=(--restart-query-service)
|
|
fi
|
|
if [[ "$MODE" == "execute" ]]; then
|
|
publish_args+=(--execute)
|
|
else
|
|
publish_args+=(--dry-run)
|
|
fi
|
|
"${publish_args[@]}"
|
|
if [[ "$MODE" == "execute" && "$START_QUERY_SERVICE" == "1" && "$RESTART_QUERY_SERVICE" == "1" ]]; then
|
|
START_QUERY_SERVICE=0
|
|
fi
|
|
WAIT_FIRST_RUN=0
|
|
else
|
|
REMOTE_STAGE_PARENT="/root/rpki_publish_packages"
|
|
PACKAGE_BASENAME="$(basename "$PACKAGE_ARCHIVE")"
|
|
REMOTE_ARCHIVE="$REMOTE_STAGE_PARENT/$PACKAGE_BASENAME"
|
|
ssh "$REMOTE_HOST" "mkdir -p '$REMOTE_STAGE_PARENT'"
|
|
scp "$PACKAGE_ARCHIVE" "$REMOTE_HOST:$REMOTE_ARCHIVE"
|
|
ssh "$REMOTE_HOST" "bash -s -- '$REMOTE_ROOT' '$REMOTE_ARCHIVE' '$MODE'" <<'REMOTE'
|
|
set -euo pipefail
|
|
remote_root="$1"
|
|
remote_archive="$2"
|
|
mode="$3"
|
|
|
|
log() { printf '[delta-publish] %s\n' "$*"; }
|
|
run_or_echo() {
|
|
if [[ "$mode" == "execute" ]]; then
|
|
"$@"
|
|
else
|
|
printf '[dry-run] '
|
|
printf '%q ' "$@"
|
|
printf '\n'
|
|
fi
|
|
}
|
|
matching_pids() {
|
|
local pattern="$1"
|
|
pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
|
|
local pid="${line%% *}"
|
|
local cmd="${line#* }"
|
|
[[ "$pid" =~ ^[0-9]+$ ]] || continue
|
|
[[ "$pid" == "$$" || "$pid" == "$BASHPID" || "$pid" == "${PPID:-}" ]] && continue
|
|
[[ "$cmd" == *"bash -s --"* && "$cmd" == *"$remote_root"* ]] && continue
|
|
printf '%s\n' "$pid"
|
|
done | sort -u
|
|
}
|
|
terminate_matching() {
|
|
local signal="$1"
|
|
local pattern="$2"
|
|
local -a pids=()
|
|
mapfile -t pids < <(matching_pids "$pattern")
|
|
if (( ${#pids[@]} > 0 )); then
|
|
kill "$signal" "${pids[@]}" >/dev/null 2>&1 || true
|
|
fi
|
|
}
|
|
json_status_is_success() {
|
|
local path="$1"
|
|
python3 - "$path" <<'PY'
|
|
import json, sys
|
|
try:
|
|
data = json.load(open(sys.argv[1], encoding="utf-8"))
|
|
except Exception:
|
|
sys.exit(1)
|
|
sys.exit(0 if data.get("status") == "success" else 1)
|
|
PY
|
|
}
|
|
max_successful_run_name() {
|
|
local candidate
|
|
find "$remote_root/runs" -maxdepth 1 -type d -name 'run_*' -printf '%f\n' 2>/dev/null | sort -V | while read -r candidate; do
|
|
[[ -n "$candidate" ]] || continue
|
|
if json_status_is_success "$remote_root/runs/$candidate/run-meta.json" \
|
|
&& json_status_is_success "$remote_root/runs/$candidate/run-summary.json"; then
|
|
printf '%s\n' "$candidate"
|
|
fi
|
|
done | tail -1
|
|
}
|
|
|
|
[[ -d "$remote_root" ]] || { echo "remote root not found: $remote_root" >&2; exit 2; }
|
|
[[ -f "$remote_archive" ]] || { echo "archive not found: $remote_archive" >&2; exit 2; }
|
|
last_run="$(max_successful_run_name || true)"
|
|
[[ -n "$last_run" ]] || { echo "no successful run found under $remote_root/runs" >&2; exit 2; }
|
|
timestamp="$(date -u +%Y%m%dT%H%M%SZ)"
|
|
backup_root="$remote_root/state/backups/pre_delta_publish_${timestamp}_after_${last_run}"
|
|
extract_root="$remote_root/state/publish-staging/$timestamp"
|
|
new_pkg="$extract_root/portable-soak"
|
|
log "last_successful_run=$last_run backup_root=$backup_root mode=$mode"
|
|
|
|
if [[ "$mode" == "execute" ]]; then
|
|
terminate_matching -TERM "$remote_root/bin/rpki "
|
|
terminate_matching -TERM "$remote_root/bin/rpki_daemon "
|
|
terminate_matching -TERM "$remote_root/run_soak.sh"
|
|
terminate_matching -TERM "fixed_phase_loop.sh --name ours-rp"
|
|
sleep 3
|
|
terminate_matching -KILL "$remote_root/bin/rpki "
|
|
terminate_matching -KILL "$remote_root/bin/rpki_daemon "
|
|
terminate_matching -KILL "$remote_root/run_soak.sh"
|
|
terminate_matching -KILL "fixed_phase_loop.sh --name ours-rp"
|
|
else
|
|
log "would stop current ours-rp soak/fixed-phase processes"
|
|
fi
|
|
|
|
run_or_echo mkdir -p "$backup_root" "$extract_root"
|
|
if [[ "$mode" == "execute" ]]; then
|
|
tar -C "$extract_root" -xzf "$remote_archive"
|
|
[[ -x "$new_pkg/bin/rpki" ]] || { echo "extracted package missing bin/rpki" >&2; exit 5; }
|
|
fi
|
|
if [[ -f "$remote_root/.env" ]]; then
|
|
run_or_echo cp -a "$remote_root/.env" "$backup_root/env.before"
|
|
fi
|
|
if [[ -d "$remote_root/bin" ]]; then
|
|
run_or_echo mv "$remote_root/bin" "$backup_root/bin.before"
|
|
fi
|
|
for path in run_soak.sh run_24h_soak_with_metrics.sh scripts monitor fixtures copied-binaries.txt missing-optional-binaries.txt fixtures.txt scripts.txt manifest.json portable-soak.env.example; do
|
|
if [[ "$mode" == "execute" && ! -e "$new_pkg/$path" ]]; then
|
|
continue
|
|
fi
|
|
if [[ "$mode" == "execute" ]]; then
|
|
rm -rf "$remote_root/$path"
|
|
cp -a "$new_pkg/$path" "$remote_root/$path"
|
|
else
|
|
printf '[dry-run] replace %s from package\n' "$path"
|
|
fi
|
|
done
|
|
run_or_echo cp -a "$new_pkg/bin" "$remote_root/bin"
|
|
if [[ -f "$backup_root/env.before" ]]; then
|
|
run_or_echo cp -a "$backup_root/env.before" "$remote_root/.env"
|
|
fi
|
|
run_or_echo mkdir -p "$remote_root/state/db" "$remote_root/state/meta" "$remote_root/tmp" "$remote_root/logs"
|
|
if [[ "$mode" == "execute" ]]; then
|
|
chmod +x "$remote_root/run_soak.sh" "$remote_root/run_24h_soak_with_metrics.sh" "$remote_root/bin/"* "$remote_root/scripts/soak/"* "$remote_root/scripts/inter_rp/"* 2>/dev/null || true
|
|
fi
|
|
REMOTE
|
|
fi
|
|
|
|
ssh "$REMOTE_HOST" "bash -s -- '$REMOTE_ROOT' '$MODE' '$START_RPKI_SOAK' '$START_ARTIFACT_METRICS' '$START_QUERY_SERVICE' '$START_INTER_RP' '$START_ROUTINATOR_SYNC' '$START_ROUTINATOR_LOCAL_SERVICES' '$START_MONITOR_STACK' '$VERIFY_INTER_RP_DASHBOARD' '$WAIT_FIRST_RUN' '$FIRST_RUN_TIMEOUT_SECS' '$RESTART_FIXED_PHASE_LOOP' '$ROUTINATOR_READY_TIMEOUT_SECS'" <<'REMOTE'
|
|
set -euo pipefail
|
|
remote_root="$1"
|
|
mode="$2"
|
|
start_rpki_soak="$3"
|
|
start_artifact_metrics="$4"
|
|
start_query_service="$5"
|
|
start_inter_rp="$6"
|
|
start_routinator_sync="$7"
|
|
start_routinator_local_services="$8"
|
|
start_monitor_stack="$9"
|
|
verify_inter_rp_dashboard="${10}"
|
|
wait_first_run="${11}"
|
|
first_run_timeout_secs="${12}"
|
|
restart_fixed_phase_loop="${13}"
|
|
routinator_ready_timeout_secs="${14}"
|
|
|
|
log() { printf '[remote231-full] %s\n' "$*"; }
|
|
is_true() {
|
|
case "${1:-0}" in 1|true|TRUE|yes|YES|on|ON) return 0 ;; *) return 1 ;; esac
|
|
}
|
|
run_or_echo() {
|
|
if [[ "$mode" == "execute" ]]; then
|
|
"$@"
|
|
else
|
|
printf '[dry-run] '
|
|
printf '%q ' "$@"
|
|
printf '\n'
|
|
fi
|
|
}
|
|
matching_pids() {
|
|
local pattern="$1"
|
|
pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
|
|
local pid="${line%% *}"
|
|
local cmd="${line#* }"
|
|
[[ "$pid" =~ ^[0-9]+$ ]] || continue
|
|
[[ "$pid" == "$$" || "$pid" == "$BASHPID" || "$pid" == "${PPID:-}" ]] && continue
|
|
[[ "$cmd" == *"bash -s --"* && "$cmd" == *"$remote_root"* ]] && continue
|
|
printf '%s\n' "$pid"
|
|
done | sort -u
|
|
}
|
|
stop_matching() {
|
|
local pattern="$1"
|
|
if [[ "$mode" != "execute" ]]; then
|
|
log "would stop processes matching: $pattern"
|
|
return 0
|
|
fi
|
|
local -a pids=()
|
|
mapfile -t pids < <(matching_pids "$pattern")
|
|
if (( ${#pids[@]} > 0 )); then
|
|
kill -TERM "${pids[@]}" >/dev/null 2>&1 || true
|
|
sleep 1
|
|
fi
|
|
}
|
|
ensure_artifact_metrics() {
|
|
is_true "$start_artifact_metrics" || return 0
|
|
if curl -fsS --max-time 3 http://127.0.0.1:9556/healthz >/dev/null 2>&1 \
|
|
|| curl -fsS --max-time 3 http://127.0.0.1:9556/metrics >/dev/null 2>&1; then
|
|
log "artifact metrics already up"
|
|
return 0
|
|
fi
|
|
stop_matching "$remote_root/bin/rpki_artifact_metrics"
|
|
if [[ "$mode" == "execute" ]]; then
|
|
nohup "$remote_root/bin/rpki_artifact_metrics" \
|
|
--run-root "$remote_root" \
|
|
--listen 0.0.0.0:9556 \
|
|
--poll-secs 120 \
|
|
--instance remote231-continuous \
|
|
>"$remote_root/logs/artifact-metrics.full-publish.log" 2>&1 &
|
|
else
|
|
log "would start artifact metrics on :9556"
|
|
fi
|
|
}
|
|
ensure_query_service() {
|
|
is_true "$start_query_service" || return 0
|
|
stop_matching "$remote_root/bin/rpki_query_service"
|
|
local latest_seq
|
|
latest_seq="$(find "$remote_root/runs" -maxdepth 1 -type d -name 'run_*' -printf '%f\n' 2>/dev/null | sort -V | tail -1 | sed 's/run_//;s/^0*//')"
|
|
[[ -n "$latest_seq" ]] || latest_seq=0
|
|
local min_seq=$((latest_seq + 1))
|
|
log "starting query service watch_min_run_seq=$min_seq"
|
|
if [[ "$mode" == "execute" ]]; then
|
|
nohup "$remote_root/bin/rpki_query_service" \
|
|
--query-db "$remote_root/state/query-db" \
|
|
--repo-bytes-db "$remote_root/state/db/repo-bytes.db" \
|
|
--export-root "$remote_root/state/query-exports" \
|
|
--listen 0.0.0.0:9560 \
|
|
--watch-run-root "$remote_root" \
|
|
--watch-interval-secs 60 \
|
|
--watch-min-run-seq "$min_seq" \
|
|
--retain-indexed-runs 10 \
|
|
--indexer-bin "$remote_root/bin/rpki_query_indexer" \
|
|
--projection-entry-limit 20 \
|
|
>"$remote_root/logs/query-service.full-publish.log" 2>&1 &
|
|
else
|
|
log "would start query service on :9560"
|
|
fi
|
|
}
|
|
ensure_inter_rp() {
|
|
is_true "$start_inter_rp" || return 0
|
|
stop_matching "rpki_inter_rp_metrics"
|
|
stop_matching "inter_rp_ours_routinator_exporter.py"
|
|
if [[ "$mode" == "execute" ]]; then
|
|
nohup env \
|
|
OURS_RUN_ROOT="$remote_root" \
|
|
PEER_ROOT="$remote_root/inter-rp-peers" \
|
|
INTER_RP_INSTANCE=remote231-inter-rp \
|
|
INTER_RP_LISTEN=0.0.0.0:9557 \
|
|
INTER_RP_SCAN_TTL_SECONDS=20 \
|
|
"$remote_root/scripts/inter_rp/inter_rp_ours_routinator_exporter.py" \
|
|
>"$remote_root/logs/inter-rp-metrics.full-publish.log" 2>&1 &
|
|
else
|
|
log "would start inter-rp ours+routinator exporter on :9557"
|
|
fi
|
|
}
|
|
ensure_routinator_local_services() {
|
|
is_true "$start_routinator_local_services" || return 0
|
|
local routinator_bin="/root/inter-rp-runners/bin/routinator"
|
|
local routinator_root="/var/lib/inter-rp-runners"
|
|
if [[ -x "$routinator_bin" ]]; then
|
|
if pgrep -af "$routinator_bin .* server .*127.0.0.1:9558" >/dev/null 2>&1; then
|
|
log "local routinator server already running"
|
|
elif [[ "$mode" == "execute" ]]; then
|
|
mkdir -p "$routinator_root/routinator-server/repository"
|
|
nohup "$routinator_bin" \
|
|
--repository-dir "$routinator_root/routinator-server/repository" \
|
|
--no-rir-tals \
|
|
--extra-tals-dir "$routinator_root/fixtures/tal" \
|
|
--enable-aspa \
|
|
server \
|
|
--http 127.0.0.1:9558 \
|
|
--rtr 127.0.0.1:0 \
|
|
--refresh 86400 \
|
|
>"$remote_root/logs/routinator-server.full-publish.log" 2>&1 &
|
|
log "started local routinator server on 127.0.0.1:9558"
|
|
else
|
|
log "would start local routinator server on 127.0.0.1:9558"
|
|
fi
|
|
else
|
|
log "warning: missing $routinator_bin; cannot start local routinator server"
|
|
fi
|
|
|
|
if pgrep -af "fixed_phase_loop.sh --name routinator" >/dev/null 2>&1; then
|
|
log "routinator fixed phase loop already running"
|
|
return 0
|
|
fi
|
|
if [[ -x "$remote_root/scripts/soak/fixed_phase_loop.sh" \
|
|
&& -x /root/inter-rp-runners/scripts/run_single_rp_with_rss.sh \
|
|
&& -x /root/inter-rp-runners/scripts/run_routinator_once.sh ]]; then
|
|
if [[ "$mode" == "execute" ]]; then
|
|
nohup bash "$remote_root/scripts/soak/fixed_phase_loop.sh" \
|
|
--name routinator \
|
|
--cycle-secs 900 \
|
|
--offset-secs 450 \
|
|
--lock-file /var/lock/rpki-heavy-run.lock \
|
|
--lock-wait-secs 60 \
|
|
-- /root/inter-rp-runners/scripts/run_single_rp_with_rss.sh \
|
|
--rp routinator \
|
|
--root /var/lib/inter-rp-runners/routinator \
|
|
--command /root/inter-rp-runners/scripts/run_routinator_once.sh \
|
|
--retain-runs 30 \
|
|
--sample-ms 500 \
|
|
>"$remote_root/logs/fixed-phase-routinator.full-publish.log" 2>&1 &
|
|
log "started routinator fixed phase loop"
|
|
else
|
|
log "would start routinator fixed phase loop"
|
|
fi
|
|
else
|
|
log "warning: missing routinator loop scripts; cannot start routinator fixed phase loop"
|
|
fi
|
|
}
|
|
ensure_routinator_sync() {
|
|
is_true "$start_routinator_sync" || return 0
|
|
if pgrep -af "sync_local_routinator_peer.sh" >/dev/null 2>&1; then
|
|
log "routinator local sync already running"
|
|
return 0
|
|
fi
|
|
if [[ -x "$remote_root/ops/sync_local_routinator_peer.sh" ]]; then
|
|
if [[ "$mode" == "execute" ]]; then
|
|
nohup "$remote_root/ops/sync_local_routinator_peer.sh" >"$remote_root/logs/sync-local-routinator.full-publish.log" 2>&1 &
|
|
else
|
|
log "would start existing ops/sync_local_routinator_peer.sh"
|
|
fi
|
|
else
|
|
log "warning: missing $remote_root/ops/sync_local_routinator_peer.sh; inter-RP routinator latest may stale"
|
|
fi
|
|
}
|
|
ensure_monitor_stack() {
|
|
is_true "$start_monitor_stack" || return 0
|
|
if [[ -f "$remote_root/monitor/docker-compose.yml" ]]; then
|
|
if command -v docker >/dev/null 2>&1; then
|
|
if [[ "$mode" == "execute" ]]; then
|
|
(cd "$remote_root/monitor" && docker compose up -d)
|
|
else
|
|
log "would run docker compose up -d under $remote_root/monitor"
|
|
fi
|
|
else
|
|
log "warning: docker not installed; cannot start monitor stack"
|
|
fi
|
|
else
|
|
log "warning: missing monitor/docker-compose.yml"
|
|
fi
|
|
}
|
|
ensure_fixed_phase_soak() {
|
|
is_true "$start_rpki_soak" || return 0
|
|
if pgrep -af "fixed_phase_loop.sh --name ours-rp" >/dev/null 2>&1; then
|
|
log "ours-rp fixed phase loop already running"
|
|
return 0
|
|
fi
|
|
is_true "$restart_fixed_phase_loop" || { log "fixed phase loop restart disabled"; return 0; }
|
|
if [[ "$mode" == "execute" ]]; then
|
|
nohup bash "$remote_root/scripts/soak/fixed_phase_loop.sh" \
|
|
--name ours-rp \
|
|
--cycle-secs 900 \
|
|
--offset-secs 0 \
|
|
--lock-file /var/lock/rpki-heavy-run.lock \
|
|
--lock-wait-secs 60 \
|
|
-- env PACKAGE_ROOT="$remote_root" ENV_FILE="$remote_root/.env" "$remote_root/run_soak.sh" \
|
|
>"$remote_root/logs/fixed-phase-ours.full-publish.log" 2>&1 &
|
|
else
|
|
log "would start ours-rp fixed phase loop"
|
|
fi
|
|
}
|
|
wait_url() {
|
|
local name="$1"
|
|
local url="$2"
|
|
local timeout_secs="${3:-60}"
|
|
local deadline=$((SECONDS + timeout_secs))
|
|
while (( SECONDS < deadline )); do
|
|
if curl -fsS --max-time 5 "$url" >/dev/null 2>&1; then
|
|
log "$name up"
|
|
return 0
|
|
fi
|
|
sleep 2
|
|
done
|
|
echo "$name did not become ready: $url" >&2
|
|
return 1
|
|
}
|
|
verify_prometheus_queries() {
|
|
is_true "$verify_inter_rp_dashboard" || return 0
|
|
[[ "$mode" == "execute" ]] || { log "would verify dashboard PromQL queries"; return 0; }
|
|
python3 - "$remote_root/monitor/grafana/dashboards/ours-rp-inter-rp.json" <<'PY'
|
|
import json, pathlib, urllib.parse, urllib.request, sys
|
|
path = pathlib.Path(sys.argv[1])
|
|
if not path.exists():
|
|
print("missing inter-rp dashboard json", file=sys.stderr)
|
|
sys.exit(1)
|
|
dash = json.loads(path.read_text())
|
|
exprs = []
|
|
def walk(value):
|
|
if isinstance(value, dict):
|
|
if "expr" in value:
|
|
exprs.append(value["expr"])
|
|
for item in value.values():
|
|
walk(item)
|
|
elif isinstance(value, list):
|
|
for item in value:
|
|
walk(item)
|
|
walk(dash)
|
|
empty = []
|
|
for expr in sorted(set(exprs)):
|
|
url = "http://127.0.0.1:9090/api/v1/query?query=" + urllib.parse.quote(expr)
|
|
data = json.load(urllib.request.urlopen(url, timeout=10))
|
|
if not data.get("data", {}).get("result", []):
|
|
empty.append(expr)
|
|
if empty:
|
|
print("empty dashboard queries:")
|
|
print("\n".join(empty))
|
|
sys.exit(1)
|
|
print(f"inter-rp dashboard queries ok count={len(set(exprs))}")
|
|
PY
|
|
}
|
|
wait_next_run_if_requested() {
|
|
is_true "$wait_first_run" || return 0
|
|
[[ "$mode" == "execute" ]] || { log "would wait for next run completion"; return 0; }
|
|
local before latest deadline status
|
|
before="$(find "$remote_root/runs" -maxdepth 1 -type d -name 'run_*' -printf '%f\n' 2>/dev/null | sort -V | tail -1)"
|
|
deadline=$((SECONDS + first_run_timeout_secs))
|
|
log "waiting for next completed run after ${before:-none}"
|
|
while (( SECONDS < deadline )); do
|
|
latest="$(find "$remote_root/runs" -maxdepth 1 -type d -name 'run_*' -printf '%f\n' 2>/dev/null | sort -V | tail -1)"
|
|
if [[ -n "$latest" && "$latest" != "$before" && -f "$remote_root/runs/$latest/run-summary.json" ]]; then
|
|
status="$(python3 - "$remote_root/runs/$latest/run-summary.json" <<'PY'
|
|
import json, sys
|
|
try:
|
|
print(json.load(open(sys.argv[1], encoding="utf-8")).get("status", "missing"))
|
|
except Exception:
|
|
print("missing")
|
|
PY
|
|
)"
|
|
if [[ "$status" == "success" ]]; then
|
|
python3 - "$remote_root/runs/$latest/run-summary.json" <<'PY'
|
|
import json, sys
|
|
s = json.load(open(sys.argv[1], encoding="utf-8"))
|
|
st = s.get("stageTiming") or {}
|
|
rc = s.get("reportCounts") or {}
|
|
print(
|
|
"next run success "
|
|
f"run={s.get('runId')} wall_ms={s.get('wallMs')} "
|
|
f"validation_ms={st.get('validation_ms')} repo_sync_ms_total={st.get('repo_sync_ms_total')} "
|
|
f"vrps={rc.get('vrps')} aspas={rc.get('aspas')} pp={rc.get('publicationPoints')}"
|
|
)
|
|
PY
|
|
return 0
|
|
fi
|
|
fi
|
|
sleep 5
|
|
done
|
|
echo "timeout waiting for next completed run" >&2
|
|
return 1
|
|
}
|
|
|
|
ensure_artifact_metrics
|
|
ensure_query_service
|
|
ensure_inter_rp
|
|
ensure_routinator_local_services
|
|
ensure_routinator_sync
|
|
ensure_monitor_stack
|
|
ensure_fixed_phase_soak
|
|
|
|
if [[ "$mode" == "execute" ]]; then
|
|
sleep 5
|
|
wait_url "artifact metrics" http://127.0.0.1:9556/metrics
|
|
if is_true "$start_query_service"; then wait_url "query service" http://127.0.0.1:9560/api/v1; fi
|
|
if is_true "$start_routinator_local_services"; then wait_url "routinator server" http://127.0.0.1:9558/metrics "$routinator_ready_timeout_secs"; fi
|
|
if is_true "$start_inter_rp"; then wait_url "inter-rp metrics" http://127.0.0.1:9557/metrics; fi
|
|
if is_true "$start_monitor_stack"; then wait_url "prometheus" http://127.0.0.1:9090/-/ready; wait_url "grafana" http://127.0.0.1:3000/api/health; fi
|
|
if is_true "$start_inter_rp"; then
|
|
curl -fsS http://127.0.0.1:9557/metrics | grep -E 'inter_rp_(service_last_reload_success|parse_errors|repo_sync_overlap_total|vrps_diff_by_class)' | head -30
|
|
fi
|
|
verify_prometheus_queries
|
|
fi
|
|
|
|
wait_next_run_if_requested
|
|
|
|
log "process summary"
|
|
pgrep -af 'fixed_phase_loop|run_soak.sh|rpki_artifact_metrics|rpki_query_service|inter_rp_ours_routinator_exporter|prometheus|grafana|routinator' || true
|
|
log "df"
|
|
df -h / /root 2>/dev/null | sort -u || true
|
|
REMOTE
|
|
|
|
echo "remote231 full publish finished mode=$MODE publish_mode=$PUBLISH_MODE"
|