rpki/scripts/soak/run_soak.sh
2026-06-29 23:25:19 +08:00

1480 lines
50 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PACKAGE_ROOT="${PACKAGE_ROOT:-$SCRIPT_DIR}"
ENV_FILE="${ENV_FILE:-$PACKAGE_ROOT/.env}"
if [[ -f "$ENV_FILE" ]]; then
# shellcheck disable=SC1090
source "$ENV_FILE"
fi
MAX_RUNS="${MAX_RUNS:-3}"
INTERVAL_SECS="${INTERVAL_SECS:-0}"
STOP_AFTER_SECS="${STOP_AFTER_SECS:-0}"
RIRS="${RIRS:-afrinic,apnic,arin,lacnic,ripe}"
TAL_INPUT_MODE="${TAL_INPUT_MODE:-file-with-ta}"
RUN_ROOT="${RUN_ROOT:-$PACKAGE_ROOT}"
RETAIN_RUNS="${RETAIN_RUNS:-10}"
CLEAN_TMP_AFTER_RUN="${CLEAN_TMP_AFTER_RUN:-0}"
OUTPUT_COMPACT_REPORT="${OUTPUT_COMPACT_REPORT:-1}"
ALLOW_RSYNC_MIRROR_REUSE="${ALLOW_RSYNC_MIRROR_REUSE:-1}"
RSYNC_SCOPE="${RSYNC_SCOPE:-module-root}"
FAILURE_SNAPSHOT_RESET="${FAILURE_SNAPSHOT_RESET:-1}"
PERIODIC_SNAPSHOT_RESET="${PERIODIC_SNAPSHOT_RESET:-0}"
PERIODIC_SNAPSHOT_MAX_DELTAS="${PERIODIC_SNAPSHOT_MAX_DELTAS:-100}"
DB_STATS_EXACT_EVERY="${DB_STATS_EXACT_EVERY:-3}"
RPKI_PROGRESS_LOG="${RPKI_PROGRESS_LOG:-1}"
RPKI_PROGRESS_SLOW_SECS="${RPKI_PROGRESS_SLOW_SECS:-10}"
RPKI_PROGRESS_STAGE_FRESH_SLOW_MS="${RPKI_PROGRESS_STAGE_FRESH_SLOW_MS:-1000}"
RPKI_PROGRESS_PP_CONTROL_SLOW_MS="${RPKI_PROGRESS_PP_CONTROL_SLOW_MS:-100}"
RPKI_PROGRESS_PP_CACHE_SLOW_MS="${RPKI_PROGRESS_PP_CACHE_SLOW_MS:-50}"
RPKI_PROGRESS_CONTROL_LOOP_SLOW_MS="${RPKI_PROGRESS_CONTROL_LOOP_SLOW_MS:-1000}"
DISABLE_COMPETING_RPS="${DISABLE_COMPETING_RPS:-1}"
ENABLE_CHILD_CERTIFICATE_VALIDATION_CACHE="${ENABLE_CHILD_CERTIFICATE_VALIDATION_CACHE:-0}"
RPKI_EXTRA_ARGS="${RPKI_EXTRA_ARGS:-}"
RPKI_ANALYZE="${RPKI_ANALYZE:-0}"
BIN_DIR="${BIN_DIR:-$PACKAGE_ROOT/bin}"
FIXTURE_DIR="${FIXTURE_DIR:-$PACKAGE_ROOT/fixtures}"
STATE_ROOT="$RUN_ROOT/state"
RUNS_ROOT="$RUN_ROOT/runs"
LOG_ROOT="$RUN_ROOT/logs"
DB_DIR="${DB_DIR:-$STATE_ROOT/db}"
META_DIR="${META_DIR:-$STATE_ROOT/meta}"
TMP_DIR="${TMP_DIR:-$RUN_ROOT/tmp}"
RSYNC_MIRROR_ROOT="${RSYNC_MIRROR_ROOT:-$STATE_ROOT/rsync-mirror}"
INVALID_ROOT="$STATE_ROOT/invalid"
RESET_STAGING_ROOT="$STATE_ROOT/reset-staging"
LIVE_TA_REFRESH_DIR="${LIVE_TA_REFRESH_DIR:-$META_DIR/live-ta-refresh}"
LIVE_TA_REFRESH_CONNECT_TIMEOUT_SECS="${LIVE_TA_REFRESH_CONNECT_TIMEOUT_SECS:-15}"
LIVE_TA_REFRESH_MAX_TIME_SECS="${LIVE_TA_REFRESH_MAX_TIME_SECS:-120}"
LIVE_TA_REFRESH_BEFORE_SNAPSHOT="${LIVE_TA_REFRESH_BEFORE_SNAPSHOT:-1}"
RUN_LIFECYCLE_STATE_PATH="$STATE_ROOT/run-lifecycle-state.json"
RUN_LIFECYCLE_RECENT_RUNS_LIMIT=200
RPKI_BIN="$BIN_DIR/rpki"
RPKI_DAEMON_BIN="$BIN_DIR/rpki_daemon"
DB_STATS_BIN="$BIN_DIR/db_stats"
usage() {
cat <<'USAGE'
Usage:
./run_soak.sh
配置来自 package 根目录下的 .env也可以用 ENV_FILE=/path/to/.env 覆盖。
USAGE
}
die() {
echo "error: $*" >&2
exit 2
}
warn() {
echo "warning: $*" >&2
}
is_true() {
case "${1:-}" in
1|true|TRUE|yes|YES|on|ON) return 0 ;;
*) return 1 ;;
esac
}
require_command() {
command -v "$1" >/dev/null 2>&1 || die "missing required command: $1"
}
validate_positive_int() {
local name="$1"
local value="$2"
[[ "$value" =~ ^[0-9]+$ ]] || die "$name must be an integer: $value"
[[ "$value" != "0" ]] || die "$name must be > 0"
}
validate_non_negative_int() {
local name="$1"
local value="$2"
[[ "$value" =~ ^[0-9]+$ ]] || die "$name must be an integer: $value"
}
validate_max_runs() {
[[ "$MAX_RUNS" =~ ^-?[0-9]+$ ]] || die "MAX_RUNS must be an integer: $MAX_RUNS"
[[ "$MAX_RUNS" != "0" ]] || die "MAX_RUNS must be non-zero; use a positive value for fixed runs or -1 for continuous mode"
}
validate_rsync_scope() {
case "$RSYNC_SCOPE" in
host|publication-point|module-root)
;;
*)
die "RSYNC_SCOPE must be host, publication-point, or module-root: $RSYNC_SCOPE"
;;
esac
}
validate_tal_input_mode() {
case "$TAL_INPUT_MODE" in
file-with-ta|file-live-ta|url)
;;
*)
die "TAL_INPUT_MODE must be file-with-ta, file-live-ta or url: $TAL_INPUT_MODE"
;;
esac
}
normalize_token() {
local token="$1"
token="${token#"${token%%[![:space:]]*}"}"
token="${token%"${token##*[![:space:]]}"}"
printf '%s' "$token" | tr '[:upper:]' '[:lower:]'
}
parse_rirs() {
RIR_LIST=()
local raw_token
local normalized
IFS=',' read -r -a raw_rirs <<< "$RIRS"
for raw_token in "${raw_rirs[@]}"; do
normalized="$(normalize_token "$raw_token")"
[[ -n "$normalized" ]] || continue
case "$normalized" in
afrinic|apnic|arin|lacnic|ripe)
RIR_LIST+=("$normalized")
;;
*)
die "invalid RIRS entry: $raw_token; allowed: afrinic,apnic,arin,lacnic,ripe"
;;
esac
done
[[ "${#RIR_LIST[@]}" -gt 0 ]] || die "RIRS must contain at least one RIR"
}
tal_file_for_rir() {
case "$1" in
afrinic) printf '%s' "$FIXTURE_DIR/tal/afrinic.tal" ;;
apnic) printf '%s' "$FIXTURE_DIR/tal/apnic-rfc7730-https.tal" ;;
arin) printf '%s' "$FIXTURE_DIR/tal/arin.tal" ;;
lacnic) printf '%s' "$FIXTURE_DIR/tal/lacnic.tal" ;;
ripe) printf '%s' "$FIXTURE_DIR/tal/ripe-ncc.tal" ;;
*) die "unknown RIR: $1" ;;
esac
}
ta_file_for_rir() {
case "$1" in
afrinic) printf '%s' "$FIXTURE_DIR/ta/afrinic-ta.cer" ;;
apnic) printf '%s' "$FIXTURE_DIR/ta/apnic-ta.cer" ;;
arin) printf '%s' "$FIXTURE_DIR/ta/arin-ta.cer" ;;
lacnic) printf '%s' "$FIXTURE_DIR/ta/lacnic-ta.cer" ;;
ripe) printf '%s' "$FIXTURE_DIR/ta/ripe-ncc-ta.cer" ;;
*) die "unknown RIR: $1" ;;
esac
}
tal_url_for_rir() {
case "$1" in
afrinic) printf '%s' "https://rpki.afrinic.net/tal/afrinic.tal" ;;
apnic) printf '%s' "https://tal.apnic.net/apnic.tal" ;;
arin) printf '%s' "https://www.arin.net/resources/manage/rpki/arin.tal" ;;
lacnic) printf '%s' "https://www.lacnic.net/innovaportal/file/4983/1/lacnic.tal" ;;
ripe) printf '%s' "https://tal.rpki.ripe.net/ripe-ncc.tal" ;;
*) die "unknown RIR: $1" ;;
esac
}
cir_tal_uri_for_rir() {
tal_url_for_rir "$1"
}
tal_https_uri_from_fixture() {
local tal_path="$1"
awk '
/^[[:space:]]*#/ { next }
/^[[:space:]]*$/ { next }
{
gsub(/^[[:space:]]+|[[:space:]]+$/, "", $0)
}
/^https?:\/\// {
print
exit 0
}
' "$tal_path"
}
live_ta_file_for_rir() {
printf '%s' "$STATE_ROOT/live-ta/$(basename "$(tal_file_for_rir "$1")" .tal).cer"
}
live_ta_refresh_pid_file_for_rir() {
printf '%s' "$LIVE_TA_REFRESH_DIR/$1.pid"
}
refresh_live_ta_for_rir() {
local rir_name="$1"
local run_id="${2:-manual}"
local log_path="${3:-}"
local tal_path
local ta_uri
local ta_file
local tmp_file
if [[ -n "$log_path" ]]; then
mkdir -p "$(dirname "$log_path")"
exec >> "$log_path" 2>&1
fi
echo "live-ta-refresh start run=$run_id rir=$rir_name at=$(date -u +%Y-%m-%dT%H:%M:%SZ)"
tal_path="$(tal_file_for_rir "$rir_name")"
ta_uri="$(tal_https_uri_from_fixture "$tal_path")"
if [[ -z "$ta_uri" ]]; then
echo "live-ta-refresh failed rir=$rir_name reason=missing_https_uri tal=$tal_path"
return 1
fi
ta_file="$(live_ta_file_for_rir "$rir_name")"
mkdir -p "$(dirname "$ta_file")"
tmp_file="${ta_file}.tmp.$$.$RANDOM"
if ! curl -fsSL --connect-timeout "$LIVE_TA_REFRESH_CONNECT_TIMEOUT_SECS" --max-time "$LIVE_TA_REFRESH_MAX_TIME_SECS" "$ta_uri" -o "$tmp_file"; then
rm -f "$tmp_file"
echo "live-ta-refresh failed rir=$rir_name reason=curl uri=$ta_uri"
return 1
fi
if [[ ! -s "$tmp_file" ]]; then
rm -f "$tmp_file"
echo "live-ta-refresh failed rir=$rir_name reason=empty_download uri=$ta_uri"
return 1
fi
mv "$tmp_file" "$ta_file"
echo "live-ta-refresh success rir=$rir_name uri=$ta_uri output=$ta_file bytes=$(wc -c < "$ta_file" | tr -d ' ')"
}
ensure_live_ta_for_rir() {
local rir_name="$1"
local live_ta_file
local fixture_ta_file
live_ta_file="$(live_ta_file_for_rir "$rir_name")"
if [[ -s "$live_ta_file" ]]; then
return 0
fi
fixture_ta_file="$(ta_file_for_rir "$rir_name")"
[[ -s "$fixture_ta_file" ]] || die "missing live TA and fixture TA for $rir_name: $live_ta_file / $fixture_ta_file"
mkdir -p "$(dirname "$live_ta_file")"
cp "$fixture_ta_file" "$live_ta_file"
}
reap_finished_live_ta_refresh_for_rir() {
local rir_name="$1"
local pid_file
local pid
local pid_state
local pid_file_mtime
local now_epoch
local stale_after_secs
pid_file="$(live_ta_refresh_pid_file_for_rir "$rir_name")"
[[ -f "$pid_file" ]] || return 0
pid="$(cat "$pid_file" 2>/dev/null || true)"
if [[ "$pid" =~ ^[0-9]+$ ]] && kill -0 "$pid" >/dev/null 2>&1; then
pid_state=""
if [[ -r "/proc/$pid/stat" ]]; then
pid_state="$(awk '{ print $3 }' "/proc/$pid/stat" 2>/dev/null || true)"
fi
if [[ "$pid_state" == "Z" ]]; then
wait "$pid" >/dev/null 2>&1 || true
rm -f "$pid_file"
return 0
fi
pid_file_mtime="$(stat -c %Y "$pid_file" 2>/dev/null || date +%s)"
now_epoch="$(date +%s)"
stale_after_secs=$((LIVE_TA_REFRESH_MAX_TIME_SECS + 60))
if (( now_epoch - pid_file_mtime > stale_after_secs )); then
rm -f "$pid_file"
return 0
fi
return 1
fi
if [[ "$pid" =~ ^[0-9]+$ ]]; then
wait "$pid" >/dev/null 2>&1 || true
fi
rm -f "$pid_file"
return 0
}
start_live_ta_refresh_for_rir() {
local rir_name="$1"
local run_id="$2"
local pid_file
local log_path
local pid
mkdir -p "$LIVE_TA_REFRESH_DIR" "$LOG_ROOT"
pid_file="$(live_ta_refresh_pid_file_for_rir "$rir_name")"
if ! reap_finished_live_ta_refresh_for_rir "$rir_name"; then
pid="$(cat "$pid_file" 2>/dev/null || true)"
echo "live-ta-refresh skip run=$run_id rir=$rir_name reason=previous_refresh_running pid=$pid" \
>> "$LOG_ROOT/live-ta-refresh-$run_id-$rir_name.log"
return 0
fi
log_path="$LOG_ROOT/live-ta-refresh-$run_id-$rir_name.log"
refresh_live_ta_for_rir "$rir_name" "$run_id" "$log_path" &
pid=$!
printf '%s\n' "$pid" > "$pid_file"
}
wait_for_previous_live_ta_refresh_for_rir() {
local rir_name="$1"
local pid_file
local pid
local deadline_epoch
pid_file="$(live_ta_refresh_pid_file_for_rir "$rir_name")"
[[ -f "$pid_file" ]] || return 0
if reap_finished_live_ta_refresh_for_rir "$rir_name"; then
return 0
fi
pid="$(cat "$pid_file" 2>/dev/null || true)"
deadline_epoch=$(( $(date +%s) + LIVE_TA_REFRESH_MAX_TIME_SECS + 60 ))
echo "live-ta-refresh wait rir=$rir_name reason=previous_refresh_running pid=$pid"
while ! reap_finished_live_ta_refresh_for_rir "$rir_name"; do
if (( $(date +%s) > deadline_epoch )); then
die "timed out waiting for previous live TA refresh for $rir_name pid=$pid"
fi
sleep 1
done
}
refresh_live_ta_blocking_for_run() {
local run_id="$1"
local rir_name
local pid
local failed=0
local pids=()
local names=()
local log_path
local pid_file
for rir_name in "${RIR_LIST[@]}"; do
wait_for_previous_live_ta_refresh_for_rir "$rir_name"
done
for rir_name in "${RIR_LIST[@]}"; do
mkdir -p "$LIVE_TA_REFRESH_DIR" "$LOG_ROOT"
pid_file="$(live_ta_refresh_pid_file_for_rir "$rir_name")"
log_path="$LOG_ROOT/live-ta-refresh-$run_id-$rir_name.log"
refresh_live_ta_for_rir "$rir_name" "$run_id" "$log_path" &
pid=$!
printf '%s\n' "$pid" > "$pid_file"
pids+=("$pid")
names+=("$rir_name")
done
local index
for index in "${!pids[@]}"; do
pid="${pids[$index]}"
rir_name="${names[$index]}"
pid_file="$(live_ta_refresh_pid_file_for_rir "$rir_name")"
if wait "$pid"; then
rm -f "$pid_file"
else
failed=1
rm -f "$pid_file"
echo "live-ta-refresh failed before snapshot rir=$rir_name log=$LOG_ROOT/live-ta-refresh-$run_id-$rir_name.log" >&2
fi
done
if (( failed != 0 )); then
die "live TA refresh failed before snapshot run=$run_id; see $LOG_ROOT/live-ta-refresh-$run_id-*.log"
fi
echo "live-ta-refresh completed before snapshot run=$run_id rirs=${#RIR_LIST[@]}"
}
prepare_live_ta_inputs_for_run() {
local run_id="$1"
local sync_mode="$2"
local rir_name
if [[ "$TAL_INPUT_MODE" != "file-live-ta" ]]; then
return 0
fi
if [[ "$sync_mode" == "snapshot" ]] && is_true "$LIVE_TA_REFRESH_BEFORE_SNAPSHOT"; then
refresh_live_ta_blocking_for_run "$run_id"
return 0
fi
for rir_name in "${RIR_LIST[@]}"; do
ensure_live_ta_for_rir "$rir_name"
done
for rir_name in "${RIR_LIST[@]}"; do
start_live_ta_refresh_for_rir "$rir_name" "$run_id"
done
}
compare_view_trust_anchor() {
if [[ "${#RIR_LIST[@]}" -eq 1 ]]; then
printf '%s' "${RIR_LIST[0]}"
else
printf '%s' "all5"
fi
}
max_existing_run_index() {
local max_index=0
local run_dir
local run_name
local numeric_part
shopt -s nullglob
for run_dir in "$RUNS_ROOT"/run_*; do
[[ -d "$run_dir" ]] || continue
run_name="$(basename "$run_dir")"
numeric_part="${run_name#run_}"
[[ "$numeric_part" =~ ^[0-9]+$ ]] || continue
if (( 10#$numeric_part > max_index )); then
max_index=$((10#$numeric_part))
fi
done
shopt -u nullglob
printf '%s' "$max_index"
}
json_status_is_success() {
local json_path="$1"
python3 - "$json_path" <<'PY'
import json
import sys
path = sys.argv[1]
try:
with open(path, "r", encoding="utf-8") as handle:
data = json.load(handle)
except Exception:
sys.exit(1)
sys.exit(0 if data.get("status") == "success" else 1)
PY
}
previous_run_success() {
local run_dir="$1"
[[ -d "$run_dir" ]] || return 1
[[ -f "$run_dir/run-meta.json" ]] || return 1
[[ -f "$run_dir/run-summary.json" ]] || return 1
json_status_is_success "$run_dir/run-meta.json" || return 1
json_status_is_success "$run_dir/run-summary.json" || return 1
for required_artifact in report.json result.ccr input.cir stage-timing.json process-time.txt stdout.log stderr.log; do
[[ -f "$run_dir/$required_artifact" ]] || return 1
done
return 0
}
move_if_exists() {
local source_path="$1"
local target_dir="$2"
if [[ -e "$source_path" ]]; then
mkdir -p "$target_dir"
mv "$source_path" "$target_dir/"
fi
}
clear_dir_contents() {
local dir_path="$1"
[[ -d "$dir_path" ]] || return 0
find "$dir_path" -mindepth 1 -maxdepth 1 -exec rm -rf {} +
}
db_state_exists() {
[[ -e "$DB_DIR/work-db" || -e "$DB_DIR/repo-bytes.db" ]]
}
delta_state_available() {
[[ -e "$DB_DIR/work-db" ]]
}
isolate_state_after_failure() {
local previous_run_id="$1"
local timestamp
timestamp="$(date -u +%Y%m%dT%H%M%SZ)"
local invalid_dir="$INVALID_ROOT/${previous_run_id}-${timestamp}"
mkdir -p "$invalid_dir"
move_if_exists "$DB_DIR" "$invalid_dir"
move_if_exists "$META_DIR" "$invalid_dir"
if [[ -e "$TMP_DIR" ]]; then
INVALID_TMP_PATH="$TMP_DIR"
TMP_CLEANUP_STATUS="pending"
TMP_CLEANUP_REASON="discard_after_failure"
clear_dir_contents "$TMP_DIR"
mkdir -p "$TMP_DIR"
TMP_CLEANUP_STATUS="discarded_recreated"
else
INVALID_TMP_PATH=""
TMP_CLEANUP_STATUS="not_present"
TMP_CLEANUP_REASON="discard_after_failure"
mkdir -p "$TMP_DIR"
fi
mkdir -p "$DB_DIR" "$META_DIR" "$TMP_DIR"
INVALID_DB_PATH="$invalid_dir/$(basename "$DB_DIR")"
INVALID_STATE_PATH="$invalid_dir/$(basename "$META_DIR")"
}
periodic_snapshot_delta_scan() {
local command="$1"
shift
python3 - "$command" "$RUN_LIFECYCLE_STATE_PATH" "$RUNS_ROOT" "$RUN_LIFECYCLE_RECENT_RUNS_LIMIT" "$@" <<'PY'
import json
import os
import pathlib
import shlex
import sys
from datetime import datetime, timezone
command = sys.argv[1]
state_path = pathlib.Path(sys.argv[2])
runs_root = pathlib.Path(sys.argv[3])
recent_limit = int(sys.argv[4])
extra_args = sys.argv[5:]
def now_rfc3339() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def to_int(value):
if value is None or value == "":
return None
if isinstance(value, bool):
return None
return int(value)
def to_bool(value):
if isinstance(value, bool):
return value
if value in (None, ""):
return None
if isinstance(value, str):
lowered = value.strip().lower()
if lowered in {"1", "true", "yes", "on"}:
return True
if lowered in {"0", "false", "no", "off"}:
return False
raise ValueError(f"invalid bool value: {value!r}")
def normalize_run_entry(entry):
if entry is None:
return None
if not isinstance(entry, dict):
raise ValueError("run entry must be an object")
run_id = entry.get("run_id") or entry.get("runId")
run_index = to_int(entry.get("run_index", entry.get("runSeq")))
if not run_id or run_index is None:
raise ValueError("run entry missing run_id/run_index")
return {
"run_id": run_id,
"run_index": run_index,
"status": entry.get("status") or "unknown",
"sync_mode": entry.get("sync_mode", entry.get("syncMode")),
"snapshot_reason": entry.get("snapshot_reason"),
"started_at_rfc3339_utc": entry.get("started_at_rfc3339_utc"),
"completed_at_rfc3339_utc": entry.get("completed_at_rfc3339_utc"),
"periodic_snapshot_reset_enabled": to_bool(entry.get("periodic_snapshot_reset_enabled")),
"periodic_snapshot_max_deltas": to_int(entry.get("periodic_snapshot_max_deltas")),
"periodic_snapshot_delta_count": to_int(entry.get("periodic_snapshot_delta_count")),
"periodic_snapshot_forced": to_bool(entry.get("periodic_snapshot_forced")),
}
def snapshot_ref(entry):
if entry is None:
return None
return {
"run_id": entry["run_id"],
"run_index": entry["run_index"],
"snapshot_reason": entry.get("snapshot_reason"),
"completed_at_rfc3339_utc": entry.get("completed_at_rfc3339_utc"),
}
def normalize_snapshot_ref(entry):
if entry is None:
return None
if not isinstance(entry, dict):
raise ValueError("snapshot ref must be an object")
run_id = entry.get("run_id") or entry.get("runId")
run_index = to_int(entry.get("run_index", entry.get("runSeq")))
if not run_id or run_index is None:
raise ValueError("snapshot ref missing run_id/run_index")
return {
"run_id": run_id,
"run_index": run_index,
"snapshot_reason": entry.get("snapshot_reason"),
"completed_at_rfc3339_utc": entry.get("completed_at_rfc3339_utc"),
}
def state_health(last_run, last_success_snapshot):
if last_success_snapshot is not None:
return "ready", "ok"
if last_run is None:
return "empty", "no_runs"
return "bootstrap_incomplete", "no_successful_snapshot_in_retained_runs"
def default_state():
health, detail = state_health(None, None)
return {
"version": 1,
"updated_at_rfc3339_utc": now_rfc3339(),
"state_health": health,
"state_detail": detail,
"recent_runs_limit": recent_limit,
"last_run": None,
"last_success_snapshot": None,
"successful_deltas_since_snapshot": None,
"recent_runs": [],
}
def finalize_state(data):
data = dict(data)
data["version"] = 1
data["updated_at_rfc3339_utc"] = now_rfc3339()
data["recent_runs_limit"] = recent_limit
recent_runs = []
for item in data.get("recent_runs", []):
recent_runs.append(normalize_run_entry(item))
if len(recent_runs) > recent_limit:
recent_runs = recent_runs[-recent_limit:]
data["recent_runs"] = recent_runs
data["last_run"] = normalize_run_entry(data.get("last_run"))
data["last_success_snapshot"] = normalize_snapshot_ref(data.get("last_success_snapshot"))
delta_count = data.get("successful_deltas_since_snapshot")
data["successful_deltas_since_snapshot"] = to_int(delta_count)
if data["last_success_snapshot"] is None:
data["successful_deltas_since_snapshot"] = None
elif data["successful_deltas_since_snapshot"] is None:
data["successful_deltas_since_snapshot"] = 0
health, detail = state_health(data["last_run"], data["last_success_snapshot"])
if health == "ready":
detail = data.get("state_detail") or detail
data["state_health"] = health
data["state_detail"] = detail
return data
def sorted_run_dirs():
candidates = []
for path in runs_root.glob("run_*"):
if not path.is_dir():
continue
suffix = path.name[4:]
if suffix.isdigit():
candidates.append((int(suffix), path))
candidates.sort()
return [path for _, path in candidates]
def parse_run_dir(run_dir):
meta_path = run_dir / "run-meta.json"
summary_path = run_dir / "run-summary.json"
if not meta_path.exists() or not summary_path.exists():
return None
with meta_path.open("r", encoding="utf-8") as handle:
meta = json.load(handle)
with summary_path.open("r", encoding="utf-8") as handle:
summary = json.load(handle)
entry = normalize_run_entry(meta)
entry["status"] = meta.get("status") or summary.get("status") or entry["status"]
return entry
def bootstrap_state(exclude_run_id=None):
data = default_state()
last_success_snapshot = None
delta_count = None
recent_runs = []
last_run = None
for run_dir in sorted_run_dirs():
if exclude_run_id and run_dir.name == exclude_run_id:
continue
try:
entry = parse_run_dir(run_dir)
except Exception:
continue
if entry is None:
continue
last_run = entry
recent_runs.append(entry)
if entry["status"] != "success":
continue
if entry.get("sync_mode") == "snapshot":
last_success_snapshot = snapshot_ref(entry)
delta_count = 0
elif entry.get("sync_mode") == "delta":
if last_success_snapshot is not None and delta_count is not None:
delta_count += 1
data["last_run"] = last_run
data["last_success_snapshot"] = last_success_snapshot
data["successful_deltas_since_snapshot"] = delta_count
data["recent_runs"] = recent_runs[-recent_limit:]
return finalize_state(data)
def backup_corrupt_state():
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
backup_path = state_path.with_name(f"{state_path.name}.corrupt.{timestamp}.{os.getpid()}")
backup_path.parent.mkdir(parents=True, exist_ok=True)
os.replace(state_path, backup_path)
return backup_path
def load_state_or_bootstrap(exclude_run_id=None):
if not state_path.exists():
return bootstrap_state(exclude_run_id), "bootstrapped_from_runs_missing_file", "", ""
try:
with state_path.open("r", encoding="utf-8") as handle:
raw = json.load(handle)
if not isinstance(raw, dict):
raise ValueError("state root must be an object")
if to_int(raw.get("version")) != 1:
raise ValueError("unsupported version")
state = finalize_state(raw)
return state, "state_file", "", ""
except Exception as exc:
backup_path = backup_corrupt_state()
state = bootstrap_state(exclude_run_id)
return state, "bootstrapped_from_runs_after_corrupt_backup", exc.__class__.__name__, str(backup_path)
def atomic_write_json(path, payload):
path.parent.mkdir(parents=True, exist_ok=True)
tmp_path = path.with_name(f"{path.name}.tmp.{os.getpid()}")
with tmp_path.open("w", encoding="utf-8") as handle:
json.dump(payload, handle, indent=2, sort_keys=True)
handle.write("\n")
handle.flush()
os.fsync(handle.fileno())
os.replace(tmp_path, path)
try:
dir_fd = os.open(path.parent, os.O_DIRECTORY)
except OSError:
return
try:
os.fsync(dir_fd)
finally:
os.close(dir_fd)
def emit(name, value):
if value is None:
value = ""
elif isinstance(value, bool):
value = "true" if value else "false"
else:
value = str(value)
print(f"{name}={shlex.quote(value)}")
if command == "load":
max_deltas = int(extra_args[0])
state, source, detail, backup_path = load_state_or_bootstrap()
if source != "state_file":
atomic_write_json(state_path, state)
delta_count = state.get("successful_deltas_since_snapshot")
force_needed = bool(
state.get("last_success_snapshot") is not None
and delta_count is not None
and delta_count >= max_deltas
)
emit("PERIODIC_LIFECYCLE_SOURCE", source)
emit("PERIODIC_LIFECYCLE_DETAIL", detail or state.get("state_detail"))
emit("PERIODIC_LIFECYCLE_CORRUPT_BACKUP_PATH", backup_path)
emit("PERIODIC_LIFECYCLE_STATE_HEALTH", state.get("state_health"))
emit(
"PERIODIC_LIFECYCLE_LAST_SNAPSHOT_RUN_ID",
(state.get("last_success_snapshot") or {}).get("run_id"),
)
emit("PERIODIC_LIFECYCLE_DELTA_COUNT", delta_count)
emit("PERIODIC_LIFECYCLE_FORCE_NEEDED", force_needed)
elif command == "update":
run_dir = pathlib.Path(extra_args[0])
state, source, detail, backup_path = load_state_or_bootstrap(exclude_run_id=run_dir.name)
entry = parse_run_dir(run_dir)
if entry is None:
raise SystemExit(f"missing run metadata for lifecycle update: {run_dir}")
previous_snapshot = state.get("last_success_snapshot")
previous_delta_count = state.get("successful_deltas_since_snapshot")
state["last_run"] = entry
recent_runs = [item for item in state.get("recent_runs", []) if item.get("run_id") != entry["run_id"]]
recent_runs.append(entry)
state["recent_runs"] = recent_runs[-recent_limit:]
if entry["status"] == "success" and entry.get("sync_mode") == "snapshot":
state["last_success_snapshot"] = snapshot_ref(entry)
state["successful_deltas_since_snapshot"] = 0
state["state_detail"] = "ok"
elif entry["status"] == "success" and entry.get("sync_mode") == "delta":
if previous_snapshot is not None and previous_delta_count is not None:
state["last_success_snapshot"] = previous_snapshot
state["successful_deltas_since_snapshot"] = previous_delta_count + 1
state["state_detail"] = "ok"
else:
state["last_success_snapshot"] = previous_snapshot
state["successful_deltas_since_snapshot"] = None
state["state_detail"] = "success_delta_without_known_snapshot"
else:
state["last_success_snapshot"] = previous_snapshot
state["successful_deltas_since_snapshot"] = previous_delta_count
state = finalize_state(state)
atomic_write_json(state_path, state)
emit("RUN_LIFECYCLE_UPDATE_SOURCE", source)
emit("RUN_LIFECYCLE_UPDATE_DETAIL", detail or state.get("state_detail"))
emit("RUN_LIFECYCLE_UPDATE_CORRUPT_BACKUP_PATH", backup_path)
emit("RUN_LIFECYCLE_UPDATE_STATE_HEALTH", state.get("state_health"))
emit("RUN_LIFECYCLE_UPDATE_DELTA_COUNT", state.get("successful_deltas_since_snapshot"))
emit(
"RUN_LIFECYCLE_UPDATE_LAST_SNAPSHOT_RUN_ID",
(state.get("last_success_snapshot") or {}).get("run_id"),
)
else:
raise SystemExit(f"unknown lifecycle helper command: {command}")
PY
}
load_periodic_snapshot_lifecycle_context() {
eval "$(periodic_snapshot_delta_scan load "$PERIODIC_SNAPSHOT_MAX_DELTAS")"
if [[ -n "$PERIODIC_LIFECYCLE_CORRUPT_BACKUP_PATH" ]]; then
warn "run lifecycle state corrupt; backed up to $PERIODIC_LIFECYCLE_CORRUPT_BACKUP_PATH detail=${PERIODIC_LIFECYCLE_DETAIL:-unknown}"
fi
if [[ "$PERIODIC_LIFECYCLE_SOURCE" != "state_file" ]]; then
echo "run lifecycle state source=$PERIODIC_LIFECYCLE_SOURCE state_health=${PERIODIC_LIFECYCLE_STATE_HEALTH:-unknown} snapshot_run=${PERIODIC_LIFECYCLE_LAST_SNAPSHOT_RUN_ID:-none} delta_count=${PERIODIC_LIFECYCLE_DELTA_COUNT:-unknown}"
fi
if [[ "$PERIODIC_LIFECYCLE_STATE_HEALTH" == "bootstrap_incomplete" ]]; then
warn "run lifecycle state bootstrap incomplete detail=${PERIODIC_LIFECYCLE_DETAIL:-unknown}; forced snapshot counting resumes after the next successful snapshot"
fi
}
update_run_lifecycle_state() {
local run_dir="$1"
eval "$(periodic_snapshot_delta_scan update "$run_dir")"
if [[ -n "$RUN_LIFECYCLE_UPDATE_CORRUPT_BACKUP_PATH" ]]; then
warn "run lifecycle state corrupt during update; backed up to $RUN_LIFECYCLE_UPDATE_CORRUPT_BACKUP_PATH detail=${RUN_LIFECYCLE_UPDATE_DETAIL:-unknown}"
fi
}
prepare_periodic_reset_state_db() {
local run_id="$1"
RESET_DB_STAGING_PATH=""
RESET_DB_CLEANUP_STATUS=""
db_state_exists || return 0
local timestamp
local staging_root
timestamp="$(date -u +%Y%m%dT%H%M%SZ)"
staging_root="$RESET_STAGING_ROOT/${run_id}-${timestamp}"
mkdir -p "$staging_root"
mv "$DB_DIR" "$staging_root/"
mkdir -p "$DB_DIR"
RESET_DB_STAGING_PATH="$staging_root/$(basename "$DB_DIR")"
RESET_DB_CLEANUP_STATUS="pending"
}
finalize_periodic_reset_state_db() {
local final_status="$1"
local reset_db_staging_path="$2"
[[ -n "$reset_db_staging_path" ]] || {
printf '%s\n' ""
return 0
}
local staging_root
staging_root="$(dirname "$reset_db_staging_path")"
if [[ "$final_status" == "success" ]]; then
if rm -rf "$staging_root"; then
printf '%s\n' "deleted"
return 0
fi
warn "failed to delete periodic reset staging: $staging_root"
printf '%s\n' "cleanup_failed"
return 1
fi
printf '%s\n' "retained_failure"
}
write_run_meta() {
local output_path="$1"
local status="$2"
local run_index="$3"
local run_id="$4"
local sync_mode="$5"
local snapshot_reason="$6"
local previous_run_id="$7"
local previous_run_success_value="$8"
local started_at="$9"
local completed_at="${10}"
local invalid_db_path="${11}"
local invalid_state_path="${12}"
local invalid_tmp_path="${13}"
local daemon_exit_code="${14}"
local package_root="${15}"
local env_file="${16}"
local periodic_snapshot_reset_enabled="${17}"
local periodic_snapshot_max_deltas="${18}"
local periodic_snapshot_delta_count="${19}"
local periodic_snapshot_forced="${20}"
local reset_db_staging_path="${21}"
local reset_db_cleanup_status="${22}"
local tmp_cleanup_status="${23}"
local tmp_cleanup_reason="${24}"
python3 - "$output_path" "$status" "$run_index" "$run_id" "$sync_mode" "$snapshot_reason" \
"$previous_run_id" "$previous_run_success_value" "$started_at" "$completed_at" \
"$invalid_db_path" "$invalid_state_path" "$invalid_tmp_path" "$daemon_exit_code" \
"$package_root" "$env_file" "$periodic_snapshot_reset_enabled" \
"$periodic_snapshot_max_deltas" "$periodic_snapshot_delta_count" \
"$periodic_snapshot_forced" "$reset_db_staging_path" "$reset_db_cleanup_status" \
"$tmp_cleanup_status" "$tmp_cleanup_reason" <<'PY'
import json
import sys
def nullable(value):
return None if value == "" else value
def nullable_bool(value):
if value == "":
return None
return value == "true"
def nullable_int(value):
if value == "":
return None
return int(value)
def bool_value(value):
return value == "true"
(
output_path,
status,
run_index,
run_id,
sync_mode,
snapshot_reason,
previous_run_id,
previous_run_success,
started_at,
completed_at,
invalid_db_path,
invalid_state_path,
invalid_tmp_path,
daemon_exit_code,
package_root,
env_file,
periodic_snapshot_reset_enabled,
periodic_snapshot_max_deltas,
periodic_snapshot_delta_count,
periodic_snapshot_forced,
reset_db_staging_path,
reset_db_cleanup_status,
tmp_cleanup_status,
tmp_cleanup_reason,
) = sys.argv[1:]
data = {
"status": status,
"run_index": int(run_index),
"run_id": run_id,
"sync_mode": sync_mode,
"snapshot_reason": nullable(snapshot_reason),
"previous_run_id": nullable(previous_run_id),
"previous_run_success": nullable_bool(previous_run_success),
"started_at_rfc3339_utc": started_at,
"completed_at_rfc3339_utc": nullable(completed_at),
"invalid_db_path": nullable(invalid_db_path),
"invalid_state_path": nullable(invalid_state_path),
"invalid_tmp_path": nullable(invalid_tmp_path),
"daemon_exit_code": nullable_int(daemon_exit_code),
"package_root": package_root,
"env_file": env_file,
"periodic_snapshot_reset_enabled": bool_value(periodic_snapshot_reset_enabled),
"periodic_snapshot_max_deltas": int(periodic_snapshot_max_deltas),
"periodic_snapshot_delta_count": nullable_int(periodic_snapshot_delta_count),
"periodic_snapshot_forced": bool_value(periodic_snapshot_forced),
"reset_db_staging_path": nullable(reset_db_staging_path),
"reset_db_cleanup_status": nullable(reset_db_cleanup_status),
"tmp_cleanup_status": nullable(tmp_cleanup_status),
"tmp_cleanup_reason": nullable(tmp_cleanup_reason),
}
with open(output_path, "w", encoding="utf-8") as handle:
json.dump(data, handle, indent=2, sort_keys=True)
handle.write("\n")
PY
}
summary_status() {
local summary_path="$1"
python3 - "$summary_path" <<'PY'
import json
import sys
try:
with open(sys.argv[1], "r", encoding="utf-8") as handle:
print(json.load(handle).get("status", "missing"))
except Exception:
print("missing")
PY
}
prepare_competing_rp_state() {
if ! is_true "$DISABLE_COMPETING_RPS"; then
return 0
fi
systemctl disable --now rpki-client.timer >/dev/null 2>&1 || true
systemctl stop rpki-client.service >/dev/null 2>&1 || true
pkill -x rpki-client >/dev/null 2>&1 || true
pkill -x routinator >/dev/null 2>&1 || true
}
write_machine_snapshot() {
local suffix="$1"
df -h > "$LOG_ROOT/df-${suffix}.txt" 2>&1 || true
free -h > "$LOG_ROOT/free-${suffix}.txt" 2>&1 || true
ps -eo pid,ppid,stat,pcpu,pmem,rss,args --sort=-pcpu \
| grep -E 'rpki_daemon|/bin/rpki|rpki-client|routinator' \
| grep -v grep > "$LOG_ROOT/process-${suffix}.txt" || true
systemctl is-active rpki-client.timer > "$LOG_ROOT/rpki-client-timer-active-${suffix}.txt" 2>&1 || true
systemctl is-enabled rpki-client.timer > "$LOG_ROOT/rpki-client-timer-enabled-${suffix}.txt" 2>&1 || true
}
build_child_args() {
CHILD_ARGS=(
--db "$DB_DIR/work-db"
--repo-bytes-db "$DB_DIR/repo-bytes.db"
--rsync-scope "$RSYNC_SCOPE"
)
if is_true "$ALLOW_RSYNC_MIRROR_REUSE"; then
CHILD_ARGS+=(--rsync-mirror-root "$RSYNC_MIRROR_ROOT")
else
CHILD_ARGS+=(--rsync-mirror-root "$TMP_DIR/rsync-mirror-{run_id}")
fi
CHILD_ARGS+=(
--parallel-phase2-ready-batch-size 256
--parallel-phase2-ready-batch-wall-time-budget-ms 100
--parallel-phase2-result-drain-batch-size 2048
--parallel-phase2-finalize-batch-size 256
--parallel-phase2-finalize-batch-wall-time-budget-ms 100
)
local rir_name
for rir_name in "${RIR_LIST[@]}"; do
if [[ "$TAL_INPUT_MODE" == "url" ]]; then
CHILD_ARGS+=(--tal-url "$(tal_url_for_rir "$rir_name")")
elif [[ "$TAL_INPUT_MODE" == "file-live-ta" ]]; then
CHILD_ARGS+=(--tal-path "$(tal_file_for_rir "$rir_name")")
CHILD_ARGS+=(--ta-path "$(live_ta_file_for_rir "$rir_name")")
else
CHILD_ARGS+=(--tal-path "$(tal_file_for_rir "$rir_name")")
CHILD_ARGS+=(--ta-path "$(ta_file_for_rir "$rir_name")")
fi
done
CHILD_ARGS+=(
--report-json "{run_out}/report.json"
)
if is_true "$OUTPUT_COMPACT_REPORT"; then
CHILD_ARGS+=(--report-json-compact)
fi
CHILD_ARGS+=(
--ccr-out "{run_out}/result.ccr"
--cir-enable
--cir-out "{run_out}/input.cir"
)
for rir_name in "${RIR_LIST[@]}"; do
CHILD_ARGS+=(--cir-tal-uri "$(cir_tal_uri_for_rir "$rir_name")")
done
CHILD_ARGS+=(
--vrps-csv-out "{run_out}/vrps.csv"
--vaps-csv-out "{run_out}/vaps.csv"
--compare-view-trust-anchor "$(compare_view_trust_anchor)"
)
if is_true "$ENABLE_CHILD_CERTIFICATE_VALIDATION_CACHE"; then
CHILD_ARGS+=(--enable-child-certificate-validation-cache)
fi
if [[ -n "$RPKI_EXTRA_ARGS" ]]; then
# shellcheck disable=SC2206
local extra_args=( $RPKI_EXTRA_ARGS )
CHILD_ARGS+=("${extra_args[@]}")
fi
}
copy_inner_run_outputs() {
local daemon_state_root="$1"
local run_dir="$2"
local outer_run_index="$3"
local outer_run_id="$4"
local inner_run_dir
inner_run_dir="$(find "$daemon_state_root/runs" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | sort | tail -n 1 || true)"
if [[ -n "$inner_run_dir" && -d "$inner_run_dir" ]]; then
shopt -s dotglob nullglob
cp -a "$inner_run_dir"/. "$run_dir"/
shopt -u dotglob nullglob
fi
[[ -f "$daemon_state_root/daemon-status.json" ]] && cp "$daemon_state_root/daemon-status.json" "$run_dir/daemon-status.inner.json"
[[ -f "$daemon_state_root/daemon-runs.jsonl" ]] && cp "$daemon_state_root/daemon-runs.jsonl" "$run_dir/daemon-runs.inner.jsonl"
normalize_outer_run_metadata "$run_dir" "$outer_run_index" "$outer_run_id" "$inner_run_dir" "$daemon_state_root"
}
normalize_outer_run_metadata() {
local run_dir="$1"
local outer_run_index="$2"
local outer_run_id="$3"
local inner_run_dir="$4"
local daemon_state_root="$5"
python3 - "$run_dir" "$outer_run_index" "$outer_run_id" "$inner_run_dir" "$daemon_state_root" <<'PY'
import json
import pathlib
import sys
run_dir = pathlib.Path(sys.argv[1]).resolve()
outer_run_index = int(sys.argv[2])
outer_run_id = sys.argv[3]
inner_run_dir = sys.argv[4]
daemon_state_root = pathlib.Path(sys.argv[5])
def replace_paths(value):
if isinstance(value, dict):
return {key: replace_paths(item) for key, item in value.items()}
if isinstance(value, list):
return [replace_paths(item) for item in value]
if isinstance(value, str) and inner_run_dir:
return value.replace(inner_run_dir, str(run_dir))
return value
def normalize_summary(summary):
summary = dict(summary)
summary.setdefault("innerRunSeq", summary.get("runSeq"))
summary.setdefault("innerRunId", summary.get("runId"))
summary.setdefault("innerRunDir", summary.get("runDir"))
summary = replace_paths(summary)
summary["runSeq"] = outer_run_index
summary["runId"] = outer_run_id
summary["runDir"] = str(run_dir)
return summary
summary_path = run_dir / "run-summary.json"
if summary_path.exists():
summary = json.loads(summary_path.read_text(encoding="utf-8"))
summary_path.write_text(
json.dumps(normalize_summary(summary), indent=2, sort_keys=True) + "\n",
encoding="utf-8",
)
inner_status_path = run_dir / "daemon-status.inner.json"
if not inner_status_path.exists():
raw_status_path = daemon_state_root / "daemon-status.json"
if raw_status_path.exists():
inner_status_path.write_text(raw_status_path.read_text(encoding="utf-8"), encoding="utf-8")
if inner_status_path.exists():
status = json.loads(inner_status_path.read_text(encoding="utf-8"))
status.setdefault("innerLastRunId", status.get("lastRunId"))
status["lastRunId"] = outer_run_id
status["outerRunId"] = outer_run_id
status["outerRunIndex"] = outer_run_index
(run_dir / "daemon-status.json").write_text(
json.dumps(status, indent=2, sort_keys=True) + "\n",
encoding="utf-8",
)
inner_runs_path = run_dir / "daemon-runs.inner.jsonl"
if not inner_runs_path.exists():
raw_runs_path = daemon_state_root / "daemon-runs.jsonl"
if raw_runs_path.exists():
inner_runs_path.write_text(raw_runs_path.read_text(encoding="utf-8"), encoding="utf-8")
if inner_runs_path.exists():
lines = []
for line in inner_runs_path.read_text(encoding="utf-8").splitlines():
if not line.strip():
continue
lines.append(json.dumps(normalize_summary(json.loads(line)), sort_keys=True))
(run_dir / "daemon-runs.jsonl").write_text("\n".join(lines) + ("\n" if lines else ""), encoding="utf-8")
PY
}
apply_outer_retention() {
local dirs=()
local retain_limit="$RETAIN_RUNS"
local keep_run="${1:-}"
local run_dir
shopt -s nullglob
for run_dir in "$RUNS_ROOT"/run_[0-9][0-9][0-9][0-9]; do
[[ -d "$run_dir" ]] && dirs+=("$run_dir")
done
shopt -u nullglob
if (( ${#dirs[@]} <= retain_limit )); then
return 0
fi
mapfile -t dirs < <(printf '%s\n' "${dirs[@]}" | sort)
local remove_count=$(( ${#dirs[@]} - retain_limit ))
local removed_count=0
local candidate
for candidate in "${dirs[@]}"; do
if [[ -n "$keep_run" && "$(basename "$candidate")" == "$keep_run" ]]; then
continue
fi
rm -rf "$candidate"
removed_count=$((removed_count + 1))
if (( removed_count >= remove_count )); then
break
fi
done
}
run_one_round() {
local run_index="$1"
local run_id
run_id="$(printf 'run_%04d' "$run_index")"
local run_dir="$RUNS_ROOT/$run_id"
local previous_run_id="$2"
local previous_success_value="$3"
local sync_mode="$4"
local snapshot_reason="$5"
local daemon_state_root="$TMP_DIR/daemon-$run_id"
local started_at
local completed_at
local daemon_exit_code
local summary_state
mkdir -p "$run_dir" "$daemon_state_root"
apply_outer_retention "$run_id"
started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
write_run_meta "$run_dir/run-meta.json" "running" "$run_index" "$run_id" "$sync_mode" \
"$snapshot_reason" "$previous_run_id" "$previous_success_value" "$started_at" "" \
"$INVALID_DB_PATH" "$INVALID_STATE_PATH" "$INVALID_TMP_PATH" "" "$PACKAGE_ROOT" "$ENV_FILE" \
"$RUN_META_PERIODIC_ENABLED" "$RUN_META_PERIODIC_MAX_DELTAS" "$RUN_META_PERIODIC_DELTA_COUNT" \
"$RUN_META_PERIODIC_FORCED" "$RUN_META_RESET_DB_STAGING_PATH" "$RUN_META_RESET_DB_CLEANUP_STATUS" \
"$RUN_META_TMP_CLEANUP_STATUS" "$RUN_META_TMP_CLEANUP_REASON"
prepare_live_ta_inputs_for_run "$run_id" "$sync_mode"
build_child_args
if is_true "$RPKI_ANALYZE"; then
CHILD_ARGS+=(--analyze --analysis-out "$run_dir/analyze")
fi
local daemon_args=(
--state-root "$daemon_state_root"
--rpki-bin "$RPKI_BIN"
--interval-secs 0
--max-runs 1
--retain-runs "$RETAIN_RUNS"
--work-db "$DB_DIR/work-db"
--repo-bytes-db "$DB_DIR/repo-bytes.db"
)
if [[ -x "$DB_STATS_BIN" ]]; then
daemon_args+=(--db-stats-bin "$DB_STATS_BIN")
if [[ -n "${DB_STATS_EXACT_EVERY:-}" && "$DB_STATS_EXACT_EVERY" != "0" ]]; then
daemon_args+=(--db-stats-exact-every "$DB_STATS_EXACT_EVERY")
fi
fi
set +e
env \
RPKI_PROGRESS_LOG="$RPKI_PROGRESS_LOG" \
RPKI_PROGRESS_SLOW_SECS="$RPKI_PROGRESS_SLOW_SECS" \
RPKI_PROGRESS_STAGE_FRESH_SLOW_MS="$RPKI_PROGRESS_STAGE_FRESH_SLOW_MS" \
RPKI_PROGRESS_PP_CONTROL_SLOW_MS="$RPKI_PROGRESS_PP_CONTROL_SLOW_MS" \
RPKI_PROGRESS_PP_CACHE_SLOW_MS="$RPKI_PROGRESS_PP_CACHE_SLOW_MS" \
RPKI_PROGRESS_CONTROL_LOOP_SLOW_MS="$RPKI_PROGRESS_CONTROL_LOOP_SLOW_MS" \
"$RPKI_DAEMON_BIN" "${daemon_args[@]}" -- "${CHILD_ARGS[@]}" \
> "$run_dir/daemon-stdout.log" 2> "$run_dir/daemon-stderr.log"
daemon_exit_code=$?
set -e
copy_inner_run_outputs "$daemon_state_root" "$run_dir" "$run_index" "$run_id"
completed_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
summary_state="$(summary_status "$run_dir/run-summary.json")"
local final_status="failed"
if [[ "$daemon_exit_code" -eq 0 && "$summary_state" == "success" ]]; then
final_status="success"
fi
if [[ -n "$RUN_META_RESET_DB_STAGING_PATH" ]]; then
if RUN_META_RESET_DB_CLEANUP_STATUS="$(finalize_periodic_reset_state_db "$final_status" "$RUN_META_RESET_DB_STAGING_PATH")"; then
:
else
final_status="failed"
fi
fi
write_run_meta "$run_dir/run-meta.json" "$final_status" "$run_index" "$run_id" "$sync_mode" \
"$snapshot_reason" "$previous_run_id" "$previous_success_value" "$started_at" "$completed_at" \
"$INVALID_DB_PATH" "$INVALID_STATE_PATH" "$INVALID_TMP_PATH" "$daemon_exit_code" "$PACKAGE_ROOT" "$ENV_FILE" \
"$RUN_META_PERIODIC_ENABLED" "$RUN_META_PERIODIC_MAX_DELTAS" "$RUN_META_PERIODIC_DELTA_COUNT" \
"$RUN_META_PERIODIC_FORCED" "$RUN_META_RESET_DB_STAGING_PATH" "$RUN_META_RESET_DB_CLEANUP_STATUS" \
"$RUN_META_TMP_CLEANUP_STATUS" "$RUN_META_TMP_CLEANUP_REASON"
update_run_lifecycle_state "$run_dir"
printf '%s\n' "$run_id" > "$META_DIR/last-run-id"
if is_true "$CLEAN_TMP_AFTER_RUN"; then
rm -rf "$daemon_state_root"
fi
apply_outer_retention
[[ "$final_status" == "success" ]]
}
main() {
if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then
usage
exit 0
fi
require_command python3
require_command date
require_command find
if [[ "$TAL_INPUT_MODE" == "file-live-ta" ]]; then
require_command curl
validate_positive_int "LIVE_TA_REFRESH_CONNECT_TIMEOUT_SECS" "$LIVE_TA_REFRESH_CONNECT_TIMEOUT_SECS"
validate_positive_int "LIVE_TA_REFRESH_MAX_TIME_SECS" "$LIVE_TA_REFRESH_MAX_TIME_SECS"
fi
validate_max_runs
validate_non_negative_int "INTERVAL_SECS" "$INTERVAL_SECS"
validate_non_negative_int "STOP_AFTER_SECS" "$STOP_AFTER_SECS"
validate_positive_int "RETAIN_RUNS" "$RETAIN_RUNS"
validate_rsync_scope
validate_tal_input_mode
validate_non_negative_int "PERIODIC_SNAPSHOT_MAX_DELTAS" "$PERIODIC_SNAPSHOT_MAX_DELTAS"
if [[ -n "${DB_STATS_EXACT_EVERY:-}" && "$DB_STATS_EXACT_EVERY" != "0" ]]; then
validate_positive_int "DB_STATS_EXACT_EVERY" "$DB_STATS_EXACT_EVERY"
fi
parse_rirs
[[ -x "$RPKI_BIN" ]] || die "missing executable: $RPKI_BIN"
[[ -x "$RPKI_DAEMON_BIN" ]] || die "missing executable: $RPKI_DAEMON_BIN"
local rir_name
for rir_name in "${RIR_LIST[@]}"; do
if [[ "$TAL_INPUT_MODE" == "url" ]]; then
[[ -n "$(tal_url_for_rir "$rir_name")" ]] || die "missing TAL URL for $rir_name"
elif [[ "$TAL_INPUT_MODE" == "file-live-ta" ]]; then
[[ -f "$(tal_file_for_rir "$rir_name")" ]] || die "missing TAL fixture for $rir_name"
[[ -n "$(tal_https_uri_from_fixture "$(tal_file_for_rir "$rir_name")")" ]] || die "missing http(s) TA URI in TAL fixture for $rir_name"
else
[[ -f "$(tal_file_for_rir "$rir_name")" ]] || die "missing TAL fixture for $rir_name"
[[ -f "$(ta_file_for_rir "$rir_name")" ]] || die "missing TA fixture for $rir_name"
fi
done
mkdir -p "$RUNS_ROOT" "$LOG_ROOT" "$DB_DIR" "$META_DIR" "$TMP_DIR" "$INVALID_ROOT" "$RESET_STAGING_ROOT" "$LIVE_TA_REFRESH_DIR"
if is_true "$ALLOW_RSYNC_MIRROR_REUSE"; then
mkdir -p "$RSYNC_MIRROR_ROOT"
fi
prepare_competing_rp_state
write_machine_snapshot "before"
local max_index
local next_index
local run_forever=0
local stop_index=0
local started_epoch
local elapsed_secs
started_epoch="$(date +%s)"
max_index="$(max_existing_run_index)"
next_index=$((max_index + 1))
if (( MAX_RUNS < 0 )); then
run_forever=1
echo "run_soak mode=continuous max_existing_run_index=$max_index next_run=$(printf 'run_%04d' "$next_index")"
else
stop_index=$((max_index + MAX_RUNS))
echo "run_soak mode=fixed max_existing_run_index=$max_index next_run=$(printf 'run_%04d' "$next_index") stop_run=$(printf 'run_%04d' "$stop_index")"
fi
local any_failed=0
while (( run_forever == 1 || next_index <= stop_index )); do
INVALID_DB_PATH=""
INVALID_STATE_PATH=""
INVALID_TMP_PATH=""
TMP_CLEANUP_STATUS=""
TMP_CLEANUP_REASON=""
PERIODIC_LIFECYCLE_SOURCE=""
PERIODIC_LIFECYCLE_DETAIL=""
PERIODIC_LIFECYCLE_CORRUPT_BACKUP_PATH=""
PERIODIC_LIFECYCLE_STATE_HEALTH=""
PERIODIC_LIFECYCLE_LAST_SNAPSHOT_RUN_ID=""
PERIODIC_LIFECYCLE_DELTA_COUNT=""
PERIODIC_LIFECYCLE_FORCE_NEEDED="false"
RESET_DB_STAGING_PATH=""
RESET_DB_CLEANUP_STATUS=""
if is_true "$PERIODIC_SNAPSHOT_RESET"; then
RUN_META_PERIODIC_ENABLED="true"
load_periodic_snapshot_lifecycle_context
if [[ -n "$PERIODIC_LIFECYCLE_DELTA_COUNT" ]]; then
RUN_META_PERIODIC_DELTA_COUNT="$PERIODIC_LIFECYCLE_DELTA_COUNT"
fi
else
RUN_META_PERIODIC_ENABLED="false"
fi
RUN_META_PERIODIC_MAX_DELTAS="$PERIODIC_SNAPSHOT_MAX_DELTAS"
RUN_META_PERIODIC_DELTA_COUNT=""
RUN_META_PERIODIC_FORCED="false"
RUN_META_RESET_DB_STAGING_PATH=""
RUN_META_RESET_DB_CLEANUP_STATUS=""
RUN_META_TMP_CLEANUP_STATUS=""
RUN_META_TMP_CLEANUP_REASON=""
local previous_run_id=""
local previous_success_value=""
local sync_mode="snapshot"
local snapshot_reason=""
if (( next_index > 1 )); then
previous_run_id="$(printf 'run_%04d' $((next_index - 1)))"
if previous_run_success "$RUNS_ROOT/$previous_run_id"; then
previous_success_value="true"
if delta_state_available; then
sync_mode="delta"
if is_true "$PERIODIC_SNAPSHOT_RESET"; then
if [[ "$PERIODIC_LIFECYCLE_FORCE_NEEDED" == "true" ]]; then
RUN_META_PERIODIC_FORCED="true"
sync_mode="snapshot"
snapshot_reason="periodic_snapshot_delta_limit"
prepare_periodic_reset_state_db "$(printf 'run_%04d' "$next_index")"
RUN_META_RESET_DB_STAGING_PATH="$RESET_DB_STAGING_PATH"
RUN_META_RESET_DB_CLEANUP_STATUS="$RESET_DB_CLEANUP_STATUS"
echo "periodic snapshot reset forcing snapshot run=$(printf 'run_%04d' "$next_index") delta_count=$PERIODIC_LIFECYCLE_DELTA_COUNT max_deltas=$PERIODIC_SNAPSHOT_MAX_DELTAS"
fi
fi
else
sync_mode="snapshot"
snapshot_reason="missing_db"
fi
else
previous_success_value="false"
if is_true "$FAILURE_SNAPSHOT_RESET"; then
isolate_state_after_failure "$previous_run_id"
RUN_META_TMP_CLEANUP_STATUS="$TMP_CLEANUP_STATUS"
RUN_META_TMP_CLEANUP_REASON="$TMP_CLEANUP_REASON"
sync_mode="snapshot"
snapshot_reason="previous_run_failed"
else
die "previous run is not successful: $previous_run_id"
fi
fi
else
sync_mode="snapshot"
if db_state_exists; then
isolate_state_after_failure "no_previous_run"
RUN_META_TMP_CLEANUP_STATUS="$TMP_CLEANUP_STATUS"
RUN_META_TMP_CLEANUP_REASON="$TMP_CLEANUP_REASON"
snapshot_reason="no_successful_previous_run"
else
snapshot_reason="first_run"
fi
fi
echo "starting run $(printf 'run_%04d' "$next_index") sync_mode=$sync_mode"
if run_one_round "$next_index" "$previous_run_id" "$previous_success_value" "$sync_mode" "$snapshot_reason"; then
echo "completed run $(printf 'run_%04d' "$next_index") status=success"
else
echo "completed run $(printf 'run_%04d' "$next_index") status=failed" >&2
any_failed=1
fi
if (( STOP_AFTER_SECS > 0 )); then
elapsed_secs=$(( $(date +%s) - started_epoch ))
if (( elapsed_secs >= STOP_AFTER_SECS )); then
echo "run_soak stop_after_secs reached elapsed_secs=$elapsed_secs stop_after_secs=$STOP_AFTER_SECS"
break
fi
fi
if (( (run_forever == 1 || next_index < stop_index) && INTERVAL_SECS > 0 )); then
sleep "$INTERVAL_SECS"
fi
next_index=$((next_index + 1))
done
write_machine_snapshot "after"
exit "$any_failed"
}
main "$@"