#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PACKAGE_ROOT="${PACKAGE_ROOT:-$SCRIPT_DIR}" ENV_FILE="${ENV_FILE:-$PACKAGE_ROOT/.env}" if [[ -f "$ENV_FILE" ]]; then # shellcheck disable=SC1090 source "$ENV_FILE" fi MAX_RUNS="${MAX_RUNS:-3}" INTERVAL_SECS="${INTERVAL_SECS:-0}" STOP_AFTER_SECS="${STOP_AFTER_SECS:-0}" RIRS="${RIRS:-afrinic,apnic,arin,lacnic,ripe}" TAL_INPUT_MODE="${TAL_INPUT_MODE:-file-with-ta}" RUN_ROOT="${RUN_ROOT:-$PACKAGE_ROOT}" RETAIN_RUNS="${RETAIN_RUNS:-10}" CLEAN_TMP_AFTER_RUN="${CLEAN_TMP_AFTER_RUN:-0}" OUTPUT_COMPACT_REPORT="${OUTPUT_COMPACT_REPORT:-1}" ALLOW_RSYNC_MIRROR_REUSE="${ALLOW_RSYNC_MIRROR_REUSE:-1}" RSYNC_SCOPE="${RSYNC_SCOPE:-module-root}" FAILURE_SNAPSHOT_RESET="${FAILURE_SNAPSHOT_RESET:-1}" PERIODIC_SNAPSHOT_RESET="${PERIODIC_SNAPSHOT_RESET:-0}" PERIODIC_SNAPSHOT_MAX_DELTAS="${PERIODIC_SNAPSHOT_MAX_DELTAS:-100}" DB_STATS_EXACT_EVERY="${DB_STATS_EXACT_EVERY:-3}" RPKI_PROGRESS_LOG="${RPKI_PROGRESS_LOG:-1}" RPKI_PROGRESS_SLOW_SECS="${RPKI_PROGRESS_SLOW_SECS:-10}" RPKI_PROGRESS_STAGE_FRESH_SLOW_MS="${RPKI_PROGRESS_STAGE_FRESH_SLOW_MS:-1000}" RPKI_PROGRESS_PP_CONTROL_SLOW_MS="${RPKI_PROGRESS_PP_CONTROL_SLOW_MS:-100}" RPKI_PROGRESS_PP_CACHE_SLOW_MS="${RPKI_PROGRESS_PP_CACHE_SLOW_MS:-50}" RPKI_PROGRESS_CONTROL_LOOP_SLOW_MS="${RPKI_PROGRESS_CONTROL_LOOP_SLOW_MS:-1000}" DISABLE_COMPETING_RPS="${DISABLE_COMPETING_RPS:-1}" ENABLE_CHILD_CERTIFICATE_VALIDATION_CACHE="${ENABLE_CHILD_CERTIFICATE_VALIDATION_CACHE:-0}" RPKI_EXTRA_ARGS="${RPKI_EXTRA_ARGS:-}" RPKI_ANALYZE="${RPKI_ANALYZE:-0}" BIN_DIR="${BIN_DIR:-$PACKAGE_ROOT/bin}" FIXTURE_DIR="${FIXTURE_DIR:-$PACKAGE_ROOT/fixtures}" STATE_ROOT="$RUN_ROOT/state" RUNS_ROOT="$RUN_ROOT/runs" LOG_ROOT="$RUN_ROOT/logs" DB_DIR="${DB_DIR:-$STATE_ROOT/db}" META_DIR="${META_DIR:-$STATE_ROOT/meta}" TMP_DIR="${TMP_DIR:-$RUN_ROOT/tmp}" RSYNC_MIRROR_ROOT="${RSYNC_MIRROR_ROOT:-$STATE_ROOT/rsync-mirror}" INVALID_ROOT="$STATE_ROOT/invalid" RESET_STAGING_ROOT="$STATE_ROOT/reset-staging" LIVE_TA_REFRESH_DIR="${LIVE_TA_REFRESH_DIR:-$META_DIR/live-ta-refresh}" LIVE_TA_REFRESH_CONNECT_TIMEOUT_SECS="${LIVE_TA_REFRESH_CONNECT_TIMEOUT_SECS:-15}" LIVE_TA_REFRESH_MAX_TIME_SECS="${LIVE_TA_REFRESH_MAX_TIME_SECS:-120}" LIVE_TA_REFRESH_BEFORE_SNAPSHOT="${LIVE_TA_REFRESH_BEFORE_SNAPSHOT:-1}" RUN_LIFECYCLE_STATE_PATH="$STATE_ROOT/run-lifecycle-state.json" RUN_LIFECYCLE_RECENT_RUNS_LIMIT=200 RPKI_BIN="$BIN_DIR/rpki" RPKI_DAEMON_BIN="$BIN_DIR/rpki_daemon" DB_STATS_BIN="$BIN_DIR/db_stats" usage() { cat <<'USAGE' Usage: ./run_soak.sh 配置来自 package 根目录下的 .env;也可以用 ENV_FILE=/path/to/.env 覆盖。 USAGE } die() { echo "error: $*" >&2 exit 2 } warn() { echo "warning: $*" >&2 } is_true() { case "${1:-}" in 1|true|TRUE|yes|YES|on|ON) return 0 ;; *) return 1 ;; esac } require_command() { command -v "$1" >/dev/null 2>&1 || die "missing required command: $1" } validate_positive_int() { local name="$1" local value="$2" [[ "$value" =~ ^[0-9]+$ ]] || die "$name must be an integer: $value" [[ "$value" != "0" ]] || die "$name must be > 0" } validate_non_negative_int() { local name="$1" local value="$2" [[ "$value" =~ ^[0-9]+$ ]] || die "$name must be an integer: $value" } validate_max_runs() { [[ "$MAX_RUNS" =~ ^-?[0-9]+$ ]] || die "MAX_RUNS must be an integer: $MAX_RUNS" [[ "$MAX_RUNS" != "0" ]] || die "MAX_RUNS must be non-zero; use a positive value for fixed runs or -1 for continuous mode" } validate_rsync_scope() { case "$RSYNC_SCOPE" in host|publication-point|module-root) ;; *) die "RSYNC_SCOPE must be host, publication-point, or module-root: $RSYNC_SCOPE" ;; esac } validate_tal_input_mode() { case "$TAL_INPUT_MODE" in file-with-ta|file-live-ta|url) ;; *) die "TAL_INPUT_MODE must be file-with-ta, file-live-ta or url: $TAL_INPUT_MODE" ;; esac } normalize_token() { local token="$1" token="${token#"${token%%[![:space:]]*}"}" token="${token%"${token##*[![:space:]]}"}" printf '%s' "$token" | tr '[:upper:]' '[:lower:]' } parse_rirs() { RIR_LIST=() local raw_token local normalized IFS=',' read -r -a raw_rirs <<< "$RIRS" for raw_token in "${raw_rirs[@]}"; do normalized="$(normalize_token "$raw_token")" [[ -n "$normalized" ]] || continue case "$normalized" in afrinic|apnic|arin|lacnic|ripe) RIR_LIST+=("$normalized") ;; *) die "invalid RIRS entry: $raw_token; allowed: afrinic,apnic,arin,lacnic,ripe" ;; esac done [[ "${#RIR_LIST[@]}" -gt 0 ]] || die "RIRS must contain at least one RIR" } tal_file_for_rir() { case "$1" in afrinic) printf '%s' "$FIXTURE_DIR/tal/afrinic.tal" ;; apnic) printf '%s' "$FIXTURE_DIR/tal/apnic-rfc7730-https.tal" ;; arin) printf '%s' "$FIXTURE_DIR/tal/arin.tal" ;; lacnic) printf '%s' "$FIXTURE_DIR/tal/lacnic.tal" ;; ripe) printf '%s' "$FIXTURE_DIR/tal/ripe-ncc.tal" ;; *) die "unknown RIR: $1" ;; esac } ta_file_for_rir() { case "$1" in afrinic) printf '%s' "$FIXTURE_DIR/ta/afrinic-ta.cer" ;; apnic) printf '%s' "$FIXTURE_DIR/ta/apnic-ta.cer" ;; arin) printf '%s' "$FIXTURE_DIR/ta/arin-ta.cer" ;; lacnic) printf '%s' "$FIXTURE_DIR/ta/lacnic-ta.cer" ;; ripe) printf '%s' "$FIXTURE_DIR/ta/ripe-ncc-ta.cer" ;; *) die "unknown RIR: $1" ;; esac } tal_url_for_rir() { case "$1" in afrinic) printf '%s' "https://rpki.afrinic.net/tal/afrinic.tal" ;; apnic) printf '%s' "https://tal.apnic.net/apnic.tal" ;; arin) printf '%s' "https://www.arin.net/resources/manage/rpki/arin.tal" ;; lacnic) printf '%s' "https://www.lacnic.net/innovaportal/file/4983/1/lacnic.tal" ;; ripe) printf '%s' "https://tal.rpki.ripe.net/ripe-ncc.tal" ;; *) die "unknown RIR: $1" ;; esac } cir_tal_uri_for_rir() { tal_url_for_rir "$1" } tal_https_uri_from_fixture() { local tal_path="$1" awk ' /^[[:space:]]*#/ { next } /^[[:space:]]*$/ { next } { gsub(/^[[:space:]]+|[[:space:]]+$/, "", $0) } /^https?:\/\// { print exit 0 } ' "$tal_path" } live_ta_file_for_rir() { printf '%s' "$STATE_ROOT/live-ta/$(basename "$(tal_file_for_rir "$1")" .tal).cer" } live_ta_refresh_pid_file_for_rir() { printf '%s' "$LIVE_TA_REFRESH_DIR/$1.pid" } refresh_live_ta_for_rir() { local rir_name="$1" local run_id="${2:-manual}" local log_path="${3:-}" local tal_path local ta_uri local ta_file local tmp_file if [[ -n "$log_path" ]]; then mkdir -p "$(dirname "$log_path")" exec >> "$log_path" 2>&1 fi echo "live-ta-refresh start run=$run_id rir=$rir_name at=$(date -u +%Y-%m-%dT%H:%M:%SZ)" tal_path="$(tal_file_for_rir "$rir_name")" ta_uri="$(tal_https_uri_from_fixture "$tal_path")" if [[ -z "$ta_uri" ]]; then echo "live-ta-refresh failed rir=$rir_name reason=missing_https_uri tal=$tal_path" return 1 fi ta_file="$(live_ta_file_for_rir "$rir_name")" mkdir -p "$(dirname "$ta_file")" tmp_file="${ta_file}.tmp.$$.$RANDOM" if ! curl -fsSL --connect-timeout "$LIVE_TA_REFRESH_CONNECT_TIMEOUT_SECS" --max-time "$LIVE_TA_REFRESH_MAX_TIME_SECS" "$ta_uri" -o "$tmp_file"; then rm -f "$tmp_file" echo "live-ta-refresh failed rir=$rir_name reason=curl uri=$ta_uri" return 1 fi if [[ ! -s "$tmp_file" ]]; then rm -f "$tmp_file" echo "live-ta-refresh failed rir=$rir_name reason=empty_download uri=$ta_uri" return 1 fi mv "$tmp_file" "$ta_file" echo "live-ta-refresh success rir=$rir_name uri=$ta_uri output=$ta_file bytes=$(wc -c < "$ta_file" | tr -d ' ')" } ensure_live_ta_for_rir() { local rir_name="$1" local live_ta_file local fixture_ta_file live_ta_file="$(live_ta_file_for_rir "$rir_name")" if [[ -s "$live_ta_file" ]]; then return 0 fi fixture_ta_file="$(ta_file_for_rir "$rir_name")" [[ -s "$fixture_ta_file" ]] || die "missing live TA and fixture TA for $rir_name: $live_ta_file / $fixture_ta_file" mkdir -p "$(dirname "$live_ta_file")" cp "$fixture_ta_file" "$live_ta_file" } reap_finished_live_ta_refresh_for_rir() { local rir_name="$1" local pid_file local pid local pid_state local pid_file_mtime local now_epoch local stale_after_secs pid_file="$(live_ta_refresh_pid_file_for_rir "$rir_name")" [[ -f "$pid_file" ]] || return 0 pid="$(cat "$pid_file" 2>/dev/null || true)" if [[ "$pid" =~ ^[0-9]+$ ]] && kill -0 "$pid" >/dev/null 2>&1; then pid_state="" if [[ -r "/proc/$pid/stat" ]]; then pid_state="$(awk '{ print $3 }' "/proc/$pid/stat" 2>/dev/null || true)" fi if [[ "$pid_state" == "Z" ]]; then wait "$pid" >/dev/null 2>&1 || true rm -f "$pid_file" return 0 fi pid_file_mtime="$(stat -c %Y "$pid_file" 2>/dev/null || date +%s)" now_epoch="$(date +%s)" stale_after_secs=$((LIVE_TA_REFRESH_MAX_TIME_SECS + 60)) if (( now_epoch - pid_file_mtime > stale_after_secs )); then rm -f "$pid_file" return 0 fi return 1 fi if [[ "$pid" =~ ^[0-9]+$ ]]; then wait "$pid" >/dev/null 2>&1 || true fi rm -f "$pid_file" return 0 } start_live_ta_refresh_for_rir() { local rir_name="$1" local run_id="$2" local pid_file local log_path local pid mkdir -p "$LIVE_TA_REFRESH_DIR" "$LOG_ROOT" pid_file="$(live_ta_refresh_pid_file_for_rir "$rir_name")" if ! reap_finished_live_ta_refresh_for_rir "$rir_name"; then pid="$(cat "$pid_file" 2>/dev/null || true)" echo "live-ta-refresh skip run=$run_id rir=$rir_name reason=previous_refresh_running pid=$pid" \ >> "$LOG_ROOT/live-ta-refresh-$run_id-$rir_name.log" return 0 fi log_path="$LOG_ROOT/live-ta-refresh-$run_id-$rir_name.log" refresh_live_ta_for_rir "$rir_name" "$run_id" "$log_path" & pid=$! printf '%s\n' "$pid" > "$pid_file" } wait_for_previous_live_ta_refresh_for_rir() { local rir_name="$1" local pid_file local pid local deadline_epoch pid_file="$(live_ta_refresh_pid_file_for_rir "$rir_name")" [[ -f "$pid_file" ]] || return 0 if reap_finished_live_ta_refresh_for_rir "$rir_name"; then return 0 fi pid="$(cat "$pid_file" 2>/dev/null || true)" deadline_epoch=$(( $(date +%s) + LIVE_TA_REFRESH_MAX_TIME_SECS + 60 )) echo "live-ta-refresh wait rir=$rir_name reason=previous_refresh_running pid=$pid" while ! reap_finished_live_ta_refresh_for_rir "$rir_name"; do if (( $(date +%s) > deadline_epoch )); then die "timed out waiting for previous live TA refresh for $rir_name pid=$pid" fi sleep 1 done } refresh_live_ta_blocking_for_run() { local run_id="$1" local rir_name local pid local failed=0 local pids=() local names=() local log_path local pid_file for rir_name in "${RIR_LIST[@]}"; do wait_for_previous_live_ta_refresh_for_rir "$rir_name" done for rir_name in "${RIR_LIST[@]}"; do mkdir -p "$LIVE_TA_REFRESH_DIR" "$LOG_ROOT" pid_file="$(live_ta_refresh_pid_file_for_rir "$rir_name")" log_path="$LOG_ROOT/live-ta-refresh-$run_id-$rir_name.log" refresh_live_ta_for_rir "$rir_name" "$run_id" "$log_path" & pid=$! printf '%s\n' "$pid" > "$pid_file" pids+=("$pid") names+=("$rir_name") done local index for index in "${!pids[@]}"; do pid="${pids[$index]}" rir_name="${names[$index]}" pid_file="$(live_ta_refresh_pid_file_for_rir "$rir_name")" if wait "$pid"; then rm -f "$pid_file" else failed=1 rm -f "$pid_file" echo "live-ta-refresh failed before snapshot rir=$rir_name log=$LOG_ROOT/live-ta-refresh-$run_id-$rir_name.log" >&2 fi done if (( failed != 0 )); then die "live TA refresh failed before snapshot run=$run_id; see $LOG_ROOT/live-ta-refresh-$run_id-*.log" fi echo "live-ta-refresh completed before snapshot run=$run_id rirs=${#RIR_LIST[@]}" } prepare_live_ta_inputs_for_run() { local run_id="$1" local sync_mode="$2" local rir_name if [[ "$TAL_INPUT_MODE" != "file-live-ta" ]]; then return 0 fi if [[ "$sync_mode" == "snapshot" ]] && is_true "$LIVE_TA_REFRESH_BEFORE_SNAPSHOT"; then refresh_live_ta_blocking_for_run "$run_id" return 0 fi for rir_name in "${RIR_LIST[@]}"; do ensure_live_ta_for_rir "$rir_name" done for rir_name in "${RIR_LIST[@]}"; do start_live_ta_refresh_for_rir "$rir_name" "$run_id" done } compare_view_trust_anchor() { if [[ "${#RIR_LIST[@]}" -eq 1 ]]; then printf '%s' "${RIR_LIST[0]}" else printf '%s' "all5" fi } max_existing_run_index() { local max_index=0 local run_dir local run_name local numeric_part shopt -s nullglob for run_dir in "$RUNS_ROOT"/run_*; do [[ -d "$run_dir" ]] || continue run_name="$(basename "$run_dir")" numeric_part="${run_name#run_}" [[ "$numeric_part" =~ ^[0-9]+$ ]] || continue if (( 10#$numeric_part > max_index )); then max_index=$((10#$numeric_part)) fi done shopt -u nullglob printf '%s' "$max_index" } json_status_is_success() { local json_path="$1" python3 - "$json_path" <<'PY' import json import sys path = sys.argv[1] try: with open(path, "r", encoding="utf-8") as handle: data = json.load(handle) except Exception: sys.exit(1) sys.exit(0 if data.get("status") == "success" else 1) PY } previous_run_success() { local run_dir="$1" [[ -d "$run_dir" ]] || return 1 [[ -f "$run_dir/run-meta.json" ]] || return 1 [[ -f "$run_dir/run-summary.json" ]] || return 1 json_status_is_success "$run_dir/run-meta.json" || return 1 json_status_is_success "$run_dir/run-summary.json" || return 1 for required_artifact in report.json result.ccr input.cir stage-timing.json process-time.txt stdout.log stderr.log; do [[ -f "$run_dir/$required_artifact" ]] || return 1 done return 0 } move_if_exists() { local source_path="$1" local target_dir="$2" if [[ -e "$source_path" ]]; then mkdir -p "$target_dir" mv "$source_path" "$target_dir/" fi } clear_dir_contents() { local dir_path="$1" [[ -d "$dir_path" ]] || return 0 find "$dir_path" -mindepth 1 -maxdepth 1 -exec rm -rf {} + } db_state_exists() { [[ -e "$DB_DIR/work-db" || -e "$DB_DIR/repo-bytes.db" ]] } delta_state_available() { [[ -e "$DB_DIR/work-db" ]] } isolate_state_after_failure() { local previous_run_id="$1" local timestamp timestamp="$(date -u +%Y%m%dT%H%M%SZ)" local invalid_dir="$INVALID_ROOT/${previous_run_id}-${timestamp}" mkdir -p "$invalid_dir" move_if_exists "$DB_DIR" "$invalid_dir" move_if_exists "$META_DIR" "$invalid_dir" if [[ -e "$TMP_DIR" ]]; then INVALID_TMP_PATH="$TMP_DIR" TMP_CLEANUP_STATUS="pending" TMP_CLEANUP_REASON="discard_after_failure" clear_dir_contents "$TMP_DIR" mkdir -p "$TMP_DIR" TMP_CLEANUP_STATUS="discarded_recreated" else INVALID_TMP_PATH="" TMP_CLEANUP_STATUS="not_present" TMP_CLEANUP_REASON="discard_after_failure" mkdir -p "$TMP_DIR" fi mkdir -p "$DB_DIR" "$META_DIR" "$TMP_DIR" INVALID_DB_PATH="$invalid_dir/$(basename "$DB_DIR")" INVALID_STATE_PATH="$invalid_dir/$(basename "$META_DIR")" } periodic_snapshot_delta_scan() { local command="$1" shift python3 - "$command" "$RUN_LIFECYCLE_STATE_PATH" "$RUNS_ROOT" "$RUN_LIFECYCLE_RECENT_RUNS_LIMIT" "$@" <<'PY' import json import os import pathlib import shlex import sys from datetime import datetime, timezone command = sys.argv[1] state_path = pathlib.Path(sys.argv[2]) runs_root = pathlib.Path(sys.argv[3]) recent_limit = int(sys.argv[4]) extra_args = sys.argv[5:] def now_rfc3339() -> str: return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def to_int(value): if value is None or value == "": return None if isinstance(value, bool): return None return int(value) def to_bool(value): if isinstance(value, bool): return value if value in (None, ""): return None if isinstance(value, str): lowered = value.strip().lower() if lowered in {"1", "true", "yes", "on"}: return True if lowered in {"0", "false", "no", "off"}: return False raise ValueError(f"invalid bool value: {value!r}") def normalize_run_entry(entry): if entry is None: return None if not isinstance(entry, dict): raise ValueError("run entry must be an object") run_id = entry.get("run_id") or entry.get("runId") run_index = to_int(entry.get("run_index", entry.get("runSeq"))) if not run_id or run_index is None: raise ValueError("run entry missing run_id/run_index") return { "run_id": run_id, "run_index": run_index, "status": entry.get("status") or "unknown", "sync_mode": entry.get("sync_mode", entry.get("syncMode")), "snapshot_reason": entry.get("snapshot_reason"), "started_at_rfc3339_utc": entry.get("started_at_rfc3339_utc"), "completed_at_rfc3339_utc": entry.get("completed_at_rfc3339_utc"), "periodic_snapshot_reset_enabled": to_bool(entry.get("periodic_snapshot_reset_enabled")), "periodic_snapshot_max_deltas": to_int(entry.get("periodic_snapshot_max_deltas")), "periodic_snapshot_delta_count": to_int(entry.get("periodic_snapshot_delta_count")), "periodic_snapshot_forced": to_bool(entry.get("periodic_snapshot_forced")), } def snapshot_ref(entry): if entry is None: return None return { "run_id": entry["run_id"], "run_index": entry["run_index"], "snapshot_reason": entry.get("snapshot_reason"), "completed_at_rfc3339_utc": entry.get("completed_at_rfc3339_utc"), } def normalize_snapshot_ref(entry): if entry is None: return None if not isinstance(entry, dict): raise ValueError("snapshot ref must be an object") run_id = entry.get("run_id") or entry.get("runId") run_index = to_int(entry.get("run_index", entry.get("runSeq"))) if not run_id or run_index is None: raise ValueError("snapshot ref missing run_id/run_index") return { "run_id": run_id, "run_index": run_index, "snapshot_reason": entry.get("snapshot_reason"), "completed_at_rfc3339_utc": entry.get("completed_at_rfc3339_utc"), } def state_health(last_run, last_success_snapshot): if last_success_snapshot is not None: return "ready", "ok" if last_run is None: return "empty", "no_runs" return "bootstrap_incomplete", "no_successful_snapshot_in_retained_runs" def default_state(): health, detail = state_health(None, None) return { "version": 1, "updated_at_rfc3339_utc": now_rfc3339(), "state_health": health, "state_detail": detail, "recent_runs_limit": recent_limit, "last_run": None, "last_success_snapshot": None, "successful_deltas_since_snapshot": None, "recent_runs": [], } def finalize_state(data): data = dict(data) data["version"] = 1 data["updated_at_rfc3339_utc"] = now_rfc3339() data["recent_runs_limit"] = recent_limit recent_runs = [] for item in data.get("recent_runs", []): recent_runs.append(normalize_run_entry(item)) if len(recent_runs) > recent_limit: recent_runs = recent_runs[-recent_limit:] data["recent_runs"] = recent_runs data["last_run"] = normalize_run_entry(data.get("last_run")) data["last_success_snapshot"] = normalize_snapshot_ref(data.get("last_success_snapshot")) delta_count = data.get("successful_deltas_since_snapshot") data["successful_deltas_since_snapshot"] = to_int(delta_count) if data["last_success_snapshot"] is None: data["successful_deltas_since_snapshot"] = None elif data["successful_deltas_since_snapshot"] is None: data["successful_deltas_since_snapshot"] = 0 health, detail = state_health(data["last_run"], data["last_success_snapshot"]) if health == "ready": detail = data.get("state_detail") or detail data["state_health"] = health data["state_detail"] = detail return data def sorted_run_dirs(): candidates = [] for path in runs_root.glob("run_*"): if not path.is_dir(): continue suffix = path.name[4:] if suffix.isdigit(): candidates.append((int(suffix), path)) candidates.sort() return [path for _, path in candidates] def parse_run_dir(run_dir): meta_path = run_dir / "run-meta.json" summary_path = run_dir / "run-summary.json" if not meta_path.exists() or not summary_path.exists(): return None with meta_path.open("r", encoding="utf-8") as handle: meta = json.load(handle) with summary_path.open("r", encoding="utf-8") as handle: summary = json.load(handle) entry = normalize_run_entry(meta) entry["status"] = meta.get("status") or summary.get("status") or entry["status"] return entry def bootstrap_state(exclude_run_id=None): data = default_state() last_success_snapshot = None delta_count = None recent_runs = [] last_run = None for run_dir in sorted_run_dirs(): if exclude_run_id and run_dir.name == exclude_run_id: continue try: entry = parse_run_dir(run_dir) except Exception: continue if entry is None: continue last_run = entry recent_runs.append(entry) if entry["status"] != "success": continue if entry.get("sync_mode") == "snapshot": last_success_snapshot = snapshot_ref(entry) delta_count = 0 elif entry.get("sync_mode") == "delta": if last_success_snapshot is not None and delta_count is not None: delta_count += 1 data["last_run"] = last_run data["last_success_snapshot"] = last_success_snapshot data["successful_deltas_since_snapshot"] = delta_count data["recent_runs"] = recent_runs[-recent_limit:] return finalize_state(data) def backup_corrupt_state(): timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") backup_path = state_path.with_name(f"{state_path.name}.corrupt.{timestamp}.{os.getpid()}") backup_path.parent.mkdir(parents=True, exist_ok=True) os.replace(state_path, backup_path) return backup_path def load_state_or_bootstrap(exclude_run_id=None): if not state_path.exists(): return bootstrap_state(exclude_run_id), "bootstrapped_from_runs_missing_file", "", "" try: with state_path.open("r", encoding="utf-8") as handle: raw = json.load(handle) if not isinstance(raw, dict): raise ValueError("state root must be an object") if to_int(raw.get("version")) != 1: raise ValueError("unsupported version") state = finalize_state(raw) return state, "state_file", "", "" except Exception as exc: backup_path = backup_corrupt_state() state = bootstrap_state(exclude_run_id) return state, "bootstrapped_from_runs_after_corrupt_backup", exc.__class__.__name__, str(backup_path) def atomic_write_json(path, payload): path.parent.mkdir(parents=True, exist_ok=True) tmp_path = path.with_name(f"{path.name}.tmp.{os.getpid()}") with tmp_path.open("w", encoding="utf-8") as handle: json.dump(payload, handle, indent=2, sort_keys=True) handle.write("\n") handle.flush() os.fsync(handle.fileno()) os.replace(tmp_path, path) try: dir_fd = os.open(path.parent, os.O_DIRECTORY) except OSError: return try: os.fsync(dir_fd) finally: os.close(dir_fd) def emit(name, value): if value is None: value = "" elif isinstance(value, bool): value = "true" if value else "false" else: value = str(value) print(f"{name}={shlex.quote(value)}") if command == "load": max_deltas = int(extra_args[0]) state, source, detail, backup_path = load_state_or_bootstrap() if source != "state_file": atomic_write_json(state_path, state) delta_count = state.get("successful_deltas_since_snapshot") force_needed = bool( state.get("last_success_snapshot") is not None and delta_count is not None and delta_count >= max_deltas ) emit("PERIODIC_LIFECYCLE_SOURCE", source) emit("PERIODIC_LIFECYCLE_DETAIL", detail or state.get("state_detail")) emit("PERIODIC_LIFECYCLE_CORRUPT_BACKUP_PATH", backup_path) emit("PERIODIC_LIFECYCLE_STATE_HEALTH", state.get("state_health")) emit( "PERIODIC_LIFECYCLE_LAST_SNAPSHOT_RUN_ID", (state.get("last_success_snapshot") or {}).get("run_id"), ) emit("PERIODIC_LIFECYCLE_DELTA_COUNT", delta_count) emit("PERIODIC_LIFECYCLE_FORCE_NEEDED", force_needed) elif command == "update": run_dir = pathlib.Path(extra_args[0]) state, source, detail, backup_path = load_state_or_bootstrap(exclude_run_id=run_dir.name) entry = parse_run_dir(run_dir) if entry is None: raise SystemExit(f"missing run metadata for lifecycle update: {run_dir}") previous_snapshot = state.get("last_success_snapshot") previous_delta_count = state.get("successful_deltas_since_snapshot") state["last_run"] = entry recent_runs = [item for item in state.get("recent_runs", []) if item.get("run_id") != entry["run_id"]] recent_runs.append(entry) state["recent_runs"] = recent_runs[-recent_limit:] if entry["status"] == "success" and entry.get("sync_mode") == "snapshot": state["last_success_snapshot"] = snapshot_ref(entry) state["successful_deltas_since_snapshot"] = 0 state["state_detail"] = "ok" elif entry["status"] == "success" and entry.get("sync_mode") == "delta": if previous_snapshot is not None and previous_delta_count is not None: state["last_success_snapshot"] = previous_snapshot state["successful_deltas_since_snapshot"] = previous_delta_count + 1 state["state_detail"] = "ok" else: state["last_success_snapshot"] = previous_snapshot state["successful_deltas_since_snapshot"] = None state["state_detail"] = "success_delta_without_known_snapshot" else: state["last_success_snapshot"] = previous_snapshot state["successful_deltas_since_snapshot"] = previous_delta_count state = finalize_state(state) atomic_write_json(state_path, state) emit("RUN_LIFECYCLE_UPDATE_SOURCE", source) emit("RUN_LIFECYCLE_UPDATE_DETAIL", detail or state.get("state_detail")) emit("RUN_LIFECYCLE_UPDATE_CORRUPT_BACKUP_PATH", backup_path) emit("RUN_LIFECYCLE_UPDATE_STATE_HEALTH", state.get("state_health")) emit("RUN_LIFECYCLE_UPDATE_DELTA_COUNT", state.get("successful_deltas_since_snapshot")) emit( "RUN_LIFECYCLE_UPDATE_LAST_SNAPSHOT_RUN_ID", (state.get("last_success_snapshot") or {}).get("run_id"), ) else: raise SystemExit(f"unknown lifecycle helper command: {command}") PY } load_periodic_snapshot_lifecycle_context() { eval "$(periodic_snapshot_delta_scan load "$PERIODIC_SNAPSHOT_MAX_DELTAS")" if [[ -n "$PERIODIC_LIFECYCLE_CORRUPT_BACKUP_PATH" ]]; then warn "run lifecycle state corrupt; backed up to $PERIODIC_LIFECYCLE_CORRUPT_BACKUP_PATH detail=${PERIODIC_LIFECYCLE_DETAIL:-unknown}" fi if [[ "$PERIODIC_LIFECYCLE_SOURCE" != "state_file" ]]; then echo "run lifecycle state source=$PERIODIC_LIFECYCLE_SOURCE state_health=${PERIODIC_LIFECYCLE_STATE_HEALTH:-unknown} snapshot_run=${PERIODIC_LIFECYCLE_LAST_SNAPSHOT_RUN_ID:-none} delta_count=${PERIODIC_LIFECYCLE_DELTA_COUNT:-unknown}" fi if [[ "$PERIODIC_LIFECYCLE_STATE_HEALTH" == "bootstrap_incomplete" ]]; then warn "run lifecycle state bootstrap incomplete detail=${PERIODIC_LIFECYCLE_DETAIL:-unknown}; forced snapshot counting resumes after the next successful snapshot" fi } update_run_lifecycle_state() { local run_dir="$1" eval "$(periodic_snapshot_delta_scan update "$run_dir")" if [[ -n "$RUN_LIFECYCLE_UPDATE_CORRUPT_BACKUP_PATH" ]]; then warn "run lifecycle state corrupt during update; backed up to $RUN_LIFECYCLE_UPDATE_CORRUPT_BACKUP_PATH detail=${RUN_LIFECYCLE_UPDATE_DETAIL:-unknown}" fi } prepare_periodic_reset_state_db() { local run_id="$1" RESET_DB_STAGING_PATH="" RESET_DB_CLEANUP_STATUS="" db_state_exists || return 0 local timestamp local staging_root timestamp="$(date -u +%Y%m%dT%H%M%SZ)" staging_root="$RESET_STAGING_ROOT/${run_id}-${timestamp}" mkdir -p "$staging_root" mv "$DB_DIR" "$staging_root/" mkdir -p "$DB_DIR" RESET_DB_STAGING_PATH="$staging_root/$(basename "$DB_DIR")" RESET_DB_CLEANUP_STATUS="pending" } finalize_periodic_reset_state_db() { local final_status="$1" local reset_db_staging_path="$2" [[ -n "$reset_db_staging_path" ]] || { printf '%s\n' "" return 0 } local staging_root staging_root="$(dirname "$reset_db_staging_path")" if [[ "$final_status" == "success" ]]; then if rm -rf "$staging_root"; then printf '%s\n' "deleted" return 0 fi warn "failed to delete periodic reset staging: $staging_root" printf '%s\n' "cleanup_failed" return 1 fi printf '%s\n' "retained_failure" } write_run_meta() { local output_path="$1" local status="$2" local run_index="$3" local run_id="$4" local sync_mode="$5" local snapshot_reason="$6" local previous_run_id="$7" local previous_run_success_value="$8" local started_at="$9" local completed_at="${10}" local invalid_db_path="${11}" local invalid_state_path="${12}" local invalid_tmp_path="${13}" local daemon_exit_code="${14}" local package_root="${15}" local env_file="${16}" local periodic_snapshot_reset_enabled="${17}" local periodic_snapshot_max_deltas="${18}" local periodic_snapshot_delta_count="${19}" local periodic_snapshot_forced="${20}" local reset_db_staging_path="${21}" local reset_db_cleanup_status="${22}" local tmp_cleanup_status="${23}" local tmp_cleanup_reason="${24}" python3 - "$output_path" "$status" "$run_index" "$run_id" "$sync_mode" "$snapshot_reason" \ "$previous_run_id" "$previous_run_success_value" "$started_at" "$completed_at" \ "$invalid_db_path" "$invalid_state_path" "$invalid_tmp_path" "$daemon_exit_code" \ "$package_root" "$env_file" "$periodic_snapshot_reset_enabled" \ "$periodic_snapshot_max_deltas" "$periodic_snapshot_delta_count" \ "$periodic_snapshot_forced" "$reset_db_staging_path" "$reset_db_cleanup_status" \ "$tmp_cleanup_status" "$tmp_cleanup_reason" <<'PY' import json import sys def nullable(value): return None if value == "" else value def nullable_bool(value): if value == "": return None return value == "true" def nullable_int(value): if value == "": return None return int(value) def bool_value(value): return value == "true" ( output_path, status, run_index, run_id, sync_mode, snapshot_reason, previous_run_id, previous_run_success, started_at, completed_at, invalid_db_path, invalid_state_path, invalid_tmp_path, daemon_exit_code, package_root, env_file, periodic_snapshot_reset_enabled, periodic_snapshot_max_deltas, periodic_snapshot_delta_count, periodic_snapshot_forced, reset_db_staging_path, reset_db_cleanup_status, tmp_cleanup_status, tmp_cleanup_reason, ) = sys.argv[1:] data = { "status": status, "run_index": int(run_index), "run_id": run_id, "sync_mode": sync_mode, "snapshot_reason": nullable(snapshot_reason), "previous_run_id": nullable(previous_run_id), "previous_run_success": nullable_bool(previous_run_success), "started_at_rfc3339_utc": started_at, "completed_at_rfc3339_utc": nullable(completed_at), "invalid_db_path": nullable(invalid_db_path), "invalid_state_path": nullable(invalid_state_path), "invalid_tmp_path": nullable(invalid_tmp_path), "daemon_exit_code": nullable_int(daemon_exit_code), "package_root": package_root, "env_file": env_file, "periodic_snapshot_reset_enabled": bool_value(periodic_snapshot_reset_enabled), "periodic_snapshot_max_deltas": int(periodic_snapshot_max_deltas), "periodic_snapshot_delta_count": nullable_int(periodic_snapshot_delta_count), "periodic_snapshot_forced": bool_value(periodic_snapshot_forced), "reset_db_staging_path": nullable(reset_db_staging_path), "reset_db_cleanup_status": nullable(reset_db_cleanup_status), "tmp_cleanup_status": nullable(tmp_cleanup_status), "tmp_cleanup_reason": nullable(tmp_cleanup_reason), } with open(output_path, "w", encoding="utf-8") as handle: json.dump(data, handle, indent=2, sort_keys=True) handle.write("\n") PY } summary_status() { local summary_path="$1" python3 - "$summary_path" <<'PY' import json import sys try: with open(sys.argv[1], "r", encoding="utf-8") as handle: print(json.load(handle).get("status", "missing")) except Exception: print("missing") PY } prepare_competing_rp_state() { if ! is_true "$DISABLE_COMPETING_RPS"; then return 0 fi systemctl disable --now rpki-client.timer >/dev/null 2>&1 || true systemctl stop rpki-client.service >/dev/null 2>&1 || true pkill -x rpki-client >/dev/null 2>&1 || true pkill -x routinator >/dev/null 2>&1 || true } write_machine_snapshot() { local suffix="$1" df -h > "$LOG_ROOT/df-${suffix}.txt" 2>&1 || true free -h > "$LOG_ROOT/free-${suffix}.txt" 2>&1 || true ps -eo pid,ppid,stat,pcpu,pmem,rss,args --sort=-pcpu \ | grep -E 'rpki_daemon|/bin/rpki|rpki-client|routinator' \ | grep -v grep > "$LOG_ROOT/process-${suffix}.txt" || true systemctl is-active rpki-client.timer > "$LOG_ROOT/rpki-client-timer-active-${suffix}.txt" 2>&1 || true systemctl is-enabled rpki-client.timer > "$LOG_ROOT/rpki-client-timer-enabled-${suffix}.txt" 2>&1 || true } build_child_args() { CHILD_ARGS=( --db "$DB_DIR/work-db" --repo-bytes-db "$DB_DIR/repo-bytes.db" --rsync-scope "$RSYNC_SCOPE" ) if is_true "$ALLOW_RSYNC_MIRROR_REUSE"; then CHILD_ARGS+=(--rsync-mirror-root "$RSYNC_MIRROR_ROOT") else CHILD_ARGS+=(--rsync-mirror-root "$TMP_DIR/rsync-mirror-{run_id}") fi CHILD_ARGS+=( --parallel-phase2-ready-batch-size 256 --parallel-phase2-ready-batch-wall-time-budget-ms 100 --parallel-phase2-result-drain-batch-size 2048 --parallel-phase2-finalize-batch-size 256 --parallel-phase2-finalize-batch-wall-time-budget-ms 100 ) local rir_name for rir_name in "${RIR_LIST[@]}"; do if [[ "$TAL_INPUT_MODE" == "url" ]]; then CHILD_ARGS+=(--tal-url "$(tal_url_for_rir "$rir_name")") elif [[ "$TAL_INPUT_MODE" == "file-live-ta" ]]; then CHILD_ARGS+=(--tal-path "$(tal_file_for_rir "$rir_name")") CHILD_ARGS+=(--ta-path "$(live_ta_file_for_rir "$rir_name")") else CHILD_ARGS+=(--tal-path "$(tal_file_for_rir "$rir_name")") CHILD_ARGS+=(--ta-path "$(ta_file_for_rir "$rir_name")") fi done CHILD_ARGS+=( --report-json "{run_out}/report.json" ) if is_true "$OUTPUT_COMPACT_REPORT"; then CHILD_ARGS+=(--report-json-compact) fi CHILD_ARGS+=( --ccr-out "{run_out}/result.ccr" --cir-enable --cir-out "{run_out}/input.cir" ) for rir_name in "${RIR_LIST[@]}"; do CHILD_ARGS+=(--cir-tal-uri "$(cir_tal_uri_for_rir "$rir_name")") done CHILD_ARGS+=( --vrps-csv-out "{run_out}/vrps.csv" --vaps-csv-out "{run_out}/vaps.csv" --compare-view-trust-anchor "$(compare_view_trust_anchor)" ) if is_true "$ENABLE_CHILD_CERTIFICATE_VALIDATION_CACHE"; then CHILD_ARGS+=(--enable-child-certificate-validation-cache) fi if [[ -n "$RPKI_EXTRA_ARGS" ]]; then # shellcheck disable=SC2206 local extra_args=( $RPKI_EXTRA_ARGS ) CHILD_ARGS+=("${extra_args[@]}") fi } copy_inner_run_outputs() { local daemon_state_root="$1" local run_dir="$2" local outer_run_index="$3" local outer_run_id="$4" local inner_run_dir inner_run_dir="$(find "$daemon_state_root/runs" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | sort | tail -n 1 || true)" if [[ -n "$inner_run_dir" && -d "$inner_run_dir" ]]; then shopt -s dotglob nullglob cp -a "$inner_run_dir"/. "$run_dir"/ shopt -u dotglob nullglob fi [[ -f "$daemon_state_root/daemon-status.json" ]] && cp "$daemon_state_root/daemon-status.json" "$run_dir/daemon-status.inner.json" [[ -f "$daemon_state_root/daemon-runs.jsonl" ]] && cp "$daemon_state_root/daemon-runs.jsonl" "$run_dir/daemon-runs.inner.jsonl" normalize_outer_run_metadata "$run_dir" "$outer_run_index" "$outer_run_id" "$inner_run_dir" "$daemon_state_root" } normalize_outer_run_metadata() { local run_dir="$1" local outer_run_index="$2" local outer_run_id="$3" local inner_run_dir="$4" local daemon_state_root="$5" python3 - "$run_dir" "$outer_run_index" "$outer_run_id" "$inner_run_dir" "$daemon_state_root" <<'PY' import json import pathlib import sys run_dir = pathlib.Path(sys.argv[1]).resolve() outer_run_index = int(sys.argv[2]) outer_run_id = sys.argv[3] inner_run_dir = sys.argv[4] daemon_state_root = pathlib.Path(sys.argv[5]) def replace_paths(value): if isinstance(value, dict): return {key: replace_paths(item) for key, item in value.items()} if isinstance(value, list): return [replace_paths(item) for item in value] if isinstance(value, str) and inner_run_dir: return value.replace(inner_run_dir, str(run_dir)) return value def normalize_summary(summary): summary = dict(summary) summary.setdefault("innerRunSeq", summary.get("runSeq")) summary.setdefault("innerRunId", summary.get("runId")) summary.setdefault("innerRunDir", summary.get("runDir")) summary = replace_paths(summary) summary["runSeq"] = outer_run_index summary["runId"] = outer_run_id summary["runDir"] = str(run_dir) return summary summary_path = run_dir / "run-summary.json" if summary_path.exists(): summary = json.loads(summary_path.read_text(encoding="utf-8")) summary_path.write_text( json.dumps(normalize_summary(summary), indent=2, sort_keys=True) + "\n", encoding="utf-8", ) inner_status_path = run_dir / "daemon-status.inner.json" if not inner_status_path.exists(): raw_status_path = daemon_state_root / "daemon-status.json" if raw_status_path.exists(): inner_status_path.write_text(raw_status_path.read_text(encoding="utf-8"), encoding="utf-8") if inner_status_path.exists(): status = json.loads(inner_status_path.read_text(encoding="utf-8")) status.setdefault("innerLastRunId", status.get("lastRunId")) status["lastRunId"] = outer_run_id status["outerRunId"] = outer_run_id status["outerRunIndex"] = outer_run_index (run_dir / "daemon-status.json").write_text( json.dumps(status, indent=2, sort_keys=True) + "\n", encoding="utf-8", ) inner_runs_path = run_dir / "daemon-runs.inner.jsonl" if not inner_runs_path.exists(): raw_runs_path = daemon_state_root / "daemon-runs.jsonl" if raw_runs_path.exists(): inner_runs_path.write_text(raw_runs_path.read_text(encoding="utf-8"), encoding="utf-8") if inner_runs_path.exists(): lines = [] for line in inner_runs_path.read_text(encoding="utf-8").splitlines(): if not line.strip(): continue lines.append(json.dumps(normalize_summary(json.loads(line)), sort_keys=True)) (run_dir / "daemon-runs.jsonl").write_text("\n".join(lines) + ("\n" if lines else ""), encoding="utf-8") PY } apply_outer_retention() { local dirs=() local retain_limit="$RETAIN_RUNS" local keep_run="${1:-}" local run_dir shopt -s nullglob for run_dir in "$RUNS_ROOT"/run_[0-9][0-9][0-9][0-9]; do [[ -d "$run_dir" ]] && dirs+=("$run_dir") done shopt -u nullglob if (( ${#dirs[@]} <= retain_limit )); then return 0 fi mapfile -t dirs < <(printf '%s\n' "${dirs[@]}" | sort) local remove_count=$(( ${#dirs[@]} - retain_limit )) local removed_count=0 local candidate for candidate in "${dirs[@]}"; do if [[ -n "$keep_run" && "$(basename "$candidate")" == "$keep_run" ]]; then continue fi rm -rf "$candidate" removed_count=$((removed_count + 1)) if (( removed_count >= remove_count )); then break fi done } run_one_round() { local run_index="$1" local run_id run_id="$(printf 'run_%04d' "$run_index")" local run_dir="$RUNS_ROOT/$run_id" local previous_run_id="$2" local previous_success_value="$3" local sync_mode="$4" local snapshot_reason="$5" local daemon_state_root="$TMP_DIR/daemon-$run_id" local started_at local completed_at local daemon_exit_code local summary_state mkdir -p "$run_dir" "$daemon_state_root" apply_outer_retention "$run_id" started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)" write_run_meta "$run_dir/run-meta.json" "running" "$run_index" "$run_id" "$sync_mode" \ "$snapshot_reason" "$previous_run_id" "$previous_success_value" "$started_at" "" \ "$INVALID_DB_PATH" "$INVALID_STATE_PATH" "$INVALID_TMP_PATH" "" "$PACKAGE_ROOT" "$ENV_FILE" \ "$RUN_META_PERIODIC_ENABLED" "$RUN_META_PERIODIC_MAX_DELTAS" "$RUN_META_PERIODIC_DELTA_COUNT" \ "$RUN_META_PERIODIC_FORCED" "$RUN_META_RESET_DB_STAGING_PATH" "$RUN_META_RESET_DB_CLEANUP_STATUS" \ "$RUN_META_TMP_CLEANUP_STATUS" "$RUN_META_TMP_CLEANUP_REASON" prepare_live_ta_inputs_for_run "$run_id" "$sync_mode" build_child_args if is_true "$RPKI_ANALYZE"; then CHILD_ARGS+=(--analyze --analysis-out "$run_dir/analyze") fi local daemon_args=( --state-root "$daemon_state_root" --rpki-bin "$RPKI_BIN" --interval-secs 0 --max-runs 1 --retain-runs "$RETAIN_RUNS" --work-db "$DB_DIR/work-db" --repo-bytes-db "$DB_DIR/repo-bytes.db" ) if [[ -x "$DB_STATS_BIN" ]]; then daemon_args+=(--db-stats-bin "$DB_STATS_BIN") if [[ -n "${DB_STATS_EXACT_EVERY:-}" && "$DB_STATS_EXACT_EVERY" != "0" ]]; then daemon_args+=(--db-stats-exact-every "$DB_STATS_EXACT_EVERY") fi fi set +e env \ RPKI_PROGRESS_LOG="$RPKI_PROGRESS_LOG" \ RPKI_PROGRESS_SLOW_SECS="$RPKI_PROGRESS_SLOW_SECS" \ RPKI_PROGRESS_STAGE_FRESH_SLOW_MS="$RPKI_PROGRESS_STAGE_FRESH_SLOW_MS" \ RPKI_PROGRESS_PP_CONTROL_SLOW_MS="$RPKI_PROGRESS_PP_CONTROL_SLOW_MS" \ RPKI_PROGRESS_PP_CACHE_SLOW_MS="$RPKI_PROGRESS_PP_CACHE_SLOW_MS" \ RPKI_PROGRESS_CONTROL_LOOP_SLOW_MS="$RPKI_PROGRESS_CONTROL_LOOP_SLOW_MS" \ "$RPKI_DAEMON_BIN" "${daemon_args[@]}" -- "${CHILD_ARGS[@]}" \ > "$run_dir/daemon-stdout.log" 2> "$run_dir/daemon-stderr.log" daemon_exit_code=$? set -e copy_inner_run_outputs "$daemon_state_root" "$run_dir" "$run_index" "$run_id" completed_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)" summary_state="$(summary_status "$run_dir/run-summary.json")" local final_status="failed" if [[ "$daemon_exit_code" -eq 0 && "$summary_state" == "success" ]]; then final_status="success" fi if [[ -n "$RUN_META_RESET_DB_STAGING_PATH" ]]; then if RUN_META_RESET_DB_CLEANUP_STATUS="$(finalize_periodic_reset_state_db "$final_status" "$RUN_META_RESET_DB_STAGING_PATH")"; then : else final_status="failed" fi fi write_run_meta "$run_dir/run-meta.json" "$final_status" "$run_index" "$run_id" "$sync_mode" \ "$snapshot_reason" "$previous_run_id" "$previous_success_value" "$started_at" "$completed_at" \ "$INVALID_DB_PATH" "$INVALID_STATE_PATH" "$INVALID_TMP_PATH" "$daemon_exit_code" "$PACKAGE_ROOT" "$ENV_FILE" \ "$RUN_META_PERIODIC_ENABLED" "$RUN_META_PERIODIC_MAX_DELTAS" "$RUN_META_PERIODIC_DELTA_COUNT" \ "$RUN_META_PERIODIC_FORCED" "$RUN_META_RESET_DB_STAGING_PATH" "$RUN_META_RESET_DB_CLEANUP_STATUS" \ "$RUN_META_TMP_CLEANUP_STATUS" "$RUN_META_TMP_CLEANUP_REASON" update_run_lifecycle_state "$run_dir" printf '%s\n' "$run_id" > "$META_DIR/last-run-id" if is_true "$CLEAN_TMP_AFTER_RUN"; then rm -rf "$daemon_state_root" fi apply_outer_retention [[ "$final_status" == "success" ]] } main() { if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then usage exit 0 fi require_command python3 require_command date require_command find if [[ "$TAL_INPUT_MODE" == "file-live-ta" ]]; then require_command curl validate_positive_int "LIVE_TA_REFRESH_CONNECT_TIMEOUT_SECS" "$LIVE_TA_REFRESH_CONNECT_TIMEOUT_SECS" validate_positive_int "LIVE_TA_REFRESH_MAX_TIME_SECS" "$LIVE_TA_REFRESH_MAX_TIME_SECS" fi validate_max_runs validate_non_negative_int "INTERVAL_SECS" "$INTERVAL_SECS" validate_non_negative_int "STOP_AFTER_SECS" "$STOP_AFTER_SECS" validate_positive_int "RETAIN_RUNS" "$RETAIN_RUNS" validate_rsync_scope validate_tal_input_mode validate_non_negative_int "PERIODIC_SNAPSHOT_MAX_DELTAS" "$PERIODIC_SNAPSHOT_MAX_DELTAS" if [[ -n "${DB_STATS_EXACT_EVERY:-}" && "$DB_STATS_EXACT_EVERY" != "0" ]]; then validate_positive_int "DB_STATS_EXACT_EVERY" "$DB_STATS_EXACT_EVERY" fi parse_rirs [[ -x "$RPKI_BIN" ]] || die "missing executable: $RPKI_BIN" [[ -x "$RPKI_DAEMON_BIN" ]] || die "missing executable: $RPKI_DAEMON_BIN" local rir_name for rir_name in "${RIR_LIST[@]}"; do if [[ "$TAL_INPUT_MODE" == "url" ]]; then [[ -n "$(tal_url_for_rir "$rir_name")" ]] || die "missing TAL URL for $rir_name" elif [[ "$TAL_INPUT_MODE" == "file-live-ta" ]]; then [[ -f "$(tal_file_for_rir "$rir_name")" ]] || die "missing TAL fixture for $rir_name" [[ -n "$(tal_https_uri_from_fixture "$(tal_file_for_rir "$rir_name")")" ]] || die "missing http(s) TA URI in TAL fixture for $rir_name" else [[ -f "$(tal_file_for_rir "$rir_name")" ]] || die "missing TAL fixture for $rir_name" [[ -f "$(ta_file_for_rir "$rir_name")" ]] || die "missing TA fixture for $rir_name" fi done mkdir -p "$RUNS_ROOT" "$LOG_ROOT" "$DB_DIR" "$META_DIR" "$TMP_DIR" "$INVALID_ROOT" "$RESET_STAGING_ROOT" "$LIVE_TA_REFRESH_DIR" if is_true "$ALLOW_RSYNC_MIRROR_REUSE"; then mkdir -p "$RSYNC_MIRROR_ROOT" fi prepare_competing_rp_state write_machine_snapshot "before" local max_index local next_index local run_forever=0 local stop_index=0 local started_epoch local elapsed_secs started_epoch="$(date +%s)" max_index="$(max_existing_run_index)" next_index=$((max_index + 1)) if (( MAX_RUNS < 0 )); then run_forever=1 echo "run_soak mode=continuous max_existing_run_index=$max_index next_run=$(printf 'run_%04d' "$next_index")" else stop_index=$((max_index + MAX_RUNS)) echo "run_soak mode=fixed max_existing_run_index=$max_index next_run=$(printf 'run_%04d' "$next_index") stop_run=$(printf 'run_%04d' "$stop_index")" fi local any_failed=0 while (( run_forever == 1 || next_index <= stop_index )); do INVALID_DB_PATH="" INVALID_STATE_PATH="" INVALID_TMP_PATH="" TMP_CLEANUP_STATUS="" TMP_CLEANUP_REASON="" PERIODIC_LIFECYCLE_SOURCE="" PERIODIC_LIFECYCLE_DETAIL="" PERIODIC_LIFECYCLE_CORRUPT_BACKUP_PATH="" PERIODIC_LIFECYCLE_STATE_HEALTH="" PERIODIC_LIFECYCLE_LAST_SNAPSHOT_RUN_ID="" PERIODIC_LIFECYCLE_DELTA_COUNT="" PERIODIC_LIFECYCLE_FORCE_NEEDED="false" RESET_DB_STAGING_PATH="" RESET_DB_CLEANUP_STATUS="" if is_true "$PERIODIC_SNAPSHOT_RESET"; then RUN_META_PERIODIC_ENABLED="true" load_periodic_snapshot_lifecycle_context if [[ -n "$PERIODIC_LIFECYCLE_DELTA_COUNT" ]]; then RUN_META_PERIODIC_DELTA_COUNT="$PERIODIC_LIFECYCLE_DELTA_COUNT" fi else RUN_META_PERIODIC_ENABLED="false" fi RUN_META_PERIODIC_MAX_DELTAS="$PERIODIC_SNAPSHOT_MAX_DELTAS" RUN_META_PERIODIC_DELTA_COUNT="" RUN_META_PERIODIC_FORCED="false" RUN_META_RESET_DB_STAGING_PATH="" RUN_META_RESET_DB_CLEANUP_STATUS="" RUN_META_TMP_CLEANUP_STATUS="" RUN_META_TMP_CLEANUP_REASON="" local previous_run_id="" local previous_success_value="" local sync_mode="snapshot" local snapshot_reason="" if (( next_index > 1 )); then previous_run_id="$(printf 'run_%04d' $((next_index - 1)))" if previous_run_success "$RUNS_ROOT/$previous_run_id"; then previous_success_value="true" if delta_state_available; then sync_mode="delta" if is_true "$PERIODIC_SNAPSHOT_RESET"; then if [[ "$PERIODIC_LIFECYCLE_FORCE_NEEDED" == "true" ]]; then RUN_META_PERIODIC_FORCED="true" sync_mode="snapshot" snapshot_reason="periodic_snapshot_delta_limit" prepare_periodic_reset_state_db "$(printf 'run_%04d' "$next_index")" RUN_META_RESET_DB_STAGING_PATH="$RESET_DB_STAGING_PATH" RUN_META_RESET_DB_CLEANUP_STATUS="$RESET_DB_CLEANUP_STATUS" echo "periodic snapshot reset forcing snapshot run=$(printf 'run_%04d' "$next_index") delta_count=$PERIODIC_LIFECYCLE_DELTA_COUNT max_deltas=$PERIODIC_SNAPSHOT_MAX_DELTAS" fi fi else sync_mode="snapshot" snapshot_reason="missing_db" fi else previous_success_value="false" if is_true "$FAILURE_SNAPSHOT_RESET"; then isolate_state_after_failure "$previous_run_id" RUN_META_TMP_CLEANUP_STATUS="$TMP_CLEANUP_STATUS" RUN_META_TMP_CLEANUP_REASON="$TMP_CLEANUP_REASON" sync_mode="snapshot" snapshot_reason="previous_run_failed" else die "previous run is not successful: $previous_run_id" fi fi else sync_mode="snapshot" if db_state_exists; then isolate_state_after_failure "no_previous_run" RUN_META_TMP_CLEANUP_STATUS="$TMP_CLEANUP_STATUS" RUN_META_TMP_CLEANUP_REASON="$TMP_CLEANUP_REASON" snapshot_reason="no_successful_previous_run" else snapshot_reason="first_run" fi fi echo "starting run $(printf 'run_%04d' "$next_index") sync_mode=$sync_mode" if run_one_round "$next_index" "$previous_run_id" "$previous_success_value" "$sync_mode" "$snapshot_reason"; then echo "completed run $(printf 'run_%04d' "$next_index") status=success" else echo "completed run $(printf 'run_%04d' "$next_index") status=failed" >&2 any_failed=1 fi if (( STOP_AFTER_SECS > 0 )); then elapsed_secs=$(( $(date +%s) - started_epoch )) if (( elapsed_secs >= STOP_AFTER_SECS )); then echo "run_soak stop_after_secs reached elapsed_secs=$elapsed_secs stop_after_secs=$STOP_AFTER_SECS" break fi fi if (( (run_forever == 1 || next_index < stop_index) && INTERVAL_SECS > 0 )); then sleep "$INTERVAL_SECS" fi next_index=$((next_index + 1)) done write_machine_snapshot "after" exit "$any_failed" } main "$@"