From 265b6f65d0c9a10db43a49924def1e3e2f0d40ec Mon Sep 17 00:00:00 2001 From: yuyr Date: Sat, 9 May 2026 19:03:23 +0800 Subject: [PATCH] 20260509 add portable soak package --- scripts/soak/build_portable_soak_package.sh | 172 ++++++ scripts/soak/portable-soak.env.example | 49 ++ scripts/soak/run_soak.sh | 570 ++++++++++++++++++++ 3 files changed, 791 insertions(+) create mode 100755 scripts/soak/build_portable_soak_package.sh create mode 100644 scripts/soak/portable-soak.env.example create mode 100755 scripts/soak/run_soak.sh diff --git a/scripts/soak/build_portable_soak_package.sh b/scripts/soak/build_portable_soak_package.sh new file mode 100755 index 0000000..941d205 --- /dev/null +++ b/scripts/soak/build_portable_soak_package.sh @@ -0,0 +1,172 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +PROFILE="${PROFILE:-release}" +OUT_DIR="${OUT_DIR:-$REPO_ROOT/target/portable-soak}" +PACKAGE_PREFIX="${PACKAGE_PREFIX:-portable-soak}" +PACKAGE_DIR_NAME="${PACKAGE_DIR_NAME:-portable-soak}" + +usage() { + cat <<'USAGE' +Usage: + scripts/soak/build_portable_soak_package.sh [--out-dir ] [--profile ] + +Requires release binaries to already exist. Build them first, for example: + cargo build --release --bin rpki --bin rpki_daemon --bin db_stats +USAGE +} + +die() { + echo "error: $*" >&2 + exit 2 +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --out-dir) + shift + OUT_DIR="${1:?--out-dir requires a value}" + ;; + --profile) + shift + PROFILE="${1:?--profile requires a value}" + ;; + --help|-h) + usage + exit 0 + ;; + *) + die "unknown argument: $1" + ;; + esac + shift +done + +command -v python3 >/dev/null 2>&1 || die "python3 is required" +command -v tar >/dev/null 2>&1 || die "tar is required" + +if [[ "$PROFILE" == "release" ]]; then + TARGET_BIN_DIR="$REPO_ROOT/target/release" +else + TARGET_BIN_DIR="$REPO_ROOT/target/$PROFILE" +fi + +REQUIRED_BINS=(rpki rpki_daemon db_stats) +OPTIONAL_BINS=( + ccr_dump + ccr_state_compare + ccr_to_compare_views + ccr_to_routinator_csv + ccr_verify + cir_drop_report + cir_dump_reject_list + cir_extract_inputs + cir_materialize + cir_probe_rpki_client_cache + cir_state_compare + rrdp_state_dump +) + +for binary_name in "${REQUIRED_BINS[@]}"; do + [[ -x "$TARGET_BIN_DIR/$binary_name" ]] || die "missing required binary: $TARGET_BIN_DIR/$binary_name" +done + +mkdir -p "$OUT_DIR" +GIT_SHA="$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || printf 'unknown')" +TIMESTAMP="$(date -u +%Y%m%dT%H%M%SZ)" +PACKAGE_NAME="${PACKAGE_PREFIX}-${TIMESTAMP}-${GIT_SHA}" +BUILD_ROOT="$REPO_ROOT/target/portable-soak-build" +STAGE_DIR="$BUILD_ROOT/$PACKAGE_DIR_NAME" +ARCHIVE_PATH="$OUT_DIR/$PACKAGE_NAME.tar.gz" + +rm -rf "$STAGE_DIR" "$ARCHIVE_PATH" +mkdir -p "$STAGE_DIR/bin" "$STAGE_DIR/fixtures" "$STAGE_DIR/scripts" \ + "$STAGE_DIR/runs" "$STAGE_DIR/state" "$STAGE_DIR/logs" "$STAGE_DIR/tmp" + +install -m 0755 "$SCRIPT_DIR/run_soak.sh" "$STAGE_DIR/run_soak.sh" +install -m 0644 "$SCRIPT_DIR/portable-soak.env.example" "$STAGE_DIR/.env" +install -m 0644 "$SCRIPT_DIR/portable-soak.env.example" "$STAGE_DIR/portable-soak.env.example" + +COPIED_BIN_LIST="$STAGE_DIR/copied-binaries.txt" +MISSING_OPTIONAL_BIN_LIST="$STAGE_DIR/missing-optional-binaries.txt" +: > "$COPIED_BIN_LIST" +: > "$MISSING_OPTIONAL_BIN_LIST" + +for binary_name in "${REQUIRED_BINS[@]}"; do + install -m 0755 "$TARGET_BIN_DIR/$binary_name" "$STAGE_DIR/bin/$binary_name" + printf '%s\n' "$binary_name" >> "$COPIED_BIN_LIST" +done + +for binary_name in "${OPTIONAL_BINS[@]}"; do + if [[ -x "$TARGET_BIN_DIR/$binary_name" ]]; then + install -m 0755 "$TARGET_BIN_DIR/$binary_name" "$STAGE_DIR/bin/$binary_name" + printf '%s\n' "$binary_name" >> "$COPIED_BIN_LIST" + else + printf '%s\n' "$binary_name" >> "$MISSING_OPTIONAL_BIN_LIST" + fi +done + +cp -a "$REPO_ROOT/tests/fixtures/tal" "$STAGE_DIR/fixtures/" +cp -a "$REPO_ROOT/tests/fixtures/ta" "$STAGE_DIR/fixtures/" +cp -a "$REPO_ROOT/scripts/periodic" "$STAGE_DIR/scripts/" +cp -a "$REPO_ROOT/scripts/cir" "$STAGE_DIR/scripts/" +find "$STAGE_DIR/scripts" -type d -name __pycache__ -prune -exec rm -rf {} + + +(cd "$STAGE_DIR" && find fixtures -type f | sort > fixtures.txt) +(cd "$STAGE_DIR" && find scripts -type f | sort > scripts.txt) + +GIT_DIRTY="false" +if [[ -n "$(git -C "$REPO_ROOT" status --short 2>/dev/null || true)" ]]; then + GIT_DIRTY="true" +fi +GIT_STATUS="$(git -C "$REPO_ROOT" status --short 2>/dev/null || true)" + +python3 - "$STAGE_DIR/manifest.json" "$PACKAGE_NAME" "$TIMESTAMP" "$REPO_ROOT" "$GIT_SHA" \ + "$GIT_DIRTY" "$PROFILE" "$TARGET_BIN_DIR" "$GIT_STATUS" <<'PY' +import json +import pathlib +import sys + +( + manifest_path, + package_name, + created_at, + repo_root, + git_sha, + git_dirty, + profile, + target_bin_dir, + git_status, +) = sys.argv[1:] +stage_dir = pathlib.Path(manifest_path).parent + +def read_lines(name): + path = stage_dir / name + if not path.exists(): + return [] + return [line for line in path.read_text(encoding="utf-8").splitlines() if line] + +manifest = { + "packageName": package_name, + "createdAtUtc": created_at, + "sourceRepo": repo_root, + "gitCommit": git_sha, + "gitDirty": git_dirty == "true", + "gitStatusShort": git_status.splitlines(), + "rustProfile": profile, + "targetBinDir": target_bin_dir, + "copiedBinaries": read_lines("copied-binaries.txt"), + "missingOptionalBinaries": read_lines("missing-optional-binaries.txt"), + "fixtures": read_lines("fixtures.txt"), + "scripts": read_lines("scripts.txt"), +} +pathlib.Path(manifest_path).write_text( + json.dumps(manifest, indent=2, sort_keys=True) + "\n", + encoding="utf-8", +) +PY + +tar -C "$BUILD_ROOT" -czf "$ARCHIVE_PATH" "$PACKAGE_DIR_NAME" +printf '%s\n' "$ARCHIVE_PATH" diff --git a/scripts/soak/portable-soak.env.example b/scripts/soak/portable-soak.env.example new file mode 100644 index 0000000..447b1fe --- /dev/null +++ b/scripts/soak/portable-soak.env.example @@ -0,0 +1,49 @@ +# portable soak 运行配置。 +# 复制为 .env 后可以在远端直接调整;所有路径默认相对 package 根目录。 + +# 最大运行轮次。重复执行 run_soak.sh 时会从已有最后一轮之后继续编号。 +MAX_RUNS=3 + +# 两轮之间等待秒数。做连续无等待验收时设置为 0。 +INTERVAL_SECS=0 + +# 要运行的 RIR 列表,逗号分隔。 +# 合法值只有:afrinic, apnic, arin, lacnic, ripe。 +# 示例:RIRS=apnic,arin 或 RIRS=afrinic,apnic,arin,lacnic,ripe +RIRS=afrinic,apnic,arin,lacnic,ripe + +# 运行根目录。默认使用 package 根目录;如需把产物写到独立数据盘,可改成绝对路径。 +RUN_ROOT="${PACKAGE_ROOT}" + +# 保留最近多少轮 run 目录。旧 run 会由 rpki_daemon 自身或后续脚本策略清理。 +RETAIN_RUNS=10 + +# 是否输出 compact report JSON。1 表示启用,0 表示关闭。 +OUTPUT_COMPACT_REPORT=1 + +# 是否复用持久 rsync mirror。1 表示跨 run 复用;失败隔离数据库时也不会清理 mirror。 +ALLOW_RSYNC_MIRROR_REUSE=1 + +# 前一轮失败或不完整时,是否隔离旧数据库和运行态目录后强制下一轮 snapshot。 +# 建议保持 1;设置为 0 时,检测到前一轮失败会直接停止。 +FAILURE_SNAPSHOT_RESET=1 + +# 每隔多少轮执行一次 db_stats --exact。设置为空或 0 表示关闭 exact 统计。 +DB_STATS_EXACT_EVERY=3 + +# 是否开启 ours RP progress log。1 表示开启。 +RPKI_PROGRESS_LOG=1 + +# progress log 慢步骤阈值,单位秒。 +RPKI_PROGRESS_SLOW_SECS=10 + +# 是否在运行前尝试禁用 rpki-client timer 并杀掉竞争 RP 进程。 +DISABLE_COMPETING_RPS=1 + +# 可选覆盖路径;默认由 package 自动推导。 +# BIN_DIR="${PACKAGE_ROOT}/bin" +# FIXTURE_DIR="${PACKAGE_ROOT}/fixtures" +# DB_DIR="${RUN_ROOT}/state/db" +# META_DIR="${RUN_ROOT}/state/meta" +# TMP_DIR="${RUN_ROOT}/tmp" +# RSYNC_MIRROR_ROOT="${RUN_ROOT}/state/rsync-mirror" diff --git a/scripts/soak/run_soak.sh b/scripts/soak/run_soak.sh new file mode 100755 index 0000000..96fd0f0 --- /dev/null +++ b/scripts/soak/run_soak.sh @@ -0,0 +1,570 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PACKAGE_ROOT="${PACKAGE_ROOT:-$SCRIPT_DIR}" +ENV_FILE="${ENV_FILE:-$PACKAGE_ROOT/.env}" + +if [[ -f "$ENV_FILE" ]]; then + # shellcheck disable=SC1090 + source "$ENV_FILE" +fi + +MAX_RUNS="${MAX_RUNS:-3}" +INTERVAL_SECS="${INTERVAL_SECS:-0}" +RIRS="${RIRS:-afrinic,apnic,arin,lacnic,ripe}" +RUN_ROOT="${RUN_ROOT:-$PACKAGE_ROOT}" +RETAIN_RUNS="${RETAIN_RUNS:-10}" +OUTPUT_COMPACT_REPORT="${OUTPUT_COMPACT_REPORT:-1}" +ALLOW_RSYNC_MIRROR_REUSE="${ALLOW_RSYNC_MIRROR_REUSE:-1}" +FAILURE_SNAPSHOT_RESET="${FAILURE_SNAPSHOT_RESET:-1}" +DB_STATS_EXACT_EVERY="${DB_STATS_EXACT_EVERY:-3}" +RPKI_PROGRESS_LOG="${RPKI_PROGRESS_LOG:-1}" +RPKI_PROGRESS_SLOW_SECS="${RPKI_PROGRESS_SLOW_SECS:-10}" +DISABLE_COMPETING_RPS="${DISABLE_COMPETING_RPS:-1}" + +BIN_DIR="${BIN_DIR:-$PACKAGE_ROOT/bin}" +FIXTURE_DIR="${FIXTURE_DIR:-$PACKAGE_ROOT/fixtures}" +STATE_ROOT="$RUN_ROOT/state" +RUNS_ROOT="$RUN_ROOT/runs" +LOG_ROOT="$RUN_ROOT/logs" +DB_DIR="${DB_DIR:-$STATE_ROOT/db}" +META_DIR="${META_DIR:-$STATE_ROOT/meta}" +TMP_DIR="${TMP_DIR:-$RUN_ROOT/tmp}" +RSYNC_MIRROR_ROOT="${RSYNC_MIRROR_ROOT:-$STATE_ROOT/rsync-mirror}" +INVALID_ROOT="$STATE_ROOT/invalid" + +RPKI_BIN="$BIN_DIR/rpki" +RPKI_DAEMON_BIN="$BIN_DIR/rpki_daemon" +DB_STATS_BIN="$BIN_DIR/db_stats" + +usage() { + cat <<'USAGE' +Usage: + ./run_soak.sh + +配置来自 package 根目录下的 .env;也可以用 ENV_FILE=/path/to/.env 覆盖。 +USAGE +} + +die() { + echo "error: $*" >&2 + exit 2 +} + +is_true() { + case "${1:-}" in + 1|true|TRUE|yes|YES|on|ON) return 0 ;; + *) return 1 ;; + esac +} + +require_command() { + command -v "$1" >/dev/null 2>&1 || die "missing required command: $1" +} + +validate_positive_int() { + local name="$1" + local value="$2" + [[ "$value" =~ ^[0-9]+$ ]] || die "$name must be an integer: $value" + [[ "$value" != "0" ]] || die "$name must be > 0" +} + +validate_non_negative_int() { + local name="$1" + local value="$2" + [[ "$value" =~ ^[0-9]+$ ]] || die "$name must be an integer: $value" +} + +normalize_token() { + local token="$1" + token="${token#"${token%%[![:space:]]*}"}" + token="${token%"${token##*[![:space:]]}"}" + printf '%s' "$token" | tr '[:upper:]' '[:lower:]' +} + +parse_rirs() { + RIR_LIST=() + local raw_token + local normalized + IFS=',' read -r -a raw_rirs <<< "$RIRS" + for raw_token in "${raw_rirs[@]}"; do + normalized="$(normalize_token "$raw_token")" + [[ -n "$normalized" ]] || continue + case "$normalized" in + afrinic|apnic|arin|lacnic|ripe) + RIR_LIST+=("$normalized") + ;; + *) + die "invalid RIRS entry: $raw_token; allowed: afrinic,apnic,arin,lacnic,ripe" + ;; + esac + done + [[ "${#RIR_LIST[@]}" -gt 0 ]] || die "RIRS must contain at least one RIR" +} + +tal_file_for_rir() { + case "$1" in + afrinic) printf '%s' "$FIXTURE_DIR/tal/afrinic.tal" ;; + apnic) printf '%s' "$FIXTURE_DIR/tal/apnic-rfc7730-https.tal" ;; + arin) printf '%s' "$FIXTURE_DIR/tal/arin.tal" ;; + lacnic) printf '%s' "$FIXTURE_DIR/tal/lacnic.tal" ;; + ripe) printf '%s' "$FIXTURE_DIR/tal/ripe-ncc.tal" ;; + *) die "unknown RIR: $1" ;; + esac +} + +ta_file_for_rir() { + case "$1" in + afrinic) printf '%s' "$FIXTURE_DIR/ta/afrinic-ta.cer" ;; + apnic) printf '%s' "$FIXTURE_DIR/ta/apnic-ta.cer" ;; + arin) printf '%s' "$FIXTURE_DIR/ta/arin-ta.cer" ;; + lacnic) printf '%s' "$FIXTURE_DIR/ta/lacnic-ta.cer" ;; + ripe) printf '%s' "$FIXTURE_DIR/ta/ripe-ncc-ta.cer" ;; + *) die "unknown RIR: $1" ;; + esac +} + +cir_tal_uri_for_rir() { + case "$1" in + afrinic) printf '%s' "https://rpki.afrinic.net/tal/afrinic.tal" ;; + apnic) printf '%s' "https://rpki.apnic.net/tal/apnic-rfc7730-https.tal" ;; + arin) printf '%s' "https://www.arin.net/resources/manage/rpki/arin.tal" ;; + lacnic) printf '%s' "https://www.lacnic.net/innovaportal/file/4983/1/lacnic.tal" ;; + ripe) printf '%s' "https://tal.rpki.ripe.net/ripe-ncc.tal" ;; + *) die "unknown RIR: $1" ;; + esac +} + +compare_view_trust_anchor() { + if [[ "${#RIR_LIST[@]}" -eq 1 ]]; then + printf '%s' "${RIR_LIST[0]}" + else + printf '%s' "all5" + fi +} + +max_existing_run_index() { + local max_index=0 + local run_dir + local run_name + local numeric_part + shopt -s nullglob + for run_dir in "$RUNS_ROOT"/run_[0-9][0-9][0-9][0-9]; do + [[ -d "$run_dir" ]] || continue + run_name="$(basename "$run_dir")" + numeric_part="${run_name#run_}" + if (( 10#$numeric_part > max_index )); then + max_index=$((10#$numeric_part)) + fi + done + shopt -u nullglob + printf '%s' "$max_index" +} + +json_status_is_success() { + local json_path="$1" + python3 - "$json_path" <<'PY' +import json +import sys +path = sys.argv[1] +try: + with open(path, "r", encoding="utf-8") as handle: + data = json.load(handle) +except Exception: + sys.exit(1) +sys.exit(0 if data.get("status") == "success" else 1) +PY +} + +previous_run_success() { + local run_dir="$1" + [[ -d "$run_dir" ]] || return 1 + [[ -f "$run_dir/run-meta.json" ]] || return 1 + [[ -f "$run_dir/run-summary.json" ]] || return 1 + json_status_is_success "$run_dir/run-meta.json" || return 1 + json_status_is_success "$run_dir/run-summary.json" || return 1 + for required_artifact in report.json result.ccr input.cir stage-timing.json process-time.txt stdout.log stderr.log; do + [[ -f "$run_dir/$required_artifact" ]] || return 1 + done + return 0 +} + +move_if_exists() { + local source_path="$1" + local target_dir="$2" + if [[ -e "$source_path" ]]; then + mkdir -p "$target_dir" + mv "$source_path" "$target_dir/" + fi +} + +db_state_exists() { + [[ -e "$DB_DIR/work-db" || -e "$DB_DIR/repo-bytes.db" ]] +} + +isolate_state_after_failure() { + local previous_run_id="$1" + local timestamp + timestamp="$(date -u +%Y%m%dT%H%M%SZ)" + local invalid_dir="$INVALID_ROOT/${previous_run_id}-${timestamp}" + mkdir -p "$invalid_dir" + move_if_exists "$DB_DIR" "$invalid_dir" + move_if_exists "$META_DIR" "$invalid_dir" + move_if_exists "$TMP_DIR" "$invalid_dir" + mkdir -p "$DB_DIR" "$META_DIR" "$TMP_DIR" + INVALID_DB_PATH="$invalid_dir/$(basename "$DB_DIR")" + INVALID_STATE_PATH="$invalid_dir/$(basename "$META_DIR")" + INVALID_TMP_PATH="$invalid_dir/$(basename "$TMP_DIR")" +} + +write_run_meta() { + local output_path="$1" + local status="$2" + local run_index="$3" + local run_id="$4" + local sync_mode="$5" + local snapshot_reason="$6" + local previous_run_id="$7" + local previous_run_success_value="$8" + local started_at="$9" + local completed_at="${10}" + local invalid_db_path="${11}" + local invalid_state_path="${12}" + local invalid_tmp_path="${13}" + local daemon_exit_code="${14}" + local package_root="${15}" + local env_file="${16}" + python3 - "$output_path" "$status" "$run_index" "$run_id" "$sync_mode" "$snapshot_reason" \ + "$previous_run_id" "$previous_run_success_value" "$started_at" "$completed_at" \ + "$invalid_db_path" "$invalid_state_path" "$invalid_tmp_path" "$daemon_exit_code" \ + "$package_root" "$env_file" <<'PY' +import json +import sys + +def nullable(value): + return None if value == "" else value + +def nullable_bool(value): + if value == "": + return None + return value == "true" + +def nullable_int(value): + if value == "": + return None + return int(value) + +( + output_path, + status, + run_index, + run_id, + sync_mode, + snapshot_reason, + previous_run_id, + previous_run_success, + started_at, + completed_at, + invalid_db_path, + invalid_state_path, + invalid_tmp_path, + daemon_exit_code, + package_root, + env_file, +) = sys.argv[1:] + +data = { + "status": status, + "run_index": int(run_index), + "run_id": run_id, + "sync_mode": sync_mode, + "snapshot_reason": nullable(snapshot_reason), + "previous_run_id": nullable(previous_run_id), + "previous_run_success": nullable_bool(previous_run_success), + "started_at_rfc3339_utc": started_at, + "completed_at_rfc3339_utc": nullable(completed_at), + "invalid_db_path": nullable(invalid_db_path), + "invalid_state_path": nullable(invalid_state_path), + "invalid_tmp_path": nullable(invalid_tmp_path), + "daemon_exit_code": nullable_int(daemon_exit_code), + "package_root": package_root, + "env_file": env_file, +} +with open(output_path, "w", encoding="utf-8") as handle: + json.dump(data, handle, indent=2, sort_keys=True) + handle.write("\n") +PY +} + +summary_status() { + local summary_path="$1" + python3 - "$summary_path" <<'PY' +import json +import sys +try: + with open(sys.argv[1], "r", encoding="utf-8") as handle: + print(json.load(handle).get("status", "missing")) +except Exception: + print("missing") +PY +} + +prepare_competing_rp_state() { + if ! is_true "$DISABLE_COMPETING_RPS"; then + return 0 + fi + systemctl disable --now rpki-client.timer >/dev/null 2>&1 || true + systemctl stop rpki-client.service >/dev/null 2>&1 || true + pkill -x rpki-client >/dev/null 2>&1 || true + pkill -x routinator >/dev/null 2>&1 || true +} + +write_machine_snapshot() { + local suffix="$1" + df -h > "$LOG_ROOT/df-${suffix}.txt" 2>&1 || true + free -h > "$LOG_ROOT/free-${suffix}.txt" 2>&1 || true + ps -eo pid,ppid,stat,pcpu,pmem,rss,args --sort=-pcpu \ + | grep -E 'rpki_daemon|/bin/rpki|rpki-client|routinator' \ + | grep -v grep > "$LOG_ROOT/process-${suffix}.txt" || true + systemctl is-active rpki-client.timer > "$LOG_ROOT/rpki-client-timer-active-${suffix}.txt" 2>&1 || true + systemctl is-enabled rpki-client.timer > "$LOG_ROOT/rpki-client-timer-enabled-${suffix}.txt" 2>&1 || true +} + +build_child_args() { + CHILD_ARGS=( + --db "$DB_DIR/work-db" + --repo-bytes-db "$DB_DIR/repo-bytes.db" + ) + if is_true "$ALLOW_RSYNC_MIRROR_REUSE"; then + CHILD_ARGS+=(--rsync-mirror-root "$RSYNC_MIRROR_ROOT") + else + CHILD_ARGS+=(--rsync-mirror-root "$TMP_DIR/rsync-mirror-{run_id}") + fi + + CHILD_ARGS+=( + --parallel-phase2-ready-batch-size 256 + --parallel-phase2-ready-batch-wall-time-budget-ms 100 + --parallel-phase2-result-drain-batch-size 2048 + --parallel-phase2-finalize-batch-size 256 + --parallel-phase2-finalize-batch-wall-time-budget-ms 100 + ) + + local rir_name + for rir_name in "${RIR_LIST[@]}"; do + CHILD_ARGS+=(--tal-path "$(tal_file_for_rir "$rir_name")") + CHILD_ARGS+=(--ta-path "$(ta_file_for_rir "$rir_name")") + done + + CHILD_ARGS+=( + --report-json "{run_out}/report.json" + ) + if is_true "$OUTPUT_COMPACT_REPORT"; then + CHILD_ARGS+=(--report-json-compact) + fi + CHILD_ARGS+=( + --ccr-out "{run_out}/result.ccr" + --cir-enable + --cir-out "{run_out}/input.cir" + ) + + for rir_name in "${RIR_LIST[@]}"; do + CHILD_ARGS+=(--cir-tal-uri "$(cir_tal_uri_for_rir "$rir_name")") + done + + CHILD_ARGS+=( + --vrps-csv-out "{run_out}/vrps.csv" + --vaps-csv-out "{run_out}/vaps.csv" + --compare-view-trust-anchor "$(compare_view_trust_anchor)" + ) +} + +copy_inner_run_outputs() { + local daemon_state_root="$1" + local run_dir="$2" + local inner_run_dir + inner_run_dir="$(find "$daemon_state_root/runs" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | sort | tail -n 1 || true)" + if [[ -n "$inner_run_dir" && -d "$inner_run_dir" ]]; then + shopt -s dotglob nullglob + cp -a "$inner_run_dir"/. "$run_dir"/ + shopt -u dotglob nullglob + fi + [[ -f "$daemon_state_root/daemon-status.json" ]] && cp "$daemon_state_root/daemon-status.json" "$run_dir/daemon-status.json" + [[ -f "$daemon_state_root/daemon-runs.jsonl" ]] && cp "$daemon_state_root/daemon-runs.jsonl" "$run_dir/daemon-runs.jsonl" +} + +apply_outer_retention() { + local dirs=() + local run_dir + shopt -s nullglob + for run_dir in "$RUNS_ROOT"/run_[0-9][0-9][0-9][0-9]; do + [[ -d "$run_dir" ]] && dirs+=("$run_dir") + done + shopt -u nullglob + if (( ${#dirs[@]} <= RETAIN_RUNS )); then + return 0 + fi + mapfile -t dirs < <(printf '%s\n' "${dirs[@]}" | sort) + local remove_count=$(( ${#dirs[@]} - RETAIN_RUNS )) + local index + for (( index = 0; index < remove_count; index++ )); do + rm -rf "${dirs[$index]}" + done +} + +run_one_round() { + local run_index="$1" + local run_id + run_id="$(printf 'run_%04d' "$run_index")" + local run_dir="$RUNS_ROOT/$run_id" + local previous_run_id="$2" + local previous_success_value="$3" + local sync_mode="$4" + local snapshot_reason="$5" + local daemon_state_root="$TMP_DIR/daemon-$run_id" + local started_at + local completed_at + local daemon_exit_code + local summary_state + + mkdir -p "$run_dir" "$daemon_state_root" + started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + write_run_meta "$run_dir/run-meta.json" "running" "$run_index" "$run_id" "$sync_mode" \ + "$snapshot_reason" "$previous_run_id" "$previous_success_value" "$started_at" "" \ + "$INVALID_DB_PATH" "$INVALID_STATE_PATH" "$INVALID_TMP_PATH" "" "$PACKAGE_ROOT" "$ENV_FILE" + + build_child_args + local daemon_args=( + --state-root "$daemon_state_root" + --rpki-bin "$RPKI_BIN" + --interval-secs 0 + --max-runs 1 + --retain-runs "$RETAIN_RUNS" + --work-db "$DB_DIR/work-db" + --repo-bytes-db "$DB_DIR/repo-bytes.db" + ) + if [[ -x "$DB_STATS_BIN" ]]; then + daemon_args+=(--db-stats-bin "$DB_STATS_BIN") + if [[ -n "${DB_STATS_EXACT_EVERY:-}" && "$DB_STATS_EXACT_EVERY" != "0" ]]; then + daemon_args+=(--db-stats-exact-every "$DB_STATS_EXACT_EVERY") + fi + fi + + set +e + env \ + RPKI_PROGRESS_LOG="$RPKI_PROGRESS_LOG" \ + RPKI_PROGRESS_SLOW_SECS="$RPKI_PROGRESS_SLOW_SECS" \ + "$RPKI_DAEMON_BIN" "${daemon_args[@]}" -- "${CHILD_ARGS[@]}" \ + > "$run_dir/daemon-stdout.log" 2> "$run_dir/daemon-stderr.log" + daemon_exit_code=$? + set -e + + copy_inner_run_outputs "$daemon_state_root" "$run_dir" + completed_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + summary_state="$(summary_status "$run_dir/run-summary.json")" + local final_status="failed" + if [[ "$daemon_exit_code" -eq 0 && "$summary_state" == "success" ]]; then + final_status="success" + fi + write_run_meta "$run_dir/run-meta.json" "$final_status" "$run_index" "$run_id" "$sync_mode" \ + "$snapshot_reason" "$previous_run_id" "$previous_success_value" "$started_at" "$completed_at" \ + "$INVALID_DB_PATH" "$INVALID_STATE_PATH" "$INVALID_TMP_PATH" "$daemon_exit_code" "$PACKAGE_ROOT" "$ENV_FILE" + printf '%s\n' "$run_id" > "$META_DIR/last-run-id" + apply_outer_retention + [[ "$final_status" == "success" ]] +} + +main() { + if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then + usage + exit 0 + fi + require_command python3 + require_command date + require_command find + validate_positive_int "MAX_RUNS" "$MAX_RUNS" + validate_non_negative_int "INTERVAL_SECS" "$INTERVAL_SECS" + validate_positive_int "RETAIN_RUNS" "$RETAIN_RUNS" + if [[ -n "${DB_STATS_EXACT_EVERY:-}" && "$DB_STATS_EXACT_EVERY" != "0" ]]; then + validate_positive_int "DB_STATS_EXACT_EVERY" "$DB_STATS_EXACT_EVERY" + fi + parse_rirs + [[ -x "$RPKI_BIN" ]] || die "missing executable: $RPKI_BIN" + [[ -x "$RPKI_DAEMON_BIN" ]] || die "missing executable: $RPKI_DAEMON_BIN" + + local rir_name + for rir_name in "${RIR_LIST[@]}"; do + [[ -f "$(tal_file_for_rir "$rir_name")" ]] || die "missing TAL fixture for $rir_name" + [[ -f "$(ta_file_for_rir "$rir_name")" ]] || die "missing TA fixture for $rir_name" + done + + mkdir -p "$RUNS_ROOT" "$LOG_ROOT" "$DB_DIR" "$META_DIR" "$TMP_DIR" "$INVALID_ROOT" + if is_true "$ALLOW_RSYNC_MIRROR_REUSE"; then + mkdir -p "$RSYNC_MIRROR_ROOT" + fi + prepare_competing_rp_state + write_machine_snapshot "before" + + local max_index + local next_index + max_index="$(max_existing_run_index)" + next_index=$((max_index + 1)) + local stop_index=$((max_index + MAX_RUNS)) + local any_failed=0 + + while (( next_index <= stop_index )); do + INVALID_DB_PATH="" + INVALID_STATE_PATH="" + INVALID_TMP_PATH="" + local previous_run_id="" + local previous_success_value="" + local sync_mode="snapshot" + local snapshot_reason="" + if (( next_index > 1 )); then + previous_run_id="$(printf 'run_%04d' $((next_index - 1)))" + if previous_run_success "$RUNS_ROOT/$previous_run_id"; then + previous_success_value="true" + if [[ -e "$DB_DIR/work-db" ]]; then + sync_mode="delta" + else + sync_mode="snapshot" + snapshot_reason="missing_db" + fi + else + previous_success_value="false" + if is_true "$FAILURE_SNAPSHOT_RESET"; then + isolate_state_after_failure "$previous_run_id" + sync_mode="snapshot" + snapshot_reason="previous_run_failed" + else + die "previous run is not successful: $previous_run_id" + fi + fi + else + sync_mode="snapshot" + if db_state_exists; then + isolate_state_after_failure "no_previous_run" + snapshot_reason="no_successful_previous_run" + else + snapshot_reason="first_run" + fi + fi + + echo "starting run $(printf 'run_%04d' "$next_index") sync_mode=$sync_mode" + if run_one_round "$next_index" "$previous_run_id" "$previous_success_value" "$sync_mode" "$snapshot_reason"; then + echo "completed run $(printf 'run_%04d' "$next_index") status=success" + else + echo "completed run $(printf 'run_%04d' "$next_index") status=failed" >&2 + any_failed=1 + fi + if (( next_index < stop_index && INTERVAL_SECS > 0 )); then + sleep "$INTERVAL_SECS" + fi + next_index=$((next_index + 1)) + done + + write_machine_snapshot "after" + exit "$any_failed" +} + +main "$@"