#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PACKAGE_ROOT="${PACKAGE_ROOT:-$SCRIPT_DIR}" ENV_FILE="${ENV_FILE:-$PACKAGE_ROOT/.env}" if [[ -f "$ENV_FILE" ]]; then # shellcheck disable=SC1090 source "$ENV_FILE" fi MAX_RUNS="${MAX_RUNS:-3}" INTERVAL_SECS="${INTERVAL_SECS:-0}" RIRS="${RIRS:-afrinic,apnic,arin,lacnic,ripe}" RUN_ROOT="${RUN_ROOT:-$PACKAGE_ROOT}" RETAIN_RUNS="${RETAIN_RUNS:-10}" OUTPUT_COMPACT_REPORT="${OUTPUT_COMPACT_REPORT:-1}" ALLOW_RSYNC_MIRROR_REUSE="${ALLOW_RSYNC_MIRROR_REUSE:-1}" FAILURE_SNAPSHOT_RESET="${FAILURE_SNAPSHOT_RESET:-1}" DB_STATS_EXACT_EVERY="${DB_STATS_EXACT_EVERY:-3}" RPKI_PROGRESS_LOG="${RPKI_PROGRESS_LOG:-1}" RPKI_PROGRESS_SLOW_SECS="${RPKI_PROGRESS_SLOW_SECS:-10}" DISABLE_COMPETING_RPS="${DISABLE_COMPETING_RPS:-1}" BIN_DIR="${BIN_DIR:-$PACKAGE_ROOT/bin}" FIXTURE_DIR="${FIXTURE_DIR:-$PACKAGE_ROOT/fixtures}" STATE_ROOT="$RUN_ROOT/state" RUNS_ROOT="$RUN_ROOT/runs" LOG_ROOT="$RUN_ROOT/logs" DB_DIR="${DB_DIR:-$STATE_ROOT/db}" META_DIR="${META_DIR:-$STATE_ROOT/meta}" TMP_DIR="${TMP_DIR:-$RUN_ROOT/tmp}" RSYNC_MIRROR_ROOT="${RSYNC_MIRROR_ROOT:-$STATE_ROOT/rsync-mirror}" INVALID_ROOT="$STATE_ROOT/invalid" RPKI_BIN="$BIN_DIR/rpki" RPKI_DAEMON_BIN="$BIN_DIR/rpki_daemon" DB_STATS_BIN="$BIN_DIR/db_stats" usage() { cat <<'USAGE' Usage: ./run_soak.sh 配置来自 package 根目录下的 .env;也可以用 ENV_FILE=/path/to/.env 覆盖。 USAGE } die() { echo "error: $*" >&2 exit 2 } is_true() { case "${1:-}" in 1|true|TRUE|yes|YES|on|ON) return 0 ;; *) return 1 ;; esac } require_command() { command -v "$1" >/dev/null 2>&1 || die "missing required command: $1" } validate_positive_int() { local name="$1" local value="$2" [[ "$value" =~ ^[0-9]+$ ]] || die "$name must be an integer: $value" [[ "$value" != "0" ]] || die "$name must be > 0" } validate_non_negative_int() { local name="$1" local value="$2" [[ "$value" =~ ^[0-9]+$ ]] || die "$name must be an integer: $value" } normalize_token() { local token="$1" token="${token#"${token%%[![:space:]]*}"}" token="${token%"${token##*[![:space:]]}"}" printf '%s' "$token" | tr '[:upper:]' '[:lower:]' } parse_rirs() { RIR_LIST=() local raw_token local normalized IFS=',' read -r -a raw_rirs <<< "$RIRS" for raw_token in "${raw_rirs[@]}"; do normalized="$(normalize_token "$raw_token")" [[ -n "$normalized" ]] || continue case "$normalized" in afrinic|apnic|arin|lacnic|ripe) RIR_LIST+=("$normalized") ;; *) die "invalid RIRS entry: $raw_token; allowed: afrinic,apnic,arin,lacnic,ripe" ;; esac done [[ "${#RIR_LIST[@]}" -gt 0 ]] || die "RIRS must contain at least one RIR" } tal_file_for_rir() { case "$1" in afrinic) printf '%s' "$FIXTURE_DIR/tal/afrinic.tal" ;; apnic) printf '%s' "$FIXTURE_DIR/tal/apnic-rfc7730-https.tal" ;; arin) printf '%s' "$FIXTURE_DIR/tal/arin.tal" ;; lacnic) printf '%s' "$FIXTURE_DIR/tal/lacnic.tal" ;; ripe) printf '%s' "$FIXTURE_DIR/tal/ripe-ncc.tal" ;; *) die "unknown RIR: $1" ;; esac } ta_file_for_rir() { case "$1" in afrinic) printf '%s' "$FIXTURE_DIR/ta/afrinic-ta.cer" ;; apnic) printf '%s' "$FIXTURE_DIR/ta/apnic-ta.cer" ;; arin) printf '%s' "$FIXTURE_DIR/ta/arin-ta.cer" ;; lacnic) printf '%s' "$FIXTURE_DIR/ta/lacnic-ta.cer" ;; ripe) printf '%s' "$FIXTURE_DIR/ta/ripe-ncc-ta.cer" ;; *) die "unknown RIR: $1" ;; esac } cir_tal_uri_for_rir() { case "$1" in afrinic) printf '%s' "https://rpki.afrinic.net/tal/afrinic.tal" ;; apnic) printf '%s' "https://rpki.apnic.net/tal/apnic-rfc7730-https.tal" ;; arin) printf '%s' "https://www.arin.net/resources/manage/rpki/arin.tal" ;; lacnic) printf '%s' "https://www.lacnic.net/innovaportal/file/4983/1/lacnic.tal" ;; ripe) printf '%s' "https://tal.rpki.ripe.net/ripe-ncc.tal" ;; *) die "unknown RIR: $1" ;; esac } compare_view_trust_anchor() { if [[ "${#RIR_LIST[@]}" -eq 1 ]]; then printf '%s' "${RIR_LIST[0]}" else printf '%s' "all5" fi } max_existing_run_index() { local max_index=0 local run_dir local run_name local numeric_part shopt -s nullglob for run_dir in "$RUNS_ROOT"/run_[0-9][0-9][0-9][0-9]; do [[ -d "$run_dir" ]] || continue run_name="$(basename "$run_dir")" numeric_part="${run_name#run_}" if (( 10#$numeric_part > max_index )); then max_index=$((10#$numeric_part)) fi done shopt -u nullglob printf '%s' "$max_index" } json_status_is_success() { local json_path="$1" python3 - "$json_path" <<'PY' import json import sys path = sys.argv[1] try: with open(path, "r", encoding="utf-8") as handle: data = json.load(handle) except Exception: sys.exit(1) sys.exit(0 if data.get("status") == "success" else 1) PY } previous_run_success() { local run_dir="$1" [[ -d "$run_dir" ]] || return 1 [[ -f "$run_dir/run-meta.json" ]] || return 1 [[ -f "$run_dir/run-summary.json" ]] || return 1 json_status_is_success "$run_dir/run-meta.json" || return 1 json_status_is_success "$run_dir/run-summary.json" || return 1 for required_artifact in report.json result.ccr input.cir stage-timing.json process-time.txt stdout.log stderr.log; do [[ -f "$run_dir/$required_artifact" ]] || return 1 done return 0 } move_if_exists() { local source_path="$1" local target_dir="$2" if [[ -e "$source_path" ]]; then mkdir -p "$target_dir" mv "$source_path" "$target_dir/" fi } db_state_exists() { [[ -e "$DB_DIR/work-db" || -e "$DB_DIR/repo-bytes.db" ]] } isolate_state_after_failure() { local previous_run_id="$1" local timestamp timestamp="$(date -u +%Y%m%dT%H%M%SZ)" local invalid_dir="$INVALID_ROOT/${previous_run_id}-${timestamp}" mkdir -p "$invalid_dir" move_if_exists "$DB_DIR" "$invalid_dir" move_if_exists "$META_DIR" "$invalid_dir" move_if_exists "$TMP_DIR" "$invalid_dir" mkdir -p "$DB_DIR" "$META_DIR" "$TMP_DIR" INVALID_DB_PATH="$invalid_dir/$(basename "$DB_DIR")" INVALID_STATE_PATH="$invalid_dir/$(basename "$META_DIR")" INVALID_TMP_PATH="$invalid_dir/$(basename "$TMP_DIR")" } write_run_meta() { local output_path="$1" local status="$2" local run_index="$3" local run_id="$4" local sync_mode="$5" local snapshot_reason="$6" local previous_run_id="$7" local previous_run_success_value="$8" local started_at="$9" local completed_at="${10}" local invalid_db_path="${11}" local invalid_state_path="${12}" local invalid_tmp_path="${13}" local daemon_exit_code="${14}" local package_root="${15}" local env_file="${16}" python3 - "$output_path" "$status" "$run_index" "$run_id" "$sync_mode" "$snapshot_reason" \ "$previous_run_id" "$previous_run_success_value" "$started_at" "$completed_at" \ "$invalid_db_path" "$invalid_state_path" "$invalid_tmp_path" "$daemon_exit_code" \ "$package_root" "$env_file" <<'PY' import json import sys def nullable(value): return None if value == "" else value def nullable_bool(value): if value == "": return None return value == "true" def nullable_int(value): if value == "": return None return int(value) ( output_path, status, run_index, run_id, sync_mode, snapshot_reason, previous_run_id, previous_run_success, started_at, completed_at, invalid_db_path, invalid_state_path, invalid_tmp_path, daemon_exit_code, package_root, env_file, ) = sys.argv[1:] data = { "status": status, "run_index": int(run_index), "run_id": run_id, "sync_mode": sync_mode, "snapshot_reason": nullable(snapshot_reason), "previous_run_id": nullable(previous_run_id), "previous_run_success": nullable_bool(previous_run_success), "started_at_rfc3339_utc": started_at, "completed_at_rfc3339_utc": nullable(completed_at), "invalid_db_path": nullable(invalid_db_path), "invalid_state_path": nullable(invalid_state_path), "invalid_tmp_path": nullable(invalid_tmp_path), "daemon_exit_code": nullable_int(daemon_exit_code), "package_root": package_root, "env_file": env_file, } with open(output_path, "w", encoding="utf-8") as handle: json.dump(data, handle, indent=2, sort_keys=True) handle.write("\n") PY } summary_status() { local summary_path="$1" python3 - "$summary_path" <<'PY' import json import sys try: with open(sys.argv[1], "r", encoding="utf-8") as handle: print(json.load(handle).get("status", "missing")) except Exception: print("missing") PY } prepare_competing_rp_state() { if ! is_true "$DISABLE_COMPETING_RPS"; then return 0 fi systemctl disable --now rpki-client.timer >/dev/null 2>&1 || true systemctl stop rpki-client.service >/dev/null 2>&1 || true pkill -x rpki-client >/dev/null 2>&1 || true pkill -x routinator >/dev/null 2>&1 || true } write_machine_snapshot() { local suffix="$1" df -h > "$LOG_ROOT/df-${suffix}.txt" 2>&1 || true free -h > "$LOG_ROOT/free-${suffix}.txt" 2>&1 || true ps -eo pid,ppid,stat,pcpu,pmem,rss,args --sort=-pcpu \ | grep -E 'rpki_daemon|/bin/rpki|rpki-client|routinator' \ | grep -v grep > "$LOG_ROOT/process-${suffix}.txt" || true systemctl is-active rpki-client.timer > "$LOG_ROOT/rpki-client-timer-active-${suffix}.txt" 2>&1 || true systemctl is-enabled rpki-client.timer > "$LOG_ROOT/rpki-client-timer-enabled-${suffix}.txt" 2>&1 || true } build_child_args() { CHILD_ARGS=( --db "$DB_DIR/work-db" --repo-bytes-db "$DB_DIR/repo-bytes.db" ) if is_true "$ALLOW_RSYNC_MIRROR_REUSE"; then CHILD_ARGS+=(--rsync-mirror-root "$RSYNC_MIRROR_ROOT") else CHILD_ARGS+=(--rsync-mirror-root "$TMP_DIR/rsync-mirror-{run_id}") fi CHILD_ARGS+=( --parallel-phase2-ready-batch-size 256 --parallel-phase2-ready-batch-wall-time-budget-ms 100 --parallel-phase2-result-drain-batch-size 2048 --parallel-phase2-finalize-batch-size 256 --parallel-phase2-finalize-batch-wall-time-budget-ms 100 ) local rir_name for rir_name in "${RIR_LIST[@]}"; do CHILD_ARGS+=(--tal-path "$(tal_file_for_rir "$rir_name")") CHILD_ARGS+=(--ta-path "$(ta_file_for_rir "$rir_name")") done CHILD_ARGS+=( --report-json "{run_out}/report.json" ) if is_true "$OUTPUT_COMPACT_REPORT"; then CHILD_ARGS+=(--report-json-compact) fi CHILD_ARGS+=( --ccr-out "{run_out}/result.ccr" --cir-enable --cir-out "{run_out}/input.cir" ) for rir_name in "${RIR_LIST[@]}"; do CHILD_ARGS+=(--cir-tal-uri "$(cir_tal_uri_for_rir "$rir_name")") done CHILD_ARGS+=( --vrps-csv-out "{run_out}/vrps.csv" --vaps-csv-out "{run_out}/vaps.csv" --compare-view-trust-anchor "$(compare_view_trust_anchor)" ) } copy_inner_run_outputs() { local daemon_state_root="$1" local run_dir="$2" local inner_run_dir inner_run_dir="$(find "$daemon_state_root/runs" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | sort | tail -n 1 || true)" if [[ -n "$inner_run_dir" && -d "$inner_run_dir" ]]; then shopt -s dotglob nullglob cp -a "$inner_run_dir"/. "$run_dir"/ shopt -u dotglob nullglob fi [[ -f "$daemon_state_root/daemon-status.json" ]] && cp "$daemon_state_root/daemon-status.json" "$run_dir/daemon-status.json" [[ -f "$daemon_state_root/daemon-runs.jsonl" ]] && cp "$daemon_state_root/daemon-runs.jsonl" "$run_dir/daemon-runs.jsonl" } apply_outer_retention() { local dirs=() local run_dir shopt -s nullglob for run_dir in "$RUNS_ROOT"/run_[0-9][0-9][0-9][0-9]; do [[ -d "$run_dir" ]] && dirs+=("$run_dir") done shopt -u nullglob if (( ${#dirs[@]} <= RETAIN_RUNS )); then return 0 fi mapfile -t dirs < <(printf '%s\n' "${dirs[@]}" | sort) local remove_count=$(( ${#dirs[@]} - RETAIN_RUNS )) local index for (( index = 0; index < remove_count; index++ )); do rm -rf "${dirs[$index]}" done } run_one_round() { local run_index="$1" local run_id run_id="$(printf 'run_%04d' "$run_index")" local run_dir="$RUNS_ROOT/$run_id" local previous_run_id="$2" local previous_success_value="$3" local sync_mode="$4" local snapshot_reason="$5" local daemon_state_root="$TMP_DIR/daemon-$run_id" local started_at local completed_at local daemon_exit_code local summary_state mkdir -p "$run_dir" "$daemon_state_root" started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)" write_run_meta "$run_dir/run-meta.json" "running" "$run_index" "$run_id" "$sync_mode" \ "$snapshot_reason" "$previous_run_id" "$previous_success_value" "$started_at" "" \ "$INVALID_DB_PATH" "$INVALID_STATE_PATH" "$INVALID_TMP_PATH" "" "$PACKAGE_ROOT" "$ENV_FILE" build_child_args local daemon_args=( --state-root "$daemon_state_root" --rpki-bin "$RPKI_BIN" --interval-secs 0 --max-runs 1 --retain-runs "$RETAIN_RUNS" --work-db "$DB_DIR/work-db" --repo-bytes-db "$DB_DIR/repo-bytes.db" ) if [[ -x "$DB_STATS_BIN" ]]; then daemon_args+=(--db-stats-bin "$DB_STATS_BIN") if [[ -n "${DB_STATS_EXACT_EVERY:-}" && "$DB_STATS_EXACT_EVERY" != "0" ]]; then daemon_args+=(--db-stats-exact-every "$DB_STATS_EXACT_EVERY") fi fi set +e env \ RPKI_PROGRESS_LOG="$RPKI_PROGRESS_LOG" \ RPKI_PROGRESS_SLOW_SECS="$RPKI_PROGRESS_SLOW_SECS" \ "$RPKI_DAEMON_BIN" "${daemon_args[@]}" -- "${CHILD_ARGS[@]}" \ > "$run_dir/daemon-stdout.log" 2> "$run_dir/daemon-stderr.log" daemon_exit_code=$? set -e copy_inner_run_outputs "$daemon_state_root" "$run_dir" completed_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)" summary_state="$(summary_status "$run_dir/run-summary.json")" local final_status="failed" if [[ "$daemon_exit_code" -eq 0 && "$summary_state" == "success" ]]; then final_status="success" fi write_run_meta "$run_dir/run-meta.json" "$final_status" "$run_index" "$run_id" "$sync_mode" \ "$snapshot_reason" "$previous_run_id" "$previous_success_value" "$started_at" "$completed_at" \ "$INVALID_DB_PATH" "$INVALID_STATE_PATH" "$INVALID_TMP_PATH" "$daemon_exit_code" "$PACKAGE_ROOT" "$ENV_FILE" printf '%s\n' "$run_id" > "$META_DIR/last-run-id" apply_outer_retention [[ "$final_status" == "success" ]] } main() { if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then usage exit 0 fi require_command python3 require_command date require_command find validate_positive_int "MAX_RUNS" "$MAX_RUNS" validate_non_negative_int "INTERVAL_SECS" "$INTERVAL_SECS" validate_positive_int "RETAIN_RUNS" "$RETAIN_RUNS" if [[ -n "${DB_STATS_EXACT_EVERY:-}" && "$DB_STATS_EXACT_EVERY" != "0" ]]; then validate_positive_int "DB_STATS_EXACT_EVERY" "$DB_STATS_EXACT_EVERY" fi parse_rirs [[ -x "$RPKI_BIN" ]] || die "missing executable: $RPKI_BIN" [[ -x "$RPKI_DAEMON_BIN" ]] || die "missing executable: $RPKI_DAEMON_BIN" local rir_name for rir_name in "${RIR_LIST[@]}"; do [[ -f "$(tal_file_for_rir "$rir_name")" ]] || die "missing TAL fixture for $rir_name" [[ -f "$(ta_file_for_rir "$rir_name")" ]] || die "missing TA fixture for $rir_name" done mkdir -p "$RUNS_ROOT" "$LOG_ROOT" "$DB_DIR" "$META_DIR" "$TMP_DIR" "$INVALID_ROOT" if is_true "$ALLOW_RSYNC_MIRROR_REUSE"; then mkdir -p "$RSYNC_MIRROR_ROOT" fi prepare_competing_rp_state write_machine_snapshot "before" local max_index local next_index max_index="$(max_existing_run_index)" next_index=$((max_index + 1)) local stop_index=$((max_index + MAX_RUNS)) local any_failed=0 while (( next_index <= stop_index )); do INVALID_DB_PATH="" INVALID_STATE_PATH="" INVALID_TMP_PATH="" local previous_run_id="" local previous_success_value="" local sync_mode="snapshot" local snapshot_reason="" if (( next_index > 1 )); then previous_run_id="$(printf 'run_%04d' $((next_index - 1)))" if previous_run_success "$RUNS_ROOT/$previous_run_id"; then previous_success_value="true" if [[ -e "$DB_DIR/work-db" ]]; then sync_mode="delta" else sync_mode="snapshot" snapshot_reason="missing_db" fi else previous_success_value="false" if is_true "$FAILURE_SNAPSHOT_RESET"; then isolate_state_after_failure "$previous_run_id" sync_mode="snapshot" snapshot_reason="previous_run_failed" else die "previous run is not successful: $previous_run_id" fi fi else sync_mode="snapshot" if db_state_exists; then isolate_state_after_failure "no_previous_run" snapshot_reason="no_successful_previous_run" else snapshot_reason="first_run" fi fi echo "starting run $(printf 'run_%04d' "$next_index") sync_mode=$sync_mode" if run_one_round "$next_index" "$previous_run_id" "$previous_success_value" "$sync_mode" "$snapshot_reason"; then echo "completed run $(printf 'run_%04d' "$next_index") status=success" else echo "completed run $(printf 'run_%04d' "$next_index") status=failed" >&2 any_failed=1 fi if (( next_index < stop_index && INTERVAL_SECS > 0 )); then sleep "$INTERVAL_SECS" fi next_index=$((next_index + 1)) done write_machine_snapshot "after" exit "$any_failed" } main "$@"