#!/usr/bin/env bash
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PACKAGE_ROOT="${PACKAGE_ROOT:-$SCRIPT_DIR}"
ENV_FILE="${ENV_FILE:-$PACKAGE_ROOT/.env}"

if [[ -f "$ENV_FILE" ]]; then
  # shellcheck disable=SC1090
  source "$ENV_FILE"
fi

MAX_RUNS="${MAX_RUNS:-3}"
INTERVAL_SECS="${INTERVAL_SECS:-0}"
RIRS="${RIRS:-afrinic,apnic,arin,lacnic,ripe}"
RUN_ROOT="${RUN_ROOT:-$PACKAGE_ROOT}"
RETAIN_RUNS="${RETAIN_RUNS:-10}"
OUTPUT_COMPACT_REPORT="${OUTPUT_COMPACT_REPORT:-1}"
ALLOW_RSYNC_MIRROR_REUSE="${ALLOW_RSYNC_MIRROR_REUSE:-1}"
FAILURE_SNAPSHOT_RESET="${FAILURE_SNAPSHOT_RESET:-1}"
DB_STATS_EXACT_EVERY="${DB_STATS_EXACT_EVERY:-3}"
RPKI_PROGRESS_LOG="${RPKI_PROGRESS_LOG:-1}"
RPKI_PROGRESS_SLOW_SECS="${RPKI_PROGRESS_SLOW_SECS:-10}"
DISABLE_COMPETING_RPS="${DISABLE_COMPETING_RPS:-1}"

BIN_DIR="${BIN_DIR:-$PACKAGE_ROOT/bin}"
FIXTURE_DIR="${FIXTURE_DIR:-$PACKAGE_ROOT/fixtures}"
STATE_ROOT="$RUN_ROOT/state"
RUNS_ROOT="$RUN_ROOT/runs"
LOG_ROOT="$RUN_ROOT/logs"
DB_DIR="${DB_DIR:-$STATE_ROOT/db}"
META_DIR="${META_DIR:-$STATE_ROOT/meta}"
TMP_DIR="${TMP_DIR:-$RUN_ROOT/tmp}"
RSYNC_MIRROR_ROOT="${RSYNC_MIRROR_ROOT:-$STATE_ROOT/rsync-mirror}"
INVALID_ROOT="$STATE_ROOT/invalid"

RPKI_BIN="$BIN_DIR/rpki"
RPKI_DAEMON_BIN="$BIN_DIR/rpki_daemon"
DB_STATS_BIN="$BIN_DIR/db_stats"

usage() {
  cat <<'USAGE'
Usage:
  ./run_soak.sh

配置来自 package 根目录下的 .env；也可以用 ENV_FILE=/path/to/.env 覆盖。
USAGE
}

die() {
  echo "error: $*" >&2
  exit 2
}

is_true() {
  case "${1:-}" in
    1|true|TRUE|yes|YES|on|ON) return 0 ;;
    *) return 1 ;;
  esac
}

require_command() {
  command -v "$1" >/dev/null 2>&1 || die "missing required command: $1"
}

validate_positive_int() {
  local name="$1"
  local value="$2"
  [[ "$value" =~ ^[0-9]+$ ]] || die "$name must be an integer: $value"
  [[ "$value" != "0" ]] || die "$name must be > 0"
}

validate_non_negative_int() {
  local name="$1"
  local value="$2"
  [[ "$value" =~ ^[0-9]+$ ]] || die "$name must be an integer: $value"
}

normalize_token() {
  local token="$1"
  token="${token#"${token%%[![:space:]]*}"}"
  token="${token%"${token##*[![:space:]]}"}"
  printf '%s' "$token" | tr '[:upper:]' '[:lower:]'
}

parse_rirs() {
  RIR_LIST=()
  local raw_token
  local normalized
  IFS=',' read -r -a raw_rirs <<< "$RIRS"
  for raw_token in "${raw_rirs[@]}"; do
    normalized="$(normalize_token "$raw_token")"
    [[ -n "$normalized" ]] || continue
    case "$normalized" in
      afrinic|apnic|arin|lacnic|ripe)
        RIR_LIST+=("$normalized")
        ;;
      *)
        die "invalid RIRS entry: $raw_token; allowed: afrinic,apnic,arin,lacnic,ripe"
        ;;
    esac
  done
  [[ "${#RIR_LIST[@]}" -gt 0 ]] || die "RIRS must contain at least one RIR"
}

tal_file_for_rir() {
  case "$1" in
    afrinic) printf '%s' "$FIXTURE_DIR/tal/afrinic.tal" ;;
    apnic) printf '%s' "$FIXTURE_DIR/tal/apnic-rfc7730-https.tal" ;;
    arin) printf '%s' "$FIXTURE_DIR/tal/arin.tal" ;;
    lacnic) printf '%s' "$FIXTURE_DIR/tal/lacnic.tal" ;;
    ripe) printf '%s' "$FIXTURE_DIR/tal/ripe-ncc.tal" ;;
    *) die "unknown RIR: $1" ;;
  esac
}

ta_file_for_rir() {
  case "$1" in
    afrinic) printf '%s' "$FIXTURE_DIR/ta/afrinic-ta.cer" ;;
    apnic) printf '%s' "$FIXTURE_DIR/ta/apnic-ta.cer" ;;
    arin) printf '%s' "$FIXTURE_DIR/ta/arin-ta.cer" ;;
    lacnic) printf '%s' "$FIXTURE_DIR/ta/lacnic-ta.cer" ;;
    ripe) printf '%s' "$FIXTURE_DIR/ta/ripe-ncc-ta.cer" ;;
    *) die "unknown RIR: $1" ;;
  esac
}

cir_tal_uri_for_rir() {
  case "$1" in
    afrinic) printf '%s' "https://rpki.afrinic.net/tal/afrinic.tal" ;;
    apnic) printf '%s' "https://rpki.apnic.net/tal/apnic-rfc7730-https.tal" ;;
    arin) printf '%s' "https://www.arin.net/resources/manage/rpki/arin.tal" ;;
    lacnic) printf '%s' "https://www.lacnic.net/innovaportal/file/4983/1/lacnic.tal" ;;
    ripe) printf '%s' "https://tal.rpki.ripe.net/ripe-ncc.tal" ;;
    *) die "unknown RIR: $1" ;;
  esac
}

compare_view_trust_anchor() {
  if [[ "${#RIR_LIST[@]}" -eq 1 ]]; then
    printf '%s' "${RIR_LIST[0]}"
  else
    printf '%s' "all5"
  fi
}

max_existing_run_index() {
  local max_index=0
  local run_dir
  local run_name
  local numeric_part
  shopt -s nullglob
  for run_dir in "$RUNS_ROOT"/run_[0-9][0-9][0-9][0-9]; do
    [[ -d "$run_dir" ]] || continue
    run_name="$(basename "$run_dir")"
    numeric_part="${run_name#run_}"
    if (( 10#$numeric_part > max_index )); then
      max_index=$((10#$numeric_part))
    fi
  done
  shopt -u nullglob
  printf '%s' "$max_index"
}

json_status_is_success() {
  local json_path="$1"
  python3 - "$json_path" <<'PY'
import json
import sys
path = sys.argv[1]
try:
    with open(path, "r", encoding="utf-8") as handle:
        data = json.load(handle)
except Exception:
    sys.exit(1)
sys.exit(0 if data.get("status") == "success" else 1)
PY
}

previous_run_success() {
  local run_dir="$1"
  [[ -d "$run_dir" ]] || return 1
  [[ -f "$run_dir/run-meta.json" ]] || return 1
  [[ -f "$run_dir/run-summary.json" ]] || return 1
  json_status_is_success "$run_dir/run-meta.json" || return 1
  json_status_is_success "$run_dir/run-summary.json" || return 1
  for required_artifact in report.json result.ccr input.cir stage-timing.json process-time.txt stdout.log stderr.log; do
    [[ -f "$run_dir/$required_artifact" ]] || return 1
  done
  return 0
}

move_if_exists() {
  local source_path="$1"
  local target_dir="$2"
  if [[ -e "$source_path" ]]; then
    mkdir -p "$target_dir"
    mv "$source_path" "$target_dir/"
  fi
}

db_state_exists() {
  [[ -e "$DB_DIR/work-db" || -e "$DB_DIR/repo-bytes.db" ]]
}

isolate_state_after_failure() {
  local previous_run_id="$1"
  local timestamp
  timestamp="$(date -u +%Y%m%dT%H%M%SZ)"
  local invalid_dir="$INVALID_ROOT/${previous_run_id}-${timestamp}"
  mkdir -p "$invalid_dir"
  move_if_exists "$DB_DIR" "$invalid_dir"
  move_if_exists "$META_DIR" "$invalid_dir"
  move_if_exists "$TMP_DIR" "$invalid_dir"
  mkdir -p "$DB_DIR" "$META_DIR" "$TMP_DIR"
  INVALID_DB_PATH="$invalid_dir/$(basename "$DB_DIR")"
  INVALID_STATE_PATH="$invalid_dir/$(basename "$META_DIR")"
  INVALID_TMP_PATH="$invalid_dir/$(basename "$TMP_DIR")"
}

write_run_meta() {
  local output_path="$1"
  local status="$2"
  local run_index="$3"
  local run_id="$4"
  local sync_mode="$5"
  local snapshot_reason="$6"
  local previous_run_id="$7"
  local previous_run_success_value="$8"
  local started_at="$9"
  local completed_at="${10}"
  local invalid_db_path="${11}"
  local invalid_state_path="${12}"
  local invalid_tmp_path="${13}"
  local daemon_exit_code="${14}"
  local package_root="${15}"
  local env_file="${16}"
  python3 - "$output_path" "$status" "$run_index" "$run_id" "$sync_mode" "$snapshot_reason" \
    "$previous_run_id" "$previous_run_success_value" "$started_at" "$completed_at" \
    "$invalid_db_path" "$invalid_state_path" "$invalid_tmp_path" "$daemon_exit_code" \
    "$package_root" "$env_file" <<'PY'
import json
import sys

def nullable(value):
    return None if value == "" else value

def nullable_bool(value):
    if value == "":
        return None
    return value == "true"

def nullable_int(value):
    if value == "":
        return None
    return int(value)

(
    output_path,
    status,
    run_index,
    run_id,
    sync_mode,
    snapshot_reason,
    previous_run_id,
    previous_run_success,
    started_at,
    completed_at,
    invalid_db_path,
    invalid_state_path,
    invalid_tmp_path,
    daemon_exit_code,
    package_root,
    env_file,
) = sys.argv[1:]

data = {
    "status": status,
    "run_index": int(run_index),
    "run_id": run_id,
    "sync_mode": sync_mode,
    "snapshot_reason": nullable(snapshot_reason),
    "previous_run_id": nullable(previous_run_id),
    "previous_run_success": nullable_bool(previous_run_success),
    "started_at_rfc3339_utc": started_at,
    "completed_at_rfc3339_utc": nullable(completed_at),
    "invalid_db_path": nullable(invalid_db_path),
    "invalid_state_path": nullable(invalid_state_path),
    "invalid_tmp_path": nullable(invalid_tmp_path),
    "daemon_exit_code": nullable_int(daemon_exit_code),
    "package_root": package_root,
    "env_file": env_file,
}
with open(output_path, "w", encoding="utf-8") as handle:
    json.dump(data, handle, indent=2, sort_keys=True)
    handle.write("\n")
PY
}

summary_status() {
  local summary_path="$1"
  python3 - "$summary_path" <<'PY'
import json
import sys
try:
    with open(sys.argv[1], "r", encoding="utf-8") as handle:
        print(json.load(handle).get("status", "missing"))
except Exception:
    print("missing")
PY
}

prepare_competing_rp_state() {
  if ! is_true "$DISABLE_COMPETING_RPS"; then
    return 0
  fi
  systemctl disable --now rpki-client.timer >/dev/null 2>&1 || true
  systemctl stop rpki-client.service >/dev/null 2>&1 || true
  pkill -x rpki-client >/dev/null 2>&1 || true
  pkill -x routinator >/dev/null 2>&1 || true
}

write_machine_snapshot() {
  local suffix="$1"
  df -h > "$LOG_ROOT/df-${suffix}.txt" 2>&1 || true
  free -h > "$LOG_ROOT/free-${suffix}.txt" 2>&1 || true
  ps -eo pid,ppid,stat,pcpu,pmem,rss,args --sort=-pcpu \
    | grep -E 'rpki_daemon|/bin/rpki|rpki-client|routinator' \
    | grep -v grep > "$LOG_ROOT/process-${suffix}.txt" || true
  systemctl is-active rpki-client.timer > "$LOG_ROOT/rpki-client-timer-active-${suffix}.txt" 2>&1 || true
  systemctl is-enabled rpki-client.timer > "$LOG_ROOT/rpki-client-timer-enabled-${suffix}.txt" 2>&1 || true
}

build_child_args() {
  CHILD_ARGS=(
    --db "$DB_DIR/work-db"
    --repo-bytes-db "$DB_DIR/repo-bytes.db"
  )
  if is_true "$ALLOW_RSYNC_MIRROR_REUSE"; then
    CHILD_ARGS+=(--rsync-mirror-root "$RSYNC_MIRROR_ROOT")
  else
    CHILD_ARGS+=(--rsync-mirror-root "$TMP_DIR/rsync-mirror-{run_id}")
  fi

  CHILD_ARGS+=(
    --parallel-phase2-ready-batch-size 256
    --parallel-phase2-ready-batch-wall-time-budget-ms 100
    --parallel-phase2-result-drain-batch-size 2048
    --parallel-phase2-finalize-batch-size 256
    --parallel-phase2-finalize-batch-wall-time-budget-ms 100
  )

  local rir_name
  for rir_name in "${RIR_LIST[@]}"; do
    CHILD_ARGS+=(--tal-path "$(tal_file_for_rir "$rir_name")")
    CHILD_ARGS+=(--ta-path "$(ta_file_for_rir "$rir_name")")
  done

  CHILD_ARGS+=(
    --report-json "{run_out}/report.json"
  )
  if is_true "$OUTPUT_COMPACT_REPORT"; then
    CHILD_ARGS+=(--report-json-compact)
  fi
  CHILD_ARGS+=(
    --ccr-out "{run_out}/result.ccr"
    --cir-enable
    --cir-out "{run_out}/input.cir"
  )

  for rir_name in "${RIR_LIST[@]}"; do
    CHILD_ARGS+=(--cir-tal-uri "$(cir_tal_uri_for_rir "$rir_name")")
  done

  CHILD_ARGS+=(
    --vrps-csv-out "{run_out}/vrps.csv"
    --vaps-csv-out "{run_out}/vaps.csv"
    --compare-view-trust-anchor "$(compare_view_trust_anchor)"
  )
}

copy_inner_run_outputs() {
  local daemon_state_root="$1"
  local run_dir="$2"
  local inner_run_dir
  inner_run_dir="$(find "$daemon_state_root/runs" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | sort | tail -n 1 || true)"
  if [[ -n "$inner_run_dir" && -d "$inner_run_dir" ]]; then
    shopt -s dotglob nullglob
    cp -a "$inner_run_dir"/. "$run_dir"/
    shopt -u dotglob nullglob
  fi
  [[ -f "$daemon_state_root/daemon-status.json" ]] && cp "$daemon_state_root/daemon-status.json" "$run_dir/daemon-status.json"
  [[ -f "$daemon_state_root/daemon-runs.jsonl" ]] && cp "$daemon_state_root/daemon-runs.jsonl" "$run_dir/daemon-runs.jsonl"
}

apply_outer_retention() {
  local dirs=()
  local run_dir
  shopt -s nullglob
  for run_dir in "$RUNS_ROOT"/run_[0-9][0-9][0-9][0-9]; do
    [[ -d "$run_dir" ]] && dirs+=("$run_dir")
  done
  shopt -u nullglob
  if (( ${#dirs[@]} <= RETAIN_RUNS )); then
    return 0
  fi
  mapfile -t dirs < <(printf '%s\n' "${dirs[@]}" | sort)
  local remove_count=$(( ${#dirs[@]} - RETAIN_RUNS ))
  local index
  for (( index = 0; index < remove_count; index++ )); do
    rm -rf "${dirs[$index]}"
  done
}

run_one_round() {
  local run_index="$1"
  local run_id
  run_id="$(printf 'run_%04d' "$run_index")"
  local run_dir="$RUNS_ROOT/$run_id"
  local previous_run_id="$2"
  local previous_success_value="$3"
  local sync_mode="$4"
  local snapshot_reason="$5"
  local daemon_state_root="$TMP_DIR/daemon-$run_id"
  local started_at
  local completed_at
  local daemon_exit_code
  local summary_state

  mkdir -p "$run_dir" "$daemon_state_root"
  started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
  write_run_meta "$run_dir/run-meta.json" "running" "$run_index" "$run_id" "$sync_mode" \
    "$snapshot_reason" "$previous_run_id" "$previous_success_value" "$started_at" "" \
    "$INVALID_DB_PATH" "$INVALID_STATE_PATH" "$INVALID_TMP_PATH" "" "$PACKAGE_ROOT" "$ENV_FILE"

  build_child_args
  local daemon_args=(
    --state-root "$daemon_state_root"
    --rpki-bin "$RPKI_BIN"
    --interval-secs 0
    --max-runs 1
    --retain-runs "$RETAIN_RUNS"
    --work-db "$DB_DIR/work-db"
    --repo-bytes-db "$DB_DIR/repo-bytes.db"
  )
  if [[ -x "$DB_STATS_BIN" ]]; then
    daemon_args+=(--db-stats-bin "$DB_STATS_BIN")
    if [[ -n "${DB_STATS_EXACT_EVERY:-}" && "$DB_STATS_EXACT_EVERY" != "0" ]]; then
      daemon_args+=(--db-stats-exact-every "$DB_STATS_EXACT_EVERY")
    fi
  fi

  set +e
  env \
    RPKI_PROGRESS_LOG="$RPKI_PROGRESS_LOG" \
    RPKI_PROGRESS_SLOW_SECS="$RPKI_PROGRESS_SLOW_SECS" \
    "$RPKI_DAEMON_BIN" "${daemon_args[@]}" -- "${CHILD_ARGS[@]}" \
      > "$run_dir/daemon-stdout.log" 2> "$run_dir/daemon-stderr.log"
  daemon_exit_code=$?
  set -e

  copy_inner_run_outputs "$daemon_state_root" "$run_dir"
  completed_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
  summary_state="$(summary_status "$run_dir/run-summary.json")"
  local final_status="failed"
  if [[ "$daemon_exit_code" -eq 0 && "$summary_state" == "success" ]]; then
    final_status="success"
  fi
  write_run_meta "$run_dir/run-meta.json" "$final_status" "$run_index" "$run_id" "$sync_mode" \
    "$snapshot_reason" "$previous_run_id" "$previous_success_value" "$started_at" "$completed_at" \
    "$INVALID_DB_PATH" "$INVALID_STATE_PATH" "$INVALID_TMP_PATH" "$daemon_exit_code" "$PACKAGE_ROOT" "$ENV_FILE"
  printf '%s\n' "$run_id" > "$META_DIR/last-run-id"
  apply_outer_retention
  [[ "$final_status" == "success" ]]
}

main() {
  if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then
    usage
    exit 0
  fi
  require_command python3
  require_command date
  require_command find
  validate_positive_int "MAX_RUNS" "$MAX_RUNS"
  validate_non_negative_int "INTERVAL_SECS" "$INTERVAL_SECS"
  validate_positive_int "RETAIN_RUNS" "$RETAIN_RUNS"
  if [[ -n "${DB_STATS_EXACT_EVERY:-}" && "$DB_STATS_EXACT_EVERY" != "0" ]]; then
    validate_positive_int "DB_STATS_EXACT_EVERY" "$DB_STATS_EXACT_EVERY"
  fi
  parse_rirs
  [[ -x "$RPKI_BIN" ]] || die "missing executable: $RPKI_BIN"
  [[ -x "$RPKI_DAEMON_BIN" ]] || die "missing executable: $RPKI_DAEMON_BIN"

  local rir_name
  for rir_name in "${RIR_LIST[@]}"; do
    [[ -f "$(tal_file_for_rir "$rir_name")" ]] || die "missing TAL fixture for $rir_name"
    [[ -f "$(ta_file_for_rir "$rir_name")" ]] || die "missing TA fixture for $rir_name"
  done

  mkdir -p "$RUNS_ROOT" "$LOG_ROOT" "$DB_DIR" "$META_DIR" "$TMP_DIR" "$INVALID_ROOT"
  if is_true "$ALLOW_RSYNC_MIRROR_REUSE"; then
    mkdir -p "$RSYNC_MIRROR_ROOT"
  fi
  prepare_competing_rp_state
  write_machine_snapshot "before"

  local max_index
  local next_index
  max_index="$(max_existing_run_index)"
  next_index=$((max_index + 1))
  local stop_index=$((max_index + MAX_RUNS))
  local any_failed=0

  while (( next_index <= stop_index )); do
    INVALID_DB_PATH=""
    INVALID_STATE_PATH=""
    INVALID_TMP_PATH=""
    local previous_run_id=""
    local previous_success_value=""
    local sync_mode="snapshot"
    local snapshot_reason=""
    if (( next_index > 1 )); then
      previous_run_id="$(printf 'run_%04d' $((next_index - 1)))"
      if previous_run_success "$RUNS_ROOT/$previous_run_id"; then
        previous_success_value="true"
        if [[ -e "$DB_DIR/work-db" ]]; then
          sync_mode="delta"
        else
          sync_mode="snapshot"
          snapshot_reason="missing_db"
        fi
      else
        previous_success_value="false"
        if is_true "$FAILURE_SNAPSHOT_RESET"; then
          isolate_state_after_failure "$previous_run_id"
          sync_mode="snapshot"
          snapshot_reason="previous_run_failed"
        else
          die "previous run is not successful: $previous_run_id"
        fi
      fi
    else
      sync_mode="snapshot"
      if db_state_exists; then
        isolate_state_after_failure "no_previous_run"
        snapshot_reason="no_successful_previous_run"
      else
        snapshot_reason="first_run"
      fi
    fi

    echo "starting run $(printf 'run_%04d' "$next_index") sync_mode=$sync_mode"
    if run_one_round "$next_index" "$previous_run_id" "$previous_success_value" "$sync_mode" "$snapshot_reason"; then
      echo "completed run $(printf 'run_%04d' "$next_index") status=success"
    else
      echo "completed run $(printf 'run_%04d' "$next_index") status=failed" >&2
      any_failed=1
    fi
    if (( next_index < stop_index && INTERVAL_SECS > 0 )); then
      sleep "$INTERVAL_SECS"
    fi
    next_index=$((next_index + 1))
  done

  write_machine_snapshot "after"
  exit "$any_failed"
}

main "$@"