#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PACKAGE_ROOT="${PACKAGE_ROOT:-$SCRIPT_DIR}" ENV_FILE="${ENV_FILE:-$PACKAGE_ROOT/.env}" if [[ -f "$ENV_FILE" ]]; then # shellcheck disable=SC1090 source "$ENV_FILE" fi RUN_ROOT="${RUN_ROOT:-$PACKAGE_ROOT}" BIN_DIR="${BIN_DIR:-$PACKAGE_ROOT/bin}" LOG_ROOT="${LOG_ROOT:-$RUN_ROOT/logs}" REPORTS_DIR="${HOURLY_REPORTS_DIR:-$RUN_ROOT/hourly_reports}" INCIDENT_DIR="${INCIDENT_DIR:-$RUN_ROOT/incident_runs}" MONITOR_DIR="${MONITOR_DIR:-$PACKAGE_ROOT/monitor}" SOAK_SCRIPT="${SOAK_SCRIPT:-$PACKAGE_ROOT/run_soak.sh}" HOURLY_REPORT_SCRIPT="${HOURLY_REPORT_SCRIPT:-$PACKAGE_ROOT/scripts/soak/hourly_soak_report.py}" SOAK_DURATION_SECS="${SOAK_DURATION_SECS:-0}" HOURLY_REPORT_INTERVAL_SECS="${HOURLY_REPORT_INTERVAL_SECS:-3600}" SOAK_RETAIN_RUNS="${SOAK_RETAIN_RUNS:-100}" CLEAN_TMP_AFTER_RUN="${CLEAN_TMP_AFTER_RUN:-1}" START_MONITOR_STACK="${START_MONITOR_STACK:-1}" STOP_MONITOR_STACK_ON_EXIT="${STOP_MONITOR_STACK_ON_EXIT:-0}" START_METRICS_SERVICE="${START_METRICS_SERVICE:-1}" STOP_METRICS_SERVICE_ON_EXIT="${STOP_METRICS_SERVICE_ON_EXIT:-0}" METRICS_LISTEN="${METRICS_LISTEN:-0.0.0.0:9556}" METRICS_POLL_SECS="${METRICS_POLL_SECS:-5}" METRICS_INSTANCE="${METRICS_INSTANCE:-remote231-24h}" PROMETHEUS_RETENTION="${PROMETHEUS_RETENTION:-7d}" SEND_FEISHU="${SEND_FEISHU:-1}" FEISHU_DRY_RUN="${FEISHU_DRY_RUN:-0}" FEISHU_WEBHOOK_SCRIPT="${FEISHU_WEBHOOK_SCRIPT:-}" FEISHU_WEBHOOK_URL="${FEISHU_WEBHOOK_URL:-}" export FEISHU_WEBHOOK_URL WALL_WARN_SECS="${WALL_WARN_SECS:-140}" VRP_MIN="${VRP_MIN:-900000}" VAPS_MIN="${VAPS_MIN:-1000}" PP_MIN="${PP_MIN:-50000}" WARNING_MAX="${WARNING_MAX:--1}" SOAK_PID="" METRICS_PID="" REPORTER_STOP=0 die() { echo "error: $*" >&2 exit 2 } is_true() { case "${1:-}" in 1|true|TRUE|yes|YES|on|ON) return 0 ;; *) return 1 ;; esac } validate_non_negative_int() { local name="$1" local value="$2" [[ "$value" =~ ^[0-9]+$ ]] || die "$name must be a non-negative integer: $value" } cleanup() { REPORTER_STOP=1 if [[ -n "$SOAK_PID" ]] && kill -0 "$SOAK_PID" >/dev/null 2>&1; then kill "$SOAK_PID" >/dev/null 2>&1 || true wait "$SOAK_PID" >/dev/null 2>&1 || true fi if is_true "$STOP_METRICS_SERVICE_ON_EXIT" && [[ -n "$METRICS_PID" ]] && kill -0 "$METRICS_PID" >/dev/null 2>&1; then kill "$METRICS_PID" >/dev/null 2>&1 || true wait "$METRICS_PID" >/dev/null 2>&1 || true fi if is_true "$START_MONITOR_STACK" && is_true "$STOP_MONITOR_STACK_ON_EXIT" && [[ -f "$MONITOR_DIR/docker-compose.yml" ]]; then (cd "$MONITOR_DIR" && PROMETHEUS_RETENTION="$PROMETHEUS_RETENTION" docker compose down) >/dev/null 2>&1 || true fi } trap cleanup EXIT INT TERM run_hourly_report() { local window_start="$1" local window_end="$2" local feishu_args=() if is_true "$SEND_FEISHU"; then feishu_args+=(--send-feishu) if is_true "$FEISHU_DRY_RUN"; then feishu_args+=(--dry-run-feishu) fi if [[ -n "$FEISHU_WEBHOOK_SCRIPT" ]]; then feishu_args+=(--feishu-script "$FEISHU_WEBHOOK_SCRIPT") fi fi python3 "$HOURLY_REPORT_SCRIPT" \ --run-root "$RUN_ROOT" \ --reports-dir "$REPORTS_DIR" \ --incident-dir "$INCIDENT_DIR" \ --window-start "$window_start" \ --window-end "$window_end" \ --wall-warn-secs "$WALL_WARN_SECS" \ --vrp-min "$VRP_MIN" \ --vaps-min "$VAPS_MIN" \ --pp-min "$PP_MIN" \ --warning-max "$WARNING_MAX" \ "${feishu_args[@]}" \ >> "$LOG_ROOT/hourly-reporter.stdout" 2>> "$LOG_ROOT/hourly-reporter.stderr" || true } format_epoch_rfc3339() { date -u -d "@$1" +%Y-%m-%dT%H:%M:%SZ } main() { validate_non_negative_int "SOAK_DURATION_SECS" "$SOAK_DURATION_SECS" validate_non_negative_int "HOURLY_REPORT_INTERVAL_SECS" "$HOURLY_REPORT_INTERVAL_SECS" [[ "$HOURLY_REPORT_INTERVAL_SECS" != "0" ]] || die "HOURLY_REPORT_INTERVAL_SECS must be > 0" [[ -x "$SOAK_SCRIPT" ]] || die "missing executable: $SOAK_SCRIPT" [[ -x "$BIN_DIR/rpki_artifact_metrics" ]] || die "missing executable: $BIN_DIR/rpki_artifact_metrics" [[ -f "$HOURLY_REPORT_SCRIPT" ]] || die "missing hourly report script: $HOURLY_REPORT_SCRIPT" mkdir -p "$LOG_ROOT" "$REPORTS_DIR" "$INCIDENT_DIR" df -h > "$LOG_ROOT/24h-df-before.txt" 2>&1 || true free -h > "$LOG_ROOT/24h-free-before.txt" 2>&1 || true if is_true "$START_METRICS_SERVICE"; then "$BIN_DIR/rpki_artifact_metrics" \ --run-root "$RUN_ROOT" \ --listen "$METRICS_LISTEN" \ --poll-secs "$METRICS_POLL_SECS" \ --instance "$METRICS_INSTANCE" \ > "$LOG_ROOT/metrics.stdout" 2> "$LOG_ROOT/metrics.stderr" & METRICS_PID="$!" echo "$METRICS_PID" > "$LOG_ROOT/metrics.pid" fi if is_true "$START_MONITOR_STACK"; then if [[ ! -f "$MONITOR_DIR/docker-compose.yml" ]]; then die "missing monitor compose: $MONITOR_DIR/docker-compose.yml" fi (cd "$MONITOR_DIR" && PROMETHEUS_RETENTION="$PROMETHEUS_RETENTION" docker compose up -d) \ > "$LOG_ROOT/monitor-start.stdout" 2> "$LOG_ROOT/monitor-start.stderr" fi local start_epoch local deadline_epoch local next_report_epoch local window_start_epoch local now_epoch local window_start local window_end local run_soak_env start_epoch="$(date +%s)" if (( SOAK_DURATION_SECS > 0 )); then deadline_epoch=$((start_epoch + SOAK_DURATION_SECS)) else deadline_epoch=0 fi next_report_epoch=$((start_epoch + HOURLY_REPORT_INTERVAL_SECS)) window_start_epoch="$start_epoch" run_soak_env="$LOG_ROOT/24h-run-soak.env" { if [[ -f "$ENV_FILE" ]]; then cat "$ENV_FILE" fi printf '\n# Generated by run_24h_soak_with_metrics.sh\n' printf 'MAX_RUNS=-1\n' printf 'INTERVAL_SECS=0\n' if (( SOAK_DURATION_SECS > 0 )); then printf 'STOP_AFTER_SECS=%q\n' "$SOAK_DURATION_SECS" else printf 'STOP_AFTER_SECS=0\n' fi printf 'RETAIN_RUNS=%q\n' "$SOAK_RETAIN_RUNS" printf 'CLEAN_TMP_AFTER_RUN=%q\n' "$CLEAN_TMP_AFTER_RUN" } > "$run_soak_env" env \ ENV_FILE="$run_soak_env" \ "$SOAK_SCRIPT" \ > "$LOG_ROOT/24h-soak.stdout" 2> "$LOG_ROOT/24h-soak.stderr" & SOAK_PID="$!" echo "$SOAK_PID" > "$LOG_ROOT/24h-soak.pid" while kill -0 "$SOAK_PID" >/dev/null 2>&1; do if (( REPORTER_STOP == 1 )); then break fi now_epoch="$(date +%s)" if (( now_epoch >= next_report_epoch )); then window_start="$(format_epoch_rfc3339 "$window_start_epoch")" window_end="$(format_epoch_rfc3339 "$now_epoch")" run_hourly_report "$window_start" "$window_end" window_start_epoch="$now_epoch" next_report_epoch=$((now_epoch + HOURLY_REPORT_INTERVAL_SECS)) fi if (( deadline_epoch > 0 && now_epoch > deadline_epoch + HOURLY_REPORT_INTERVAL_SECS + 300 )); then echo "deadline grace exceeded; waiting for soak process pid=$SOAK_PID" >&2 fi sleep 5 done local soak_exit_code set +e wait "$SOAK_PID" soak_exit_code=$? set -e SOAK_PID="" now_epoch="$(date +%s)" if (( now_epoch > window_start_epoch )); then window_start="$(format_epoch_rfc3339 "$window_start_epoch")" window_end="$(format_epoch_rfc3339 "$now_epoch")" run_hourly_report "$window_start" "$window_end" fi df -h > "$LOG_ROOT/24h-df-after.txt" 2>&1 || true free -h > "$LOG_ROOT/24h-free-after.txt" 2>&1 || true return "$soak_exit_code" } main "$@"