rpki/scripts/soak/run_24h_soak_with_metrics.sh

227 lines
7.2 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PACKAGE_ROOT="${PACKAGE_ROOT:-$SCRIPT_DIR}"
ENV_FILE="${ENV_FILE:-$PACKAGE_ROOT/.env}"
if [[ -f "$ENV_FILE" ]]; then
# shellcheck disable=SC1090
source "$ENV_FILE"
fi
RUN_ROOT="${RUN_ROOT:-$PACKAGE_ROOT}"
BIN_DIR="${BIN_DIR:-$PACKAGE_ROOT/bin}"
LOG_ROOT="${LOG_ROOT:-$RUN_ROOT/logs}"
REPORTS_DIR="${HOURLY_REPORTS_DIR:-$RUN_ROOT/hourly_reports}"
INCIDENT_DIR="${INCIDENT_DIR:-$RUN_ROOT/incident_runs}"
MONITOR_DIR="${MONITOR_DIR:-$PACKAGE_ROOT/monitor}"
SOAK_SCRIPT="${SOAK_SCRIPT:-$PACKAGE_ROOT/run_soak.sh}"
HOURLY_REPORT_SCRIPT="${HOURLY_REPORT_SCRIPT:-$PACKAGE_ROOT/scripts/soak/hourly_soak_report.py}"
SOAK_DURATION_SECS="${SOAK_DURATION_SECS:-0}"
HOURLY_REPORT_INTERVAL_SECS="${HOURLY_REPORT_INTERVAL_SECS:-3600}"
SOAK_RETAIN_RUNS="${SOAK_RETAIN_RUNS:-100}"
CLEAN_TMP_AFTER_RUN="${CLEAN_TMP_AFTER_RUN:-1}"
START_MONITOR_STACK="${START_MONITOR_STACK:-1}"
STOP_MONITOR_STACK_ON_EXIT="${STOP_MONITOR_STACK_ON_EXIT:-0}"
START_METRICS_SERVICE="${START_METRICS_SERVICE:-1}"
STOP_METRICS_SERVICE_ON_EXIT="${STOP_METRICS_SERVICE_ON_EXIT:-0}"
METRICS_LISTEN="${METRICS_LISTEN:-0.0.0.0:9556}"
METRICS_POLL_SECS="${METRICS_POLL_SECS:-5}"
METRICS_INSTANCE="${METRICS_INSTANCE:-remote231-24h}"
PROMETHEUS_RETENTION="${PROMETHEUS_RETENTION:-7d}"
SEND_FEISHU="${SEND_FEISHU:-1}"
FEISHU_DRY_RUN="${FEISHU_DRY_RUN:-0}"
FEISHU_WEBHOOK_SCRIPT="${FEISHU_WEBHOOK_SCRIPT:-}"
FEISHU_WEBHOOK_URL="${FEISHU_WEBHOOK_URL:-}"
export FEISHU_WEBHOOK_URL
WALL_WARN_SECS="${WALL_WARN_SECS:-140}"
VRP_MIN="${VRP_MIN:-900000}"
VAPS_MIN="${VAPS_MIN:-1000}"
PP_MIN="${PP_MIN:-50000}"
WARNING_MAX="${WARNING_MAX:--1}"
SOAK_PID=""
METRICS_PID=""
REPORTER_STOP=0
die() {
echo "error: $*" >&2
exit 2
}
is_true() {
case "${1:-}" in
1|true|TRUE|yes|YES|on|ON) return 0 ;;
*) return 1 ;;
esac
}
validate_non_negative_int() {
local name="$1"
local value="$2"
[[ "$value" =~ ^[0-9]+$ ]] || die "$name must be a non-negative integer: $value"
}
cleanup() {
REPORTER_STOP=1
if [[ -n "$SOAK_PID" ]] && kill -0 "$SOAK_PID" >/dev/null 2>&1; then
kill "$SOAK_PID" >/dev/null 2>&1 || true
wait "$SOAK_PID" >/dev/null 2>&1 || true
fi
if is_true "$STOP_METRICS_SERVICE_ON_EXIT" && [[ -n "$METRICS_PID" ]] && kill -0 "$METRICS_PID" >/dev/null 2>&1; then
kill "$METRICS_PID" >/dev/null 2>&1 || true
wait "$METRICS_PID" >/dev/null 2>&1 || true
fi
if is_true "$START_MONITOR_STACK" && is_true "$STOP_MONITOR_STACK_ON_EXIT" && [[ -f "$MONITOR_DIR/docker-compose.yml" ]]; then
(cd "$MONITOR_DIR" && PROMETHEUS_RETENTION="$PROMETHEUS_RETENTION" docker compose down) >/dev/null 2>&1 || true
fi
}
trap cleanup EXIT INT TERM
run_hourly_report() {
local window_start="$1"
local window_end="$2"
local feishu_args=()
if is_true "$SEND_FEISHU"; then
feishu_args+=(--send-feishu)
if is_true "$FEISHU_DRY_RUN"; then
feishu_args+=(--dry-run-feishu)
fi
if [[ -n "$FEISHU_WEBHOOK_SCRIPT" ]]; then
feishu_args+=(--feishu-script "$FEISHU_WEBHOOK_SCRIPT")
fi
fi
python3 "$HOURLY_REPORT_SCRIPT" \
--run-root "$RUN_ROOT" \
--reports-dir "$REPORTS_DIR" \
--incident-dir "$INCIDENT_DIR" \
--window-start "$window_start" \
--window-end "$window_end" \
--wall-warn-secs "$WALL_WARN_SECS" \
--vrp-min "$VRP_MIN" \
--vaps-min "$VAPS_MIN" \
--pp-min "$PP_MIN" \
--warning-max "$WARNING_MAX" \
"${feishu_args[@]}" \
>> "$LOG_ROOT/hourly-reporter.stdout" 2>> "$LOG_ROOT/hourly-reporter.stderr" || true
}
format_epoch_rfc3339() {
date -u -d "@$1" +%Y-%m-%dT%H:%M:%SZ
}
main() {
validate_non_negative_int "SOAK_DURATION_SECS" "$SOAK_DURATION_SECS"
validate_non_negative_int "HOURLY_REPORT_INTERVAL_SECS" "$HOURLY_REPORT_INTERVAL_SECS"
[[ "$HOURLY_REPORT_INTERVAL_SECS" != "0" ]] || die "HOURLY_REPORT_INTERVAL_SECS must be > 0"
[[ -x "$SOAK_SCRIPT" ]] || die "missing executable: $SOAK_SCRIPT"
[[ -x "$BIN_DIR/rpki_artifact_metrics" ]] || die "missing executable: $BIN_DIR/rpki_artifact_metrics"
[[ -f "$HOURLY_REPORT_SCRIPT" ]] || die "missing hourly report script: $HOURLY_REPORT_SCRIPT"
mkdir -p "$LOG_ROOT" "$REPORTS_DIR" "$INCIDENT_DIR"
df -h > "$LOG_ROOT/24h-df-before.txt" 2>&1 || true
free -h > "$LOG_ROOT/24h-free-before.txt" 2>&1 || true
if is_true "$START_METRICS_SERVICE"; then
"$BIN_DIR/rpki_artifact_metrics" \
--run-root "$RUN_ROOT" \
--listen "$METRICS_LISTEN" \
--poll-secs "$METRICS_POLL_SECS" \
--instance "$METRICS_INSTANCE" \
> "$LOG_ROOT/metrics.stdout" 2> "$LOG_ROOT/metrics.stderr" &
METRICS_PID="$!"
echo "$METRICS_PID" > "$LOG_ROOT/metrics.pid"
fi
if is_true "$START_MONITOR_STACK"; then
if [[ ! -f "$MONITOR_DIR/docker-compose.yml" ]]; then
die "missing monitor compose: $MONITOR_DIR/docker-compose.yml"
fi
(cd "$MONITOR_DIR" && PROMETHEUS_RETENTION="$PROMETHEUS_RETENTION" docker compose up -d) \
> "$LOG_ROOT/monitor-start.stdout" 2> "$LOG_ROOT/monitor-start.stderr"
fi
local start_epoch
local deadline_epoch
local next_report_epoch
local window_start_epoch
local now_epoch
local window_start
local window_end
local run_soak_env
start_epoch="$(date +%s)"
if (( SOAK_DURATION_SECS > 0 )); then
deadline_epoch=$((start_epoch + SOAK_DURATION_SECS))
else
deadline_epoch=0
fi
next_report_epoch=$((start_epoch + HOURLY_REPORT_INTERVAL_SECS))
window_start_epoch="$start_epoch"
run_soak_env="$LOG_ROOT/24h-run-soak.env"
{
if [[ -f "$ENV_FILE" ]]; then
cat "$ENV_FILE"
fi
printf '\n# Generated by run_24h_soak_with_metrics.sh\n'
printf 'MAX_RUNS=-1\n'
printf 'INTERVAL_SECS=0\n'
if (( SOAK_DURATION_SECS > 0 )); then
printf 'STOP_AFTER_SECS=%q\n' "$SOAK_DURATION_SECS"
else
printf 'STOP_AFTER_SECS=0\n'
fi
printf 'RETAIN_RUNS=%q\n' "$SOAK_RETAIN_RUNS"
printf 'CLEAN_TMP_AFTER_RUN=%q\n' "$CLEAN_TMP_AFTER_RUN"
} > "$run_soak_env"
env \
ENV_FILE="$run_soak_env" \
"$SOAK_SCRIPT" \
> "$LOG_ROOT/24h-soak.stdout" 2> "$LOG_ROOT/24h-soak.stderr" &
SOAK_PID="$!"
echo "$SOAK_PID" > "$LOG_ROOT/24h-soak.pid"
while kill -0 "$SOAK_PID" >/dev/null 2>&1; do
if (( REPORTER_STOP == 1 )); then
break
fi
now_epoch="$(date +%s)"
if (( now_epoch >= next_report_epoch )); then
window_start="$(format_epoch_rfc3339 "$window_start_epoch")"
window_end="$(format_epoch_rfc3339 "$now_epoch")"
run_hourly_report "$window_start" "$window_end"
window_start_epoch="$now_epoch"
next_report_epoch=$((now_epoch + HOURLY_REPORT_INTERVAL_SECS))
fi
if (( deadline_epoch > 0 && now_epoch > deadline_epoch + HOURLY_REPORT_INTERVAL_SECS + 300 )); then
echo "deadline grace exceeded; waiting for soak process pid=$SOAK_PID" >&2
fi
sleep 5
done
local soak_exit_code
set +e
wait "$SOAK_PID"
soak_exit_code=$?
set -e
SOAK_PID=""
now_epoch="$(date +%s)"
if (( now_epoch > window_start_epoch )); then
window_start="$(format_epoch_rfc3339 "$window_start_epoch")"
window_end="$(format_epoch_rfc3339 "$now_epoch")"
run_hourly_report "$window_start" "$window_end"
fi
df -h > "$LOG_ROOT/24h-df-after.txt" 2>&1 || true
free -h > "$LOG_ROOT/24h-free-after.txt" 2>&1 || true
return "$soak_exit_code"
}
main "$@"