227 lines
7.2 KiB
Bash
Executable File
227 lines
7.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PACKAGE_ROOT="${PACKAGE_ROOT:-$SCRIPT_DIR}"
|
|
ENV_FILE="${ENV_FILE:-$PACKAGE_ROOT/.env}"
|
|
|
|
if [[ -f "$ENV_FILE" ]]; then
|
|
# shellcheck disable=SC1090
|
|
source "$ENV_FILE"
|
|
fi
|
|
|
|
RUN_ROOT="${RUN_ROOT:-$PACKAGE_ROOT}"
|
|
BIN_DIR="${BIN_DIR:-$PACKAGE_ROOT/bin}"
|
|
LOG_ROOT="${LOG_ROOT:-$RUN_ROOT/logs}"
|
|
REPORTS_DIR="${HOURLY_REPORTS_DIR:-$RUN_ROOT/hourly_reports}"
|
|
INCIDENT_DIR="${INCIDENT_DIR:-$RUN_ROOT/incident_runs}"
|
|
MONITOR_DIR="${MONITOR_DIR:-$PACKAGE_ROOT/monitor}"
|
|
SOAK_SCRIPT="${SOAK_SCRIPT:-$PACKAGE_ROOT/run_soak.sh}"
|
|
HOURLY_REPORT_SCRIPT="${HOURLY_REPORT_SCRIPT:-$PACKAGE_ROOT/scripts/soak/hourly_soak_report.py}"
|
|
|
|
SOAK_DURATION_SECS="${SOAK_DURATION_SECS:-0}"
|
|
HOURLY_REPORT_INTERVAL_SECS="${HOURLY_REPORT_INTERVAL_SECS:-3600}"
|
|
SOAK_RETAIN_RUNS="${SOAK_RETAIN_RUNS:-100}"
|
|
CLEAN_TMP_AFTER_RUN="${CLEAN_TMP_AFTER_RUN:-1}"
|
|
START_MONITOR_STACK="${START_MONITOR_STACK:-1}"
|
|
STOP_MONITOR_STACK_ON_EXIT="${STOP_MONITOR_STACK_ON_EXIT:-0}"
|
|
START_METRICS_SERVICE="${START_METRICS_SERVICE:-1}"
|
|
STOP_METRICS_SERVICE_ON_EXIT="${STOP_METRICS_SERVICE_ON_EXIT:-0}"
|
|
METRICS_LISTEN="${METRICS_LISTEN:-0.0.0.0:9556}"
|
|
METRICS_POLL_SECS="${METRICS_POLL_SECS:-5}"
|
|
METRICS_INSTANCE="${METRICS_INSTANCE:-remote231-24h}"
|
|
PROMETHEUS_RETENTION="${PROMETHEUS_RETENTION:-7d}"
|
|
SEND_FEISHU="${SEND_FEISHU:-1}"
|
|
FEISHU_DRY_RUN="${FEISHU_DRY_RUN:-0}"
|
|
FEISHU_WEBHOOK_SCRIPT="${FEISHU_WEBHOOK_SCRIPT:-}"
|
|
FEISHU_WEBHOOK_URL="${FEISHU_WEBHOOK_URL:-}"
|
|
export FEISHU_WEBHOOK_URL
|
|
|
|
WALL_WARN_SECS="${WALL_WARN_SECS:-140}"
|
|
VRP_MIN="${VRP_MIN:-900000}"
|
|
VAPS_MIN="${VAPS_MIN:-1000}"
|
|
PP_MIN="${PP_MIN:-50000}"
|
|
WARNING_MAX="${WARNING_MAX:--1}"
|
|
|
|
SOAK_PID=""
|
|
METRICS_PID=""
|
|
REPORTER_STOP=0
|
|
|
|
die() {
|
|
echo "error: $*" >&2
|
|
exit 2
|
|
}
|
|
|
|
is_true() {
|
|
case "${1:-}" in
|
|
1|true|TRUE|yes|YES|on|ON) return 0 ;;
|
|
*) return 1 ;;
|
|
esac
|
|
}
|
|
|
|
validate_non_negative_int() {
|
|
local name="$1"
|
|
local value="$2"
|
|
[[ "$value" =~ ^[0-9]+$ ]] || die "$name must be a non-negative integer: $value"
|
|
}
|
|
|
|
cleanup() {
|
|
REPORTER_STOP=1
|
|
if [[ -n "$SOAK_PID" ]] && kill -0 "$SOAK_PID" >/dev/null 2>&1; then
|
|
kill "$SOAK_PID" >/dev/null 2>&1 || true
|
|
wait "$SOAK_PID" >/dev/null 2>&1 || true
|
|
fi
|
|
if is_true "$STOP_METRICS_SERVICE_ON_EXIT" && [[ -n "$METRICS_PID" ]] && kill -0 "$METRICS_PID" >/dev/null 2>&1; then
|
|
kill "$METRICS_PID" >/dev/null 2>&1 || true
|
|
wait "$METRICS_PID" >/dev/null 2>&1 || true
|
|
fi
|
|
if is_true "$START_MONITOR_STACK" && is_true "$STOP_MONITOR_STACK_ON_EXIT" && [[ -f "$MONITOR_DIR/docker-compose.yml" ]]; then
|
|
(cd "$MONITOR_DIR" && PROMETHEUS_RETENTION="$PROMETHEUS_RETENTION" docker compose down) >/dev/null 2>&1 || true
|
|
fi
|
|
}
|
|
|
|
trap cleanup EXIT INT TERM
|
|
|
|
run_hourly_report() {
|
|
local window_start="$1"
|
|
local window_end="$2"
|
|
local feishu_args=()
|
|
if is_true "$SEND_FEISHU"; then
|
|
feishu_args+=(--send-feishu)
|
|
if is_true "$FEISHU_DRY_RUN"; then
|
|
feishu_args+=(--dry-run-feishu)
|
|
fi
|
|
if [[ -n "$FEISHU_WEBHOOK_SCRIPT" ]]; then
|
|
feishu_args+=(--feishu-script "$FEISHU_WEBHOOK_SCRIPT")
|
|
fi
|
|
fi
|
|
python3 "$HOURLY_REPORT_SCRIPT" \
|
|
--run-root "$RUN_ROOT" \
|
|
--reports-dir "$REPORTS_DIR" \
|
|
--incident-dir "$INCIDENT_DIR" \
|
|
--window-start "$window_start" \
|
|
--window-end "$window_end" \
|
|
--wall-warn-secs "$WALL_WARN_SECS" \
|
|
--vrp-min "$VRP_MIN" \
|
|
--vaps-min "$VAPS_MIN" \
|
|
--pp-min "$PP_MIN" \
|
|
--warning-max "$WARNING_MAX" \
|
|
"${feishu_args[@]}" \
|
|
>> "$LOG_ROOT/hourly-reporter.stdout" 2>> "$LOG_ROOT/hourly-reporter.stderr" || true
|
|
}
|
|
|
|
format_epoch_rfc3339() {
|
|
date -u -d "@$1" +%Y-%m-%dT%H:%M:%SZ
|
|
}
|
|
|
|
main() {
|
|
validate_non_negative_int "SOAK_DURATION_SECS" "$SOAK_DURATION_SECS"
|
|
validate_non_negative_int "HOURLY_REPORT_INTERVAL_SECS" "$HOURLY_REPORT_INTERVAL_SECS"
|
|
[[ "$HOURLY_REPORT_INTERVAL_SECS" != "0" ]] || die "HOURLY_REPORT_INTERVAL_SECS must be > 0"
|
|
[[ -x "$SOAK_SCRIPT" ]] || die "missing executable: $SOAK_SCRIPT"
|
|
[[ -x "$BIN_DIR/rpki_artifact_metrics" ]] || die "missing executable: $BIN_DIR/rpki_artifact_metrics"
|
|
[[ -f "$HOURLY_REPORT_SCRIPT" ]] || die "missing hourly report script: $HOURLY_REPORT_SCRIPT"
|
|
|
|
mkdir -p "$LOG_ROOT" "$REPORTS_DIR" "$INCIDENT_DIR"
|
|
df -h > "$LOG_ROOT/24h-df-before.txt" 2>&1 || true
|
|
free -h > "$LOG_ROOT/24h-free-before.txt" 2>&1 || true
|
|
|
|
if is_true "$START_METRICS_SERVICE"; then
|
|
"$BIN_DIR/rpki_artifact_metrics" \
|
|
--run-root "$RUN_ROOT" \
|
|
--listen "$METRICS_LISTEN" \
|
|
--poll-secs "$METRICS_POLL_SECS" \
|
|
--instance "$METRICS_INSTANCE" \
|
|
> "$LOG_ROOT/metrics.stdout" 2> "$LOG_ROOT/metrics.stderr" &
|
|
METRICS_PID="$!"
|
|
echo "$METRICS_PID" > "$LOG_ROOT/metrics.pid"
|
|
fi
|
|
|
|
if is_true "$START_MONITOR_STACK"; then
|
|
if [[ ! -f "$MONITOR_DIR/docker-compose.yml" ]]; then
|
|
die "missing monitor compose: $MONITOR_DIR/docker-compose.yml"
|
|
fi
|
|
(cd "$MONITOR_DIR" && PROMETHEUS_RETENTION="$PROMETHEUS_RETENTION" docker compose up -d) \
|
|
> "$LOG_ROOT/monitor-start.stdout" 2> "$LOG_ROOT/monitor-start.stderr"
|
|
fi
|
|
|
|
local start_epoch
|
|
local deadline_epoch
|
|
local next_report_epoch
|
|
local window_start_epoch
|
|
local now_epoch
|
|
local window_start
|
|
local window_end
|
|
local run_soak_env
|
|
start_epoch="$(date +%s)"
|
|
if (( SOAK_DURATION_SECS > 0 )); then
|
|
deadline_epoch=$((start_epoch + SOAK_DURATION_SECS))
|
|
else
|
|
deadline_epoch=0
|
|
fi
|
|
next_report_epoch=$((start_epoch + HOURLY_REPORT_INTERVAL_SECS))
|
|
window_start_epoch="$start_epoch"
|
|
|
|
run_soak_env="$LOG_ROOT/24h-run-soak.env"
|
|
{
|
|
if [[ -f "$ENV_FILE" ]]; then
|
|
cat "$ENV_FILE"
|
|
fi
|
|
printf '\n# Generated by run_24h_soak_with_metrics.sh\n'
|
|
printf 'MAX_RUNS=-1\n'
|
|
printf 'INTERVAL_SECS=0\n'
|
|
if (( SOAK_DURATION_SECS > 0 )); then
|
|
printf 'STOP_AFTER_SECS=%q\n' "$SOAK_DURATION_SECS"
|
|
else
|
|
printf 'STOP_AFTER_SECS=0\n'
|
|
fi
|
|
printf 'RETAIN_RUNS=%q\n' "$SOAK_RETAIN_RUNS"
|
|
printf 'CLEAN_TMP_AFTER_RUN=%q\n' "$CLEAN_TMP_AFTER_RUN"
|
|
} > "$run_soak_env"
|
|
|
|
env \
|
|
ENV_FILE="$run_soak_env" \
|
|
"$SOAK_SCRIPT" \
|
|
> "$LOG_ROOT/24h-soak.stdout" 2> "$LOG_ROOT/24h-soak.stderr" &
|
|
SOAK_PID="$!"
|
|
echo "$SOAK_PID" > "$LOG_ROOT/24h-soak.pid"
|
|
|
|
while kill -0 "$SOAK_PID" >/dev/null 2>&1; do
|
|
if (( REPORTER_STOP == 1 )); then
|
|
break
|
|
fi
|
|
now_epoch="$(date +%s)"
|
|
if (( now_epoch >= next_report_epoch )); then
|
|
window_start="$(format_epoch_rfc3339 "$window_start_epoch")"
|
|
window_end="$(format_epoch_rfc3339 "$now_epoch")"
|
|
run_hourly_report "$window_start" "$window_end"
|
|
window_start_epoch="$now_epoch"
|
|
next_report_epoch=$((now_epoch + HOURLY_REPORT_INTERVAL_SECS))
|
|
fi
|
|
if (( deadline_epoch > 0 && now_epoch > deadline_epoch + HOURLY_REPORT_INTERVAL_SECS + 300 )); then
|
|
echo "deadline grace exceeded; waiting for soak process pid=$SOAK_PID" >&2
|
|
fi
|
|
sleep 5
|
|
done
|
|
|
|
local soak_exit_code
|
|
set +e
|
|
wait "$SOAK_PID"
|
|
soak_exit_code=$?
|
|
set -e
|
|
SOAK_PID=""
|
|
|
|
now_epoch="$(date +%s)"
|
|
if (( now_epoch > window_start_epoch )); then
|
|
window_start="$(format_epoch_rfc3339 "$window_start_epoch")"
|
|
window_end="$(format_epoch_rfc3339 "$now_epoch")"
|
|
run_hourly_report "$window_start" "$window_end"
|
|
fi
|
|
|
|
df -h > "$LOG_ROOT/24h-df-after.txt" 2>&1 || true
|
|
free -h > "$LOG_ROOT/24h-free-after.txt" 2>&1 || true
|
|
return "$soak_exit_code"
|
|
}
|
|
|
|
main "$@"
|