rpki/scripts/inter_rp/run_single_rp_with_rss.sh

267 lines
6.6 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
usage() {
cat <<'USAGE'
Usage: run_single_rp_with_rss.sh --rp <name> --root <path> --command <shell-command> [--retain-runs <n>] [--sample-ms <n>]
The command runs with RUN_DIR, RP_ROOT, RUN_SEQ, RUN_ID, and RP_NAME exported.
It must write artifacts into RUN_DIR. The wrapper writes run-meta.json and atomically updates latest.
USAGE
}
RP_NAME=""
RP_ROOT=""
RUN_COMMAND=""
RETAIN_RUNS="${RETAIN_RUNS:-20}"
SAMPLE_MS="${RSS_SAMPLE_MS:-500}"
while [[ $# -gt 0 ]]; do
case "$1" in
--rp)
RP_NAME="$2"
shift 2
;;
--root)
RP_ROOT="$2"
shift 2
;;
--command)
RUN_COMMAND="$2"
shift 2
;;
--retain-runs)
RETAIN_RUNS="$2"
shift 2
;;
--sample-ms)
SAMPLE_MS="$2"
shift 2
;;
-h|--help)
usage
exit 0
;;
*)
echo "unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
if [[ -z "$RP_NAME" || -z "$RP_ROOT" || -z "$RUN_COMMAND" ]]; then
usage >&2
exit 2
fi
mkdir -p "$RP_ROOT/runs" "$RP_ROOT/logs"
next_seq() {
local max_seq="0"
local run_path
shopt -s nullglob
for run_path in "$RP_ROOT"/runs/run_*; do
local run_name="${run_path##*/}"
local seq="${run_name#run_}"
if [[ "$seq" =~ ^[0-9]+$ ]] && (( 10#$seq > max_seq )); then
max_seq=$((10#$seq))
fi
done
shopt -u nullglob
printf '%06d' $((max_seq + 1))
}
rss_kb_for_pid() {
local pid="$1"
awk '/^VmRSS:/ {print $2; found=1} END {if (!found) print 0}' "/proc/$pid/status" 2>/dev/null || echo 0
}
collect_related_pids() {
local root_pid="$1"
local process_group="$2"
local queue=("$root_pid")
local pid
declare -A seen=()
if [[ -n "$process_group" ]]; then
while read -r pid; do
[[ -n "$pid" ]] && queue+=("$pid")
done < <(pgrep -g "$process_group" 2>/dev/null || true)
fi
while ((${#queue[@]} > 0)); do
pid="${queue[0]}"
queue=("${queue[@]:1}")
[[ -z "$pid" || -n "${seen[$pid]:-}" ]] && continue
seen[$pid]=1
echo "$pid"
while read -r child_pid; do
[[ -n "$child_pid" ]] && queue+=("$child_pid")
done < <(pgrep -P "$pid" 2>/dev/null || true)
done
}
sum_related_rss() {
local process_group="$1"
local parent_pid="$2"
local total_rss="0"
local child_max_rss="0"
local pid
while read -r pid; do
[[ -z "$pid" ]] && continue
local rss
rss="$(rss_kb_for_pid "$pid")"
total_rss=$((total_rss + rss))
if [[ "$pid" != "$parent_pid" ]] && (( rss > child_max_rss )); then
child_max_rss="$rss"
fi
done < <(collect_related_pids "$parent_pid" "$process_group")
printf '%s %s
' "$total_rss" "$child_max_rss"
}
count_csv_rows() {
local path="$1"
if [[ ! -f "$path" ]]; then
echo 0
return
fi
awk 'BEGIN {count=0} /^[[:space:]]*$/ {next} /^#/ {next} NR==1 {next} {count++} END {print count}' "$path"
}
RUN_SEQ="$(next_seq)"
RUN_ID="run_${RUN_SEQ}"
RUN_DIR="$RP_ROOT/runs/$RUN_ID"
mkdir -p "$RUN_DIR"
STARTED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
START_EPOCH_MS="$(python3 - <<'PY'
import time
print(int(time.time() * 1000))
PY
)"
export RP_NAME RP_ROOT RUN_SEQ RUN_ID RUN_DIR RUN_COMMAND
set +e
setsid bash -lc "$RUN_COMMAND" >"$RUN_DIR/stdout.log" 2>"$RUN_DIR/stderr.log" &
CHILD_PID=$!
set -e
sleep 0.05
PROCESS_GROUP="$(ps -o pgid= -p "$CHILD_PID" 2>/dev/null | tr -d '[:space:]' || true)"
if [[ -z "$PROCESS_GROUP" ]]; then
PROCESS_GROUP="$CHILD_PID"
fi
PARENT_MAX_RSS_KB="0"
CHILD_MAX_RSS_KB="0"
AGGREGATE_PEAK_RSS_KB="0"
SAMPLE_INTERVAL_SECONDS="$(python3 - <<PY
print(max(float($SAMPLE_MS) / 1000.0, 0.05))
PY
)"
while kill -0 "$CHILD_PID" 2>/dev/null; do
parent_rss="$(rss_kb_for_pid "$CHILD_PID")"
read -r aggregate_rss child_rss < <(sum_related_rss "$PROCESS_GROUP" "$CHILD_PID")
if (( parent_rss > PARENT_MAX_RSS_KB )); then
PARENT_MAX_RSS_KB="$parent_rss"
fi
if (( child_rss > CHILD_MAX_RSS_KB )); then
CHILD_MAX_RSS_KB="$child_rss"
fi
if (( aggregate_rss > AGGREGATE_PEAK_RSS_KB )); then
AGGREGATE_PEAK_RSS_KB="$aggregate_rss"
fi
sleep "$SAMPLE_INTERVAL_SECONDS"
done
set +e
wait "$CHILD_PID"
EXIT_CODE=$?
set -e
FINISHED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
END_EPOCH_MS="$(python3 - <<'PY'
import time
print(int(time.time() * 1000))
PY
)"
WALL_MS=$((END_EPOCH_MS - START_EPOCH_MS))
SUCCESS=false
if [[ "$EXIT_CODE" == "0" ]]; then
SUCCESS=true
fi
VRPS_COUNT="$(count_csv_rows "$RUN_DIR/vrps.csv")"
VAPS_COUNT="$(count_csv_rows "$RUN_DIR/vaps.csv")"
export STARTED_AT FINISHED_AT WALL_MS EXIT_CODE SUCCESS
export PARENT_MAX_RSS_KB CHILD_MAX_RSS_KB AGGREGATE_PEAK_RSS_KB SAMPLE_MS
export VRPS_COUNT VAPS_COUNT
export CCR_ARTIFACT_PATH=""
export VRPS_ARTIFACT_PATH=""
export VAPS_ARTIFACT_PATH=""
if [[ -f "$RUN_DIR/result.ccr" ]]; then
CCR_ARTIFACT_PATH="result.ccr"
fi
if [[ -f "$RUN_DIR/vrps.csv" ]]; then
VRPS_ARTIFACT_PATH="vrps.csv"
fi
if [[ -f "$RUN_DIR/vaps.csv" ]]; then
VAPS_ARTIFACT_PATH="vaps.csv"
fi
python3 - <<'PY' >"$RUN_DIR/run-meta.json"
import os
import json, socket
def optional(name):
value = os.environ.get(name, "")
return value if value else None
meta = {
"schemaVersion": 1,
"rp": os.environ["RP_NAME"],
"runSeq": int(os.environ["RUN_SEQ"]),
"runId": os.environ["RUN_ID"],
"host": socket.gethostname(),
"command": os.environ["RUN_COMMAND"],
"startedAtRfc3339Utc": os.environ["STARTED_AT"],
"finishedAtRfc3339Utc": os.environ["FINISHED_AT"],
"wallMs": int(os.environ["WALL_MS"]),
"exitCode": int(os.environ["EXIT_CODE"]),
"success": os.environ["SUCCESS"] == "true",
"maxRssKb": {
"parent": int(os.environ["PARENT_MAX_RSS_KB"]),
"childMax": int(os.environ["CHILD_MAX_RSS_KB"]),
"aggregatePeak": int(os.environ["AGGREGATE_PEAK_RSS_KB"]),
"sampleIntervalMs": int(os.environ["SAMPLE_MS"]),
},
"artifacts": {
"vrpsCsv": optional("VRPS_ARTIFACT_PATH"),
"vapsCsv": optional("VAPS_ARTIFACT_PATH"),
"ccr": optional("CCR_ARTIFACT_PATH"),
"stdout": "stdout.log",
"stderr": "stderr.log",
},
"counts": {
"vrps": int(os.environ["VRPS_COUNT"]),
"vaps": int(os.environ["VAPS_COUNT"]),
},
}
print(json.dumps(meta, indent=2, ensure_ascii=False))
PY
ln -sfn "runs/$RUN_ID" "$RP_ROOT/latest.tmp"
mv -Tf "$RP_ROOT/latest.tmp" "$RP_ROOT/latest"
echo "$RUN_ID $RP_NAME exit=$EXIT_CODE wall_ms=$WALL_MS vrps=$VRPS_COUNT vaps=$VAPS_COUNT rss_kb=$AGGREGATE_PEAK_RSS_KB"
if [[ "$RETAIN_RUNS" =~ ^[0-9]+$ ]] && (( RETAIN_RUNS > 0 )); then
mapfile -t old_runs < <(find "$RP_ROOT/runs" -maxdepth 1 -type d -name 'run_*' -printf '%f\n' | sort | head -n -"$RETAIN_RUNS" || true)
for old_run in "${old_runs[@]}"; do
rm -rf "$RP_ROOT/runs/$old_run"
done
fi
exit "$EXIT_CODE"