267 lines
6.6 KiB
Bash
Executable File
267 lines
6.6 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
usage() {
|
|
cat <<'USAGE'
|
|
Usage: run_single_rp_with_rss.sh --rp <name> --root <path> --command <shell-command> [--retain-runs <n>] [--sample-ms <n>]
|
|
|
|
The command runs with RUN_DIR, RP_ROOT, RUN_SEQ, RUN_ID, and RP_NAME exported.
|
|
It must write artifacts into RUN_DIR. The wrapper writes run-meta.json and atomically updates latest.
|
|
USAGE
|
|
}
|
|
|
|
RP_NAME=""
|
|
RP_ROOT=""
|
|
RUN_COMMAND=""
|
|
RETAIN_RUNS="${RETAIN_RUNS:-20}"
|
|
SAMPLE_MS="${RSS_SAMPLE_MS:-500}"
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--rp)
|
|
RP_NAME="$2"
|
|
shift 2
|
|
;;
|
|
--root)
|
|
RP_ROOT="$2"
|
|
shift 2
|
|
;;
|
|
--command)
|
|
RUN_COMMAND="$2"
|
|
shift 2
|
|
;;
|
|
--retain-runs)
|
|
RETAIN_RUNS="$2"
|
|
shift 2
|
|
;;
|
|
--sample-ms)
|
|
SAMPLE_MS="$2"
|
|
shift 2
|
|
;;
|
|
-h|--help)
|
|
usage
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo "unknown argument: $1" >&2
|
|
usage >&2
|
|
exit 2
|
|
;;
|
|
esac
|
|
done
|
|
|
|
if [[ -z "$RP_NAME" || -z "$RP_ROOT" || -z "$RUN_COMMAND" ]]; then
|
|
usage >&2
|
|
exit 2
|
|
fi
|
|
|
|
mkdir -p "$RP_ROOT/runs" "$RP_ROOT/logs"
|
|
|
|
next_seq() {
|
|
local max_seq="0"
|
|
local run_path
|
|
shopt -s nullglob
|
|
for run_path in "$RP_ROOT"/runs/run_*; do
|
|
local run_name="${run_path##*/}"
|
|
local seq="${run_name#run_}"
|
|
if [[ "$seq" =~ ^[0-9]+$ ]] && (( 10#$seq > max_seq )); then
|
|
max_seq=$((10#$seq))
|
|
fi
|
|
done
|
|
shopt -u nullglob
|
|
printf '%06d' $((max_seq + 1))
|
|
}
|
|
|
|
rss_kb_for_pid() {
|
|
local pid="$1"
|
|
awk '/^VmRSS:/ {print $2; found=1} END {if (!found) print 0}' "/proc/$pid/status" 2>/dev/null || echo 0
|
|
}
|
|
|
|
collect_related_pids() {
|
|
local root_pid="$1"
|
|
local process_group="$2"
|
|
local queue=("$root_pid")
|
|
local pid
|
|
declare -A seen=()
|
|
|
|
if [[ -n "$process_group" ]]; then
|
|
while read -r pid; do
|
|
[[ -n "$pid" ]] && queue+=("$pid")
|
|
done < <(pgrep -g "$process_group" 2>/dev/null || true)
|
|
fi
|
|
|
|
while ((${#queue[@]} > 0)); do
|
|
pid="${queue[0]}"
|
|
queue=("${queue[@]:1}")
|
|
[[ -z "$pid" || -n "${seen[$pid]:-}" ]] && continue
|
|
seen[$pid]=1
|
|
echo "$pid"
|
|
while read -r child_pid; do
|
|
[[ -n "$child_pid" ]] && queue+=("$child_pid")
|
|
done < <(pgrep -P "$pid" 2>/dev/null || true)
|
|
done
|
|
}
|
|
|
|
sum_related_rss() {
|
|
local process_group="$1"
|
|
local parent_pid="$2"
|
|
local total_rss="0"
|
|
local child_max_rss="0"
|
|
local pid
|
|
while read -r pid; do
|
|
[[ -z "$pid" ]] && continue
|
|
local rss
|
|
rss="$(rss_kb_for_pid "$pid")"
|
|
total_rss=$((total_rss + rss))
|
|
if [[ "$pid" != "$parent_pid" ]] && (( rss > child_max_rss )); then
|
|
child_max_rss="$rss"
|
|
fi
|
|
done < <(collect_related_pids "$parent_pid" "$process_group")
|
|
printf '%s %s
|
|
' "$total_rss" "$child_max_rss"
|
|
}
|
|
|
|
count_csv_rows() {
|
|
local path="$1"
|
|
if [[ ! -f "$path" ]]; then
|
|
echo 0
|
|
return
|
|
fi
|
|
awk 'BEGIN {count=0} /^[[:space:]]*$/ {next} /^#/ {next} NR==1 {next} {count++} END {print count}' "$path"
|
|
}
|
|
|
|
RUN_SEQ="$(next_seq)"
|
|
RUN_ID="run_${RUN_SEQ}"
|
|
RUN_DIR="$RP_ROOT/runs/$RUN_ID"
|
|
mkdir -p "$RUN_DIR"
|
|
|
|
STARTED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
|
START_EPOCH_MS="$(python3 - <<'PY'
|
|
import time
|
|
print(int(time.time() * 1000))
|
|
PY
|
|
)"
|
|
|
|
export RP_NAME RP_ROOT RUN_SEQ RUN_ID RUN_DIR RUN_COMMAND
|
|
set +e
|
|
setsid bash -lc "$RUN_COMMAND" >"$RUN_DIR/stdout.log" 2>"$RUN_DIR/stderr.log" &
|
|
CHILD_PID=$!
|
|
set -e
|
|
sleep 0.05
|
|
PROCESS_GROUP="$(ps -o pgid= -p "$CHILD_PID" 2>/dev/null | tr -d '[:space:]' || true)"
|
|
if [[ -z "$PROCESS_GROUP" ]]; then
|
|
PROCESS_GROUP="$CHILD_PID"
|
|
fi
|
|
|
|
PARENT_MAX_RSS_KB="0"
|
|
CHILD_MAX_RSS_KB="0"
|
|
AGGREGATE_PEAK_RSS_KB="0"
|
|
SAMPLE_INTERVAL_SECONDS="$(python3 - <<PY
|
|
print(max(float($SAMPLE_MS) / 1000.0, 0.05))
|
|
PY
|
|
)"
|
|
|
|
while kill -0 "$CHILD_PID" 2>/dev/null; do
|
|
parent_rss="$(rss_kb_for_pid "$CHILD_PID")"
|
|
read -r aggregate_rss child_rss < <(sum_related_rss "$PROCESS_GROUP" "$CHILD_PID")
|
|
if (( parent_rss > PARENT_MAX_RSS_KB )); then
|
|
PARENT_MAX_RSS_KB="$parent_rss"
|
|
fi
|
|
if (( child_rss > CHILD_MAX_RSS_KB )); then
|
|
CHILD_MAX_RSS_KB="$child_rss"
|
|
fi
|
|
if (( aggregate_rss > AGGREGATE_PEAK_RSS_KB )); then
|
|
AGGREGATE_PEAK_RSS_KB="$aggregate_rss"
|
|
fi
|
|
sleep "$SAMPLE_INTERVAL_SECONDS"
|
|
done
|
|
|
|
set +e
|
|
wait "$CHILD_PID"
|
|
EXIT_CODE=$?
|
|
set -e
|
|
|
|
FINISHED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
|
END_EPOCH_MS="$(python3 - <<'PY'
|
|
import time
|
|
print(int(time.time() * 1000))
|
|
PY
|
|
)"
|
|
WALL_MS=$((END_EPOCH_MS - START_EPOCH_MS))
|
|
SUCCESS=false
|
|
if [[ "$EXIT_CODE" == "0" ]]; then
|
|
SUCCESS=true
|
|
fi
|
|
|
|
VRPS_COUNT="$(count_csv_rows "$RUN_DIR/vrps.csv")"
|
|
VAPS_COUNT="$(count_csv_rows "$RUN_DIR/vaps.csv")"
|
|
|
|
export STARTED_AT FINISHED_AT WALL_MS EXIT_CODE SUCCESS
|
|
export PARENT_MAX_RSS_KB CHILD_MAX_RSS_KB AGGREGATE_PEAK_RSS_KB SAMPLE_MS
|
|
export VRPS_COUNT VAPS_COUNT
|
|
export CCR_ARTIFACT_PATH=""
|
|
export VRPS_ARTIFACT_PATH=""
|
|
export VAPS_ARTIFACT_PATH=""
|
|
if [[ -f "$RUN_DIR/result.ccr" ]]; then
|
|
CCR_ARTIFACT_PATH="result.ccr"
|
|
fi
|
|
if [[ -f "$RUN_DIR/vrps.csv" ]]; then
|
|
VRPS_ARTIFACT_PATH="vrps.csv"
|
|
fi
|
|
if [[ -f "$RUN_DIR/vaps.csv" ]]; then
|
|
VAPS_ARTIFACT_PATH="vaps.csv"
|
|
fi
|
|
|
|
python3 - <<'PY' >"$RUN_DIR/run-meta.json"
|
|
import os
|
|
import json, socket
|
|
def optional(name):
|
|
value = os.environ.get(name, "")
|
|
return value if value else None
|
|
meta = {
|
|
"schemaVersion": 1,
|
|
"rp": os.environ["RP_NAME"],
|
|
"runSeq": int(os.environ["RUN_SEQ"]),
|
|
"runId": os.environ["RUN_ID"],
|
|
"host": socket.gethostname(),
|
|
"command": os.environ["RUN_COMMAND"],
|
|
"startedAtRfc3339Utc": os.environ["STARTED_AT"],
|
|
"finishedAtRfc3339Utc": os.environ["FINISHED_AT"],
|
|
"wallMs": int(os.environ["WALL_MS"]),
|
|
"exitCode": int(os.environ["EXIT_CODE"]),
|
|
"success": os.environ["SUCCESS"] == "true",
|
|
"maxRssKb": {
|
|
"parent": int(os.environ["PARENT_MAX_RSS_KB"]),
|
|
"childMax": int(os.environ["CHILD_MAX_RSS_KB"]),
|
|
"aggregatePeak": int(os.environ["AGGREGATE_PEAK_RSS_KB"]),
|
|
"sampleIntervalMs": int(os.environ["SAMPLE_MS"]),
|
|
},
|
|
"artifacts": {
|
|
"vrpsCsv": optional("VRPS_ARTIFACT_PATH"),
|
|
"vapsCsv": optional("VAPS_ARTIFACT_PATH"),
|
|
"ccr": optional("CCR_ARTIFACT_PATH"),
|
|
"stdout": "stdout.log",
|
|
"stderr": "stderr.log",
|
|
},
|
|
"counts": {
|
|
"vrps": int(os.environ["VRPS_COUNT"]),
|
|
"vaps": int(os.environ["VAPS_COUNT"]),
|
|
},
|
|
}
|
|
print(json.dumps(meta, indent=2, ensure_ascii=False))
|
|
PY
|
|
|
|
ln -sfn "runs/$RUN_ID" "$RP_ROOT/latest.tmp"
|
|
mv -Tf "$RP_ROOT/latest.tmp" "$RP_ROOT/latest"
|
|
|
|
echo "$RUN_ID $RP_NAME exit=$EXIT_CODE wall_ms=$WALL_MS vrps=$VRPS_COUNT vaps=$VAPS_COUNT rss_kb=$AGGREGATE_PEAK_RSS_KB"
|
|
|
|
if [[ "$RETAIN_RUNS" =~ ^[0-9]+$ ]] && (( RETAIN_RUNS > 0 )); then
|
|
mapfile -t old_runs < <(find "$RP_ROOT/runs" -maxdepth 1 -type d -name 'run_*' -printf '%f\n' | sort | head -n -"$RETAIN_RUNS" || true)
|
|
for old_run in "${old_runs[@]}"; do
|
|
rm -rf "$RP_ROOT/runs/$old_run"
|
|
done
|
|
fi
|
|
|
|
exit "$EXIT_CODE"
|