#!/usr/bin/env bash set -euo pipefail usage() { cat <<'USAGE' Usage: run_single_rp_with_rss.sh --rp --root --command [--retain-runs ] [--sample-ms ] The command runs with RUN_DIR, RP_ROOT, RUN_SEQ, RUN_ID, and RP_NAME exported. It must write artifacts into RUN_DIR. The wrapper writes run-meta.json and atomically updates latest. USAGE } RP_NAME="" RP_ROOT="" RUN_COMMAND="" RETAIN_RUNS="${RETAIN_RUNS:-20}" SAMPLE_MS="${RSS_SAMPLE_MS:-500}" while [[ $# -gt 0 ]]; do case "$1" in --rp) RP_NAME="$2" shift 2 ;; --root) RP_ROOT="$2" shift 2 ;; --command) RUN_COMMAND="$2" shift 2 ;; --retain-runs) RETAIN_RUNS="$2" shift 2 ;; --sample-ms) SAMPLE_MS="$2" shift 2 ;; -h|--help) usage exit 0 ;; *) echo "unknown argument: $1" >&2 usage >&2 exit 2 ;; esac done if [[ -z "$RP_NAME" || -z "$RP_ROOT" || -z "$RUN_COMMAND" ]]; then usage >&2 exit 2 fi mkdir -p "$RP_ROOT/runs" "$RP_ROOT/logs" next_seq() { local max_seq="0" local run_path shopt -s nullglob for run_path in "$RP_ROOT"/runs/run_*; do local run_name="${run_path##*/}" local seq="${run_name#run_}" if [[ "$seq" =~ ^[0-9]+$ ]] && (( 10#$seq > max_seq )); then max_seq=$((10#$seq)) fi done shopt -u nullglob printf '%06d' $((max_seq + 1)) } rss_kb_for_pid() { local pid="$1" awk '/^VmRSS:/ {print $2; found=1} END {if (!found) print 0}' "/proc/$pid/status" 2>/dev/null || echo 0 } collect_related_pids() { local root_pid="$1" local process_group="$2" local queue=("$root_pid") local pid declare -A seen=() if [[ -n "$process_group" ]]; then while read -r pid; do [[ -n "$pid" ]] && queue+=("$pid") done < <(pgrep -g "$process_group" 2>/dev/null || true) fi while ((${#queue[@]} > 0)); do pid="${queue[0]}" queue=("${queue[@]:1}") [[ -z "$pid" || -n "${seen[$pid]:-}" ]] && continue seen[$pid]=1 echo "$pid" while read -r child_pid; do [[ -n "$child_pid" ]] && queue+=("$child_pid") done < <(pgrep -P "$pid" 2>/dev/null || true) done } sum_related_rss() { local process_group="$1" local parent_pid="$2" local total_rss="0" local child_max_rss="0" local pid while read -r pid; do [[ -z "$pid" ]] && continue local rss rss="$(rss_kb_for_pid "$pid")" total_rss=$((total_rss + rss)) if [[ "$pid" != "$parent_pid" ]] && (( rss > child_max_rss )); then child_max_rss="$rss" fi done < <(collect_related_pids "$parent_pid" "$process_group") printf '%s %s ' "$total_rss" "$child_max_rss" } count_csv_rows() { local path="$1" if [[ ! -f "$path" ]]; then echo 0 return fi awk 'BEGIN {count=0} /^[[:space:]]*$/ {next} /^#/ {next} NR==1 {next} {count++} END {print count}' "$path" } RUN_SEQ="$(next_seq)" RUN_ID="run_${RUN_SEQ}" RUN_DIR="$RP_ROOT/runs/$RUN_ID" mkdir -p "$RUN_DIR" STARTED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)" START_EPOCH_MS="$(python3 - <<'PY' import time print(int(time.time() * 1000)) PY )" export RP_NAME RP_ROOT RUN_SEQ RUN_ID RUN_DIR RUN_COMMAND set +e setsid bash -lc "$RUN_COMMAND" >"$RUN_DIR/stdout.log" 2>"$RUN_DIR/stderr.log" & CHILD_PID=$! set -e sleep 0.05 PROCESS_GROUP="$(ps -o pgid= -p "$CHILD_PID" 2>/dev/null | tr -d '[:space:]' || true)" if [[ -z "$PROCESS_GROUP" ]]; then PROCESS_GROUP="$CHILD_PID" fi PARENT_MAX_RSS_KB="0" CHILD_MAX_RSS_KB="0" AGGREGATE_PEAK_RSS_KB="0" SAMPLE_INTERVAL_SECONDS="$(python3 - </dev/null; do parent_rss="$(rss_kb_for_pid "$CHILD_PID")" read -r aggregate_rss child_rss < <(sum_related_rss "$PROCESS_GROUP" "$CHILD_PID") if (( parent_rss > PARENT_MAX_RSS_KB )); then PARENT_MAX_RSS_KB="$parent_rss" fi if (( child_rss > CHILD_MAX_RSS_KB )); then CHILD_MAX_RSS_KB="$child_rss" fi if (( aggregate_rss > AGGREGATE_PEAK_RSS_KB )); then AGGREGATE_PEAK_RSS_KB="$aggregate_rss" fi sleep "$SAMPLE_INTERVAL_SECONDS" done set +e wait "$CHILD_PID" EXIT_CODE=$? set -e FINISHED_AT="$(date -u +%Y-%m-%dT%H:%M:%SZ)" END_EPOCH_MS="$(python3 - <<'PY' import time print(int(time.time() * 1000)) PY )" WALL_MS=$((END_EPOCH_MS - START_EPOCH_MS)) SUCCESS=false if [[ "$EXIT_CODE" == "0" ]]; then SUCCESS=true fi VRPS_COUNT="$(count_csv_rows "$RUN_DIR/vrps.csv")" VAPS_COUNT="$(count_csv_rows "$RUN_DIR/vaps.csv")" export STARTED_AT FINISHED_AT WALL_MS EXIT_CODE SUCCESS export PARENT_MAX_RSS_KB CHILD_MAX_RSS_KB AGGREGATE_PEAK_RSS_KB SAMPLE_MS export VRPS_COUNT VAPS_COUNT export CCR_ARTIFACT_PATH="" export VRPS_ARTIFACT_PATH="" export VAPS_ARTIFACT_PATH="" if [[ -f "$RUN_DIR/result.ccr" ]]; then CCR_ARTIFACT_PATH="result.ccr" fi if [[ -f "$RUN_DIR/vrps.csv" ]]; then VRPS_ARTIFACT_PATH="vrps.csv" fi if [[ -f "$RUN_DIR/vaps.csv" ]]; then VAPS_ARTIFACT_PATH="vaps.csv" fi python3 - <<'PY' >"$RUN_DIR/run-meta.json" import os import json, socket def optional(name): value = os.environ.get(name, "") return value if value else None meta = { "schemaVersion": 1, "rp": os.environ["RP_NAME"], "runSeq": int(os.environ["RUN_SEQ"]), "runId": os.environ["RUN_ID"], "host": socket.gethostname(), "command": os.environ["RUN_COMMAND"], "startedAtRfc3339Utc": os.environ["STARTED_AT"], "finishedAtRfc3339Utc": os.environ["FINISHED_AT"], "wallMs": int(os.environ["WALL_MS"]), "exitCode": int(os.environ["EXIT_CODE"]), "success": os.environ["SUCCESS"] == "true", "maxRssKb": { "parent": int(os.environ["PARENT_MAX_RSS_KB"]), "childMax": int(os.environ["CHILD_MAX_RSS_KB"]), "aggregatePeak": int(os.environ["AGGREGATE_PEAK_RSS_KB"]), "sampleIntervalMs": int(os.environ["SAMPLE_MS"]), }, "artifacts": { "vrpsCsv": optional("VRPS_ARTIFACT_PATH"), "vapsCsv": optional("VAPS_ARTIFACT_PATH"), "ccr": optional("CCR_ARTIFACT_PATH"), "stdout": "stdout.log", "stderr": "stderr.log", }, "counts": { "vrps": int(os.environ["VRPS_COUNT"]), "vaps": int(os.environ["VAPS_COUNT"]), }, } print(json.dumps(meta, indent=2, ensure_ascii=False)) PY ln -sfn "runs/$RUN_ID" "$RP_ROOT/latest.tmp" mv -Tf "$RP_ROOT/latest.tmp" "$RP_ROOT/latest" echo "$RUN_ID $RP_NAME exit=$EXIT_CODE wall_ms=$WALL_MS vrps=$VRPS_COUNT vaps=$VAPS_COUNT rss_kb=$AGGREGATE_PEAK_RSS_KB" if [[ "$RETAIN_RUNS" =~ ^[0-9]+$ ]] && (( RETAIN_RUNS > 0 )); then mapfile -t old_runs < <(find "$RP_ROOT/runs" -maxdepth 1 -type d -name 'run_*' -printf '%f\n' | sort | head -n -"$RETAIN_RUNS" || true) for old_run in "${old_runs[@]}"; do rm -rf "$RP_ROOT/runs/$old_run" done fi exit "$EXIT_CODE"