#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PACKAGE_ROOT="${PACKAGE_ROOT:-$(cd "$SCRIPT_DIR/../.." && pwd)}" RUN_SOAK_SCRIPT="${RUN_SOAK_SCRIPT:-$PACKAGE_ROOT/run_soak.sh}" if [[ ! -x "$RUN_SOAK_SCRIPT" && -x "$SCRIPT_DIR/run_soak.sh" ]]; then RUN_SOAK_SCRIPT="$SCRIPT_DIR/run_soak.sh" fi if [[ ! -x "$RUN_SOAK_SCRIPT" && -x "$SCRIPT_DIR/../../run_soak.sh" ]]; then RUN_SOAK_SCRIPT="$SCRIPT_DIR/../../run_soak.sh" fi ENV_FILE="${ENV_FILE:-$PACKAGE_ROOT/.env}" EXPERIMENT_RUN_ROOT="${EXPERIMENT_RUN_ROOT:-$PACKAGE_ROOT}" EXPERIMENT_DIR="${EXPERIMENT_DIR:-$EXPERIMENT_RUN_ROOT/experiments/cache-ablation-$(date -u +%Y%m%dT%H%M%SZ)}" CASE_RUNS="${CASE_RUNS:-10}" RUN_START_INTERVAL_SECS="${RUN_START_INTERVAL_SECS:-600}" FIRST_RUN_DELAY_SECS="${FIRST_RUN_DELAY_SECS:-0}" SNAPSHOT_EXTRA_ARGS="${SNAPSHOT_EXTRA_ARGS:-}" EXPERIMENT_CASE_SET="${EXPERIMENT_CASE_SET:-default}" DRY_RUN="${DRY_RUN:-0}" RUN_SNAPSHOT="${RUN_SNAPSHOT:-1}" BASE_RETAIN_RUNS="${RETAIN_RUNS:-100}" usage() { cat <<'USAGE' Usage: run_cache_ablation_experiment.sh [--dry-run] [--experiment-dir ] Runs a fixed-cadence cache ablation experiment: 1 snapshot warmup, then selected cases x CASE_RUNS delta runs. Environment: PACKAGE_ROOT portable package root RUN_SOAK_SCRIPT path to run_soak.sh ENV_FILE base .env for run_soak.sh EXPERIMENT_RUN_ROOT shared run root/state root; default PACKAGE_ROOT EXPERIMENT_DIR experiment metadata output directory CASE_RUNS delta runs per case; default 10 EXPERIMENT_CASE_SET default or cache-only; default runs the original 4-case matrix RUN_START_INTERVAL_SECS fixed start cadence for all runs; default 600 FIRST_RUN_DELAY_SECS delay before the first scheduled run; default 0 SNAPSHOT_EXTRA_ARGS extra rpki args for snapshot warmup DRY_RUN=1 print plan without executing run_soak.sh RUN_SNAPSHOT=0 skip snapshot warmup, useful when continuing a prepared state USAGE } die() { echo "error: $*" >&2 exit 2 } is_true() { case "${1:-}" in 1|true|TRUE|yes|YES|on|ON) return 0 ;; *) return 1 ;; esac } validate_non_negative_int() { local name="$1" local value="$2" [[ "$value" =~ ^[0-9]+$ ]] || die "$name must be a non-negative integer: $value" } validate_positive_int() { local name="$1" local value="$2" [[ "$value" =~ ^[0-9]+$ ]] || die "$name must be a positive integer: $value" [[ "$value" != "0" ]] || die "$name must be > 0" } shell_quote() { printf '%q' "$1" } append_env_assignment() { local env_path="$1" local name="$2" local value="$3" printf '%s=%s\n' "$name" "$(shell_quote "$value")" >> "$env_path" } timestamp_utc() { date -u +%Y-%m-%dT%H:%M:%SZ } format_epoch_utc() { date -u -d "@$1" +%Y-%m-%dT%H:%M:%SZ } case_count() { case "$EXPERIMENT_CASE_SET" in default) printf '%s' 4 ;; cache-only) printf '%s' 3 ;; *) die "EXPERIMENT_CASE_SET must be default or cache-only: $EXPERIMENT_CASE_SET" ;; esac } case_id_for_index() { case "$EXPERIMENT_CASE_SET:$1" in default:1) printf '%s' "case1" ;; default:2) printf '%s' "case2" ;; default:3) printf '%s' "case3" ;; default:4) printf '%s' "case4" ;; cache-only:1) printf '%s' "pp-only" ;; cache-only:2) printf '%s' "object-only" ;; cache-only:3) printf '%s' "pp-object-only" ;; *) die "unknown case index: $1 for set $EXPERIMENT_CASE_SET" ;; esac } case_name_for_index() { case "$EXPERIMENT_CASE_SET:$1" in default:1) printf '%s' "all-cache-off" ;; default:2) printf '%s' "prefetch-only" ;; default:3) printf '%s' "prefetch-pp-cache" ;; default:4) printf '%s' "full-cache" ;; cache-only:1) printf '%s' "pp-cache-only" ;; cache-only:2) printf '%s' "object-cache-only" ;; cache-only:3) printf '%s' "pp-cache-object-cache-only" ;; *) die "unknown case index: $1 for set $EXPERIMENT_CASE_SET" ;; esac } case_extra_args_for_index() { case "$EXPERIMENT_CASE_SET:$1" in default:1) printf '%s' "" ;; default:2) printf '%s' "--enable-transport-request-prefetch" ;; default:3) printf '%s' "--enable-transport-request-prefetch --enable-publication-point-validation-cache" ;; default:4) printf '%s' "--enable-transport-request-prefetch --enable-publication-point-validation-cache --enable-roa-validation-cache" ;; cache-only:1) printf '%s' "--enable-publication-point-validation-cache" ;; cache-only:2) printf '%s' "--enable-roa-validation-cache" ;; cache-only:3) printf '%s' "--enable-publication-point-validation-cache --enable-roa-validation-cache" ;; *) die "unknown case index: $1 for set $EXPERIMENT_CASE_SET" ;; esac } case_child_cert_cache_for_index() { case "$EXPERIMENT_CASE_SET:$1" in default:4|cache-only:2|cache-only:3) printf '%s' "1" ;; default:1|default:2|default:3|cache-only:1) printf '%s' "0" ;; *) die "unknown case index: $1 for set $EXPERIMENT_CASE_SET" ;; esac } write_cases_json() { python3 - "$EXPERIMENT_CASE_SET" <<'PY' import json, sys case_set = sys.argv[1] if case_set == "default": cases = [ {"caseId": "case1", "caseName": "all-cache-off", "extraArgs": "", "enableChildCertificateValidationCache": False}, {"caseId": "case2", "caseName": "prefetch-only", "extraArgs": "--enable-transport-request-prefetch", "enableChildCertificateValidationCache": False}, {"caseId": "case3", "caseName": "prefetch-pp-cache", "extraArgs": "--enable-transport-request-prefetch --enable-publication-point-validation-cache", "enableChildCertificateValidationCache": False}, {"caseId": "case4", "caseName": "full-cache", "extraArgs": "--enable-transport-request-prefetch --enable-publication-point-validation-cache --enable-roa-validation-cache", "enableChildCertificateValidationCache": True}, ] elif case_set == "cache-only": cases = [ {"caseId": "pp-only", "caseName": "pp-cache-only", "extraArgs": "--enable-publication-point-validation-cache", "enableChildCertificateValidationCache": False}, {"caseId": "object-only", "caseName": "object-cache-only", "extraArgs": "--enable-roa-validation-cache", "enableChildCertificateValidationCache": True}, {"caseId": "pp-object-only", "caseName": "pp-cache-object-cache-only", "extraArgs": "--enable-publication-point-validation-cache --enable-roa-validation-cache", "enableChildCertificateValidationCache": True}, ] else: raise SystemExit(f"unknown case set: {case_set}") print(json.dumps(cases, ensure_ascii=False, indent=8)) PY } max_existing_run_index() { local runs_root="$EXPERIMENT_RUN_ROOT/runs" local max_index if [[ ! -d "$runs_root" ]]; then printf '%s\n' 0 return 0 fi max_index="$(find "$runs_root" -maxdepth 1 -type d -name 'run_[0-9][0-9][0-9][0-9]*' -printf '%f\n' \ | sed -E 's/^run_0*([0-9]+)$/\1/' \ | sort -n \ | tail -1 \ | awk '{print $1 + 0}')" printf '%s\n' "${max_index:-0}" } write_config() { local path="$1" local git_sha local git_dirty local git_dirty_py git_sha="$(git -C "$PACKAGE_ROOT" rev-parse --short HEAD 2>/dev/null || printf 'unknown')" if [[ -n "$(git -C "$PACKAGE_ROOT" status --short 2>/dev/null || true)" ]]; then git_dirty=true git_dirty_py=True else git_dirty=false git_dirty_py=False fi python3 - "$path" < "$env_path" fi { printf '\n# cache ablation experiment overrides generated at %s\n' "$(timestamp_utc)" } >> "$env_path" append_env_assignment "$env_path" "RUN_ROOT" "$EXPERIMENT_RUN_ROOT" append_env_assignment "$env_path" "MAX_RUNS" "1" append_env_assignment "$env_path" "INTERVAL_SECS" "0" append_env_assignment "$env_path" "RETAIN_RUNS" "$BASE_RETAIN_RUNS" append_env_assignment "$env_path" "RPKI_EXTRA_ARGS" "$extra_args" append_env_assignment "$env_path" "ENABLE_CHILD_CERTIFICATE_VALIDATION_CACHE" "$child_cert_cache" printf '%s\n' "$env_path" } extract_summary() { local event="$1" local case_id="$2" local case_name="$3" local case_run_index="$4" local planned_epoch="$5" local actual_epoch="$6" local completed_epoch="$7" local schedule_lag_ms="$8" local extra_args="$9" local child_cert_cache="${10}" local max_index_before="${11}" local max_index_after="${12}" local run_dir="$EXPERIMENT_RUN_ROOT/runs/$(printf 'run_%04d' "$max_index_after")" local summary_path="$run_dir/run-summary.json" local meta_path="$run_dir/run-meta.json" python3 - "$event" "$case_id" "$case_name" "$case_run_index" "$planned_epoch" "$actual_epoch" \ "$completed_epoch" "$schedule_lag_ms" "$extra_args" "$child_cert_cache" "$max_index_before" \ "$max_index_after" "$summary_path" "$meta_path" <<'PY' import json, sys ( event, case_id, case_name, case_run_index, planned_epoch, actual_epoch, completed_epoch, schedule_lag_ms, extra_args, child_cert_cache, max_index_before, max_index_after, summary_path, meta_path, ) = sys.argv[1:] def ts(epoch): import datetime if int(epoch) <= 0: return None return datetime.datetime.fromtimestamp(int(epoch), datetime.timezone.utc).isoformat().replace("+00:00", "Z") record = { "event": event, "caseId": case_id, "caseName": case_name, "caseRunIndex": int(case_run_index), "plannedStartEpoch": int(planned_epoch), "plannedStartRfc3339Utc": ts(planned_epoch), "actualStartEpoch": int(actual_epoch), "actualStartRfc3339Utc": ts(actual_epoch), "completedEpoch": int(completed_epoch), "completedRfc3339Utc": ts(completed_epoch), "scheduleLagMs": int(schedule_lag_ms), "extraArgs": extra_args, "enableChildCertificateValidationCache": child_cert_cache == "1", "maxRunIndexBefore": int(max_index_before), "maxRunIndexAfter": int(max_index_after), "summaryPath": summary_path, "metaPath": meta_path, } try: with open(meta_path, "r", encoding="utf-8") as f: meta = json.load(f) except Exception as exc: record["metaError"] = str(exc) else: record["syncMode"] = meta.get("syncMode") or meta.get("sync_mode") record["snapshotReason"] = meta.get("snapshotReason") or meta.get("snapshot_reason") record["runMetaStatus"] = meta.get("status") try: with open(summary_path, "r", encoding="utf-8") as f: summary = json.load(f) except Exception as exc: record["summaryError"] = str(exc) else: record["status"] = summary.get("status") record["runId"] = summary.get("runId") record["runSeq"] = summary.get("runSeq") counts = summary.get("reportCounts") or {} record["wallMs"] = summary.get("wallMs") record["vrps"] = counts.get("vrps") record["vaps"] = counts.get("aspas") record["publicationPoints"] = counts.get("publicationPoints") record["warnings"] = counts.get("warnings") metrics = summary.get("processMetrics") or {} record["maxRssKb"] = metrics.get("maxRssKb") record["cpuPercent"] = metrics.get("cpuPercent") stage = summary.get("stageTiming") or {} record["stageTimingMs"] = { k: v for k, v in stage.items() if isinstance(v, (int, float)) and "_ms" in k } for key in [ "download_bytes_total", "download_event_count", "enable_transport_request_prefetch", "enable_publication_point_validation_cache", "enable_roa_validation_cache", "enable_child_certificate_validation_cache", "publication_point_cache_index_load", "publication_point_cache_index_refresh", "roa_validation_cache", ]: if key in stage: record[key] = stage.get(key) analysis_counts = stage.get("analysis_counts") or {} interesting = [ "publication_point_cache_lookup_total", "publication_point_cache_reuse_hits", "publication_point_cache_miss_total", "roa_validation_cache_hit_roas", "roa_validation_cache_miss_roas", "child_certificate_cache_hit", "child_certificate_cache_lookup", "child_certificate_cache_miss_not_found", "fresh_publication_points", "fresh_manifest_files_total", ] record["cacheCounts"] = {k: analysis_counts.get(k) for k in interesting if k in analysis_counts} record["repoSyncStats"] = summary.get("repoSyncStats") print(json.dumps(record, ensure_ascii=False, sort_keys=True)) PY } run_soak_once() { local event="$1" local case_id="$2" local case_name="$3" local case_run_index="$4" local planned_epoch="$5" local extra_args="$6" local child_cert_cache="$7" local max_index_before local max_index_after local actual_epoch local completed_epoch local schedule_lag_ms local effective_env max_index_before="$(max_existing_run_index)" actual_epoch="$(date +%s)" if (( actual_epoch > planned_epoch )); then schedule_lag_ms=$(( (actual_epoch - planned_epoch) * 1000 )) else schedule_lag_ms=0 fi echo "[$(timestamp_utc)] start event=$event case=$case_id run=$case_run_index planned=$(format_epoch_utc "$planned_epoch") lag_ms=$schedule_lag_ms args='$extra_args'" >&2 if is_true "$DRY_RUN"; then completed_epoch="$(date +%s)" max_index_after="$max_index_before" python3 - "$event" "$case_id" "$case_name" "$case_run_index" "$planned_epoch" "$actual_epoch" "$completed_epoch" "$schedule_lag_ms" "$extra_args" "$child_cert_cache" "$max_index_before" "$max_index_after" <<'PY' import json, sys keys = ["event","caseId","caseName","caseRunIndex","plannedStartEpoch","actualStartEpoch","completedEpoch","scheduleLagMs","extraArgs","enableChildCertificateValidationCache","maxRunIndexBefore","maxRunIndexAfter"] values = sys.argv[1:] record = dict(zip(keys, values)) record["caseRunIndex"] = int(record["caseRunIndex"]) record["plannedStartEpoch"] = int(record["plannedStartEpoch"]) record["actualStartEpoch"] = int(record["actualStartEpoch"]) record["completedEpoch"] = int(record["completedEpoch"]) record["scheduleLagMs"] = int(record["scheduleLagMs"]) record["enableChildCertificateValidationCache"] = record["enableChildCertificateValidationCache"] == "1" record["maxRunIndexBefore"] = int(record["maxRunIndexBefore"]) record["maxRunIndexAfter"] = int(record["maxRunIndexAfter"]) record["dryRun"] = True print(json.dumps(record, ensure_ascii=False, sort_keys=True)) PY return 0 fi effective_env="$(write_effective_env "$event" "$case_id" "$case_run_index" "$extra_args" "$child_cert_cache")" env \ PACKAGE_ROOT="$PACKAGE_ROOT" \ ENV_FILE="$effective_env" \ "$RUN_SOAK_SCRIPT" >&2 completed_epoch="$(date +%s)" max_index_after="$(max_existing_run_index)" extract_summary "$event" "$case_id" "$case_name" "$case_run_index" "$planned_epoch" \ "$actual_epoch" "$completed_epoch" "$schedule_lag_ms" "$extra_args" "$child_cert_cache" \ "$max_index_before" "$max_index_after" } while [[ $# -gt 0 ]]; do case "$1" in --dry-run) DRY_RUN=1 ;; --experiment-dir) shift EXPERIMENT_DIR="${1:?--experiment-dir requires a value}" ;; --help|-h) usage exit 0 ;; *) die "unknown argument: $1" ;; esac shift done command -v python3 >/dev/null 2>&1 || die "python3 is required" command -v date >/dev/null 2>&1 || die "date is required" validate_positive_int "CASE_RUNS" "$CASE_RUNS" validate_positive_int "RUN_START_INTERVAL_SECS" "$RUN_START_INTERVAL_SECS" validate_non_negative_int "FIRST_RUN_DELAY_SECS" "$FIRST_RUN_DELAY_SECS" validate_positive_int "BASE_RETAIN_RUNS" "$BASE_RETAIN_RUNS" [[ -x "$RUN_SOAK_SCRIPT" ]] || die "missing executable run_soak.sh: $RUN_SOAK_SCRIPT" mkdir -p "$EXPERIMENT_DIR" "$EXPERIMENT_RUN_ROOT" SUMMARY_JSONL="$EXPERIMENT_DIR/experiment-summary.jsonl" CONFIG_JSON="$EXPERIMENT_DIR/experiment-config.json" : > "$SUMMARY_JSONL" write_config "$CONFIG_JSON" echo "experiment_dir=$EXPERIMENT_DIR" echo "experiment_run_root=$EXPERIMENT_RUN_ROOT" echo "run_soak_script=$RUN_SOAK_SCRIPT" echo "case_set=$EXPERIMENT_CASE_SET case_runs=$CASE_RUNS run_start_interval_secs=$RUN_START_INTERVAL_SECS dry_run=$DRY_RUN" first_run_epoch=$(( $(date +%s) + FIRST_RUN_DELAY_SECS )) run_index_global=0 if is_true "$RUN_SNAPSHOT"; then planned_epoch=$(( first_run_epoch + run_index_global * RUN_START_INTERVAL_SECS )) now_epoch="$(date +%s)" if (( now_epoch < planned_epoch )) && ! is_true "$DRY_RUN"; then sleep_secs=$((planned_epoch - now_epoch)) echo "[$(timestamp_utc)] waiting ${sleep_secs}s for snapshot target=$(format_epoch_utc "$planned_epoch")" sleep "$sleep_secs" fi run_soak_once "snapshot-warmup" "warmup" "snapshot-warmup" 1 "$planned_epoch" "$SNAPSHOT_EXTRA_ARGS" "0" \ | tee -a "$SUMMARY_JSONL" run_index_global=$((run_index_global + 1)) fi for case_index in $(seq 1 "$(case_count)"); do case_id="$(case_id_for_index "$case_index")" case_name="$(case_name_for_index "$case_index")" extra_args="$(case_extra_args_for_index "$case_index")" child_cert_cache="$(case_child_cert_cache_for_index "$case_index")" for case_run_index in $(seq 1 "$CASE_RUNS"); do planned_epoch=$(( first_run_epoch + run_index_global * RUN_START_INTERVAL_SECS )) now_epoch="$(date +%s)" if (( now_epoch < planned_epoch )) && ! is_true "$DRY_RUN"; then sleep_secs=$((planned_epoch - now_epoch)) echo "[$(timestamp_utc)] waiting ${sleep_secs}s for $case_id run=$case_run_index target=$(format_epoch_utc "$planned_epoch")" sleep "$sleep_secs" fi run_soak_once "delta" "$case_id" "$case_name" "$case_run_index" "$planned_epoch" "$extra_args" "$child_cert_cache" \ | tee -a "$SUMMARY_JSONL" run_index_global=$((run_index_global + 1)) done done echo "[$(timestamp_utc)] cache ablation experiment complete summary=$SUMMARY_JSONL"