diff --git a/monitor/grafana/dashboards/ours-rp-inter-rp.json b/monitor/grafana/dashboards/ours-rp-inter-rp.json index ee430ce..474df02 100644 --- a/monitor/grafana/dashboards/ours-rp-inter-rp.json +++ b/monitor/grafana/dashboards/ours-rp-inter-rp.json @@ -47,7 +47,7 @@ }, "targets": [ { - "expr": "max(inter_rp_repo_sync_overlap_total{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\",class=\"only_ours\"})", + "expr": "max(inter_rp_repo_sync_overlap_total{exported_instance=~\".*inter-rp\",left=\"ours-rp\",right=\"routinator\",class=\"only_ours\"})", "legendFormat": "only ours", "refId": "A", "instant": true @@ -92,7 +92,7 @@ }, "targets": [ { - "expr": "max(inter_rp_repo_sync_overlap_total{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\",class=\"only_routinator\"})", + "expr": "max(inter_rp_repo_sync_overlap_total{exported_instance=~\".*inter-rp\",left=\"ours-rp\",right=\"routinator\",class=\"only_routinator\"})", "legendFormat": "only routinator", "refId": "A", "instant": true @@ -137,7 +137,7 @@ }, "targets": [ { - "expr": "max(inter_rp_vaps_diff{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\"})", + "expr": "max(inter_rp_vaps_diff{exported_instance=~\".*inter-rp\",left=\"ours-rp\",right=\"routinator\"})", "legendFormat": "vap diff", "refId": "A", "instant": true @@ -182,7 +182,7 @@ }, "targets": [ { - "expr": "max(inter_rp_vrps_diff{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\"})", + "expr": "max(inter_rp_vrps_diff{exported_instance=~\".*inter-rp\",left=\"ours-rp\",right=\"routinator\"})", "legendFormat": "vrp diff", "refId": "A", "instant": true @@ -226,8 +226,8 @@ }, "targets": [ { - "expr": "inter_rp_run_wall_seconds{exported_instance=\"remote200-inter-rp\",exported_rp=~\"ours-rp|routinator\"}", - "legendFormat": "{{exported_rp}}", + "expr": "inter_rp_run_wall_seconds{exported_instance=~\".*inter-rp\",rp=~\"ours-rp|routinator\"}", + "legendFormat": "{{rp}}", "refId": "A" } ] @@ -269,8 +269,8 @@ }, "targets": [ { - "expr": "inter_rp_run_max_rss_bytes{exported_instance=\"remote200-inter-rp\",kind=\"aggregate_peak\",exported_rp=~\"ours-rp|routinator\"}", - "legendFormat": "{{exported_rp}}", + "expr": "inter_rp_run_max_rss_bytes{exported_instance=~\".*inter-rp\",kind=\"aggregate_peak\",rp=~\"ours-rp|routinator\"}", + "legendFormat": "{{rp}}", "refId": "A" } ] @@ -312,8 +312,8 @@ }, "targets": [ { - "expr": "inter_rp_vrps{exported_instance=\"remote200-inter-rp\",exported_rp=~\"ours-rp|routinator\"}", - "legendFormat": "{{exported_rp}}", + "expr": "inter_rp_vrps{exported_instance=~\".*inter-rp\",rp=~\"ours-rp|routinator\"}", + "legendFormat": "{{rp}}", "refId": "A" } ] @@ -355,8 +355,8 @@ }, "targets": [ { - "expr": "inter_rp_vaps{exported_instance=\"remote200-inter-rp\",exported_rp=~\"ours-rp|routinator\"}", - "legendFormat": "{{exported_rp}}", + "expr": "inter_rp_vaps{exported_instance=~\".*inter-rp\",rp=~\"ours-rp|routinator\"}", + "legendFormat": "{{rp}}", "refId": "A" } ] @@ -388,24 +388,24 @@ }, "targets": [ { - "expr": "inter_rp_run_seq{exported_instance=\"remote200-inter-rp\",exported_rp=~\"ours-rp|routinator\"}", + "expr": "inter_rp_run_seq{exported_instance=~\".*inter-rp\",rp=~\"ours-rp|routinator\"}", "format": "table", "instant": true, - "legendFormat": "{{exported_rp}} seq", + "legendFormat": "{{rp}} seq", "refId": "A" }, { - "expr": "inter_rp_run_success{exported_instance=\"remote200-inter-rp\",exported_rp=~\"ours-rp|routinator\"}", + "expr": "inter_rp_run_success{exported_instance=~\".*inter-rp\",rp=~\"ours-rp|routinator\"}", "format": "table", "instant": true, - "legendFormat": "{{exported_rp}} success", + "legendFormat": "{{rp}} success", "refId": "B" }, { - "expr": "inter_rp_run_wall_seconds{exported_instance=\"remote200-inter-rp\",exported_rp=~\"ours-rp|routinator\"}", + "expr": "inter_rp_run_wall_seconds{exported_instance=~\".*inter-rp\",rp=~\"ours-rp|routinator\"}", "format": "table", "instant": true, - "legendFormat": "{{exported_rp}} wall", + "legendFormat": "{{rp}} wall", "refId": "C" } ] @@ -437,14 +437,14 @@ }, "targets": [ { - "expr": "inter_rp_vrps_diff{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\"}", + "expr": "inter_rp_vrps_diff{exported_instance=~\".*inter-rp\",left=\"ours-rp\",right=\"routinator\"}", "format": "table", "instant": true, "legendFormat": "vrps ours-rp-routinator", "refId": "A" }, { - "expr": "inter_rp_vaps_diff{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\"}", + "expr": "inter_rp_vaps_diff{exported_instance=~\".*inter-rp\",left=\"ours-rp\",right=\"routinator\"}", "format": "table", "instant": true, "legendFormat": "vaps ours-rp-routinator", @@ -489,8 +489,8 @@ }, "targets": [ { - "expr": "inter_rp_artifact_age_seconds{exported_instance=\"remote200-inter-rp\",exported_rp=~\"ours-rp|routinator\"}", - "legendFormat": "{{exported_rp}}", + "expr": "inter_rp_artifact_age_seconds{exported_instance=~\".*inter-rp\",rp=~\"ours-rp|routinator\"}", + "legendFormat": "{{rp}}", "refId": "A" } ] @@ -533,8 +533,8 @@ }, "targets": [ { - "expr": "inter_rp_repo_sync_total{exported_instance=\"remote200-inter-rp\",state=~\"available|failed\",exported_rp=~\"ours-rp|routinator\"}", - "legendFormat": "{{exported_rp}} {{state}}", + "expr": "inter_rp_repo_sync_total{exported_instance=~\".*inter-rp\",state=~\"available|failed\",rp=~\"ours-rp|routinator\"}", + "legendFormat": "{{rp}} {{state}}", "refId": "A" } ] @@ -577,7 +577,7 @@ }, "targets": [ { - "expr": "inter_rp_repo_sync_overlap_total{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\"}", + "expr": "inter_rp_repo_sync_overlap_total{exported_instance=~\".*inter-rp\",left=\"ours-rp\",right=\"routinator\"}", "legendFormat": "{{class}}", "refId": "A" } @@ -651,7 +651,7 @@ }, "targets": [ { - "expr": "inter_rp_repo_sync_diff_info{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\"}", + "expr": "inter_rp_repo_sync_diff_info{exported_instance=~\".*inter-rp\",left=\"ours-rp\",right=\"routinator\"}", "format": "table", "instant": true, "legendFormat": "{{class}} #{{rank}}", diff --git a/monitor/grafana/dashboards/ours-rp-soak-overview.json b/monitor/grafana/dashboards/ours-rp-soak-overview.json index 805bb56..0762bc0 100644 --- a/monitor/grafana/dashboards/ours-rp-soak-overview.json +++ b/monitor/grafana/dashboards/ours-rp-soak-overview.json @@ -186,6 +186,205 @@ "title": "Publication Points", "type": "stat" }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "decimals": 0, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "x": 0, + "y": 4, + "w": 6, + "h": 4 + }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "expr": "ours_rp_run_sequence", + "legendFormat": "seq", + "refId": "A" + } + ], + "title": "Latest Run Sequence", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 90 + }, + { + "color": "green", + "value": 98 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "x": 6, + "y": 4, + "w": 6, + "h": 4 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "expr": "100 * sum by (job, instance, exported_instance) (ours_rp_repo_terminal_state_count{terminal_state=\"publication_point_cache\"}) / sum by (job, instance, exported_instance) (ours_rp_publication_points)", + "legendFormat": "PP cache hit ratio", + "refId": "A" + } + ], + "title": "Latest PP Cache Hit Ratio", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "none", + "decimals": 0 + }, + "overrides": [] + }, + "gridPos": { + "x": 12, + "y": 4, + "w": 6, + "h": 4 + }, + "id": 11, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "expr": "ours_rp_vrps{kind=\"total\"}", + "legendFormat": "VRPs raw", + "refId": "A" + } + ], + "title": "VRPs", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "none", + "decimals": 0 + }, + "overrides": [] + }, + "gridPos": { + "x": 18, + "y": 4, + "w": 6, + "h": 4 + }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "expr": "ours_rp_vaps", + "legendFormat": "VAPs", + "refId": "A" + } + ], + "title": "VAPs", + "type": "stat" + }, { "datasource": { "type": "prometheus", @@ -334,186 +533,6 @@ "title": "Large Publication Points by Object Count", "type": "timeseries" }, - { - "datasource": { - "type": "prometheus", - "uid": "Prometheus" - }, - "fieldConfig": { - "defaults": { - "decimals": 0, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "x": 0, - "y": 4, - "w": 6, - "h": 4 - }, - "id": 9, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.3.1", - "targets": [ - { - "expr": "ours_rp_run_sequence", - "legendFormat": "seq", - "refId": "A" - } - ], - "title": "Latest Run Sequence", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "Prometheus" - }, - "fieldConfig": { - "defaults": { - "decimals": 0, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "x": 6, - "y": 4, - "w": 6, - "h": 4 - }, - "id": 10, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.3.1", - "targets": [ - { - "expr": "ours_rp_run_success", - "legendFormat": "success", - "refId": "A" - } - ], - "title": "Latest Run Success", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "Prometheus" - }, - "fieldConfig": { - "defaults": { - "unit": "none", - "decimals": 0 - }, - "overrides": [] - }, - "gridPos": { - "x": 12, - "y": 4, - "w": 6, - "h": 4 - }, - "id": 11, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.3.1", - "targets": [ - { - "expr": "ours_rp_vrps{kind=\"total\"}", - "legendFormat": "VRPs raw", - "refId": "A" - } - ], - "title": "VRPs", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "Prometheus" - }, - "fieldConfig": { - "defaults": { - "unit": "none", - "decimals": 0 - }, - "overrides": [] - }, - "gridPos": { - "x": 18, - "y": 4, - "w": 6, - "h": 4 - }, - "id": 12, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.3.1", - "targets": [ - { - "expr": "ours_rp_vaps", - "legendFormat": "VAPs", - "refId": "A" - } - ], - "title": "VAPs", - "type": "stat" - }, { "datasource": { "type": "prometheus", @@ -586,7 +605,7 @@ "gridPos": { "x": 0, "y": 24, - "w": 24, + "w": 12, "h": 8 }, "id": 14, @@ -615,6 +634,70 @@ "title": "Max RSS Over Time", "type": "timeseries" }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "decimals": 2, + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 90 + }, + { + "color": "green", + "value": 98 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "x": 12, + "y": 24, + "w": 12, + "h": 8 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "min", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "100 * sum by (job, instance, exported_instance) (ours_rp_repo_terminal_state_count{terminal_state=\"publication_point_cache\"}) / sum by (job, instance, exported_instance) (ours_rp_publication_points)", + "legendFormat": "PP cache hit ratio", + "refId": "A" + } + ], + "title": "PP Cache Hit Ratio", + "type": "timeseries" + }, { "datasource": { "type": "prometheus", diff --git a/scripts/inter_rp/inter_rp_ours_routinator_exporter.py b/scripts/inter_rp/inter_rp_ours_routinator_exporter.py index 64448ef..2d6d757 100755 --- a/scripts/inter_rp/inter_rp_ours_routinator_exporter.py +++ b/scripts/inter_rp/inter_rp_ours_routinator_exporter.py @@ -243,6 +243,7 @@ def load_routinator_repo_sets(errors): failed = set() duration = {} object_counts = {} + publication_point_states = {} for metric in ["routinator_rrdp_status", "routinator_rsync_status"]: for labels, value in parse_prometheus_samples(text, metric): uri = labels.get("uri") @@ -254,6 +255,20 @@ def load_routinator_repo_sets(errors): success.add(uri) else: failed.add(uri) + for labels, value in parse_prometheus_samples(text, "routinator_repository_publication_points_total"): + uri = labels.get("uri") + state = labels.get("state", "unknown") + if not uri: + continue + total.add(uri) + publication_point_states.setdefault(uri, {})[state] = publication_point_states.setdefault(uri, {}).get(state, 0.0) + value + for uri, states in publication_point_states.items(): + valid_count = states.get("valid", 0.0) + non_valid_count = sum(value for state, value in states.items() if state != "valid") + if valid_count > 0: + success.add(uri) + elif non_valid_count > 0: + failed.add(uri) for metric in ["routinator_rrdp_duration", "routinator_rsync_duration"]: for labels, value in parse_prometheus_samples(text, metric): uri = labels.get("uri") @@ -261,6 +276,7 @@ def load_routinator_repo_sets(errors): duration[uri] = max(duration.get(uri, 0.0), value) for labels, value in parse_prometheus_samples(text, "routinator_repository_objects_total"): add_object_count(object_counts, labels.get("uri"), labels.get("type"), value) + failed = failed - success return {"total": total, "success": success, "failed": failed, "duration": duration, "object_counts": object_counts} def emit_repo_diff_metrics(out, errors): diff --git a/scripts/soak/build_portable_soak_package.sh b/scripts/soak/build_portable_soak_package.sh index 477c4fc..fcbae31 100755 --- a/scripts/soak/build_portable_soak_package.sh +++ b/scripts/soak/build_portable_soak_package.sh @@ -87,6 +87,7 @@ mkdir -p "$STAGE_DIR/bin" "$STAGE_DIR/fixtures" "$STAGE_DIR/scripts" "$STAGE_DIR install -m 0755 "$SCRIPT_DIR/run_soak.sh" "$STAGE_DIR/run_soak.sh" install -m 0755 "$SCRIPT_DIR/run_24h_soak_with_metrics.sh" "$STAGE_DIR/run_24h_soak_with_metrics.sh" +install -m 0755 "$SCRIPT_DIR/fixed_phase_loop.sh" "$STAGE_DIR/scripts/soak/fixed_phase_loop.sh" install -m 0755 "$SCRIPT_DIR/hourly_soak_report.py" "$STAGE_DIR/scripts/soak/hourly_soak_report.py" install -m 0644 "$SCRIPT_DIR/portable-soak.env.example" "$STAGE_DIR/.env" install -m 0644 "$SCRIPT_DIR/portable-soak.env.example" "$STAGE_DIR/portable-soak.env.example" diff --git a/scripts/soak/fixed_phase_loop.sh b/scripts/soak/fixed_phase_loop.sh new file mode 100644 index 0000000..8deb8c9 --- /dev/null +++ b/scripts/soak/fixed_phase_loop.sh @@ -0,0 +1,126 @@ +#!/usr/bin/env bash +set -euo pipefail + +NAME="fixed-phase" +CYCLE_SECS="${PHASE_CYCLE_SECS:-900}" +OFFSET_SECS="${PHASE_OFFSET_SECS:-0}" +LOCK_FILE="${RPKI_HEAVY_LOCK:-/var/lock/rpki-heavy-run.lock}" +LOCK_WAIT_SECS="${LOCK_WAIT_SECS:-30}" + +usage() { + cat <<'USAGE' +Usage: + fixed_phase_loop.sh [--name ] [--cycle-secs ] [--offset-secs ] + [--lock-file ] [--lock-wait-secs ] -- [args...] + +Runs one command at fixed wall-clock phases. Missed phases are skipped rather than caught up, +which keeps independent RP jobs from drifting into each other. A shared flock protects against +unexpected overruns. +USAGE +} + +die() { + echo "error: $*" >&2 + exit 2 +} + +is_non_negative_int() { + [[ "$1" =~ ^[0-9]+$ ]] +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --name) + shift + NAME="${1:?--name requires a value}" + ;; + --cycle-secs) + shift + CYCLE_SECS="${1:?--cycle-secs requires a value}" + ;; + --offset-secs) + shift + OFFSET_SECS="${1:?--offset-secs requires a value}" + ;; + --lock-file) + shift + LOCK_FILE="${1:?--lock-file requires a value}" + ;; + --lock-wait-secs) + shift + LOCK_WAIT_SECS="${1:?--lock-wait-secs requires a value}" + ;; + --help|-h) + usage + exit 0 + ;; + --) + shift + break + ;; + *) + die "unknown argument: $1" + ;; + esac + shift +done + +[[ $# -gt 0 ]] || die "missing command after --" +is_non_negative_int "$CYCLE_SECS" || die "--cycle-secs must be a non-negative integer" +is_non_negative_int "$OFFSET_SECS" || die "--offset-secs must be a non-negative integer" +is_non_negative_int "$LOCK_WAIT_SECS" || die "--lock-wait-secs must be a non-negative integer" +(( CYCLE_SECS > 0 )) || die "--cycle-secs must be > 0" +(( OFFSET_SECS < CYCLE_SECS )) || die "--offset-secs must be < --cycle-secs" + +mkdir -p "$(dirname "$LOCK_FILE")" + +timestamp_utc() { + date -u +%Y-%m-%dT%H:%M:%SZ +} + +format_epoch_utc() { + date -u -d "@$1" +%Y-%m-%dT%H:%M:%SZ +} + +LAST_TARGET_EPOCH=-1 + +next_phase_epoch() { + local now="$1" + local shifted=$((now - OFFSET_SECS)) + local remainder=$((shifted % CYCLE_SECS)) + if (( remainder < 0 )); then + remainder=$((remainder + CYCLE_SECS)) + fi + local sleep_secs=$((CYCLE_SECS - remainder)) + if (( sleep_secs == CYCLE_SECS )); then + sleep_secs=0 + fi + printf '%s\n' "$((now + sleep_secs))" +} + +while true; do + now_epoch="$(date +%s)" + target_epoch="$(next_phase_epoch "$now_epoch")" + if (( target_epoch <= LAST_TARGET_EPOCH )); then + target_epoch=$((LAST_TARGET_EPOCH + CYCLE_SECS)) + fi + LAST_TARGET_EPOCH="$target_epoch" + sleep_secs=$((target_epoch - now_epoch)) + echo "[$(timestamp_utc)] $NAME next_phase=$(format_epoch_utc "$target_epoch") sleep=${sleep_secs}s cycle=${CYCLE_SECS}s offset=${OFFSET_SECS}s" >&2 + if (( sleep_secs > 0 )); then + sleep "$sleep_secs" + fi + + started_epoch="$(date +%s)" + echo "[$(timestamp_utc)] $NAME phase_start target=$(format_epoch_utc "$target_epoch") lock=$LOCK_FILE wait=${LOCK_WAIT_SECS}s" >&2 + set +e + flock -w "$LOCK_WAIT_SECS" "$LOCK_FILE" "$@" + code=$? + set -e + ended_epoch="$(date +%s)" + if (( code == 0 )); then + echo "[$(timestamp_utc)] $NAME phase_done exit=0 elapsed=$((ended_epoch - started_epoch))s" >&2 + else + echo "[$(timestamp_utc)] $NAME phase_done exit=$code elapsed=$((ended_epoch - started_epoch))s skipped_or_failed=1" >&2 + fi +done diff --git a/scripts/soak/portable-soak.env.example b/scripts/soak/portable-soak.env.example index cb13fe0..d8f7be9 100644 --- a/scripts/soak/portable-soak.env.example +++ b/scripts/soak/portable-soak.env.example @@ -17,6 +17,17 @@ STOP_AFTER_SECS=0 # 示例:RIRS=apnic,arin 或 RIRS=afrinic,apnic,arin,lacnic,ripe RIRS=afrinic,apnic,arin,lacnic,ripe +# TAL/TA 输入模式。 +# file-with-ta:使用 package 内置 fixtures/tal + fixtures/ta,完全离线固定输入。 +# file-live-ta:使用 package 内置 fixtures/tal;每轮后台 best-effort 刷新 TA 到 state/live-ta, +# 子进程不等待刷新,直接使用当前已有的 state/live-ta,首次缺失时从 fixtures/ta 初始化。 +# url:直接把 TAL URL 传给子进程,由子进程处理 TAL/TA 获取。 +TAL_INPUT_MODE=file-with-ta + +# file-live-ta 后台刷新 TA 的 curl 超时配置。刷新失败只写日志,不阻断本轮 run。 +LIVE_TA_REFRESH_CONNECT_TIMEOUT_SECS=15 +LIVE_TA_REFRESH_MAX_TIME_SECS=120 + # 运行根目录。默认使用 package 根目录;如需把产物写到独立数据盘,可改成绝对路径。 RUN_ROOT="${PACKAGE_ROOT}" diff --git a/scripts/soak/run_soak.sh b/scripts/soak/run_soak.sh index c915e8f..f81e557 100755 --- a/scripts/soak/run_soak.sh +++ b/scripts/soak/run_soak.sh @@ -43,6 +43,9 @@ META_DIR="${META_DIR:-$STATE_ROOT/meta}" TMP_DIR="${TMP_DIR:-$RUN_ROOT/tmp}" RSYNC_MIRROR_ROOT="${RSYNC_MIRROR_ROOT:-$STATE_ROOT/rsync-mirror}" INVALID_ROOT="$STATE_ROOT/invalid" +LIVE_TA_REFRESH_DIR="${LIVE_TA_REFRESH_DIR:-$META_DIR/live-ta-refresh}" +LIVE_TA_REFRESH_CONNECT_TIMEOUT_SECS="${LIVE_TA_REFRESH_CONNECT_TIMEOUT_SECS:-15}" +LIVE_TA_REFRESH_MAX_TIME_SECS="${LIVE_TA_REFRESH_MAX_TIME_SECS:-120}" RPKI_BIN="$BIN_DIR/rpki" RPKI_DAEMON_BIN="$BIN_DIR/rpki_daemon" @@ -194,22 +197,129 @@ live_ta_file_for_rir() { printf '%s' "$STATE_ROOT/live-ta/$(basename "$(tal_file_for_rir "$1")" .tal).cer" } +live_ta_refresh_pid_file_for_rir() { + printf '%s' "$LIVE_TA_REFRESH_DIR/$1.pid" +} + refresh_live_ta_for_rir() { local rir_name="$1" + local run_id="${2:-manual}" + local log_path="${3:-}" local tal_path local ta_uri local ta_file local tmp_file + if [[ -n "$log_path" ]]; then + mkdir -p "$(dirname "$log_path")" + exec >> "$log_path" 2>&1 + fi + echo "live-ta-refresh start run=$run_id rir=$rir_name at=$(date -u +%Y-%m-%dT%H:%M:%SZ)" tal_path="$(tal_file_for_rir "$rir_name")" ta_uri="$(tal_https_uri_from_fixture "$tal_path")" - [[ -n "$ta_uri" ]] || die "missing http(s) TA URI in TAL fixture for $rir_name: $tal_path" + if [[ -z "$ta_uri" ]]; then + echo "live-ta-refresh failed rir=$rir_name reason=missing_https_uri tal=$tal_path" + return 1 + fi ta_file="$(live_ta_file_for_rir "$rir_name")" mkdir -p "$(dirname "$ta_file")" - tmp_file="${ta_file}.tmp.$$" - curl -fsSL --connect-timeout 15 --max-time 120 "$ta_uri" -o "$tmp_file" \ - || { rm -f "$tmp_file"; die "failed to refresh TA for $rir_name from $ta_uri"; } - [[ -s "$tmp_file" ]] || { rm -f "$tmp_file"; die "empty TA download for $rir_name from $ta_uri"; } + tmp_file="${ta_file}.tmp.$$.$RANDOM" + if ! curl -fsSL --connect-timeout "$LIVE_TA_REFRESH_CONNECT_TIMEOUT_SECS" --max-time "$LIVE_TA_REFRESH_MAX_TIME_SECS" "$ta_uri" -o "$tmp_file"; then + rm -f "$tmp_file" + echo "live-ta-refresh failed rir=$rir_name reason=curl uri=$ta_uri" + return 1 + fi + if [[ ! -s "$tmp_file" ]]; then + rm -f "$tmp_file" + echo "live-ta-refresh failed rir=$rir_name reason=empty_download uri=$ta_uri" + return 1 + fi mv "$tmp_file" "$ta_file" + echo "live-ta-refresh success rir=$rir_name uri=$ta_uri output=$ta_file bytes=$(wc -c < "$ta_file" | tr -d ' ')" +} + +ensure_live_ta_for_rir() { + local rir_name="$1" + local live_ta_file + local fixture_ta_file + live_ta_file="$(live_ta_file_for_rir "$rir_name")" + if [[ -s "$live_ta_file" ]]; then + return 0 + fi + fixture_ta_file="$(ta_file_for_rir "$rir_name")" + [[ -s "$fixture_ta_file" ]] || die "missing live TA and fixture TA for $rir_name: $live_ta_file / $fixture_ta_file" + mkdir -p "$(dirname "$live_ta_file")" + cp "$fixture_ta_file" "$live_ta_file" +} + +reap_finished_live_ta_refresh_for_rir() { + local rir_name="$1" + local pid_file + local pid + local pid_state + local pid_file_mtime + local now_epoch + local stale_after_secs + pid_file="$(live_ta_refresh_pid_file_for_rir "$rir_name")" + [[ -f "$pid_file" ]] || return 0 + pid="$(cat "$pid_file" 2>/dev/null || true)" + if [[ "$pid" =~ ^[0-9]+$ ]] && kill -0 "$pid" >/dev/null 2>&1; then + pid_state="" + if [[ -r "/proc/$pid/stat" ]]; then + pid_state="$(awk '{ print $3 }' "/proc/$pid/stat" 2>/dev/null || true)" + fi + if [[ "$pid_state" == "Z" ]]; then + wait "$pid" >/dev/null 2>&1 || true + rm -f "$pid_file" + return 0 + fi + pid_file_mtime="$(stat -c %Y "$pid_file" 2>/dev/null || date +%s)" + now_epoch="$(date +%s)" + stale_after_secs=$((LIVE_TA_REFRESH_MAX_TIME_SECS + 60)) + if (( now_epoch - pid_file_mtime > stale_after_secs )); then + rm -f "$pid_file" + return 0 + fi + return 1 + fi + if [[ "$pid" =~ ^[0-9]+$ ]]; then + wait "$pid" >/dev/null 2>&1 || true + fi + rm -f "$pid_file" + return 0 +} + +start_live_ta_refresh_for_rir() { + local rir_name="$1" + local run_id="$2" + local pid_file + local log_path + local pid + mkdir -p "$LIVE_TA_REFRESH_DIR" "$LOG_ROOT" + pid_file="$(live_ta_refresh_pid_file_for_rir "$rir_name")" + if ! reap_finished_live_ta_refresh_for_rir "$rir_name"; then + pid="$(cat "$pid_file" 2>/dev/null || true)" + echo "live-ta-refresh skip run=$run_id rir=$rir_name reason=previous_refresh_running pid=$pid" \ + >> "$LOG_ROOT/live-ta-refresh-$run_id-$rir_name.log" + return 0 + fi + log_path="$LOG_ROOT/live-ta-refresh-$run_id-$rir_name.log" + refresh_live_ta_for_rir "$rir_name" "$run_id" "$log_path" & + pid=$! + printf '%s\n' "$pid" > "$pid_file" +} + +prepare_live_ta_inputs_for_run() { + local run_id="$1" + local rir_name + if [[ "$TAL_INPUT_MODE" != "file-live-ta" ]]; then + return 0 + fi + for rir_name in "${RIR_LIST[@]}"; do + ensure_live_ta_for_rir "$rir_name" + done + for rir_name in "${RIR_LIST[@]}"; do + start_live_ta_refresh_for_rir "$rir_name" "$run_id" + done } compare_view_trust_anchor() { @@ -432,7 +542,6 @@ build_child_args() { if [[ "$TAL_INPUT_MODE" == "url" ]]; then CHILD_ARGS+=(--tal-url "$(tal_url_for_rir "$rir_name")") elif [[ "$TAL_INPUT_MODE" == "file-live-ta" ]]; then - refresh_live_ta_for_rir "$rir_name" CHILD_ARGS+=(--tal-path "$(tal_file_for_rir "$rir_name")") CHILD_ARGS+=(--ta-path "$(live_ta_file_for_rir "$rir_name")") else @@ -613,6 +722,7 @@ run_one_round() { "$snapshot_reason" "$previous_run_id" "$previous_success_value" "$started_at" "" \ "$INVALID_DB_PATH" "$INVALID_STATE_PATH" "$INVALID_TMP_PATH" "" "$PACKAGE_ROOT" "$ENV_FILE" + prepare_live_ta_inputs_for_run "$run_id" build_child_args if is_true "$RPKI_ANALYZE"; then CHILD_ARGS+=(--analyze --analysis-out "$run_dir/analyze") @@ -674,6 +784,8 @@ main() { require_command find if [[ "$TAL_INPUT_MODE" == "file-live-ta" ]]; then require_command curl + validate_positive_int "LIVE_TA_REFRESH_CONNECT_TIMEOUT_SECS" "$LIVE_TA_REFRESH_CONNECT_TIMEOUT_SECS" + validate_positive_int "LIVE_TA_REFRESH_MAX_TIME_SECS" "$LIVE_TA_REFRESH_MAX_TIME_SECS" fi validate_max_runs validate_non_negative_int "INTERVAL_SECS" "$INTERVAL_SECS" @@ -701,7 +813,7 @@ main() { fi done - mkdir -p "$RUNS_ROOT" "$LOG_ROOT" "$DB_DIR" "$META_DIR" "$TMP_DIR" "$INVALID_ROOT" + mkdir -p "$RUNS_ROOT" "$LOG_ROOT" "$DB_DIR" "$META_DIR" "$TMP_DIR" "$INVALID_ROOT" "$LIVE_TA_REFRESH_DIR" if is_true "$ALLOW_RSYNC_MIRROR_REUSE"; then mkdir -p "$RSYNC_MIRROR_ROOT" fi