rpki/scripts/soak/hourly_soak_report.py

553 lines
22 KiB
Python
Executable File

#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import os
import shutil
import subprocess
import sys
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from pathlib import Path
from statistics import median
from typing import Any
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen
def parse_rfc3339(value: str) -> datetime:
normalized = value.strip()
if normalized.endswith("Z"):
normalized = normalized[:-1] + "+00:00"
parsed = datetime.fromisoformat(normalized)
if parsed.tzinfo is None:
parsed = parsed.replace(tzinfo=timezone.utc)
return parsed.astimezone(timezone.utc)
def format_rfc3339(value: datetime) -> str:
return value.astimezone(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
def default_window() -> tuple[datetime, datetime]:
window_end = datetime.now(timezone.utc)
return window_end - timedelta(hours=1), window_end
def read_json(path: Path) -> dict[str, Any] | None:
try:
return json.loads(path.read_text(encoding="utf-8"))
except Exception:
return None
def nested_get(data: dict[str, Any] | None, keys: list[str], default: Any = None) -> Any:
current: Any = data
for key in keys:
if not isinstance(current, dict) or key not in current:
return default
current = current[key]
return current
def as_int(value: Any, default: int = 0) -> int:
if value is None:
return default
try:
return int(value)
except (TypeError, ValueError):
return default
def as_float(value: Any, default: float = 0.0) -> float:
if value is None:
return default
try:
return float(value)
except (TypeError, ValueError):
return default
def bytes_to_mib(value: int | None) -> str:
if value is None:
return "-"
return f"{value / 1024 / 1024:.1f}"
def millis_to_seconds_text(value: int | None) -> str:
if value is None:
return "-"
return f"{value / 1000:.3f}"
def seconds_text(value: float | None) -> str:
if value is None:
return "-"
return f"{value:.3f}"
def percent_text(numerator: int, denominator: int) -> str:
if denominator <= 0:
return "-"
return f"{(numerator / denominator) * 100:.1f}%"
@dataclass
class RunRecord:
run_id: str
run_index: int
run_dir: Path
status: str
sync_mode: str
started_at: datetime | None
completed_at: datetime | None
wall_ms: int | None
validation_ms: int | None
repo_sync_ms_total: int | None
rrdp_download_ms_total: int | None
rsync_download_ms_total: int | None
download_bytes_total: int | None
max_rss_bytes: int | None
vrps: int
vaps: int
publication_points: int
warnings: int
repo_sync_stats: dict[str, Any]
artifact_sizes: dict[str, int]
parse_errors: list[str]
incident_reasons: list[str]
def load_run(run_dir: Path, window_start: datetime, window_end: datetime, args: argparse.Namespace) -> RunRecord | None:
meta = read_json(run_dir / "run-meta.json")
summary = read_json(run_dir / "run-summary.json")
if meta is None:
return None
completed_raw = meta.get("completed_at_rfc3339_utc") or nested_get(summary, ["finishedAtRfc3339Utc"])
if not completed_raw:
return None
try:
completed_at = parse_rfc3339(str(completed_raw))
except ValueError:
return None
if completed_at < window_start or completed_at >= window_end:
return None
started_at = None
started_raw = meta.get("started_at_rfc3339_utc") or nested_get(summary, ["startedAtRfc3339Utc"])
if started_raw:
try:
started_at = parse_rfc3339(str(started_raw))
except ValueError:
started_at = None
status = str(meta.get("status") or nested_get(summary, ["status"], "unknown"))
sync_mode = str(meta.get("sync_mode") or "unknown")
run_index = as_int(meta.get("run_index"), default=0)
run_id = str(meta.get("run_id") or run_dir.name)
stage = nested_get(summary, ["stageTiming"], {}) or {}
process_metrics = nested_get(summary, ["processMetrics"], {}) or {}
report_counts = nested_get(summary, ["reportCounts"], {}) or {}
repo_sync_stats = nested_get(summary, ["repoSyncStats"], {}) or {}
wall_ms = nested_get(summary, ["wallMs"])
max_rss_kb = nested_get(process_metrics, ["maxRssKb"])
artifact_sizes = {}
for artifact in nested_get(summary, ["artifacts"], []) or []:
if isinstance(artifact, dict):
artifact_path = str(artifact.get("path", ""))
artifact_sizes[Path(artifact_path).name] = as_int(artifact.get("sizeBytes"))
parse_errors = []
if summary is None:
parse_errors.append("missing_or_invalid_run_summary")
if not (run_dir / "stage-timing.json").exists():
parse_errors.append("missing_stage_timing")
if not (run_dir / "process-time.txt").exists():
parse_errors.append("missing_process_time")
record = RunRecord(
run_id=run_id,
run_index=run_index,
run_dir=run_dir,
status=status,
sync_mode=sync_mode,
started_at=started_at,
completed_at=completed_at,
wall_ms=as_int(wall_ms) if wall_ms is not None else None,
validation_ms=as_int(stage.get("validation_ms")) if stage else None,
repo_sync_ms_total=as_int(stage.get("repo_sync_ms_total")) if stage else None,
rrdp_download_ms_total=as_int(stage.get("rrdp_download_ms_total")) if stage else None,
rsync_download_ms_total=as_int(stage.get("rsync_download_ms_total")) if stage else None,
download_bytes_total=as_int(stage.get("download_bytes_total")) if stage else None,
max_rss_bytes=as_int(max_rss_kb) * 1024 if max_rss_kb is not None else None,
vrps=as_int(report_counts.get("vrps")),
vaps=as_int(report_counts.get("aspas")),
publication_points=as_int(report_counts.get("publicationPoints")),
warnings=as_int(report_counts.get("warnings")),
repo_sync_stats=repo_sync_stats if isinstance(repo_sync_stats, dict) else {},
artifact_sizes=artifact_sizes,
parse_errors=parse_errors,
incident_reasons=[],
)
record.incident_reasons = classify_incident(record, args)
return record
def classify_incident(record: RunRecord, args: argparse.Namespace) -> list[str]:
reasons = []
if record.status != "success":
reasons.append(f"status={record.status}")
if record.wall_ms is not None and record.wall_ms > args.wall_warn_secs * 1000:
reasons.append(f"wall>{args.wall_warn_secs}s")
if record.vrps < args.vrp_min:
reasons.append(f"vrps<{args.vrp_min}")
if record.vaps < args.vaps_min:
reasons.append(f"vaps<{args.vaps_min}")
if record.publication_points < args.pp_min:
reasons.append(f"pp<{args.pp_min}")
if args.warning_max >= 0 and record.warnings > args.warning_max:
reasons.append(f"warnings>{args.warning_max}")
reasons.extend(record.parse_errors)
return reasons
def discover_runs(run_root: Path, window_start: datetime, window_end: datetime, args: argparse.Namespace) -> list[RunRecord]:
runs_root = run_root / "runs"
records = []
for run_dir in sorted(runs_root.glob("run_[0-9][0-9][0-9][0-9]")):
if not run_dir.is_dir():
continue
record = load_run(run_dir, window_start, window_end, args)
if record is not None:
records.append(record)
return records
def percentile(values: list[float], target_percentile: float) -> float | None:
if not values:
return None
ordered = sorted(values)
index = int(round((target_percentile / 100.0) * (len(ordered) - 1)))
return ordered[index]
def aggregate_count_duration(records: list[RunRecord], group_key: str) -> dict[str, dict[str, int]]:
aggregate: dict[str, dict[str, int]] = {}
for record in records:
group = record.repo_sync_stats.get(group_key, {})
if not isinstance(group, dict):
continue
for name, value in group.items():
if not isinstance(value, dict):
continue
bucket = aggregate.setdefault(str(name), {"count": 0, "duration_ms_total": 0})
bucket["count"] += as_int(value.get("count"))
bucket["duration_ms_total"] += as_int(value.get("duration_ms_total"))
return aggregate
def copy_tree_preserve(source_dir: Path, target_dir: Path) -> None:
if target_dir.exists():
return
target_dir.parent.mkdir(parents=True, exist_ok=True)
try:
shutil.copytree(source_dir, target_dir, copy_function=os.link)
except OSError:
if target_dir.exists():
shutil.rmtree(target_dir)
shutil.copytree(source_dir, target_dir)
def preserve_incidents(records: list[RunRecord], incident_dir: Path) -> list[dict[str, Any]]:
preserved = []
for record in records:
if not record.incident_reasons:
continue
target_dir = incident_dir / record.run_id
copy_tree_preserve(record.run_dir, target_dir)
preserved.append(
{
"runId": record.run_id,
"sourceDir": str(record.run_dir),
"incidentDir": str(target_dir),
"reasons": record.incident_reasons,
}
)
return preserved
def disk_snapshot(path: Path) -> dict[str, Any]:
usage = shutil.disk_usage(path)
return {
"path": str(path),
"totalBytes": usage.total,
"usedBytes": usage.used,
"freeBytes": usage.free,
"usedPercent": (usage.used / usage.total) * 100 if usage.total else 0.0,
}
def build_summary(
run_root: Path,
window_start: datetime,
window_end: datetime,
records: list[RunRecord],
preserved: list[dict[str, Any]],
) -> dict[str, Any]:
wall_seconds = [record.wall_ms / 1000 for record in records if record.wall_ms is not None]
validation_seconds = [
record.validation_ms / 1000 for record in records if record.validation_ms is not None
]
max_rss_values = [record.max_rss_bytes for record in records if record.max_rss_bytes is not None]
success_count = sum(1 for record in records if record.status == "success")
failed_count = sum(1 for record in records if record.status != "success")
snapshot_count = sum(1 for record in records if record.sync_mode == "snapshot")
delta_count = sum(1 for record in records if record.sync_mode == "delta")
incident_count = sum(1 for record in records if record.incident_reasons)
def range_for(values: list[int]) -> dict[str, int | None]:
return {"min": min(values) if values else None, "max": max(values) if values else None}
return {
"schemaVersion": 1,
"generatedAtUtc": format_rfc3339(datetime.now(timezone.utc)),
"windowStartUtc": format_rfc3339(window_start),
"windowEndUtc": format_rfc3339(window_end),
"runRoot": str(run_root),
"runs": {
"total": len(records),
"success": success_count,
"failed": failed_count,
"snapshot": snapshot_count,
"delta": delta_count,
"incidents": incident_count,
},
"wallSeconds": {
"min": min(wall_seconds) if wall_seconds else None,
"median": median(wall_seconds) if wall_seconds else None,
"p95": percentile(wall_seconds, 95),
"max": max(wall_seconds) if wall_seconds else None,
},
"validationSeconds": {
"min": min(validation_seconds) if validation_seconds else None,
"median": median(validation_seconds) if validation_seconds else None,
"max": max(validation_seconds) if validation_seconds else None,
},
"maxRssBytes": {
"max": max(max_rss_values) if max_rss_values else None,
},
"outputs": {
"vrps": range_for([record.vrps for record in records]),
"vaps": range_for([record.vaps for record in records]),
"publicationPoints": range_for([record.publication_points for record in records]),
"warnings": range_for([record.warnings for record in records]),
},
"repoSyncByPhase": aggregate_count_duration(records, "by_phase"),
"repoSyncByTerminalState": aggregate_count_duration(records, "by_terminal_state"),
"downloadBytesTotal": sum(record.download_bytes_total or 0 for record in records),
"preservedIncidents": preserved,
"disk": disk_snapshot(run_root),
}
def render_markdown(summary: dict[str, Any], records: list[RunRecord]) -> str:
lines = [
"# Ours RP 24h Soak 小时级报告",
"",
f"- Window: `{summary['windowStartUtc']}` → `{summary['windowEndUtc']}`",
f"- Generated: `{summary['generatedAtUtc']}`",
f"- Run root: `{summary['runRoot']}`",
"",
"## 汇总",
"",
"| 指标 | 值 |",
"|---|---:|",
f"| Runs | {summary['runs']['total']} |",
f"| Success / Failed | {summary['runs']['success']} / {summary['runs']['failed']} |",
f"| Snapshot / Delta | {summary['runs']['snapshot']} / {summary['runs']['delta']} |",
f"| Incidents | {summary['runs']['incidents']} |",
f"| Wall min / median / p95 / max (s) | {seconds_text(summary['wallSeconds']['min'])} / {seconds_text(summary['wallSeconds']['median'])} / {seconds_text(summary['wallSeconds']['p95'])} / {seconds_text(summary['wallSeconds']['max'])} |",
f"| Validation min / median / max (s) | {seconds_text(summary['validationSeconds']['min'])} / {seconds_text(summary['validationSeconds']['median'])} / {seconds_text(summary['validationSeconds']['max'])} |",
f"| Max RSS max (MiB) | {bytes_to_mib(summary['maxRssBytes']['max'])} |",
f"| VRPs range | {summary['outputs']['vrps']['min']} - {summary['outputs']['vrps']['max']} |",
f"| VAPs range | {summary['outputs']['vaps']['min']} - {summary['outputs']['vaps']['max']} |",
f"| PP range | {summary['outputs']['publicationPoints']['min']} - {summary['outputs']['publicationPoints']['max']} |",
f"| Disk used | {summary['disk']['usedPercent']:.1f}% |",
"",
"## 每轮明细",
"",
"| Run | Mode | Status | Wall(s) | Validation(s) | RRDP(s) | Rsync(s) | Max RSS(MiB) | VRPs | VAPs | PP | Warnings | Incident |",
"|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---|",
]
for record in records:
incident = ", ".join(record.incident_reasons) if record.incident_reasons else ""
lines.append(
"| "
+ " | ".join(
[
record.run_id,
record.sync_mode,
record.status,
millis_to_seconds_text(record.wall_ms),
millis_to_seconds_text(record.validation_ms),
millis_to_seconds_text(record.rrdp_download_ms_total),
millis_to_seconds_text(record.rsync_download_ms_total),
bytes_to_mib(record.max_rss_bytes),
str(record.vrps),
str(record.vaps),
str(record.publication_points),
str(record.warnings),
incident,
]
)
+ " |"
)
lines.extend(["", "## Repo Sync 聚合", "", "### By Phase", "", "| Phase | Count | Duration(s) |", "|---|---:|---:|"])
for phase, value in sorted(summary["repoSyncByPhase"].items()):
lines.append(f"| {phase} | {value['count']} | {value['duration_ms_total'] / 1000:.3f} |")
lines.extend(["", "### By Terminal State", "", "| State | Count | Duration(s) |", "|---|---:|---:|"])
for state, value in sorted(summary["repoSyncByTerminalState"].items()):
lines.append(f"| {state} | {value['count']} | {value['duration_ms_total'] / 1000:.3f} |")
lines.extend(["", "## Incident 固化", "", "| Run | Incident Dir | Reasons |", "|---|---|---|"])
for incident in summary["preservedIncidents"]:
lines.append(
f"| {incident['runId']} | `{incident['incidentDir']}` | {', '.join(incident['reasons'])} |"
)
if not summary["preservedIncidents"]:
lines.append("| - | - | - |")
lines.append("")
return "\n".join(lines)
def append_jsonl(path: Path, summary: dict[str, Any]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("a", encoding="utf-8") as handle:
handle.write(json.dumps(summary, ensure_ascii=False, sort_keys=True) + "\n")
def send_feishu_with_script(script_path: Path, message: str, dry_run: bool) -> int:
command = [sys.executable, str(script_path), "--stdin"]
if dry_run:
command.append("--dry-run")
completed = subprocess.run(command, input=message, text=True, check=False)
return completed.returncode
def send_feishu_with_webhook(message: str, timeout_seconds: float = 10.0) -> int:
webhook_url = os.environ.get("FEISHU_WEBHOOK_URL", "").strip()
if not webhook_url:
return 2
final_text = message if message.lower().startswith("from codex:") else f"From codex: {message}"
payload = json.dumps(
{"msg_type": "text", "content": {"text": final_text}},
ensure_ascii=False,
).encode("utf-8")
request = Request(
webhook_url,
data=payload,
method="POST",
headers={"Content-Type": "application/json"},
)
try:
with urlopen(request, timeout=timeout_seconds) as response:
body = response.read().decode("utf-8", errors="replace")
if getattr(response, "status", 0) != 200:
print(body, file=sys.stderr)
return 1
parsed = json.loads(body)
if parsed.get("code") == 0 or parsed.get("StatusCode") == 0:
return 0
print(body, file=sys.stderr)
return 1
except (HTTPError, URLError, json.JSONDecodeError) as error:
print(f"feishu send failed: {error}", file=sys.stderr)
return 1
def build_feishu_message(summary: dict[str, Any], report_path: Path) -> str:
return (
"#062 24h soak hourly "
f"{summary['windowStartUtc']}..{summary['windowEndUtc']} "
f"runs={summary['runs']['total']} ok={summary['runs']['success']} fail={summary['runs']['failed']} "
f"incidents={summary['runs']['incidents']} "
f"wall_s={seconds_text(summary['wallSeconds']['median'])}/{seconds_text(summary['wallSeconds']['max'])} "
f"vrps={summary['outputs']['vrps']['min']}-{summary['outputs']['vrps']['max']} "
f"vaps={summary['outputs']['vaps']['min']}-{summary['outputs']['vaps']['max']} "
f"report={report_path}"
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Generate hourly reports for portable ours RP soak runs.")
parser.add_argument("--run-root", required=True, type=Path)
parser.add_argument("--reports-dir", type=Path)
parser.add_argument("--incident-dir", type=Path)
parser.add_argument("--window-start", help="RFC3339 UTC timestamp. Defaults to now-1h.")
parser.add_argument("--window-end", help="RFC3339 UTC timestamp. Defaults to now.")
parser.add_argument("--report-name", help="Override report filename.")
parser.add_argument("--wall-warn-secs", type=int, default=140)
parser.add_argument("--vrp-min", type=int, default=900_000)
parser.add_argument("--vaps-min", type=int, default=1_000)
parser.add_argument("--pp-min", type=int, default=50_000)
parser.add_argument(
"--warning-max",
type=int,
default=-1,
help="Incident threshold for warnings; negative disables warning-based incidents.",
)
parser.add_argument("--send-feishu", action="store_true")
parser.add_argument("--dry-run-feishu", action="store_true")
parser.add_argument("--feishu-script", type=Path)
return parser.parse_args()
def main() -> int:
args = parse_args()
if args.window_start and args.window_end:
window_start = parse_rfc3339(args.window_start)
window_end = parse_rfc3339(args.window_end)
else:
window_start, window_end = default_window()
if window_end <= window_start:
raise SystemExit("--window-end must be later than --window-start")
run_root = args.run_root.resolve()
reports_dir = (args.reports_dir or run_root / "hourly_reports").resolve()
incident_dir = (args.incident_dir or run_root / "incident_runs").resolve()
reports_dir.mkdir(parents=True, exist_ok=True)
records = discover_runs(run_root, window_start, window_end, args)
preserved = preserve_incidents(records, incident_dir)
summary = build_summary(run_root, window_start, window_end, records, preserved)
report_name = args.report_name or f"hour_{format_rfc3339(window_start).replace(':', '').replace('-', '')}.md"
report_path = reports_dir / report_name
report_path.write_text(render_markdown(summary, records), encoding="utf-8")
append_jsonl(reports_dir / "hourly_summary.jsonl", summary)
if args.send_feishu:
message = build_feishu_message(summary, report_path)
feishu_script = args.feishu_script
if feishu_script is None:
env_script = os.environ.get("FEISHU_WEBHOOK_SCRIPT", "").strip()
feishu_script = Path(env_script) if env_script else None
if feishu_script and feishu_script.exists():
return send_feishu_with_script(feishu_script, message, args.dry_run_feishu)
if args.dry_run_feishu:
print(f"DRY RUN FEISHU: {message}")
else:
result = send_feishu_with_webhook(message)
if result != 0:
print("Feishu not sent: configure --feishu-script or FEISHU_WEBHOOK_URL", file=sys.stderr)
return result
print(report_path)
return 0
if __name__ == "__main__":
raise SystemExit(main())