rpki/scripts/soak/hourly_soak_report.py

#!/usr/bin/env python3
from __future__ import annotations

import argparse
import json
import os
import shutil
import subprocess
import sys
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from pathlib import Path
from statistics import median
from typing import Any
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen


def parse_rfc3339(value: str) -> datetime:
    normalized = value.strip()
    if normalized.endswith("Z"):
        normalized = normalized[:-1] + "+00:00"
    parsed = datetime.fromisoformat(normalized)
    if parsed.tzinfo is None:
        parsed = parsed.replace(tzinfo=timezone.utc)
    return parsed.astimezone(timezone.utc)


def format_rfc3339(value: datetime) -> str:
    return value.astimezone(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")


def default_window() -> tuple[datetime, datetime]:
    window_end = datetime.now(timezone.utc)
    return window_end - timedelta(hours=1), window_end


def read_json(path: Path) -> dict[str, Any] | None:
    try:
        return json.loads(path.read_text(encoding="utf-8"))
    except Exception:
        return None


def nested_get(data: dict[str, Any] | None, keys: list[str], default: Any = None) -> Any:
    current: Any = data
    for key in keys:
        if not isinstance(current, dict) or key not in current:
            return default
        current = current[key]
    return current


def as_int(value: Any, default: int = 0) -> int:
    if value is None:
        return default
    try:
        return int(value)
    except (TypeError, ValueError):
        return default


def as_float(value: Any, default: float = 0.0) -> float:
    if value is None:
        return default
    try:
        return float(value)
    except (TypeError, ValueError):
        return default


def bytes_to_mib(value: int | None) -> str:
    if value is None:
        return "-"
    return f"{value / 1024 / 1024:.1f}"


def millis_to_seconds_text(value: int | None) -> str:
    if value is None:
        return "-"
    return f"{value / 1000:.3f}"


def seconds_text(value: float | None) -> str:
    if value is None:
        return "-"
    return f"{value:.3f}"


def percent_text(numerator: int, denominator: int) -> str:
    if denominator <= 0:
        return "-"
    return f"{(numerator / denominator) * 100:.1f}%"


@dataclass
class RunRecord:
    run_id: str
    run_index: int
    run_dir: Path
    status: str
    sync_mode: str
    started_at: datetime | None
    completed_at: datetime | None
    wall_ms: int | None
    validation_ms: int | None
    repo_sync_ms_total: int | None
    rrdp_download_ms_total: int | None
    rsync_download_ms_total: int | None
    download_bytes_total: int | None
    max_rss_bytes: int | None
    vrps: int
    vaps: int
    publication_points: int
    warnings: int
    repo_sync_stats: dict[str, Any]
    artifact_sizes: dict[str, int]
    parse_errors: list[str]
    incident_reasons: list[str]


def load_run(run_dir: Path, window_start: datetime, window_end: datetime, args: argparse.Namespace) -> RunRecord | None:
    meta = read_json(run_dir / "run-meta.json")
    summary = read_json(run_dir / "run-summary.json")
    if meta is None:
        return None

    completed_raw = meta.get("completed_at_rfc3339_utc") or nested_get(summary, ["finishedAtRfc3339Utc"])
    if not completed_raw:
        return None
    try:
        completed_at = parse_rfc3339(str(completed_raw))
    except ValueError:
        return None
    if completed_at < window_start or completed_at >= window_end:
        return None

    started_at = None
    started_raw = meta.get("started_at_rfc3339_utc") or nested_get(summary, ["startedAtRfc3339Utc"])
    if started_raw:
        try:
            started_at = parse_rfc3339(str(started_raw))
        except ValueError:
            started_at = None

    status = str(meta.get("status") or nested_get(summary, ["status"], "unknown"))
    sync_mode = str(meta.get("sync_mode") or "unknown")
    run_index = as_int(meta.get("run_index"), default=0)
    run_id = str(meta.get("run_id") or run_dir.name)
    stage = nested_get(summary, ["stageTiming"], {}) or {}
    process_metrics = nested_get(summary, ["processMetrics"], {}) or {}
    report_counts = nested_get(summary, ["reportCounts"], {}) or {}
    repo_sync_stats = nested_get(summary, ["repoSyncStats"], {}) or {}
    wall_ms = nested_get(summary, ["wallMs"])
    max_rss_kb = nested_get(process_metrics, ["maxRssKb"])
    artifact_sizes = {}
    for artifact in nested_get(summary, ["artifacts"], []) or []:
        if isinstance(artifact, dict):
            artifact_path = str(artifact.get("path", ""))
            artifact_sizes[Path(artifact_path).name] = as_int(artifact.get("sizeBytes"))

    parse_errors = []
    if summary is None:
        parse_errors.append("missing_or_invalid_run_summary")
    if not (run_dir / "stage-timing.json").exists():
        parse_errors.append("missing_stage_timing")
    if not (run_dir / "process-time.txt").exists():
        parse_errors.append("missing_process_time")

    record = RunRecord(
        run_id=run_id,
        run_index=run_index,
        run_dir=run_dir,
        status=status,
        sync_mode=sync_mode,
        started_at=started_at,
        completed_at=completed_at,
        wall_ms=as_int(wall_ms) if wall_ms is not None else None,
        validation_ms=as_int(stage.get("validation_ms")) if stage else None,
        repo_sync_ms_total=as_int(stage.get("repo_sync_ms_total")) if stage else None,
        rrdp_download_ms_total=as_int(stage.get("rrdp_download_ms_total")) if stage else None,
        rsync_download_ms_total=as_int(stage.get("rsync_download_ms_total")) if stage else None,
        download_bytes_total=as_int(stage.get("download_bytes_total")) if stage else None,
        max_rss_bytes=as_int(max_rss_kb) * 1024 if max_rss_kb is not None else None,
        vrps=as_int(report_counts.get("vrps")),
        vaps=as_int(report_counts.get("aspas")),
        publication_points=as_int(report_counts.get("publicationPoints")),
        warnings=as_int(report_counts.get("warnings")),
        repo_sync_stats=repo_sync_stats if isinstance(repo_sync_stats, dict) else {},
        artifact_sizes=artifact_sizes,
        parse_errors=parse_errors,
        incident_reasons=[],
    )
    record.incident_reasons = classify_incident(record, args)
    return record


def classify_incident(record: RunRecord, args: argparse.Namespace) -> list[str]:
    reasons = []
    if record.status != "success":
        reasons.append(f"status={record.status}")
    if record.wall_ms is not None and record.wall_ms > args.wall_warn_secs * 1000:
        reasons.append(f"wall>{args.wall_warn_secs}s")
    if record.vrps < args.vrp_min:
        reasons.append(f"vrps<{args.vrp_min}")
    if record.vaps < args.vaps_min:
        reasons.append(f"vaps<{args.vaps_min}")
    if record.publication_points < args.pp_min:
        reasons.append(f"pp<{args.pp_min}")
    if args.warning_max >= 0 and record.warnings > args.warning_max:
        reasons.append(f"warnings>{args.warning_max}")
    reasons.extend(record.parse_errors)
    return reasons


def discover_runs(run_root: Path, window_start: datetime, window_end: datetime, args: argparse.Namespace) -> list[RunRecord]:
    runs_root = run_root / "runs"
    records = []
    for run_dir in sorted(runs_root.glob("run_[0-9][0-9][0-9][0-9]")):
        if not run_dir.is_dir():
            continue
        record = load_run(run_dir, window_start, window_end, args)
        if record is not None:
            records.append(record)
    return records


def percentile(values: list[float], target_percentile: float) -> float | None:
    if not values:
        return None
    ordered = sorted(values)
    index = int(round((target_percentile / 100.0) * (len(ordered) - 1)))
    return ordered[index]


def aggregate_count_duration(records: list[RunRecord], group_key: str) -> dict[str, dict[str, int]]:
    aggregate: dict[str, dict[str, int]] = {}
    for record in records:
        group = record.repo_sync_stats.get(group_key, {})
        if not isinstance(group, dict):
            continue
        for name, value in group.items():
            if not isinstance(value, dict):
                continue
            bucket = aggregate.setdefault(str(name), {"count": 0, "duration_ms_total": 0})
            bucket["count"] += as_int(value.get("count"))
            bucket["duration_ms_total"] += as_int(value.get("duration_ms_total"))
    return aggregate


def copy_tree_preserve(source_dir: Path, target_dir: Path) -> None:
    if target_dir.exists():
        return
    target_dir.parent.mkdir(parents=True, exist_ok=True)
    try:
        shutil.copytree(source_dir, target_dir, copy_function=os.link)
    except OSError:
        if target_dir.exists():
            shutil.rmtree(target_dir)
        shutil.copytree(source_dir, target_dir)


def preserve_incidents(records: list[RunRecord], incident_dir: Path) -> list[dict[str, Any]]:
    preserved = []
    for record in records:
        if not record.incident_reasons:
            continue
        target_dir = incident_dir / record.run_id
        copy_tree_preserve(record.run_dir, target_dir)
        preserved.append(
            {
                "runId": record.run_id,
                "sourceDir": str(record.run_dir),
                "incidentDir": str(target_dir),
                "reasons": record.incident_reasons,
            }
        )
    return preserved


def disk_snapshot(path: Path) -> dict[str, Any]:
    usage = shutil.disk_usage(path)
    return {
        "path": str(path),
        "totalBytes": usage.total,
        "usedBytes": usage.used,
        "freeBytes": usage.free,
        "usedPercent": (usage.used / usage.total) * 100 if usage.total else 0.0,
    }


def build_summary(
    run_root: Path,
    window_start: datetime,
    window_end: datetime,
    records: list[RunRecord],
    preserved: list[dict[str, Any]],
) -> dict[str, Any]:
    wall_seconds = [record.wall_ms / 1000 for record in records if record.wall_ms is not None]
    validation_seconds = [
        record.validation_ms / 1000 for record in records if record.validation_ms is not None
    ]
    max_rss_values = [record.max_rss_bytes for record in records if record.max_rss_bytes is not None]
    success_count = sum(1 for record in records if record.status == "success")
    failed_count = sum(1 for record in records if record.status != "success")
    snapshot_count = sum(1 for record in records if record.sync_mode == "snapshot")
    delta_count = sum(1 for record in records if record.sync_mode == "delta")
    incident_count = sum(1 for record in records if record.incident_reasons)

    def range_for(values: list[int]) -> dict[str, int | None]:
        return {"min": min(values) if values else None, "max": max(values) if values else None}

    return {
        "schemaVersion": 1,
        "generatedAtUtc": format_rfc3339(datetime.now(timezone.utc)),
        "windowStartUtc": format_rfc3339(window_start),
        "windowEndUtc": format_rfc3339(window_end),
        "runRoot": str(run_root),
        "runs": {
            "total": len(records),
            "success": success_count,
            "failed": failed_count,
            "snapshot": snapshot_count,
            "delta": delta_count,
            "incidents": incident_count,
        },
        "wallSeconds": {
            "min": min(wall_seconds) if wall_seconds else None,
            "median": median(wall_seconds) if wall_seconds else None,
            "p95": percentile(wall_seconds, 95),
            "max": max(wall_seconds) if wall_seconds else None,
        },
        "validationSeconds": {
            "min": min(validation_seconds) if validation_seconds else None,
            "median": median(validation_seconds) if validation_seconds else None,
            "max": max(validation_seconds) if validation_seconds else None,
        },
        "maxRssBytes": {
            "max": max(max_rss_values) if max_rss_values else None,
        },
        "outputs": {
            "vrps": range_for([record.vrps for record in records]),
            "vaps": range_for([record.vaps for record in records]),
            "publicationPoints": range_for([record.publication_points for record in records]),
            "warnings": range_for([record.warnings for record in records]),
        },
        "repoSyncByPhase": aggregate_count_duration(records, "by_phase"),
        "repoSyncByTerminalState": aggregate_count_duration(records, "by_terminal_state"),
        "downloadBytesTotal": sum(record.download_bytes_total or 0 for record in records),
        "preservedIncidents": preserved,
        "disk": disk_snapshot(run_root),
    }


def render_markdown(summary: dict[str, Any], records: list[RunRecord]) -> str:
    lines = [
        "# Ours RP 24h Soak 小时级报告",
        "",
        f"- Window: `{summary['windowStartUtc']}` → `{summary['windowEndUtc']}`",
        f"- Generated: `{summary['generatedAtUtc']}`",
        f"- Run root: `{summary['runRoot']}`",
        "",
        "## 汇总",
        "",
        "| 指标 | 值 |",
        "|---|---:|",
        f"| Runs | {summary['runs']['total']} |",
        f"| Success / Failed | {summary['runs']['success']} / {summary['runs']['failed']} |",
        f"| Snapshot / Delta | {summary['runs']['snapshot']} / {summary['runs']['delta']} |",
        f"| Incidents | {summary['runs']['incidents']} |",
        f"| Wall min / median / p95 / max (s) | {seconds_text(summary['wallSeconds']['min'])} / {seconds_text(summary['wallSeconds']['median'])} / {seconds_text(summary['wallSeconds']['p95'])} / {seconds_text(summary['wallSeconds']['max'])} |",
        f"| Validation min / median / max (s) | {seconds_text(summary['validationSeconds']['min'])} / {seconds_text(summary['validationSeconds']['median'])} / {seconds_text(summary['validationSeconds']['max'])} |",
        f"| Max RSS max (MiB) | {bytes_to_mib(summary['maxRssBytes']['max'])} |",
        f"| VRPs range | {summary['outputs']['vrps']['min']} - {summary['outputs']['vrps']['max']} |",
        f"| VAPs range | {summary['outputs']['vaps']['min']} - {summary['outputs']['vaps']['max']} |",
        f"| PP range | {summary['outputs']['publicationPoints']['min']} - {summary['outputs']['publicationPoints']['max']} |",
        f"| Disk used | {summary['disk']['usedPercent']:.1f}% |",
        "",
        "## 每轮明细",
        "",
        "| Run | Mode | Status | Wall(s) | Validation(s) | RRDP(s) | Rsync(s) | Max RSS(MiB) | VRPs | VAPs | PP | Warnings | Incident |",
        "|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---|",
    ]
    for record in records:
        incident = ", ".join(record.incident_reasons) if record.incident_reasons else ""
        lines.append(
            "| "
            + " | ".join(
                [
                    record.run_id,
                    record.sync_mode,
                    record.status,
                    millis_to_seconds_text(record.wall_ms),
                    millis_to_seconds_text(record.validation_ms),
                    millis_to_seconds_text(record.rrdp_download_ms_total),
                    millis_to_seconds_text(record.rsync_download_ms_total),
                    bytes_to_mib(record.max_rss_bytes),
                    str(record.vrps),
                    str(record.vaps),
                    str(record.publication_points),
                    str(record.warnings),
                    incident,
                ]
            )
            + " |"
        )

    lines.extend(["", "## Repo Sync 聚合", "", "### By Phase", "", "| Phase | Count | Duration(s) |", "|---|---:|---:|"])
    for phase, value in sorted(summary["repoSyncByPhase"].items()):
        lines.append(f"| {phase} | {value['count']} | {value['duration_ms_total'] / 1000:.3f} |")
    lines.extend(["", "### By Terminal State", "", "| State | Count | Duration(s) |", "|---|---:|---:|"])
    for state, value in sorted(summary["repoSyncByTerminalState"].items()):
        lines.append(f"| {state} | {value['count']} | {value['duration_ms_total'] / 1000:.3f} |")

    lines.extend(["", "## Incident 固化", "", "| Run | Incident Dir | Reasons |", "|---|---|---|"])
    for incident in summary["preservedIncidents"]:
        lines.append(
            f"| {incident['runId']} | `{incident['incidentDir']}` | {', '.join(incident['reasons'])} |"
        )
    if not summary["preservedIncidents"]:
        lines.append("| - | - | - |")
    lines.append("")
    return "\n".join(lines)


def append_jsonl(path: Path, summary: dict[str, Any]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("a", encoding="utf-8") as handle:
        handle.write(json.dumps(summary, ensure_ascii=False, sort_keys=True) + "\n")


def send_feishu_with_script(script_path: Path, message: str, dry_run: bool) -> int:
    command = [sys.executable, str(script_path), "--stdin"]
    if dry_run:
        command.append("--dry-run")
    completed = subprocess.run(command, input=message, text=True, check=False)
    return completed.returncode


def send_feishu_with_webhook(message: str, timeout_seconds: float = 10.0) -> int:
    webhook_url = os.environ.get("FEISHU_WEBHOOK_URL", "").strip()
    if not webhook_url:
        return 2
    final_text = message if message.lower().startswith("from codex:") else f"From codex: {message}"
    payload = json.dumps(
        {"msg_type": "text", "content": {"text": final_text}},
        ensure_ascii=False,
    ).encode("utf-8")
    request = Request(
        webhook_url,
        data=payload,
        method="POST",
        headers={"Content-Type": "application/json"},
    )
    try:
        with urlopen(request, timeout=timeout_seconds) as response:
            body = response.read().decode("utf-8", errors="replace")
            if getattr(response, "status", 0) != 200:
                print(body, file=sys.stderr)
                return 1
            parsed = json.loads(body)
            if parsed.get("code") == 0 or parsed.get("StatusCode") == 0:
                return 0
            print(body, file=sys.stderr)
            return 1
    except (HTTPError, URLError, json.JSONDecodeError) as error:
        print(f"feishu send failed: {error}", file=sys.stderr)
        return 1


def build_feishu_message(summary: dict[str, Any], report_path: Path) -> str:
    return (
        "#062 24h soak hourly "
        f"{summary['windowStartUtc']}..{summary['windowEndUtc']} "
        f"runs={summary['runs']['total']} ok={summary['runs']['success']} fail={summary['runs']['failed']} "
        f"incidents={summary['runs']['incidents']} "
        f"wall_s={seconds_text(summary['wallSeconds']['median'])}/{seconds_text(summary['wallSeconds']['max'])} "
        f"vrps={summary['outputs']['vrps']['min']}-{summary['outputs']['vrps']['max']} "
        f"vaps={summary['outputs']['vaps']['min']}-{summary['outputs']['vaps']['max']} "
        f"report={report_path}"
    )


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Generate hourly reports for portable ours RP soak runs.")
    parser.add_argument("--run-root", required=True, type=Path)
    parser.add_argument("--reports-dir", type=Path)
    parser.add_argument("--incident-dir", type=Path)
    parser.add_argument("--window-start", help="RFC3339 UTC timestamp. Defaults to now-1h.")
    parser.add_argument("--window-end", help="RFC3339 UTC timestamp. Defaults to now.")
    parser.add_argument("--report-name", help="Override report filename.")
    parser.add_argument("--wall-warn-secs", type=int, default=140)
    parser.add_argument("--vrp-min", type=int, default=900_000)
    parser.add_argument("--vaps-min", type=int, default=1_000)
    parser.add_argument("--pp-min", type=int, default=50_000)
    parser.add_argument(
        "--warning-max",
        type=int,
        default=-1,
        help="Incident threshold for warnings; negative disables warning-based incidents.",
    )
    parser.add_argument("--send-feishu", action="store_true")
    parser.add_argument("--dry-run-feishu", action="store_true")
    parser.add_argument("--feishu-script", type=Path)
    return parser.parse_args()


def main() -> int:
    args = parse_args()
    if args.window_start and args.window_end:
        window_start = parse_rfc3339(args.window_start)
        window_end = parse_rfc3339(args.window_end)
    else:
        window_start, window_end = default_window()
    if window_end <= window_start:
        raise SystemExit("--window-end must be later than --window-start")

    run_root = args.run_root.resolve()
    reports_dir = (args.reports_dir or run_root / "hourly_reports").resolve()
    incident_dir = (args.incident_dir or run_root / "incident_runs").resolve()
    reports_dir.mkdir(parents=True, exist_ok=True)
    records = discover_runs(run_root, window_start, window_end, args)
    preserved = preserve_incidents(records, incident_dir)
    summary = build_summary(run_root, window_start, window_end, records, preserved)

    report_name = args.report_name or f"hour_{format_rfc3339(window_start).replace(':', '').replace('-', '')}.md"
    report_path = reports_dir / report_name
    report_path.write_text(render_markdown(summary, records), encoding="utf-8")
    append_jsonl(reports_dir / "hourly_summary.jsonl", summary)

    if args.send_feishu:
        message = build_feishu_message(summary, report_path)
        feishu_script = args.feishu_script
        if feishu_script is None:
            env_script = os.environ.get("FEISHU_WEBHOOK_SCRIPT", "").strip()
            feishu_script = Path(env_script) if env_script else None
        if feishu_script and feishu_script.exists():
            return send_feishu_with_script(feishu_script, message, args.dry_run_feishu)
        if args.dry_run_feishu:
            print(f"DRY RUN FEISHU: {message}")
        else:
            result = send_feishu_with_webhook(message)
            if result != 0:
                print("Feishu not sent: configure --feishu-script or FEISHU_WEBHOOK_URL", file=sys.stderr)
                return result

    print(report_path)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())