#!/usr/bin/env python3 from __future__ import annotations import argparse import json import os import shutil import subprocess import sys from dataclasses import dataclass from datetime import datetime, timedelta, timezone from pathlib import Path from statistics import median from typing import Any from urllib.error import HTTPError, URLError from urllib.request import Request, urlopen def parse_rfc3339(value: str) -> datetime: normalized = value.strip() if normalized.endswith("Z"): normalized = normalized[:-1] + "+00:00" parsed = datetime.fromisoformat(normalized) if parsed.tzinfo is None: parsed = parsed.replace(tzinfo=timezone.utc) return parsed.astimezone(timezone.utc) def format_rfc3339(value: datetime) -> str: return value.astimezone(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") def default_window() -> tuple[datetime, datetime]: window_end = datetime.now(timezone.utc) return window_end - timedelta(hours=1), window_end def read_json(path: Path) -> dict[str, Any] | None: try: return json.loads(path.read_text(encoding="utf-8")) except Exception: return None def nested_get(data: dict[str, Any] | None, keys: list[str], default: Any = None) -> Any: current: Any = data for key in keys: if not isinstance(current, dict) or key not in current: return default current = current[key] return current def as_int(value: Any, default: int = 0) -> int: if value is None: return default try: return int(value) except (TypeError, ValueError): return default def as_float(value: Any, default: float = 0.0) -> float: if value is None: return default try: return float(value) except (TypeError, ValueError): return default def bytes_to_mib(value: int | None) -> str: if value is None: return "-" return f"{value / 1024 / 1024:.1f}" def millis_to_seconds_text(value: int | None) -> str: if value is None: return "-" return f"{value / 1000:.3f}" def seconds_text(value: float | None) -> str: if value is None: return "-" return f"{value:.3f}" def percent_text(numerator: int, denominator: int) -> str: if denominator <= 0: return "-" return f"{(numerator / denominator) * 100:.1f}%" @dataclass class RunRecord: run_id: str run_index: int run_dir: Path status: str sync_mode: str started_at: datetime | None completed_at: datetime | None wall_ms: int | None validation_ms: int | None repo_sync_ms_total: int | None rrdp_download_ms_total: int | None rsync_download_ms_total: int | None download_bytes_total: int | None max_rss_bytes: int | None vrps: int vaps: int publication_points: int warnings: int repo_sync_stats: dict[str, Any] artifact_sizes: dict[str, int] parse_errors: list[str] incident_reasons: list[str] def load_run(run_dir: Path, window_start: datetime, window_end: datetime, args: argparse.Namespace) -> RunRecord | None: meta = read_json(run_dir / "run-meta.json") summary = read_json(run_dir / "run-summary.json") if meta is None: return None completed_raw = meta.get("completed_at_rfc3339_utc") or nested_get(summary, ["finishedAtRfc3339Utc"]) if not completed_raw: return None try: completed_at = parse_rfc3339(str(completed_raw)) except ValueError: return None if completed_at < window_start or completed_at >= window_end: return None started_at = None started_raw = meta.get("started_at_rfc3339_utc") or nested_get(summary, ["startedAtRfc3339Utc"]) if started_raw: try: started_at = parse_rfc3339(str(started_raw)) except ValueError: started_at = None status = str(meta.get("status") or nested_get(summary, ["status"], "unknown")) sync_mode = str(meta.get("sync_mode") or "unknown") run_index = as_int(meta.get("run_index"), default=0) run_id = str(meta.get("run_id") or run_dir.name) stage = nested_get(summary, ["stageTiming"], {}) or {} process_metrics = nested_get(summary, ["processMetrics"], {}) or {} report_counts = nested_get(summary, ["reportCounts"], {}) or {} repo_sync_stats = nested_get(summary, ["repoSyncStats"], {}) or {} wall_ms = nested_get(summary, ["wallMs"]) max_rss_kb = nested_get(process_metrics, ["maxRssKb"]) artifact_sizes = {} for artifact in nested_get(summary, ["artifacts"], []) or []: if isinstance(artifact, dict): artifact_path = str(artifact.get("path", "")) artifact_sizes[Path(artifact_path).name] = as_int(artifact.get("sizeBytes")) parse_errors = [] if summary is None: parse_errors.append("missing_or_invalid_run_summary") if not (run_dir / "stage-timing.json").exists(): parse_errors.append("missing_stage_timing") if not (run_dir / "process-time.txt").exists(): parse_errors.append("missing_process_time") record = RunRecord( run_id=run_id, run_index=run_index, run_dir=run_dir, status=status, sync_mode=sync_mode, started_at=started_at, completed_at=completed_at, wall_ms=as_int(wall_ms) if wall_ms is not None else None, validation_ms=as_int(stage.get("validation_ms")) if stage else None, repo_sync_ms_total=as_int(stage.get("repo_sync_ms_total")) if stage else None, rrdp_download_ms_total=as_int(stage.get("rrdp_download_ms_total")) if stage else None, rsync_download_ms_total=as_int(stage.get("rsync_download_ms_total")) if stage else None, download_bytes_total=as_int(stage.get("download_bytes_total")) if stage else None, max_rss_bytes=as_int(max_rss_kb) * 1024 if max_rss_kb is not None else None, vrps=as_int(report_counts.get("vrps")), vaps=as_int(report_counts.get("aspas")), publication_points=as_int(report_counts.get("publicationPoints")), warnings=as_int(report_counts.get("warnings")), repo_sync_stats=repo_sync_stats if isinstance(repo_sync_stats, dict) else {}, artifact_sizes=artifact_sizes, parse_errors=parse_errors, incident_reasons=[], ) record.incident_reasons = classify_incident(record, args) return record def classify_incident(record: RunRecord, args: argparse.Namespace) -> list[str]: reasons = [] if record.status != "success": reasons.append(f"status={record.status}") if record.wall_ms is not None and record.wall_ms > args.wall_warn_secs * 1000: reasons.append(f"wall>{args.wall_warn_secs}s") if record.vrps < args.vrp_min: reasons.append(f"vrps<{args.vrp_min}") if record.vaps < args.vaps_min: reasons.append(f"vaps<{args.vaps_min}") if record.publication_points < args.pp_min: reasons.append(f"pp<{args.pp_min}") if args.warning_max >= 0 and record.warnings > args.warning_max: reasons.append(f"warnings>{args.warning_max}") reasons.extend(record.parse_errors) return reasons def discover_runs(run_root: Path, window_start: datetime, window_end: datetime, args: argparse.Namespace) -> list[RunRecord]: runs_root = run_root / "runs" records = [] for run_dir in sorted(runs_root.glob("run_[0-9][0-9][0-9][0-9]")): if not run_dir.is_dir(): continue record = load_run(run_dir, window_start, window_end, args) if record is not None: records.append(record) return records def percentile(values: list[float], target_percentile: float) -> float | None: if not values: return None ordered = sorted(values) index = int(round((target_percentile / 100.0) * (len(ordered) - 1))) return ordered[index] def aggregate_count_duration(records: list[RunRecord], group_key: str) -> dict[str, dict[str, int]]: aggregate: dict[str, dict[str, int]] = {} for record in records: group = record.repo_sync_stats.get(group_key, {}) if not isinstance(group, dict): continue for name, value in group.items(): if not isinstance(value, dict): continue bucket = aggregate.setdefault(str(name), {"count": 0, "duration_ms_total": 0}) bucket["count"] += as_int(value.get("count")) bucket["duration_ms_total"] += as_int(value.get("duration_ms_total")) return aggregate def copy_tree_preserve(source_dir: Path, target_dir: Path) -> None: if target_dir.exists(): return target_dir.parent.mkdir(parents=True, exist_ok=True) try: shutil.copytree(source_dir, target_dir, copy_function=os.link) except OSError: if target_dir.exists(): shutil.rmtree(target_dir) shutil.copytree(source_dir, target_dir) def preserve_incidents(records: list[RunRecord], incident_dir: Path) -> list[dict[str, Any]]: preserved = [] for record in records: if not record.incident_reasons: continue target_dir = incident_dir / record.run_id copy_tree_preserve(record.run_dir, target_dir) preserved.append( { "runId": record.run_id, "sourceDir": str(record.run_dir), "incidentDir": str(target_dir), "reasons": record.incident_reasons, } ) return preserved def disk_snapshot(path: Path) -> dict[str, Any]: usage = shutil.disk_usage(path) return { "path": str(path), "totalBytes": usage.total, "usedBytes": usage.used, "freeBytes": usage.free, "usedPercent": (usage.used / usage.total) * 100 if usage.total else 0.0, } def build_summary( run_root: Path, window_start: datetime, window_end: datetime, records: list[RunRecord], preserved: list[dict[str, Any]], ) -> dict[str, Any]: wall_seconds = [record.wall_ms / 1000 for record in records if record.wall_ms is not None] validation_seconds = [ record.validation_ms / 1000 for record in records if record.validation_ms is not None ] max_rss_values = [record.max_rss_bytes for record in records if record.max_rss_bytes is not None] success_count = sum(1 for record in records if record.status == "success") failed_count = sum(1 for record in records if record.status != "success") snapshot_count = sum(1 for record in records if record.sync_mode == "snapshot") delta_count = sum(1 for record in records if record.sync_mode == "delta") incident_count = sum(1 for record in records if record.incident_reasons) def range_for(values: list[int]) -> dict[str, int | None]: return {"min": min(values) if values else None, "max": max(values) if values else None} return { "schemaVersion": 1, "generatedAtUtc": format_rfc3339(datetime.now(timezone.utc)), "windowStartUtc": format_rfc3339(window_start), "windowEndUtc": format_rfc3339(window_end), "runRoot": str(run_root), "runs": { "total": len(records), "success": success_count, "failed": failed_count, "snapshot": snapshot_count, "delta": delta_count, "incidents": incident_count, }, "wallSeconds": { "min": min(wall_seconds) if wall_seconds else None, "median": median(wall_seconds) if wall_seconds else None, "p95": percentile(wall_seconds, 95), "max": max(wall_seconds) if wall_seconds else None, }, "validationSeconds": { "min": min(validation_seconds) if validation_seconds else None, "median": median(validation_seconds) if validation_seconds else None, "max": max(validation_seconds) if validation_seconds else None, }, "maxRssBytes": { "max": max(max_rss_values) if max_rss_values else None, }, "outputs": { "vrps": range_for([record.vrps for record in records]), "vaps": range_for([record.vaps for record in records]), "publicationPoints": range_for([record.publication_points for record in records]), "warnings": range_for([record.warnings for record in records]), }, "repoSyncByPhase": aggregate_count_duration(records, "by_phase"), "repoSyncByTerminalState": aggregate_count_duration(records, "by_terminal_state"), "downloadBytesTotal": sum(record.download_bytes_total or 0 for record in records), "preservedIncidents": preserved, "disk": disk_snapshot(run_root), } def render_markdown(summary: dict[str, Any], records: list[RunRecord]) -> str: lines = [ "# Ours RP 24h Soak 小时级报告", "", f"- Window: `{summary['windowStartUtc']}` → `{summary['windowEndUtc']}`", f"- Generated: `{summary['generatedAtUtc']}`", f"- Run root: `{summary['runRoot']}`", "", "## 汇总", "", "| 指标 | 值 |", "|---|---:|", f"| Runs | {summary['runs']['total']} |", f"| Success / Failed | {summary['runs']['success']} / {summary['runs']['failed']} |", f"| Snapshot / Delta | {summary['runs']['snapshot']} / {summary['runs']['delta']} |", f"| Incidents | {summary['runs']['incidents']} |", f"| Wall min / median / p95 / max (s) | {seconds_text(summary['wallSeconds']['min'])} / {seconds_text(summary['wallSeconds']['median'])} / {seconds_text(summary['wallSeconds']['p95'])} / {seconds_text(summary['wallSeconds']['max'])} |", f"| Validation min / median / max (s) | {seconds_text(summary['validationSeconds']['min'])} / {seconds_text(summary['validationSeconds']['median'])} / {seconds_text(summary['validationSeconds']['max'])} |", f"| Max RSS max (MiB) | {bytes_to_mib(summary['maxRssBytes']['max'])} |", f"| VRPs range | {summary['outputs']['vrps']['min']} - {summary['outputs']['vrps']['max']} |", f"| VAPs range | {summary['outputs']['vaps']['min']} - {summary['outputs']['vaps']['max']} |", f"| PP range | {summary['outputs']['publicationPoints']['min']} - {summary['outputs']['publicationPoints']['max']} |", f"| Disk used | {summary['disk']['usedPercent']:.1f}% |", "", "## 每轮明细", "", "| Run | Mode | Status | Wall(s) | Validation(s) | RRDP(s) | Rsync(s) | Max RSS(MiB) | VRPs | VAPs | PP | Warnings | Incident |", "|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---|", ] for record in records: incident = ", ".join(record.incident_reasons) if record.incident_reasons else "" lines.append( "| " + " | ".join( [ record.run_id, record.sync_mode, record.status, millis_to_seconds_text(record.wall_ms), millis_to_seconds_text(record.validation_ms), millis_to_seconds_text(record.rrdp_download_ms_total), millis_to_seconds_text(record.rsync_download_ms_total), bytes_to_mib(record.max_rss_bytes), str(record.vrps), str(record.vaps), str(record.publication_points), str(record.warnings), incident, ] ) + " |" ) lines.extend(["", "## Repo Sync 聚合", "", "### By Phase", "", "| Phase | Count | Duration(s) |", "|---|---:|---:|"]) for phase, value in sorted(summary["repoSyncByPhase"].items()): lines.append(f"| {phase} | {value['count']} | {value['duration_ms_total'] / 1000:.3f} |") lines.extend(["", "### By Terminal State", "", "| State | Count | Duration(s) |", "|---|---:|---:|"]) for state, value in sorted(summary["repoSyncByTerminalState"].items()): lines.append(f"| {state} | {value['count']} | {value['duration_ms_total'] / 1000:.3f} |") lines.extend(["", "## Incident 固化", "", "| Run | Incident Dir | Reasons |", "|---|---|---|"]) for incident in summary["preservedIncidents"]: lines.append( f"| {incident['runId']} | `{incident['incidentDir']}` | {', '.join(incident['reasons'])} |" ) if not summary["preservedIncidents"]: lines.append("| - | - | - |") lines.append("") return "\n".join(lines) def append_jsonl(path: Path, summary: dict[str, Any]) -> None: path.parent.mkdir(parents=True, exist_ok=True) with path.open("a", encoding="utf-8") as handle: handle.write(json.dumps(summary, ensure_ascii=False, sort_keys=True) + "\n") def send_feishu_with_script(script_path: Path, message: str, dry_run: bool) -> int: command = [sys.executable, str(script_path), "--stdin"] if dry_run: command.append("--dry-run") completed = subprocess.run(command, input=message, text=True, check=False) return completed.returncode def send_feishu_with_webhook(message: str, timeout_seconds: float = 10.0) -> int: webhook_url = os.environ.get("FEISHU_WEBHOOK_URL", "").strip() if not webhook_url: return 2 final_text = message if message.lower().startswith("from codex:") else f"From codex: {message}" payload = json.dumps( {"msg_type": "text", "content": {"text": final_text}}, ensure_ascii=False, ).encode("utf-8") request = Request( webhook_url, data=payload, method="POST", headers={"Content-Type": "application/json"}, ) try: with urlopen(request, timeout=timeout_seconds) as response: body = response.read().decode("utf-8", errors="replace") if getattr(response, "status", 0) != 200: print(body, file=sys.stderr) return 1 parsed = json.loads(body) if parsed.get("code") == 0 or parsed.get("StatusCode") == 0: return 0 print(body, file=sys.stderr) return 1 except (HTTPError, URLError, json.JSONDecodeError) as error: print(f"feishu send failed: {error}", file=sys.stderr) return 1 def build_feishu_message(summary: dict[str, Any], report_path: Path) -> str: return ( "#062 24h soak hourly " f"{summary['windowStartUtc']}..{summary['windowEndUtc']} " f"runs={summary['runs']['total']} ok={summary['runs']['success']} fail={summary['runs']['failed']} " f"incidents={summary['runs']['incidents']} " f"wall_s={seconds_text(summary['wallSeconds']['median'])}/{seconds_text(summary['wallSeconds']['max'])} " f"vrps={summary['outputs']['vrps']['min']}-{summary['outputs']['vrps']['max']} " f"vaps={summary['outputs']['vaps']['min']}-{summary['outputs']['vaps']['max']} " f"report={report_path}" ) def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Generate hourly reports for portable ours RP soak runs.") parser.add_argument("--run-root", required=True, type=Path) parser.add_argument("--reports-dir", type=Path) parser.add_argument("--incident-dir", type=Path) parser.add_argument("--window-start", help="RFC3339 UTC timestamp. Defaults to now-1h.") parser.add_argument("--window-end", help="RFC3339 UTC timestamp. Defaults to now.") parser.add_argument("--report-name", help="Override report filename.") parser.add_argument("--wall-warn-secs", type=int, default=140) parser.add_argument("--vrp-min", type=int, default=900_000) parser.add_argument("--vaps-min", type=int, default=1_000) parser.add_argument("--pp-min", type=int, default=50_000) parser.add_argument( "--warning-max", type=int, default=-1, help="Incident threshold for warnings; negative disables warning-based incidents.", ) parser.add_argument("--send-feishu", action="store_true") parser.add_argument("--dry-run-feishu", action="store_true") parser.add_argument("--feishu-script", type=Path) return parser.parse_args() def main() -> int: args = parse_args() if args.window_start and args.window_end: window_start = parse_rfc3339(args.window_start) window_end = parse_rfc3339(args.window_end) else: window_start, window_end = default_window() if window_end <= window_start: raise SystemExit("--window-end must be later than --window-start") run_root = args.run_root.resolve() reports_dir = (args.reports_dir or run_root / "hourly_reports").resolve() incident_dir = (args.incident_dir or run_root / "incident_runs").resolve() reports_dir.mkdir(parents=True, exist_ok=True) records = discover_runs(run_root, window_start, window_end, args) preserved = preserve_incidents(records, incident_dir) summary = build_summary(run_root, window_start, window_end, records, preserved) report_name = args.report_name or f"hour_{format_rfc3339(window_start).replace(':', '').replace('-', '')}.md" report_path = reports_dir / report_name report_path.write_text(render_markdown(summary, records), encoding="utf-8") append_jsonl(reports_dir / "hourly_summary.jsonl", summary) if args.send_feishu: message = build_feishu_message(summary, report_path) feishu_script = args.feishu_script if feishu_script is None: env_script = os.environ.get("FEISHU_WEBHOOK_SCRIPT", "").strip() feishu_script = Path(env_script) if env_script else None if feishu_script and feishu_script.exists(): return send_feishu_with_script(feishu_script, message, args.dry_run_feishu) if args.dry_run_feishu: print(f"DRY RUN FEISHU: {message}") else: result = send_feishu_with_webhook(message) if result != 0: print("Feishu not sent: configure --feishu-script or FEISHU_WEBHOOK_URL", file=sys.stderr) return result print(report_path) return 0 if __name__ == "__main__": raise SystemExit(main())