diff --git a/monitor/grafana/dashboards/ours-rp-inter-rp.json b/monitor/grafana/dashboards/ours-rp-inter-rp.json index fd2e431..102a3ad 100644 --- a/monitor/grafana/dashboards/ours-rp-inter-rp.json +++ b/monitor/grafana/dashboards/ours-rp-inter-rp.json @@ -494,6 +494,174 @@ "refId": "A" } ] + }, + { + "id": 12, + "title": "Repo Sync Availability by RP", + "type": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 36 + }, + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "inter_rp_repo_sync_total{exported_instance=\"remote200-inter-rp\",state=~\"available|failed\",exported_rp=~\"ours-rp|routinator\"}", + "legendFormat": "{{exported_rp}} {{state}}", + "refId": "A" + } + ] + }, + { + "id": 13, + "title": "Repo Sync Overlap Classes", + "type": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 36 + }, + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "inter_rp_repo_sync_overlap_total{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\"}", + "legendFormat": "{{class}}", + "refId": "A" + } + ] + }, + { + "id": 14, + "title": "Repo Sync Diff URIs", + "type": "table", + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 44 + }, + "fieldConfig": { + "defaults": { + "unit": "none", + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "sortBy": [] + }, + "targets": [ + { + "expr": "inter_rp_repo_sync_diff_info{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\"}", + "format": "table", + "instant": true, + "legendFormat": "{{class}} #{{rank}}", + "refId": "A" + } + ] + }, + { + "id": 15, + "title": "Only-Ours Repo Count", + "type": "stat", + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 32 + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "targets": [ + { + "expr": "max(inter_rp_repo_sync_overlap_total{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\",class=\"only_ours\"})", + "legendFormat": "only ours", + "refId": "A", + "instant": true + } + ] } ], "refresh": "10s", @@ -513,5 +681,5 @@ "timezone": "browser", "title": "Ours RP vs Routinator", "uid": "ours-rp-inter-rp", - "version": 2 + "version": 3 } diff --git a/scripts/inter_rp/inter_rp_ours_routinator_exporter.py b/scripts/inter_rp/inter_rp_ours_routinator_exporter.py index 66cc2d6..bd333a0 100755 --- a/scripts/inter_rp/inter_rp_ours_routinator_exporter.py +++ b/scripts/inter_rp/inter_rp_ours_routinator_exporter.py @@ -3,6 +3,7 @@ import csv import datetime as dt import json import os +import re import socket import sys import threading @@ -138,6 +139,126 @@ def sample_routinator(now, errors): sample["vaps"] = count_unique_csv(latest / "vaps.csv", 2) return sample + +def parse_prom_labels(label_text): + labels = {} + for key, value in re.findall(r'(\w+)="((?:[^"\\]|\\.)*)"', label_text): + labels[key] = value.replace(r'\"', '"').replace(r'\\', '\\') + return labels + +def parse_prometheus_samples(text, metric_name): + pattern = re.compile(r'^' + re.escape(metric_name) + r'\{([^}]*)\}\s+([-+0-9.eE]+)\s*$', re.MULTILINE) + for match in pattern.finditer(text): + try: + value = float(match.group(2)) + except ValueError: + continue + yield parse_prom_labels(match.group(1)), value + +def load_ours_repo_sets(errors): + try: + import urllib.request + text = urllib.request.urlopen("http://127.0.0.1:9556/metrics", timeout=10).read().decode("utf-8", "replace") + except Exception as exc: + errors.append(f"ours-rp repo metrics: {exc}") + return {"total": set(), "available": set(), "failed": set(), "info": {}} + info = {} + states = {} + for labels, value in parse_prometheus_samples(text, "ours_rp_repository_info"): + uri = labels.get("uri") + if uri: + info[uri] = labels + for labels, value in parse_prometheus_samples(text, "ours_rp_repository_terminal_state_publication_points"): + uri = None + repo_id = labels.get("repo_id") + for candidate_uri, candidate in info.items(): + if candidate.get("repo_id") == repo_id: + uri = candidate_uri + break + if not uri: + continue + states.setdefault(uri, {})[labels.get("terminal_state", "unknown")] = value + total = set(info) + available = set() + failed = set() + for uri in total: + repo_states = states.get(uri, {}) + non_failed = sum(value for state, value in repo_states.items() if state != "failed_no_cache") + if non_failed > 0: + available.add(uri) + else: + failed.add(uri) + return {"total": total, "available": available, "failed": failed, "info": info} + +def load_routinator_repo_sets(errors): + path = PEER_ROOT / "routinator" / "routinator-metrics.prom" + if not path.exists(): + errors.append(f"routinator repo metrics: missing {path}") + return {"total": set(), "success": set(), "failed": set(), "duration": {}} + try: + text = path.read_text(encoding="utf-8", errors="replace") + except Exception as exc: + errors.append(f"routinator repo metrics: read {path}: {exc}") + return {"total": set(), "success": set(), "failed": set(), "duration": {}} + total = set() + success = set() + failed = set() + duration = {} + for metric in ["routinator_rrdp_status", "routinator_rsync_status"]: + for labels, value in parse_prometheus_samples(text, metric): + uri = labels.get("uri") + if not uri: + continue + total.add(uri) + ok = value in (0.0, 200.0, 304.0) + if ok: + success.add(uri) + else: + failed.add(uri) + for metric in ["routinator_rrdp_duration", "routinator_rsync_duration"]: + for labels, value in parse_prometheus_samples(text, metric): + uri = labels.get("uri") + if uri: + duration[uri] = max(duration.get(uri, 0.0), value) + return {"total": total, "success": success, "failed": failed, "duration": duration} + +def emit_repo_diff_metrics(out, errors): + ours = load_ours_repo_sets(errors) + routinator = load_routinator_repo_sets(errors) + ours_ok = ours["available"] + rout_ok = routinator["success"] + only_ours = sorted(ours_ok - rout_ok) + only_routinator = sorted(rout_ok - ours_ok) + both = ours_ok & rout_ok + neither = (ours["total"] | routinator["total"]) - (ours_ok | rout_ok) + base = {"instance": INSTANCE, "left": "ours-rp", "right": "routinator"} + out.append(metric_line("inter_rp_repo_sync_total", {"instance": INSTANCE, "rp": "ours-rp", "state": "total"}, len(ours["total"]))) + out.append(metric_line("inter_rp_repo_sync_total", {"instance": INSTANCE, "rp": "ours-rp", "state": "available"}, len(ours_ok))) + out.append(metric_line("inter_rp_repo_sync_total", {"instance": INSTANCE, "rp": "ours-rp", "state": "failed"}, len(ours["failed"]))) + out.append(metric_line("inter_rp_repo_sync_total", {"instance": INSTANCE, "rp": "routinator", "state": "total"}, len(routinator["total"]))) + out.append(metric_line("inter_rp_repo_sync_total", {"instance": INSTANCE, "rp": "routinator", "state": "available"}, len(rout_ok))) + out.append(metric_line("inter_rp_repo_sync_total", {"instance": INSTANCE, "rp": "routinator", "state": "failed"}, len(routinator["failed"]))) + out.append(metric_line("inter_rp_repo_sync_overlap_total", {**base, "class": "both_available"}, len(both))) + out.append(metric_line("inter_rp_repo_sync_overlap_total", {**base, "class": "only_ours"}, len(only_ours))) + out.append(metric_line("inter_rp_repo_sync_overlap_total", {**base, "class": "only_routinator"}, len(only_routinator))) + out.append(metric_line("inter_rp_repo_sync_overlap_total", {**base, "class": "neither_available"}, len(neither))) + for diff_class, uris in [("only_ours", only_ours), ("only_routinator", only_routinator)]: + for rank, uri in enumerate(uris[:50], start=1): + labels = {**base, "class": diff_class, "rank": rank, "uri": uri} + if uri in routinator["duration"]: + labels["routinator_duration"] = f"{routinator['duration'][uri]:.3f}" + out.append(metric_line("inter_rp_repo_sync_diff_info", labels, 1)) + return { + "oursTotal": len(ours["total"]), + "oursAvailable": len(ours_ok), + "routinatorTotal": len(routinator["total"]), + "routinatorAvailable": len(rout_ok), + "onlyOurs": only_ours[:50], + "onlyRoutinator": only_routinator[:50], + "bothAvailable": len(both), + "neitherAvailable": len(neither), + } + def sync_metrics(now): path = PEER_ROOT / "sync-status.json" if not path.exists(): @@ -201,7 +322,8 @@ def build_metrics(): out.append(metric_line("inter_rp_vrps_diff", {"instance": INSTANCE, "left": "ours-rp", "right": "routinator"}, abs(int(ours["vrps"]) - int(rout["vrps"])))) if ours.get("vaps") is not None and rout.get("vaps") is not None: out.append(metric_line("inter_rp_vaps_diff", {"instance": INSTANCE, "left": "ours-rp", "right": "routinator"}, abs(int(ours["vaps"]) - int(rout["vaps"])))) - return "".join(out), {"errors": errors, "samples": samples, "sync": sync} + repo_diff = emit_repo_diff_metrics(out, errors) + return "".join(out), {"errors": errors, "samples": samples, "sync": sync, "repoDiff": repo_diff} def get_metrics(): now = time.time() diff --git a/scripts/inter_rp/sync_routinator_from_remote231.sh b/scripts/inter_rp/sync_routinator_from_remote231.sh index d541515..2d4c884 100755 --- a/scripts/inter_rp/sync_routinator_from_remote231.sh +++ b/scripts/inter_rp/sync_routinator_from_remote231.sh @@ -47,7 +47,13 @@ sync_once() { rm -rf "$tmp" return 2 fi - rm -rf "$PEER_ROOT/routinator/latest.next" "$PEER_ROOT/routinator/latest.prev" + if ! ssh -o BatchMode=yes -o ConnectTimeout=8 "$REMOTE231" "curl -fsS --max-time 20 http://127.0.0.1:9558/metrics" >"$tmp/routinator-metrics.prom"; then + rm -rf "$tmp" + return 3 + fi + rm -rf "$PEER_ROOT/routinator/latest.next" "$PEER_ROOT/routinator/latest.prev" "$PEER_ROOT/routinator/routinator-metrics.prom.next" + mv "$tmp/routinator-metrics.prom" "$PEER_ROOT/routinator/routinator-metrics.prom.next" + mv "$PEER_ROOT/routinator/routinator-metrics.prom.next" "$PEER_ROOT/routinator/routinator-metrics.prom" mv "$tmp/latest" "$PEER_ROOT/routinator/latest.next" if [[ -e "$PEER_ROOT/routinator/latest" ]]; then mv "$PEER_ROOT/routinator/latest" "$PEER_ROOT/routinator/latest.prev"