20260622 接入Routinator仓库同步指标

This commit is contained in:
yuyr 2026-06-22 16:27:19 +08:00
parent 4546d90c33
commit 92d184681b
3 changed files with 299 additions and 3 deletions

View File

@ -494,6 +494,174 @@
"refId": "A"
}
]
},
{
"id": 12,
"title": "Repo Sync Availability by RP",
"type": "timeseries",
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 36
},
"fieldConfig": {
"defaults": {
"unit": "none",
"min": 0,
"decimals": 0
},
"overrides": []
},
"options": {
"legend": {
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"expr": "inter_rp_repo_sync_total{exported_instance=\"remote200-inter-rp\",state=~\"available|failed\",exported_rp=~\"ours-rp|routinator\"}",
"legendFormat": "{{exported_rp}} {{state}}",
"refId": "A"
}
]
},
{
"id": 13,
"title": "Repo Sync Overlap Classes",
"type": "timeseries",
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 36
},
"fieldConfig": {
"defaults": {
"unit": "none",
"min": 0,
"decimals": 0
},
"overrides": []
},
"options": {
"legend": {
"calcs": [
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"expr": "inter_rp_repo_sync_overlap_total{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\"}",
"legendFormat": "{{class}}",
"refId": "A"
}
]
},
{
"id": 14,
"title": "Repo Sync Diff URIs",
"type": "table",
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 44
},
"fieldConfig": {
"defaults": {
"unit": "none",
"decimals": 0
},
"overrides": []
},
"options": {
"showHeader": true,
"sortBy": []
},
"targets": [
{
"expr": "inter_rp_repo_sync_diff_info{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\"}",
"format": "table",
"instant": true,
"legendFormat": "{{class}} #{{rank}}",
"refId": "A"
}
]
},
{
"id": 15,
"title": "Only-Ours Repo Count",
"type": "stat",
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 32
},
"fieldConfig": {
"defaults": {
"unit": "short",
"decimals": 0
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto",
"wideLayout": true
},
"targets": [
{
"expr": "max(inter_rp_repo_sync_overlap_total{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\",class=\"only_ours\"})",
"legendFormat": "only ours",
"refId": "A",
"instant": true
}
]
}
],
"refresh": "10s",
@ -513,5 +681,5 @@
"timezone": "browser",
"title": "Ours RP vs Routinator",
"uid": "ours-rp-inter-rp",
"version": 2
"version": 3
}

View File

@ -3,6 +3,7 @@ import csv
import datetime as dt
import json
import os
import re
import socket
import sys
import threading
@ -138,6 +139,126 @@ def sample_routinator(now, errors):
sample["vaps"] = count_unique_csv(latest / "vaps.csv", 2)
return sample
def parse_prom_labels(label_text):
labels = {}
for key, value in re.findall(r'(\w+)="((?:[^"\\]|\\.)*)"', label_text):
labels[key] = value.replace(r'\"', '"').replace(r'\\', '\\')
return labels
def parse_prometheus_samples(text, metric_name):
pattern = re.compile(r'^' + re.escape(metric_name) + r'\{([^}]*)\}\s+([-+0-9.eE]+)\s*$', re.MULTILINE)
for match in pattern.finditer(text):
try:
value = float(match.group(2))
except ValueError:
continue
yield parse_prom_labels(match.group(1)), value
def load_ours_repo_sets(errors):
try:
import urllib.request
text = urllib.request.urlopen("http://127.0.0.1:9556/metrics", timeout=10).read().decode("utf-8", "replace")
except Exception as exc:
errors.append(f"ours-rp repo metrics: {exc}")
return {"total": set(), "available": set(), "failed": set(), "info": {}}
info = {}
states = {}
for labels, value in parse_prometheus_samples(text, "ours_rp_repository_info"):
uri = labels.get("uri")
if uri:
info[uri] = labels
for labels, value in parse_prometheus_samples(text, "ours_rp_repository_terminal_state_publication_points"):
uri = None
repo_id = labels.get("repo_id")
for candidate_uri, candidate in info.items():
if candidate.get("repo_id") == repo_id:
uri = candidate_uri
break
if not uri:
continue
states.setdefault(uri, {})[labels.get("terminal_state", "unknown")] = value
total = set(info)
available = set()
failed = set()
for uri in total:
repo_states = states.get(uri, {})
non_failed = sum(value for state, value in repo_states.items() if state != "failed_no_cache")
if non_failed > 0:
available.add(uri)
else:
failed.add(uri)
return {"total": total, "available": available, "failed": failed, "info": info}
def load_routinator_repo_sets(errors):
path = PEER_ROOT / "routinator" / "routinator-metrics.prom"
if not path.exists():
errors.append(f"routinator repo metrics: missing {path}")
return {"total": set(), "success": set(), "failed": set(), "duration": {}}
try:
text = path.read_text(encoding="utf-8", errors="replace")
except Exception as exc:
errors.append(f"routinator repo metrics: read {path}: {exc}")
return {"total": set(), "success": set(), "failed": set(), "duration": {}}
total = set()
success = set()
failed = set()
duration = {}
for metric in ["routinator_rrdp_status", "routinator_rsync_status"]:
for labels, value in parse_prometheus_samples(text, metric):
uri = labels.get("uri")
if not uri:
continue
total.add(uri)
ok = value in (0.0, 200.0, 304.0)
if ok:
success.add(uri)
else:
failed.add(uri)
for metric in ["routinator_rrdp_duration", "routinator_rsync_duration"]:
for labels, value in parse_prometheus_samples(text, metric):
uri = labels.get("uri")
if uri:
duration[uri] = max(duration.get(uri, 0.0), value)
return {"total": total, "success": success, "failed": failed, "duration": duration}
def emit_repo_diff_metrics(out, errors):
ours = load_ours_repo_sets(errors)
routinator = load_routinator_repo_sets(errors)
ours_ok = ours["available"]
rout_ok = routinator["success"]
only_ours = sorted(ours_ok - rout_ok)
only_routinator = sorted(rout_ok - ours_ok)
both = ours_ok & rout_ok
neither = (ours["total"] | routinator["total"]) - (ours_ok | rout_ok)
base = {"instance": INSTANCE, "left": "ours-rp", "right": "routinator"}
out.append(metric_line("inter_rp_repo_sync_total", {"instance": INSTANCE, "rp": "ours-rp", "state": "total"}, len(ours["total"])))
out.append(metric_line("inter_rp_repo_sync_total", {"instance": INSTANCE, "rp": "ours-rp", "state": "available"}, len(ours_ok)))
out.append(metric_line("inter_rp_repo_sync_total", {"instance": INSTANCE, "rp": "ours-rp", "state": "failed"}, len(ours["failed"])))
out.append(metric_line("inter_rp_repo_sync_total", {"instance": INSTANCE, "rp": "routinator", "state": "total"}, len(routinator["total"])))
out.append(metric_line("inter_rp_repo_sync_total", {"instance": INSTANCE, "rp": "routinator", "state": "available"}, len(rout_ok)))
out.append(metric_line("inter_rp_repo_sync_total", {"instance": INSTANCE, "rp": "routinator", "state": "failed"}, len(routinator["failed"])))
out.append(metric_line("inter_rp_repo_sync_overlap_total", {**base, "class": "both_available"}, len(both)))
out.append(metric_line("inter_rp_repo_sync_overlap_total", {**base, "class": "only_ours"}, len(only_ours)))
out.append(metric_line("inter_rp_repo_sync_overlap_total", {**base, "class": "only_routinator"}, len(only_routinator)))
out.append(metric_line("inter_rp_repo_sync_overlap_total", {**base, "class": "neither_available"}, len(neither)))
for diff_class, uris in [("only_ours", only_ours), ("only_routinator", only_routinator)]:
for rank, uri in enumerate(uris[:50], start=1):
labels = {**base, "class": diff_class, "rank": rank, "uri": uri}
if uri in routinator["duration"]:
labels["routinator_duration"] = f"{routinator['duration'][uri]:.3f}"
out.append(metric_line("inter_rp_repo_sync_diff_info", labels, 1))
return {
"oursTotal": len(ours["total"]),
"oursAvailable": len(ours_ok),
"routinatorTotal": len(routinator["total"]),
"routinatorAvailable": len(rout_ok),
"onlyOurs": only_ours[:50],
"onlyRoutinator": only_routinator[:50],
"bothAvailable": len(both),
"neitherAvailable": len(neither),
}
def sync_metrics(now):
path = PEER_ROOT / "sync-status.json"
if not path.exists():
@ -201,7 +322,8 @@ def build_metrics():
out.append(metric_line("inter_rp_vrps_diff", {"instance": INSTANCE, "left": "ours-rp", "right": "routinator"}, abs(int(ours["vrps"]) - int(rout["vrps"]))))
if ours.get("vaps") is not None and rout.get("vaps") is not None:
out.append(metric_line("inter_rp_vaps_diff", {"instance": INSTANCE, "left": "ours-rp", "right": "routinator"}, abs(int(ours["vaps"]) - int(rout["vaps"]))))
return "".join(out), {"errors": errors, "samples": samples, "sync": sync}
repo_diff = emit_repo_diff_metrics(out, errors)
return "".join(out), {"errors": errors, "samples": samples, "sync": sync, "repoDiff": repo_diff}
def get_metrics():
now = time.time()

View File

@ -47,7 +47,13 @@ sync_once() {
rm -rf "$tmp"
return 2
fi
rm -rf "$PEER_ROOT/routinator/latest.next" "$PEER_ROOT/routinator/latest.prev"
if ! ssh -o BatchMode=yes -o ConnectTimeout=8 "$REMOTE231" "curl -fsS --max-time 20 http://127.0.0.1:9558/metrics" >"$tmp/routinator-metrics.prom"; then
rm -rf "$tmp"
return 3
fi
rm -rf "$PEER_ROOT/routinator/latest.next" "$PEER_ROOT/routinator/latest.prev" "$PEER_ROOT/routinator/routinator-metrics.prom.next"
mv "$tmp/routinator-metrics.prom" "$PEER_ROOT/routinator/routinator-metrics.prom.next"
mv "$PEER_ROOT/routinator/routinator-metrics.prom.next" "$PEER_ROOT/routinator/routinator-metrics.prom"
mv "$tmp/latest" "$PEER_ROOT/routinator/latest.next"
if [[ -e "$PEER_ROOT/routinator/latest" ]]; then
mv "$PEER_ROOT/routinator/latest" "$PEER_ROOT/routinator/latest.prev"