20260622 接入Routinator仓库同步指标
This commit is contained in:
parent
4546d90c33
commit
92d184681b
@ -494,6 +494,174 @@
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"title": "Repo Sync Availability by RP",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 36
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none",
|
||||
"min": 0,
|
||||
"decimals": 0
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "inter_rp_repo_sync_total{exported_instance=\"remote200-inter-rp\",state=~\"available|failed\",exported_rp=~\"ours-rp|routinator\"}",
|
||||
"legendFormat": "{{exported_rp}} {{state}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"title": "Repo Sync Overlap Classes",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 36
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none",
|
||||
"min": 0,
|
||||
"decimals": 0
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "inter_rp_repo_sync_overlap_total{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\"}",
|
||||
"legendFormat": "{{class}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"title": "Repo Sync Diff URIs",
|
||||
"type": "table",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 44
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none",
|
||||
"decimals": 0
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true,
|
||||
"sortBy": []
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "inter_rp_repo_sync_diff_info{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "{{class}} #{{rank}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"title": "Only-Ours Repo Count",
|
||||
"type": "stat",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 32
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"decimals": 0
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "max(inter_rp_repo_sync_overlap_total{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\",class=\"only_ours\"})",
|
||||
"legendFormat": "only ours",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"refresh": "10s",
|
||||
@ -513,5 +681,5 @@
|
||||
"timezone": "browser",
|
||||
"title": "Ours RP vs Routinator",
|
||||
"uid": "ours-rp-inter-rp",
|
||||
"version": 2
|
||||
"version": 3
|
||||
}
|
||||
|
||||
@ -3,6 +3,7 @@ import csv
|
||||
import datetime as dt
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import socket
|
||||
import sys
|
||||
import threading
|
||||
@ -138,6 +139,126 @@ def sample_routinator(now, errors):
|
||||
sample["vaps"] = count_unique_csv(latest / "vaps.csv", 2)
|
||||
return sample
|
||||
|
||||
|
||||
def parse_prom_labels(label_text):
|
||||
labels = {}
|
||||
for key, value in re.findall(r'(\w+)="((?:[^"\\]|\\.)*)"', label_text):
|
||||
labels[key] = value.replace(r'\"', '"').replace(r'\\', '\\')
|
||||
return labels
|
||||
|
||||
def parse_prometheus_samples(text, metric_name):
|
||||
pattern = re.compile(r'^' + re.escape(metric_name) + r'\{([^}]*)\}\s+([-+0-9.eE]+)\s*$', re.MULTILINE)
|
||||
for match in pattern.finditer(text):
|
||||
try:
|
||||
value = float(match.group(2))
|
||||
except ValueError:
|
||||
continue
|
||||
yield parse_prom_labels(match.group(1)), value
|
||||
|
||||
def load_ours_repo_sets(errors):
|
||||
try:
|
||||
import urllib.request
|
||||
text = urllib.request.urlopen("http://127.0.0.1:9556/metrics", timeout=10).read().decode("utf-8", "replace")
|
||||
except Exception as exc:
|
||||
errors.append(f"ours-rp repo metrics: {exc}")
|
||||
return {"total": set(), "available": set(), "failed": set(), "info": {}}
|
||||
info = {}
|
||||
states = {}
|
||||
for labels, value in parse_prometheus_samples(text, "ours_rp_repository_info"):
|
||||
uri = labels.get("uri")
|
||||
if uri:
|
||||
info[uri] = labels
|
||||
for labels, value in parse_prometheus_samples(text, "ours_rp_repository_terminal_state_publication_points"):
|
||||
uri = None
|
||||
repo_id = labels.get("repo_id")
|
||||
for candidate_uri, candidate in info.items():
|
||||
if candidate.get("repo_id") == repo_id:
|
||||
uri = candidate_uri
|
||||
break
|
||||
if not uri:
|
||||
continue
|
||||
states.setdefault(uri, {})[labels.get("terminal_state", "unknown")] = value
|
||||
total = set(info)
|
||||
available = set()
|
||||
failed = set()
|
||||
for uri in total:
|
||||
repo_states = states.get(uri, {})
|
||||
non_failed = sum(value for state, value in repo_states.items() if state != "failed_no_cache")
|
||||
if non_failed > 0:
|
||||
available.add(uri)
|
||||
else:
|
||||
failed.add(uri)
|
||||
return {"total": total, "available": available, "failed": failed, "info": info}
|
||||
|
||||
def load_routinator_repo_sets(errors):
|
||||
path = PEER_ROOT / "routinator" / "routinator-metrics.prom"
|
||||
if not path.exists():
|
||||
errors.append(f"routinator repo metrics: missing {path}")
|
||||
return {"total": set(), "success": set(), "failed": set(), "duration": {}}
|
||||
try:
|
||||
text = path.read_text(encoding="utf-8", errors="replace")
|
||||
except Exception as exc:
|
||||
errors.append(f"routinator repo metrics: read {path}: {exc}")
|
||||
return {"total": set(), "success": set(), "failed": set(), "duration": {}}
|
||||
total = set()
|
||||
success = set()
|
||||
failed = set()
|
||||
duration = {}
|
||||
for metric in ["routinator_rrdp_status", "routinator_rsync_status"]:
|
||||
for labels, value in parse_prometheus_samples(text, metric):
|
||||
uri = labels.get("uri")
|
||||
if not uri:
|
||||
continue
|
||||
total.add(uri)
|
||||
ok = value in (0.0, 200.0, 304.0)
|
||||
if ok:
|
||||
success.add(uri)
|
||||
else:
|
||||
failed.add(uri)
|
||||
for metric in ["routinator_rrdp_duration", "routinator_rsync_duration"]:
|
||||
for labels, value in parse_prometheus_samples(text, metric):
|
||||
uri = labels.get("uri")
|
||||
if uri:
|
||||
duration[uri] = max(duration.get(uri, 0.0), value)
|
||||
return {"total": total, "success": success, "failed": failed, "duration": duration}
|
||||
|
||||
def emit_repo_diff_metrics(out, errors):
|
||||
ours = load_ours_repo_sets(errors)
|
||||
routinator = load_routinator_repo_sets(errors)
|
||||
ours_ok = ours["available"]
|
||||
rout_ok = routinator["success"]
|
||||
only_ours = sorted(ours_ok - rout_ok)
|
||||
only_routinator = sorted(rout_ok - ours_ok)
|
||||
both = ours_ok & rout_ok
|
||||
neither = (ours["total"] | routinator["total"]) - (ours_ok | rout_ok)
|
||||
base = {"instance": INSTANCE, "left": "ours-rp", "right": "routinator"}
|
||||
out.append(metric_line("inter_rp_repo_sync_total", {"instance": INSTANCE, "rp": "ours-rp", "state": "total"}, len(ours["total"])))
|
||||
out.append(metric_line("inter_rp_repo_sync_total", {"instance": INSTANCE, "rp": "ours-rp", "state": "available"}, len(ours_ok)))
|
||||
out.append(metric_line("inter_rp_repo_sync_total", {"instance": INSTANCE, "rp": "ours-rp", "state": "failed"}, len(ours["failed"])))
|
||||
out.append(metric_line("inter_rp_repo_sync_total", {"instance": INSTANCE, "rp": "routinator", "state": "total"}, len(routinator["total"])))
|
||||
out.append(metric_line("inter_rp_repo_sync_total", {"instance": INSTANCE, "rp": "routinator", "state": "available"}, len(rout_ok)))
|
||||
out.append(metric_line("inter_rp_repo_sync_total", {"instance": INSTANCE, "rp": "routinator", "state": "failed"}, len(routinator["failed"])))
|
||||
out.append(metric_line("inter_rp_repo_sync_overlap_total", {**base, "class": "both_available"}, len(both)))
|
||||
out.append(metric_line("inter_rp_repo_sync_overlap_total", {**base, "class": "only_ours"}, len(only_ours)))
|
||||
out.append(metric_line("inter_rp_repo_sync_overlap_total", {**base, "class": "only_routinator"}, len(only_routinator)))
|
||||
out.append(metric_line("inter_rp_repo_sync_overlap_total", {**base, "class": "neither_available"}, len(neither)))
|
||||
for diff_class, uris in [("only_ours", only_ours), ("only_routinator", only_routinator)]:
|
||||
for rank, uri in enumerate(uris[:50], start=1):
|
||||
labels = {**base, "class": diff_class, "rank": rank, "uri": uri}
|
||||
if uri in routinator["duration"]:
|
||||
labels["routinator_duration"] = f"{routinator['duration'][uri]:.3f}"
|
||||
out.append(metric_line("inter_rp_repo_sync_diff_info", labels, 1))
|
||||
return {
|
||||
"oursTotal": len(ours["total"]),
|
||||
"oursAvailable": len(ours_ok),
|
||||
"routinatorTotal": len(routinator["total"]),
|
||||
"routinatorAvailable": len(rout_ok),
|
||||
"onlyOurs": only_ours[:50],
|
||||
"onlyRoutinator": only_routinator[:50],
|
||||
"bothAvailable": len(both),
|
||||
"neitherAvailable": len(neither),
|
||||
}
|
||||
|
||||
def sync_metrics(now):
|
||||
path = PEER_ROOT / "sync-status.json"
|
||||
if not path.exists():
|
||||
@ -201,7 +322,8 @@ def build_metrics():
|
||||
out.append(metric_line("inter_rp_vrps_diff", {"instance": INSTANCE, "left": "ours-rp", "right": "routinator"}, abs(int(ours["vrps"]) - int(rout["vrps"]))))
|
||||
if ours.get("vaps") is not None and rout.get("vaps") is not None:
|
||||
out.append(metric_line("inter_rp_vaps_diff", {"instance": INSTANCE, "left": "ours-rp", "right": "routinator"}, abs(int(ours["vaps"]) - int(rout["vaps"]))))
|
||||
return "".join(out), {"errors": errors, "samples": samples, "sync": sync}
|
||||
repo_diff = emit_repo_diff_metrics(out, errors)
|
||||
return "".join(out), {"errors": errors, "samples": samples, "sync": sync, "repoDiff": repo_diff}
|
||||
|
||||
def get_metrics():
|
||||
now = time.time()
|
||||
|
||||
@ -47,7 +47,13 @@ sync_once() {
|
||||
rm -rf "$tmp"
|
||||
return 2
|
||||
fi
|
||||
rm -rf "$PEER_ROOT/routinator/latest.next" "$PEER_ROOT/routinator/latest.prev"
|
||||
if ! ssh -o BatchMode=yes -o ConnectTimeout=8 "$REMOTE231" "curl -fsS --max-time 20 http://127.0.0.1:9558/metrics" >"$tmp/routinator-metrics.prom"; then
|
||||
rm -rf "$tmp"
|
||||
return 3
|
||||
fi
|
||||
rm -rf "$PEER_ROOT/routinator/latest.next" "$PEER_ROOT/routinator/latest.prev" "$PEER_ROOT/routinator/routinator-metrics.prom.next"
|
||||
mv "$tmp/routinator-metrics.prom" "$PEER_ROOT/routinator/routinator-metrics.prom.next"
|
||||
mv "$PEER_ROOT/routinator/routinator-metrics.prom.next" "$PEER_ROOT/routinator/routinator-metrics.prom"
|
||||
mv "$tmp/latest" "$PEER_ROOT/routinator/latest.next"
|
||||
if [[ -e "$PEER_ROOT/routinator/latest" ]]; then
|
||||
mv "$PEER_ROOT/routinator/latest" "$PEER_ROOT/routinator/latest.prev"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user