20260622 恢复Routinator inter-RP监控
This commit is contained in:
parent
61d3e636ae
commit
4546d90c33
@ -7,12 +7,22 @@
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Metrics Reload OK",
|
||||
"type": "stat",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none",
|
||||
@ -20,13 +30,6 @@
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
@ -42,22 +45,29 @@
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.1",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "inter_rp_service_last_reload_success",
|
||||
"expr": "max(inter_rp_service_last_reload_success{exported_instance=\"remote200-inter-rp\"})",
|
||||
"legendFormat": "reload",
|
||||
"refId": "A"
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"title": "Metrics Reload OK",
|
||||
"type": "stat"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "231 Sync Age",
|
||||
"type": "stat",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 0
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
@ -65,13 +75,6 @@
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
@ -87,22 +90,29 @@
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.1",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "inter_rp_sync_age_seconds",
|
||||
"expr": "max(inter_rp_sync_age_seconds{exported_instance=\"remote200-inter-rp\"})",
|
||||
"legendFormat": "sync age",
|
||||
"refId": "A"
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"title": "Remote200 Sync Age",
|
||||
"type": "stat"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Parse Errors",
|
||||
"type": "stat",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
@ -110,13 +120,6 @@
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
@ -132,36 +135,36 @@
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.1",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "inter_rp_parse_errors",
|
||||
"expr": "max(inter_rp_parse_errors{exported_instance=\"remote200-inter-rp\"})",
|
||||
"legendFormat": "errors",
|
||||
"refId": "A"
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"title": "Parse Errors",
|
||||
"type": "stat"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Ours vs Routinator VRP Diff",
|
||||
"type": "stat",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none",
|
||||
"decimals": 0
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 0
|
||||
},
|
||||
"id": 4,
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"decimals": 0
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
@ -177,22 +180,29 @@
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.1",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "inter_rp_ccr_digest_match{state=\"overall\"}",
|
||||
"legendFormat": "overall",
|
||||
"refId": "A"
|
||||
"expr": "max(inter_rp_vrps_diff{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\"})",
|
||||
"legendFormat": "vrp diff",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"title": "Ours vs rpki-client CCR Match",
|
||||
"type": "stat"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Wall Time by RP",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 4
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
@ -200,13 +210,6 @@
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 4
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
@ -223,19 +226,26 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "inter_rp_run_wall_seconds",
|
||||
"legendFormat": "{{rp}}",
|
||||
"expr": "inter_rp_run_wall_seconds{exported_instance=\"remote200-inter-rp\",exported_rp=~\"ours-rp|routinator\"}",
|
||||
"legendFormat": "{{exported_rp}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Wall Time by RP",
|
||||
"type": "timeseries"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Max RSS Aggregate Peak by RP",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 4
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
@ -243,13 +253,6 @@
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 4
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
@ -266,34 +269,33 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "inter_rp_run_max_rss_bytes{kind=\"aggregate_peak\"}",
|
||||
"legendFormat": "{{rp}}",
|
||||
"expr": "inter_rp_run_max_rss_bytes{exported_instance=\"remote200-inter-rp\",kind=\"aggregate_peak\",exported_rp=~\"ours-rp|routinator\"}",
|
||||
"legendFormat": "{{exported_rp}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Max RSS Aggregate Peak by RP",
|
||||
"type": "timeseries"
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none",
|
||||
"decimals": 0,
|
||||
"min": 0
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 12
|
||||
},
|
||||
"id": 7,
|
||||
"title": "VRPs by RP (unique ASN/Prefix/MaxLen)",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 12
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none",
|
||||
"min": 0
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
@ -310,34 +312,33 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "inter_rp_vrps",
|
||||
"legendFormat": "{{rp}}",
|
||||
"expr": "inter_rp_vrps{exported_instance=\"remote200-inter-rp\",exported_rp=~\"ours-rp|routinator\"}",
|
||||
"legendFormat": "{{exported_rp}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "VRPs by RP (unique ASN/Prefix/MaxLen)",
|
||||
"type": "timeseries"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "VAPs / ASPAs by RP (unique Customer/Providers)",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none",
|
||||
"decimals": 0,
|
||||
"min": 0
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 12
|
||||
},
|
||||
"id": 8,
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none",
|
||||
"min": 0
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
@ -354,54 +355,75 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "inter_rp_vaps",
|
||||
"legendFormat": "{{rp}}",
|
||||
"expr": "inter_rp_vaps{exported_instance=\"remote200-inter-rp\",exported_rp=~\"ours-rp|routinator\"}",
|
||||
"legendFormat": "{{exported_rp}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "VAPs / ASPAs by RP (unique Customer/Providers)",
|
||||
"type": "timeseries"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "Latest RP Runs",
|
||||
"type": "table",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none",
|
||||
"decimals": 0
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 20
|
||||
},
|
||||
"id": 9,
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none",
|
||||
"decimals": 0
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true,
|
||||
"sortBy": []
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "inter_rp_ccr_digest_match{left=\"ours-rp\",right=\"rpki-client\"}",
|
||||
"expr": "inter_rp_run_seq{exported_instance=\"remote200-inter-rp\",exported_rp=~\"ours-rp|routinator\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "{{state}}",
|
||||
"legendFormat": "{{exported_rp}} seq",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "inter_rp_run_success{exported_instance=\"remote200-inter-rp\",exported_rp=~\"ours-rp|routinator\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "{{exported_rp}} success",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "inter_rp_run_wall_seconds{exported_instance=\"remote200-inter-rp\",exported_rp=~\"ours-rp|routinator\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "{{exported_rp}} wall",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"title": "CCR Digest Match States",
|
||||
"type": "table"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "Output Count Diffs (unique)",
|
||||
"type": "table",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 20
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none",
|
||||
@ -409,41 +431,41 @@
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 20
|
||||
},
|
||||
"id": 10,
|
||||
"options": {
|
||||
"showHeader": true,
|
||||
"sortBy": []
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "inter_rp_vrps_diff",
|
||||
"expr": "inter_rp_vrps_diff{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "vrps {{left}}-{{right}}",
|
||||
"legendFormat": "vrps ours-rp-routinator",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "inter_rp_vaps_diff",
|
||||
"expr": "inter_rp_vaps_diff{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "vaps {{left}}-{{right}}",
|
||||
"legendFormat": "vaps ours-rp-routinator",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Output Count Diffs (VRP/VAP unique)",
|
||||
"type": "table"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Artifact Age by RP",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 28
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
@ -451,13 +473,6 @@
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 28
|
||||
},
|
||||
"id": 11,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
@ -474,21 +489,19 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "inter_rp_artifact_age_seconds",
|
||||
"legendFormat": "{{rp}}",
|
||||
"expr": "inter_rp_artifact_age_seconds{exported_instance=\"remote200-inter-rp\",exported_rp=~\"ours-rp|routinator\"}",
|
||||
"legendFormat": "{{exported_rp}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Artifact Age by RP",
|
||||
"type": "timeseries"
|
||||
]
|
||||
}
|
||||
],
|
||||
"preload": false,
|
||||
"refresh": "10s",
|
||||
"schemaVersion": 40,
|
||||
"tags": [
|
||||
"rpki",
|
||||
"inter-rp"
|
||||
"inter-rp",
|
||||
"routinator"
|
||||
],
|
||||
"templating": {
|
||||
"list": []
|
||||
@ -497,9 +510,8 @@
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "Ours RP Inter-RP",
|
||||
"title": "Ours RP vs Routinator",
|
||||
"uid": "ours-rp-inter-rp",
|
||||
"version": 1
|
||||
"version": 2
|
||||
}
|
||||
|
||||
250
scripts/inter_rp/inter_rp_ours_routinator_exporter.py
Executable file
250
scripts/inter_rp/inter_rp_ours_routinator_exporter.py
Executable file
@ -0,0 +1,250 @@
|
||||
#!/usr/bin/env python3
|
||||
import csv
|
||||
import datetime as dt
|
||||
import json
|
||||
import os
|
||||
import socket
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
||||
from pathlib import Path
|
||||
|
||||
RUN_ROOT = Path(os.environ.get("OURS_RUN_ROOT", "/root/ours-rp-continuous/portable-soak"))
|
||||
PEER_ROOT = Path(os.environ.get("PEER_ROOT", "/root/ours-rp-continuous/portable-soak/inter-rp-peers"))
|
||||
INSTANCE = os.environ.get("INTER_RP_INSTANCE", "remote200-inter-rp")
|
||||
LISTEN = os.environ.get("INTER_RP_LISTEN", "0.0.0.0:9557")
|
||||
SCAN_TTL = float(os.environ.get("INTER_RP_SCAN_TTL_SECONDS", "10"))
|
||||
|
||||
_cache_lock = threading.Lock()
|
||||
_cache = {"deadline": 0.0, "metrics": "", "status": {}}
|
||||
_count_cache = {}
|
||||
|
||||
def unix_now():
|
||||
return time.time()
|
||||
|
||||
def parse_rfc3339(value):
|
||||
if not value:
|
||||
return None
|
||||
text = str(value).replace("Z", "+00:00")
|
||||
try:
|
||||
return dt.datetime.fromisoformat(text).timestamp()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def read_json(path):
|
||||
with open(path, "r", encoding="utf-8") as handle:
|
||||
return json.load(handle)
|
||||
|
||||
def latest_ours_run():
|
||||
runs = RUN_ROOT / "runs"
|
||||
candidates = sorted(p for p in runs.glob("run_*") if (p / "run-summary.json").exists())
|
||||
return candidates[-1] if candidates else None
|
||||
|
||||
def count_unique_csv(path, cols):
|
||||
if not path.exists():
|
||||
return None
|
||||
stat = path.stat()
|
||||
key = (str(path), stat.st_mtime_ns, stat.st_size, cols)
|
||||
if key in _count_cache:
|
||||
return _count_cache[key]
|
||||
seen = set()
|
||||
with open(path, "r", encoding="utf-8", newline="") as handle:
|
||||
reader = csv.reader(handle)
|
||||
first = True
|
||||
for row in reader:
|
||||
if first:
|
||||
first = False
|
||||
continue
|
||||
if not row:
|
||||
continue
|
||||
if len(row) < cols:
|
||||
continue
|
||||
seen.add(tuple(cell.strip() for cell in row[:cols]))
|
||||
value = len(seen)
|
||||
_count_cache.clear()
|
||||
_count_cache[key] = value
|
||||
return value
|
||||
|
||||
def metric_line(name, labels, value):
|
||||
label_text = ",".join(f'{k}="{str(v).replace(chr(92), chr(92)+chr(92)).replace(chr(34), chr(92)+chr(34))}"' for k, v in labels.items())
|
||||
return f"{name}{{{label_text}}} {value}\n"
|
||||
|
||||
def bool_num(value):
|
||||
return 1 if value else 0
|
||||
|
||||
def sample_ours(now, errors):
|
||||
sample = {"rp": "ours-rp", "present": False, "success": False, "max": {}, "errors": 0}
|
||||
run_dir = latest_ours_run()
|
||||
if run_dir is None:
|
||||
sample["errors"] += 1
|
||||
errors.append("ours-rp: no run-summary.json")
|
||||
return sample
|
||||
sample["present"] = True
|
||||
summary_path = run_dir / "run-summary.json"
|
||||
try:
|
||||
summary = read_json(summary_path)
|
||||
except Exception as exc:
|
||||
sample["errors"] += 1
|
||||
errors.append(f"ours-rp: read {summary_path}: {exc}")
|
||||
return sample
|
||||
sample["run_id"] = summary.get("runId") or run_dir.name
|
||||
sample["run_seq"] = summary.get("runSeq") or int(run_dir.name.split("_")[-1])
|
||||
sample["success"] = summary.get("status") == "success" and int(summary.get("exitCode", 0)) == 0
|
||||
if summary.get("wallMs") is not None:
|
||||
sample["wall"] = float(summary["wallMs"]) / 1000.0
|
||||
finished = parse_rfc3339(summary.get("finishedAtRfc3339Utc"))
|
||||
if finished is not None:
|
||||
sample["finished"] = finished
|
||||
sample["age"] = max(0.0, now - finished)
|
||||
rss = summary.get("processMetrics", {}).get("maxRssKb")
|
||||
if rss is not None:
|
||||
sample["max"]["parent"] = int(rss) * 1024
|
||||
sample["max"]["aggregate_peak"] = int(rss) * 1024
|
||||
sample["vrps"] = count_unique_csv(run_dir / "vrps.csv", 3)
|
||||
sample["vaps"] = count_unique_csv(run_dir / "vaps.csv", 2)
|
||||
return sample
|
||||
|
||||
def sample_routinator(now, errors):
|
||||
rp = "routinator"
|
||||
latest = PEER_ROOT / rp / "latest"
|
||||
sample = {"rp": rp, "present": False, "success": False, "max": {}, "errors": 0}
|
||||
if not latest.exists():
|
||||
sample["errors"] += 1
|
||||
errors.append(f"routinator: missing latest directory: {latest}")
|
||||
return sample
|
||||
sample["present"] = True
|
||||
meta_path = latest / "run-meta.json"
|
||||
try:
|
||||
meta = read_json(meta_path)
|
||||
except Exception as exc:
|
||||
sample["errors"] += 1
|
||||
errors.append(f"routinator: read {meta_path}: {exc}")
|
||||
return sample
|
||||
sample["run_id"] = meta.get("runId")
|
||||
sample["run_seq"] = meta.get("runSeq")
|
||||
sample["success"] = bool(meta.get("success"))
|
||||
if meta.get("wallMs") is not None:
|
||||
sample["wall"] = float(meta["wallMs"]) / 1000.0
|
||||
finished = parse_rfc3339(meta.get("finishedAtRfc3339Utc"))
|
||||
if finished is not None:
|
||||
sample["finished"] = finished
|
||||
sample["age"] = max(0.0, now - finished)
|
||||
max_rss = meta.get("maxRssKb", {})
|
||||
for label, key in [("parent", "parent"), ("child_max", "childMax"), ("aggregate_peak", "aggregatePeak")]:
|
||||
if max_rss.get(key) is not None:
|
||||
sample["max"][label] = int(max_rss[key]) * 1024
|
||||
sample["vrps"] = count_unique_csv(latest / "vrps.csv", 3)
|
||||
sample["vaps"] = count_unique_csv(latest / "vaps.csv", 2)
|
||||
return sample
|
||||
|
||||
def sync_metrics(now):
|
||||
path = PEER_ROOT / "sync-status.json"
|
||||
if not path.exists():
|
||||
return {"present": False, "success": False, "message": f"missing {path}"}
|
||||
try:
|
||||
value = read_json(path)
|
||||
except Exception as exc:
|
||||
return {"present": True, "success": False, "message": str(exc)}
|
||||
ts = parse_rfc3339(value.get("lastSyncAtRfc3339Utc"))
|
||||
return {
|
||||
"present": True,
|
||||
"success": bool(value.get("success")),
|
||||
"timestamp": ts,
|
||||
"age": max(0.0, now - ts) if ts is not None else None,
|
||||
"remote": value.get("remoteHost", ""),
|
||||
"message": value.get("message", ""),
|
||||
}
|
||||
|
||||
def build_metrics():
|
||||
now = unix_now()
|
||||
errors = []
|
||||
start = time.time()
|
||||
samples = [sample_ours(now, errors), sample_routinator(now, errors)]
|
||||
sync = sync_metrics(now)
|
||||
if not sync.get("success"):
|
||||
errors.append("sync: " + str(sync.get("message", "failed")))
|
||||
out = []
|
||||
out.append(metric_line("inter_rp_service_up", {"instance": INSTANCE}, 1))
|
||||
out.append(metric_line("inter_rp_service_last_scan_timestamp_seconds", {"instance": INSTANCE}, now))
|
||||
out.append(metric_line("inter_rp_service_last_scan_duration_seconds", {"instance": INSTANCE}, time.time() - start))
|
||||
out.append(metric_line("inter_rp_service_last_reload_success", {"instance": INSTANCE}, bool_num(len(errors) == 0)))
|
||||
out.append(metric_line("inter_rp_parse_errors", {"instance": INSTANCE}, len(errors)))
|
||||
out.append(metric_line("inter_rp_sync_present", {"instance": INSTANCE}, bool_num(sync.get("present"))))
|
||||
out.append(metric_line("inter_rp_sync_last_success", {"instance": INSTANCE}, bool_num(sync.get("success"))))
|
||||
if sync.get("age") is not None:
|
||||
out.append(metric_line("inter_rp_sync_age_seconds", {"instance": INSTANCE}, sync["age"]))
|
||||
if sync.get("timestamp") is not None:
|
||||
out.append(metric_line("inter_rp_sync_last_timestamp_seconds", {"instance": INSTANCE}, sync["timestamp"]))
|
||||
by_rp = {s["rp"]: s for s in samples}
|
||||
for s in samples:
|
||||
labels = {"instance": INSTANCE, "rp": s["rp"]}
|
||||
out.append(metric_line("inter_rp_run_present", labels, bool_num(s.get("present"))))
|
||||
out.append(metric_line("inter_rp_run_success", labels, bool_num(s.get("success"))))
|
||||
out.append(metric_line("inter_rp_sample_parse_errors", labels, s.get("errors", 0)))
|
||||
if s.get("run_seq") is not None:
|
||||
out.append(metric_line("inter_rp_run_seq", labels, s["run_seq"]))
|
||||
if s.get("wall") is not None:
|
||||
out.append(metric_line("inter_rp_run_wall_seconds", labels, s["wall"]))
|
||||
if s.get("age") is not None:
|
||||
out.append(metric_line("inter_rp_artifact_age_seconds", labels, s["age"]))
|
||||
if s.get("vrps") is not None:
|
||||
out.append(metric_line("inter_rp_vrps", labels, s["vrps"]))
|
||||
if s.get("vaps") is not None:
|
||||
out.append(metric_line("inter_rp_vaps", labels, s["vaps"]))
|
||||
for kind, value in s.get("max", {}).items():
|
||||
labels2 = dict(labels)
|
||||
labels2["kind"] = kind
|
||||
out.append(metric_line("inter_rp_run_max_rss_bytes", labels2, value))
|
||||
ours, rout = by_rp.get("ours-rp", {}), by_rp.get("routinator", {})
|
||||
if ours.get("vrps") is not None and rout.get("vrps") is not None:
|
||||
out.append(metric_line("inter_rp_vrps_diff", {"instance": INSTANCE, "left": "ours-rp", "right": "routinator"}, abs(int(ours["vrps"]) - int(rout["vrps"]))))
|
||||
if ours.get("vaps") is not None and rout.get("vaps") is not None:
|
||||
out.append(metric_line("inter_rp_vaps_diff", {"instance": INSTANCE, "left": "ours-rp", "right": "routinator"}, abs(int(ours["vaps"]) - int(rout["vaps"]))))
|
||||
return "".join(out), {"errors": errors, "samples": samples, "sync": sync}
|
||||
|
||||
def get_metrics():
|
||||
now = time.time()
|
||||
with _cache_lock:
|
||||
if _cache["metrics"] and _cache["deadline"] > now:
|
||||
return _cache["metrics"]
|
||||
metrics, status = build_metrics()
|
||||
_cache["metrics"] = metrics
|
||||
_cache["status"] = status
|
||||
_cache["deadline"] = now + SCAN_TTL
|
||||
return metrics
|
||||
|
||||
class Handler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
if self.path == "/metrics":
|
||||
body = get_metrics().encode("utf-8")
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
return
|
||||
if self.path == "/status":
|
||||
with _cache_lock:
|
||||
if not _cache["status"] or _cache["deadline"] <= time.time():
|
||||
build_metrics()
|
||||
body = json.dumps(_cache["status"], indent=2).encode("utf-8")
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
return
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
def log_message(self, fmt, *args):
|
||||
return
|
||||
|
||||
def main():
|
||||
host, port = LISTEN.rsplit(":", 1)
|
||||
server = ThreadingHTTPServer((host, int(port)), Handler)
|
||||
print(f"inter-rp ours+routinator exporter listen={LISTEN} instance={INSTANCE}", flush=True)
|
||||
server.serve_forever()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
73
scripts/inter_rp/sync_routinator_from_remote231.sh
Executable file
73
scripts/inter_rp/sync_routinator_from_remote231.sh
Executable file
@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
REMOTE231="${REMOTE231:-root@47.251.127.231}"
|
||||
REMOTE_ROOT="${REMOTE_ROOT:-/var/lib/inter-rp-runners}"
|
||||
PEER_ROOT="${PEER_ROOT:-/root/ours-rp-continuous/portable-soak/inter-rp-peers}"
|
||||
SYNC_INTERVAL_SECS="${SYNC_INTERVAL_SECS:-60}"
|
||||
MAX_SYNCS="${MAX_SYNCS:--1}"
|
||||
LOG_PREFIX="[inter-rp-sync]"
|
||||
mkdir -p "$PEER_ROOT/routinator"
|
||||
write_status() {
|
||||
local success="$1"
|
||||
local message="$2"
|
||||
env SYNC_SUCCESS="$success" SYNC_MESSAGE="$message" SYNC_REMOTE="$REMOTE231" SYNC_REMOTE_ROOT="$REMOTE_ROOT" python3 - "$PEER_ROOT/sync-status.json" <<'PY'
|
||||
import datetime, json, os, socket, sys
|
||||
path = sys.argv[1]
|
||||
payload = {
|
||||
"schemaVersion": 1,
|
||||
"success": os.environ["SYNC_SUCCESS"] == "true",
|
||||
"lastSyncAtRfc3339Utc": datetime.datetime.now(datetime.UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z"),
|
||||
"remoteHost": os.environ["SYNC_REMOTE"],
|
||||
"remoteRoot": os.environ["SYNC_REMOTE_ROOT"],
|
||||
"localHost": socket.gethostname(),
|
||||
"message": os.environ["SYNC_MESSAGE"],
|
||||
}
|
||||
with open(path, "w", encoding="utf-8") as handle:
|
||||
json.dump(payload, handle, indent=2)
|
||||
handle.write("\n")
|
||||
PY
|
||||
}
|
||||
sync_once() {
|
||||
local tmp="$PEER_ROOT/.sync-routinator-$$"
|
||||
rm -rf "$tmp"
|
||||
mkdir -p "$tmp"
|
||||
if ! rsync -aL --delete \
|
||||
--include='run-meta.json' \
|
||||
--include='result.ccr' \
|
||||
--include='vrps.csv' \
|
||||
--include='vaps.csv' \
|
||||
--include='stdout.log' \
|
||||
--include='stderr.log' \
|
||||
--exclude='*' \
|
||||
"$REMOTE231:$REMOTE_ROOT/routinator/latest/" "$tmp/latest/"; then
|
||||
rm -rf "$tmp"
|
||||
return 1
|
||||
fi
|
||||
if [[ ! -f "$tmp/latest/run-meta.json" ]]; then
|
||||
rm -rf "$tmp"
|
||||
return 2
|
||||
fi
|
||||
rm -rf "$PEER_ROOT/routinator/latest.next" "$PEER_ROOT/routinator/latest.prev"
|
||||
mv "$tmp/latest" "$PEER_ROOT/routinator/latest.next"
|
||||
if [[ -e "$PEER_ROOT/routinator/latest" ]]; then
|
||||
mv "$PEER_ROOT/routinator/latest" "$PEER_ROOT/routinator/latest.prev"
|
||||
fi
|
||||
mv "$PEER_ROOT/routinator/latest.next" "$PEER_ROOT/routinator/latest"
|
||||
rm -rf "$PEER_ROOT/routinator/latest.prev" "$tmp"
|
||||
}
|
||||
completed=0
|
||||
while true; do
|
||||
if sync_once; then
|
||||
echo "$LOG_PREFIX $(date -u +%Y-%m-%dT%H:%M:%SZ) ok"
|
||||
write_status true "ok"
|
||||
else
|
||||
code=$?
|
||||
echo "$LOG_PREFIX $(date -u +%Y-%m-%dT%H:%M:%SZ) sync failed code=$code" >&2
|
||||
write_status false "routinator rsync failed code=$code"
|
||||
fi
|
||||
completed=$((completed + 1))
|
||||
if [[ "$MAX_SYNCS" =~ ^[0-9]+$ ]] && (( completed >= MAX_SYNCS )); then
|
||||
break
|
||||
fi
|
||||
sleep "$SYNC_INTERVAL_SECS"
|
||||
done
|
||||
Loading…
x
Reference in New Issue
Block a user