diff --git a/monitor/grafana/dashboards/ours-rp-inter-rp.json b/monitor/grafana/dashboards/ours-rp-inter-rp.json index 102a3ad..ee430ce 100644 --- a/monitor/grafana/dashboards/ours-rp-inter-rp.json +++ b/monitor/grafana/dashboards/ours-rp-inter-rp.json @@ -11,7 +11,7 @@ "panels": [ { "id": 1, - "title": "Metrics Reload OK", + "title": "Ours Only Repo Count", "type": "stat", "datasource": { "type": "prometheus", @@ -25,7 +25,7 @@ }, "fieldConfig": { "defaults": { - "unit": "none", + "unit": "short", "decimals": 0 }, "overrides": [] @@ -47,8 +47,8 @@ }, "targets": [ { - "expr": "max(inter_rp_service_last_reload_success{exported_instance=\"remote200-inter-rp\"})", - "legendFormat": "reload", + "expr": "max(inter_rp_repo_sync_overlap_total{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\",class=\"only_ours\"})", + "legendFormat": "only ours", "refId": "A", "instant": true } @@ -56,7 +56,7 @@ }, { "id": 2, - "title": "231 Sync Age", + "title": "Routinator Only Repo Count", "type": "stat", "datasource": { "type": "prometheus", @@ -70,7 +70,7 @@ }, "fieldConfig": { "defaults": { - "unit": "s", + "unit": "short", "decimals": 0 }, "overrides": [] @@ -92,8 +92,8 @@ }, "targets": [ { - "expr": "max(inter_rp_sync_age_seconds{exported_instance=\"remote200-inter-rp\"})", - "legendFormat": "sync age", + "expr": "max(inter_rp_repo_sync_overlap_total{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\",class=\"only_routinator\"})", + "legendFormat": "only routinator", "refId": "A", "instant": true } @@ -101,7 +101,7 @@ }, { "id": 3, - "title": "Parse Errors", + "title": "Ours vs Routinator VAP Diff", "type": "stat", "datasource": { "type": "prometheus", @@ -137,8 +137,8 @@ }, "targets": [ { - "expr": "max(inter_rp_parse_errors{exported_instance=\"remote200-inter-rp\"})", - "legendFormat": "errors", + "expr": "max(inter_rp_vaps_diff{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\"})", + "legendFormat": "vap diff", "refId": "A", "instant": true } @@ -592,7 +592,7 @@ "uid": "Prometheus" }, "gridPos": { - "h": 8, + "h": 9, "w": 24, "x": 0, "y": 44 @@ -602,7 +602,48 @@ "unit": "none", "decimals": 0 }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "uri" + }, + "properties": [ + { + "id": "custom.width", + "value": 760 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "class" + }, + "properties": [ + { + "id": "custom.width", + "value": 140 + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "^(mft|crl|crt|roa|aspa)$" + }, + "properties": [ + { + "id": "custom.align", + "value": "right" + }, + { + "id": "custom.width", + "value": 80 + } + ] + } + ] }, "options": { "showHeader": true, @@ -616,50 +657,42 @@ "legendFormat": "{{class}} #{{rank}}", "refId": "A" } - ] - }, - { - "id": 15, - "title": "Only-Ours Repo Count", - "type": "stat", - "datasource": { - "type": "prometheus", - "uid": "Prometheus" - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 32 - }, - "fieldConfig": { - "defaults": { - "unit": "short", - "decimals": 0 - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto", - "wideLayout": true - }, - "targets": [ + ], + "transformations": [ { - "expr": "max(inter_rp_repo_sync_overlap_total{exported_instance=\"remote200-inter-rp\",left=\"ours-rp\",right=\"routinator\",class=\"only_ours\"})", - "legendFormat": "only ours", - "refId": "A", - "instant": true + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "__name__": true, + "exported_instance": true, + "instance": true, + "job": true, + "left": true, + "right": true, + "rank": true, + "routinator_duration": true + }, + "indexByName": { + "class": 0, + "uri": 1, + "mft": 2, + "crl": 3, + "crt": 4, + "roa": 5, + "aspa": 6 + }, + "renameByName": { + "class": "class", + "uri": "uri", + "mft": "mft", + "crl": "crl", + "crt": "crt", + "roa": "roa", + "aspa": "aspa" + } + } } ] } @@ -681,5 +714,5 @@ "timezone": "browser", "title": "Ours RP vs Routinator", "uid": "ours-rp-inter-rp", - "version": 3 + "version": 4 } diff --git a/scripts/inter_rp/inter_rp_ours_routinator_exporter.py b/scripts/inter_rp/inter_rp_ours_routinator_exporter.py index bd333a0..64448ef 100755 --- a/scripts/inter_rp/inter_rp_ours_routinator_exporter.py +++ b/scripts/inter_rp/inter_rp_ours_routinator_exporter.py @@ -155,15 +155,44 @@ def parse_prometheus_samples(text, metric_name): continue yield parse_prom_labels(match.group(1)), value +def canonical_object_type(value): + value = (value or "").strip().lower() + if value in ("manifest", "mft"): + return "mft" + if value == "crl": + return "crl" + if value in ("certificate", "cert", "ca_cert", "router_cert", "ee_cert", "crt"): + return "crt" + if value == "roa": + return "roa" + if value == "aspa": + return "aspa" + return None + +def empty_object_counts(): + return {"mft": 0, "crl": 0, "crt": 0, "roa": 0, "aspa": 0} + +def add_object_count(counts_by_uri, uri, object_type, value): + canonical = canonical_object_type(object_type) + if not uri or canonical is None: + return + counts = counts_by_uri.setdefault(uri, empty_object_counts()) + counts[canonical] = counts.get(canonical, 0) + int(value) + +def object_count_labels(counts_by_uri, uri): + counts = counts_by_uri.get(uri, empty_object_counts()) + return {key: str(int(counts.get(key, 0))) for key in ["mft", "crl", "crt", "roa", "aspa"]} + def load_ours_repo_sets(errors): try: import urllib.request text = urllib.request.urlopen("http://127.0.0.1:9556/metrics", timeout=10).read().decode("utf-8", "replace") except Exception as exc: errors.append(f"ours-rp repo metrics: {exc}") - return {"total": set(), "available": set(), "failed": set(), "info": {}} + return {"total": set(), "available": set(), "failed": set(), "info": {}, "object_counts": {}} info = {} states = {} + object_counts = {} for labels, value in parse_prometheus_samples(text, "ours_rp_repository_info"): uri = labels.get("uri") if uri: @@ -178,6 +207,15 @@ def load_ours_repo_sets(errors): if not uri: continue states.setdefault(uri, {})[labels.get("terminal_state", "unknown")] = value + for labels, value in parse_prometheus_samples(text, "ours_rp_repository_objects_by_type"): + uri = labels.get("uri") + if not uri: + repo_id = labels.get("repo_id") + for candidate_uri, candidate in info.items(): + if candidate.get("repo_id") == repo_id: + uri = candidate_uri + break + add_object_count(object_counts, uri, labels.get("object_type"), value) total = set(info) available = set() failed = set() @@ -188,22 +226,23 @@ def load_ours_repo_sets(errors): available.add(uri) else: failed.add(uri) - return {"total": total, "available": available, "failed": failed, "info": info} + return {"total": total, "available": available, "failed": failed, "info": info, "object_counts": object_counts} def load_routinator_repo_sets(errors): path = PEER_ROOT / "routinator" / "routinator-metrics.prom" if not path.exists(): errors.append(f"routinator repo metrics: missing {path}") - return {"total": set(), "success": set(), "failed": set(), "duration": {}} + return {"total": set(), "success": set(), "failed": set(), "duration": {}, "object_counts": {}} try: text = path.read_text(encoding="utf-8", errors="replace") except Exception as exc: errors.append(f"routinator repo metrics: read {path}: {exc}") - return {"total": set(), "success": set(), "failed": set(), "duration": {}} + return {"total": set(), "success": set(), "failed": set(), "duration": {}, "object_counts": {}} total = set() success = set() failed = set() duration = {} + object_counts = {} for metric in ["routinator_rrdp_status", "routinator_rsync_status"]: for labels, value in parse_prometheus_samples(text, metric): uri = labels.get("uri") @@ -220,7 +259,9 @@ def load_routinator_repo_sets(errors): uri = labels.get("uri") if uri: duration[uri] = max(duration.get(uri, 0.0), value) - return {"total": total, "success": success, "failed": failed, "duration": duration} + for labels, value in parse_prometheus_samples(text, "routinator_repository_objects_total"): + add_object_count(object_counts, labels.get("uri"), labels.get("type"), value) + return {"total": total, "success": success, "failed": failed, "duration": duration, "object_counts": object_counts} def emit_repo_diff_metrics(out, errors): ours = load_ours_repo_sets(errors) @@ -242,9 +283,18 @@ def emit_repo_diff_metrics(out, errors): out.append(metric_line("inter_rp_repo_sync_overlap_total", {**base, "class": "only_ours"}, len(only_ours))) out.append(metric_line("inter_rp_repo_sync_overlap_total", {**base, "class": "only_routinator"}, len(only_routinator))) out.append(metric_line("inter_rp_repo_sync_overlap_total", {**base, "class": "neither_available"}, len(neither))) - for diff_class, uris in [("only_ours", only_ours), ("only_routinator", only_routinator)]: + for diff_class, uris, counts_by_uri in [ + ("only_ours", only_ours, ours["object_counts"]), + ("only_routinator", only_routinator, routinator["object_counts"]), + ]: for rank, uri in enumerate(uris[:50], start=1): - labels = {**base, "class": diff_class, "rank": rank, "uri": uri} + labels = { + **base, + "class": diff_class, + "rank": rank, + "uri": uri, + **object_count_labels(counts_by_uri, uri), + } if uri in routinator["duration"]: labels["routinator_duration"] = f"{routinator['duration'][uri]:.3f}" out.append(metric_line("inter_rp_repo_sync_diff_info", labels, 1)) diff --git a/src/tools/rpki_artifact_metrics.rs b/src/tools/rpki_artifact_metrics.rs index ee72b15..d9b13f1 100644 --- a/src/tools/rpki_artifact_metrics.rs +++ b/src/tools/rpki_artifact_metrics.rs @@ -258,6 +258,7 @@ struct RepoMetrics { duration_seconds_avg: f64, phase_counts: BTreeMap, terminal_state_counts: BTreeMap, + object_counts: BTreeMap, } #[derive(Clone, Debug, Default, Serialize)] @@ -859,7 +860,8 @@ fn extract_publication_point_metrics( let result = json_str(object, &["result"]) .unwrap_or("unknown") .to_string(); - *object_counts.entry((kind, result)).or_default() += 1; + *object_counts.entry((kind.clone(), result)).or_default() += 1; + *repo.object_counts.entry(kind).or_default() += 1; } } @@ -1531,6 +1533,21 @@ fn render_repo_metrics(writer: &mut PromWriter<'_>, instance: &str, repos: &[Rep *count as f64, ); } + for kind in ["manifest", "crl", "certificate", "roa", "aspa"] { + let labels = [ + label("instance", instance), + label("repo_id", &repo.repo_id), + label("host", &repo.host), + label("uri", &repo.uri), + label("object_type", kind), + ]; + writer.gauge( + "ours_rp_repository_objects_by_type", + "Repository object count by object type from latest run report", + &labels, + repo.object_counts.get(kind).copied().unwrap_or(0) as f64, + ); + } } } @@ -2298,6 +2315,8 @@ mod tests { assert_eq!(snapshot.repo_stats.len(), 1); assert!(snapshot.repo_stats[0].sync_success); assert_eq!(snapshot.repo_stats[0].download_bytes, 333); + assert_eq!(snapshot.repo_stats[0].object_counts["roa"], 2); + assert_eq!(snapshot.repo_stats[0].object_counts["manifest"], 1); assert_eq!(snapshot.top_pp_by_object_count[0].object_count, 2); assert_eq!(snapshot.cir.as_ref().unwrap().objects, 2); assert_eq!(snapshot.ccr.as_ref().unwrap().state_items["tas"], 1); @@ -2305,6 +2324,10 @@ mod tests { assert!(metrics.contains("ours_rp_repository_info")); assert!(metrics.contains("ours_rp_repository_sync_success")); assert!(metrics.contains("ours_rp_repository_download_bytes")); + assert!(metrics.contains("ours_rp_repository_objects_by_type")); + assert!(metrics.contains(r#"ours_rp_repository_objects_by_type{instance="test",repo_id="#)); + assert!(metrics.contains(r#"object_type="roa"} 2"#)); + assert!(metrics.contains(r#"object_type="manifest"} 1"#)); assert!(metrics.contains("ours_rp_large_publication_points")); assert!(metrics.contains("ours_rp_cir_objects")); assert!(metrics.contains("ours_rp_cir_objects_by_source"));