From 89fba8114abf416320b81b3bafdd1365a3634965 Mon Sep 17 00:00:00 2001 From: "sundapeng.sdp" Date: Fri, 17 Oct 2025 15:13:28 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=88=9B=E5=BB=BA=20=E9=9B=86=E7=BE=A4?= =?UTF-8?q?=20Dashboard=20=E6=A8=A1=E7=89=88=EF=BC=8Cdefault=5Fcluster=5Fd?= =?UTF-8?q?ashboard.json=EF=BC=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit refs #12 --- .../dashboards/default_cluster_dashboard.json | 570 ++++++++++++++++++ 1 file changed, 570 insertions(+) create mode 100644 src/metric/grafana/build/dashboards/default_cluster_dashboard.json diff --git a/src/metric/grafana/build/dashboards/default_cluster_dashboard.json b/src/metric/grafana/build/dashboards/default_cluster_dashboard.json new file mode 100644 index 0000000..06ef418 --- /dev/null +++ b/src/metric/grafana/build/dashboards/default_cluster_dashboard.json @@ -0,0 +1,570 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 3, + "links": [], + "panels": [ + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode='idle'}[5m])))", + "refId": "A" + } + ], + "title": "CPU 平均利用率(%)", + "type": "stat" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "expr": "avg(1 - ((node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes)) * 100", + "refId": "A" + } + ], + "title": "内存平均利用率(%)", + "type": "stat" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "expr": "count(count by(hostname) (up{job='node'} == 1))", + "refId": "A" + } + ], + "title": "节点在线数", + "type": "stat" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 5 + }, + "id": 6, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "expr": "avg by (hostname) (DCGM_FI_DEV_GPU_UTIL)", + "refId": "A" + } + ], + "title": "GPU 平均利用率 (%)", + "type": "gauge" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 5 + }, + "id": 12, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "expr": "round(avg(DCGM_FI_DEV_FB_USED{job='dcgm'}/(DCGM_FI_DEV_FB_USED{job='dcgm'} + DCGM_FI_DEV_FB_FREE{job='dcgm'})) * 100)", + "refId": "A" + } + ], + "title": "显存平均利用率 (%)", + "type": "gauge" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 5 + }, + "id": 7, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "expr": "avg by (hostname) (DCGM_FI_DEV_GPU_TEMP)", + "refId": "A" + } + ], + "title": "GPU 温度 (℃)", + "type": "gauge" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 300, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "orange", + "value": 200 + }, + { + "color": "red", + "value": 300 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 5 + }, + "id": 8, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "expr": "avg by (hostname) (DCGM_FI_DEV_POWER_USAGE)", + "refId": "A" + } + ], + "title": "GPU 平均实时功耗 (W)", + "type": "gauge" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": { + "align": "center", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "decimals": 1, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 11, + "options": { + "cellHeight": "sm", + "cellLinks": [ + { + "title": "跳转至节点详情", + "url": "http://127.0.0.1:3000/d/node_gpu_metrics/node-and-gpu-metrics?orgId=1&refresh=15s&var-hostname=${__data.fields.hostname}" + } + ], + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "GPU 使用率" + } + ] + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "expr": "up{job=\"dcgm\"} + on(hostname) group_left(ip, node_id) up{job=\"dcgm\"}*0", + "format": "table", + "instant": true, + "refId": "node_info" + }, + { + "expr": "round(100 - avg by(hostname)(rate(node_cpu_seconds_total{job=\"node\",mode=\"idle\"}[5m])) * 100, 0.1)", + "format": "table", + "instant": true, + "refId": "CPU" + }, + { + "expr": "round((1 - avg by(hostname)(node_memory_MemAvailable_bytes{job=\"node\"} / node_memory_MemTotal_bytes{job=\"node\"})) * 100, 0.1)", + "format": "table", + "instant": true, + "refId": "MEM" + }, + { + "expr": "round(avg by(hostname)(DCGM_FI_DEV_GPU_UTIL{job=\"dcgm\"}), 0.1)", + "format": "table", + "instant": true, + "refId": "GPU_UTIL" + }, + { + "expr": "round(avg by(hostname)(DCGM_FI_DEV_FB_USED{job=\"dcgm\"} / (DCGM_FI_DEV_FB_USED{job=\"dcgm\"} + DCGM_FI_DEV_FB_FREE{job=\"dcgm\"}) * 100), 0.1)", + "format": "table", + "instant": true, + "refId": "GPU_MEM" + } + ], + "title": "节点列表(CPU / 内存 / GPU)", + "transformations": [ + { + "id": "seriesToColumns", + "options": { + "byField": "hostname" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value #node_info": true, + "hostname_1": true, + "hostname_2": true, + "hostname_3": true, + "instance": true, + "ip_1": true, + "job": true, + "node_id_1": true + }, + "indexByName": { + "CPU 使用率": 3, + "GPU 使用率": 5, + "GPU 显存占用": 6, + "IP 地址": 1, + "主机名": 0, + "内存使用率": 4, + "节点 ID": 2 + }, + "renameByName": { + "Value #CPU": "CPU 使用率", + "Value #GPU_MEM": "GPU 显存占用", + "Value #GPU_UTIL": "GPU 使用率", + "Value #MEM": "内存使用率", + "hostname": "主机名", + "ip": "IP 地址", + "node_id": "节点 ID", + "user_id": "用户ID" + } + } + } + ], + "type": "table" + } + ], + "refresh": "5s", + "schemaVersion": 39, + "tags": [ + "cluster", + "gpu", + "system" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Cluster Dashboard", + "uid": "cluster-dashboard", + "version": 34, + "weekStart": "" +} \ No newline at end of file