dev_1.0.0_sundp_2 优化Argus-metric模块的e2e部署测试流程 #27

Merged
sundapeng merged 7 commits from dev_1.0.0_sundp_2 into dev_1.0.0 2025-10-17 17:15:55 +08:00
Showing only changes of commit 89fba8114a - Show all commits

View File

@ -0,0 +1,570 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 3,
"links": [],
"panels": [
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 8,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.1.0",
"targets": [
{
"expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode='idle'}[5m])))",
"refId": "A"
}
],
"title": "CPU 平均利用率(%",
"type": "stat"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 8,
"x": 8,
"y": 0
},
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.1.0",
"targets": [
{
"expr": "avg(1 - ((node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes)) * 100",
"refId": "A"
}
],
"title": "内存平均利用率(%",
"type": "stat"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 8,
"x": 16,
"y": 0
},
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.1.0",
"targets": [
{
"expr": "count(count by(hostname) (up{job='node'} == 1))",
"refId": "A"
}
],
"title": "节点在线数",
"type": "stat"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 6,
"x": 0,
"y": 5
},
"id": 6,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto"
},
"pluginVersion": "11.1.0",
"targets": [
{
"expr": "avg by (hostname) (DCGM_FI_DEV_GPU_UTIL)",
"refId": "A"
}
],
"title": "GPU 平均利用率 (%)",
"type": "gauge"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 6,
"x": 6,
"y": 5
},
"id": 12,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto"
},
"pluginVersion": "11.1.0",
"targets": [
{
"expr": "round(avg(DCGM_FI_DEV_FB_USED{job='dcgm'}/(DCGM_FI_DEV_FB_USED{job='dcgm'} + DCGM_FI_DEV_FB_FREE{job='dcgm'})) * 100)",
"refId": "A"
}
],
"title": "显存平均利用率 (%)",
"type": "gauge"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 6,
"x": 12,
"y": 5
},
"id": 7,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto"
},
"pluginVersion": "11.1.0",
"targets": [
{
"expr": "avg by (hostname) (DCGM_FI_DEV_GPU_TEMP)",
"refId": "A"
}
],
"title": "GPU 温度 (℃)",
"type": "gauge"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"mappings": [],
"max": 300,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "orange",
"value": 200
},
{
"color": "red",
"value": 300
}
]
},
"unit": "watt"
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 6,
"x": 18,
"y": 5
},
"id": 8,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": true,
"showThresholdMarkers": true,
"sizing": "auto"
},
"pluginVersion": "11.1.0",
"targets": [
{
"expr": "avg by (hostname) (DCGM_FI_DEV_POWER_USAGE)",
"refId": "A"
}
],
"title": "GPU 平均实时功耗 (W)",
"type": "gauge"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"custom": {
"align": "center",
"cellOptions": {
"type": "auto"
},
"inspect": false
},
"decimals": 1,
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 12,
"w": 24,
"x": 0,
"y": 11
},
"id": 11,
"options": {
"cellHeight": "sm",
"cellLinks": [
{
"title": "跳转至节点详情",
"url": "http://127.0.0.1:3000/d/node_gpu_metrics/node-and-gpu-metrics?orgId=1&refresh=15s&var-hostname=${__data.fields.hostname}"
}
],
"footer": {
"countRows": false,
"fields": "",
"reducer": [
"sum"
],
"show": false
},
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "GPU 使用率"
}
]
},
"pluginVersion": "11.1.0",
"targets": [
{
"expr": "up{job=\"dcgm\"} + on(hostname) group_left(ip, node_id) up{job=\"dcgm\"}*0",
"format": "table",
"instant": true,
"refId": "node_info"
},
{
"expr": "round(100 - avg by(hostname)(rate(node_cpu_seconds_total{job=\"node\",mode=\"idle\"}[5m])) * 100, 0.1)",
"format": "table",
"instant": true,
"refId": "CPU"
},
{
"expr": "round((1 - avg by(hostname)(node_memory_MemAvailable_bytes{job=\"node\"} / node_memory_MemTotal_bytes{job=\"node\"})) * 100, 0.1)",
"format": "table",
"instant": true,
"refId": "MEM"
},
{
"expr": "round(avg by(hostname)(DCGM_FI_DEV_GPU_UTIL{job=\"dcgm\"}), 0.1)",
"format": "table",
"instant": true,
"refId": "GPU_UTIL"
},
{
"expr": "round(avg by(hostname)(DCGM_FI_DEV_FB_USED{job=\"dcgm\"} / (DCGM_FI_DEV_FB_USED{job=\"dcgm\"} + DCGM_FI_DEV_FB_FREE{job=\"dcgm\"}) * 100), 0.1)",
"format": "table",
"instant": true,
"refId": "GPU_MEM"
}
],
"title": "节点列表CPU / 内存 / GPU",
"transformations": [
{
"id": "seriesToColumns",
"options": {
"byField": "hostname"
}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"Value #node_info": true,
"hostname_1": true,
"hostname_2": true,
"hostname_3": true,
"instance": true,
"ip_1": true,
"job": true,
"node_id_1": true
},
"indexByName": {
"CPU 使用率": 3,
"GPU 使用率": 5,
"GPU 显存占用": 6,
"IP 地址": 1,
"主机名": 0,
"内存使用率": 4,
"节点 ID": 2
},
"renameByName": {
"Value #CPU": "CPU 使用率",
"Value #GPU_MEM": "GPU 显存占用",
"Value #GPU_UTIL": "GPU 使用率",
"Value #MEM": "内存使用率",
"hostname": "主机名",
"ip": "IP 地址",
"node_id": "节点 ID",
"user_id": "用户ID"
}
}
}
],
"type": "table"
}
],
"refresh": "5s",
"schemaVersion": 39,
"tags": [
"cluster",
"gpu",
"system"
],
"templating": {
"list": []
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"timezone": "browser",
"title": "Cluster Dashboard",
"uid": "cluster-dashboard",
"version": 34,
"weekStart": ""
}