diff --git a/src/metric/client-plugins/all-in-one-full/config/VERSION b/src/metric/client-plugins/all-in-one-full/config/VERSION index 7aa332e..2aeaa11 100644 --- a/src/metric/client-plugins/all-in-one-full/config/VERSION +++ b/src/metric/client-plugins/all-in-one-full/config/VERSION @@ -1 +1 @@ -1.33.0 +1.35.0 diff --git a/src/metric/grafana/build/dashboards/default_dashboard_by_hostname.json b/src/metric/grafana/build/dashboards/default_dashboard_by_hostname.json index 4a09e80..4ee370d 100644 --- a/src/metric/grafana/build/dashboards/default_dashboard_by_hostname.json +++ b/src/metric/grafana/build/dashboards/default_dashboard_by_hostname.json @@ -581,6 +581,372 @@ ], "title": "Node Process Count", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "GPU Utilization (%)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 80 + }, + { + "color": "red", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 301, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "DCGM_FI_DEV_GPU_UTIL{hostname=~\"$hostname\"}", + "legendFormat": "{{hostname}} GPU{{gpu}}", + "refId": "A" + } + ], + "title": "GPU 利用率 (单卡)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Memory Used (%)", + "axisPlacement": "left", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "orange", + "value": 80 + }, + { + "color": "red", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 403, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "round(DCGM_FI_DEV_FB_USED{hostname=~\"$hostname\"} / (DCGM_FI_DEV_FB_USED{hostname=~\"$hostname\"} + DCGM_FI_DEV_FB_FREE{hostname=~\"$hostname\"}) * 100)", + "legendFormat": "{{hostname}} GPU{{gpu}}", + "refId": "A" + } + ], + "title": "GPU 显存使用率 (单卡)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Temperature (℃)", + "axisPlacement": "left", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 75 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 501, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "DCGM_FI_DEV_GPU_TEMP{hostname=~\"$hostname\"}", + "legendFormat": "{{hostname}} GPU{{gpu}}", + "refId": "A" + } + ], + "title": "GPU 温度(单卡)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Power (W)", + "axisPlacement": "left", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 300, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 200 + }, + { + "color": "red", + "value": 300 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 502, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "DCGM_FI_DEV_POWER_USAGE{hostname=~\"$hostname\"}", + "legendFormat": "{{hostname}} GPU{{gpu}}", + "refId": "A" + } + ], + "title": "GPU 功率 (单卡)", + "type": "timeseries" } ], "refresh": "15s", @@ -589,11 +955,6 @@ "templating": { "list": [ { - "current": { - "selected": true, - "text": "node-exporter-A1", - "value": "node-exporter-A1" - }, "datasource": { "type": "prometheus" }, @@ -623,7 +984,7 @@ }, "timepicker": {}, "timezone": "", - "title": "Node and GPU Metrics", - "uid": "node_gpu_metrics", + "title": "Node and GPU Metrics (by hostname)", + "uid": "node_gpu_metrics_by_hostname", "weekStart": "" -} \ No newline at end of file +} diff --git a/src/metric/grafana/build/dashboards/default_dashboard_by_instance.json b/src/metric/grafana/build/dashboards/default_dashboard_by_instance.json index 78f0c43..c56b846 100644 --- a/src/metric/grafana/build/dashboards/default_dashboard_by_instance.json +++ b/src/metric/grafana/build/dashboards/default_dashboard_by_instance.json @@ -622,7 +622,7 @@ }, "timepicker": {}, "timezone": "", - "title": "Node and GPU Metrics", - "uid": "node_gpu_metrics", + "title": "Node and GPU Metrics (by instance)", + "uid": "node_gpu_metrics_by_instance", "weekStart": "" - } \ No newline at end of file + }