[#1] 增加部署测试prom&grafana

This commit is contained in:
yuyr 2025-12-02 11:13:36 +08:00
parent 6a0a4755b6
commit 79afe4fbfc
5 changed files with 527 additions and 0 deletions

View File

@ -0,0 +1,41 @@
version: "3.8"
networks:
monitor_net:
driver: bridge
services:
prometheus:
image: prom/prometheus:latest
container_name: deploytest-prometheus
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
# - ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro # 可选:如需告警规则可解开
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--web.enable-lifecycle"
ports:
- "19090:9090"
networks:
- monitor_net
extra_hosts:
# 兼容 Linux让容器中的 host.docker.internal 指向宿主机
- "host.docker.internal:host-gateway"
grafana:
image: grafana/grafana:latest
container_name: deploytest-grafana
depends_on:
- prometheus
ports:
- "13000:3000"
networks:
- monitor_net
volumes:
- ./grafana/provisioning:/etc/grafana/provisioning:ro
- ./grafana/dashboards:/var/lib/grafana/dashboards:ro
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin
- GF_USERS_ALLOW_SIGN_UP=false

View File

@ -0,0 +1,450 @@
{
"id": null,
"uid": "netconf-transceiver-exporter",
"title": "NETCONF Transceiver Exporter",
"tags": ["netconf", "transceiver", "exporter"],
"timezone": "browser",
"schemaVersion": 38,
"version": 1,
"refresh": "30s",
"panels": [
{
"type": "stat",
"title": "Exporter Scrape Success Ratio",
"description": "含义:当前选择的所有 $device 的 netconf_scrape_success 平均值,介于 01。用法一眼看整体抓取健康度≈1 表示基本都成功,<1 表示有失败);多选多个设备时是平均值,可快速知道整体是否在抖。",
"id": 1,
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"targets": [
{
"expr": "avg(netconf_scrape_success{job=~\"$job\", device=~\"$device\"})",
"legendFormat": "success_ratio",
"refId": "A"
}
],
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"orientation": "horizontal",
"textMode": "auto"
},
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }
},
{
"type": "timeseries",
"title": "Exporter Build Info (TODO)",
"description": "含义:预留的 exporter_build_info{version,python_version} 指标,可查看 exporter 版本与 Python 运行环境。用法:目前 exporter 尚未实现该指标,显示 No data 属正常;未来实现后,可以用来确认线上跑的是哪个版本/解释器,排查“是不是在用最新代码”。",
"id": 9,
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "exporter_build_info{job=~\"$job\", device=~\"$device\"}",
"legendFormat": "{{version}} ({{python_version}})",
"refId": "A"
}
],
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }
},
{
"type": "stat",
"title": "Exporter Devices Total (TODO)",
"description": "含义:预留的 exporter_devices_total表示 exporter 配置或注册的设备总数。用法:当前无数据属预期;未来实现后,可对比 total 与 enabledPanel 11例如 total=10 但 enabled=8用来找出被禁用或注册失败的设备。",
"id": 10,
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "exporter_devices_total{job=~\"$job\", device=~\"$device\"}",
"legendFormat": "{{device}} total",
"refId": "A"
}
],
"fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] },
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"orientation": "horizontal",
"textMode": "auto"
},
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }
},
{
"type": "stat",
"title": "Exporter Devices Enabled (TODO)",
"description": "含义:预留的 exporter_devices_enabled表示实际启用抓取的设备数量。用法当前无数据属预期未来实现后可快速判断是否有设备被标记为 disabled 或注册失败,并与 Panel 10 对比排查配置问题。",
"id": 11,
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "exporter_devices_enabled{job=~\"$job\", device=~\"$device\"}",
"legendFormat": "{{device}} enabled",
"refId": "A"
}
],
"fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] },
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"orientation": "horizontal",
"textMode": "auto"
},
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 }
},
{
"type": "timeseries",
"title": "Scrape Success by Device",
"description": "含义:每个设备的 netconf_scrape_success 随时间的 0/1 曲线(按 $device 过滤)。用法:查看“哪个设备最近在频繁失败”:曲线掉到 0 的时间段就是失败窗口;多设备同时查看时,每条线代表一个 device可以直观看出谁最不稳定。",
"id": 2,
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"targets": [
{
"expr": "netconf_scrape_success{job=~\"$job\", device=~\"$device\"}",
"legendFormat": "{{device}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": { "unit": "bool" },
"overrides": []
},
"gridPos": { "h": 8, "w": 9, "x": 6, "y": 0 }
},
{
"type": "timeseries",
"title": "Scrape Duration per Device",
"description": "含义:每个设备的 netconf_scrape_duration_seconds单位秒。用法判断哪台设备抓取最慢曲线越高抓取耗时越长可能 RPC 多或设备性能差);观察趋势变化,例如某设备 duration 从 2s 突然升到 10s提示设备压力或链路问题。",
"id": 3,
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"targets": [
{
"expr": "netconf_scrape_duration_seconds{job=~\"$job\", device=~\"$device\"}",
"legendFormat": "{{device}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": { "unit": "s" },
"overrides": []
},
"gridPos": { "h": 8, "w": 9, "x": 15, "y": 0 }
},
{
"type": "table",
"title": "Scrape Errors by Device & Type (5m)",
"description": "含义:最近 5 分钟内 netconf_scrape_errors_total 的增长量,按 (device,error_type) 聚合。用法:出现异常时优先查看 error_type如 AuthError、RpcTimeout、UnknownError 等);配合 Panel 2/3 使用,当某设备 success 掉、duration 拉长时,看对应的 error_type 是什么以定位问题。",
"id": 4,
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"targets": [
{
"expr": "sum(increase(netconf_scrape_errors_total{job=~\"$job\", device=~\"$device\"}[5m])) by (device, error_type)",
"format": "table",
"refId": "A"
}
],
"fieldConfig": { "defaults": {}, "overrides": [] },
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 4 }
},
{
"type": "timeseries",
"title": "Transceiver Supply Voltage",
"description": "含义transceiver_supply_voltage_volts按 {device,port,component_name} 展示各光模块电压。用法:$port 选 All 时可对比不同端口/模块电压是否在合理区间;当某个 transceiver 电压明显过低或过高时,通过 legend 中的 device/port/component_name 精确定位异常模块。",
"id": 12,
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "transceiver_supply_voltage_volts{job=~\"$job\", device=~\"$device\", port=~\"$port\"}",
"legendFormat": "{{device}}/{{port}}/{{component_name}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": { "unit": "volt" },
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }
},
{
"type": "timeseries",
"title": "Transceiver Temperature (with vendor/serial)",
"description": "含义transceiver_temperature_celsius 与 transceiver_info_info join 后的温度曲线legend 中包含 vendor 与 serial 等元数据。用法:在同一设备下对比多个模块温度,快速发现温度显著偏高的模块;带 serial 的 legend 方便截图或记录,用于资产与维护(如“序列号为 X 的模块长期偏高”)。",
"id": 13,
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"targets": [
{
"expr": "transceiver_temperature_celsius{job=~\"$job\", device=~\"$device\", port=~\"$port\"} * on(job, device, port, component_name) group_left(vendor, serial, part_number, hardware_rev) transceiver_info_info{job=~\"$job\", device=~\"$device\"}",
"legendFormat": "{{device}}/{{port}} {{vendor}}({{serial}})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": { "unit": "celsius" },
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }
},
{
"type": "bargauge",
"title": "Transceiver Present & Oper Status",
"description": "含义transceiver_present 表示模块是否在位 (1=在位,0=不在位),同时预留展示 transceiver_oper_status运行状态例如 up/down当前无该数据属预期。用法快速确认某设备端口上光模块是否插着present=1未来实现 oper_status 后,可在同一面板中发现“模块在位但状态为 down”这类异常。",
"id": 14,
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "transceiver_present{job=~\"$job\", device=~\"$device\", port=~\"$port\"}",
"legendFormat": "{{device}}/{{port}}/{{component_name}}",
"refId": "A"
},
{
"expr": "transceiver_oper_status{job=~\"$job\", device=~\"$device\", port=~\"$port\"}",
"legendFormat": "oper: {{device}}/{{port}}/{{component_name}}",
"refId": "B"
}
],
"fieldConfig": {
"defaults": { "unit": "bool" },
"overrides": []
},
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 14 }
},
{
"type": "timeseries",
"title": "Channel Tx Power (dBm)",
"description": "含义transceiver_channel_tx_power_dbm按 channel 维度展示发射光功率。用法:$device + $port 指定后,$component 可选单个或多个模块,查看各通道的 TX 功率;用于对比多通道是否功率均衡,以及定位某个通道发射功率突然下降的情况。",
"id": 15,
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"targets": [
{
"expr": "transceiver_channel_tx_power_dbm{job=~\"$job\", device=~\"$device\", port=~\"$port\", component_name=~\"$component\"}",
"legendFormat": "{{component_name}}:{{channel}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": { "unit": "dBm" },
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 }
},
{
"type": "timeseries",
"title": "Channel Rx Power (dBm)",
"description": "含义transceiver_channel_rx_power_dbm按 channel 展示接收光功率。用法:配合 TX 面板一起查看链路健康度TX 正常但 RX 很低可能是远端/链路问题);多 device、多 port 对比场景下,可快速识别接收功率异常低的通道。",
"id": 16,
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "transceiver_channel_rx_power_dbm{job=~\"$job\", device=~\"$device\", port=~\"$port\", component_name=~\"$component\"}",
"legendFormat": "{{component_name}}:{{channel}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": { "unit": "dBm" },
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 }
},
{
"type": "timeseries",
"title": "Channel Bias Current (mA)",
"description": "含义transceiver_channel_bias_current_ma每个 channel 对应激光器偏置电流。用法:检查不同通道的电流是否在合理范围内,识别某个通道电流特别高的情况(可能模块老化);结合历史趋势观察电流缓慢爬升,用于预判光模块老化风险。",
"id": 17,
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "transceiver_channel_bias_current_ma{job=~\"$job\", device=~\"$device\", port=~\"$port\", component_name=~\"$component\"}",
"legendFormat": "{{component_name}}:{{channel}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": { "unit": "mA" },
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 28 }
},
{
"type": "timeseries",
"title": "Channel Laser Temperature (C, device-dependent)",
"description": "含义transceiver_channel_laser_temperature_celsius按通道展示激光器温度。用法当前 H3C 设备通常不返回该字段,因此本面板往往无数据,这是设备特性而非 bug若未来设备或其他厂商实现该字段本面板可用于更细粒度地观察每个通道的激光温度。",
"id": 18,
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "transceiver_channel_laser_temperature_celsius{job=~\"$job\", device=~\"$device\", port=~\"$port\", component_name=~\"$component\"}",
"legendFormat": "{{component_name}}:{{channel}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": { "unit": "celsius" },
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 28 }
},
{
"type": "table",
"title": "Channel Info (Instant)",
"description": "含义transceiver_channel_info_info 与 transceiver_info_info join 后形成的静态资产表,每行对应一个 channel并附带 vendor/serial/part_number/hardware_rev 等上游光模块信息。用法:使用 Instant 查询和表格展示,仅保留 label 列;作为“光模块资产清单”,可按 port、component_name、channel_index 排序,清晰查看每个端口有哪些模块及其通道数量,并通过 vendor/serial 等字段核对现场资产或反查某个 channel 属于哪个物理模块。",
"id": 19,
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"targets": [
{
"expr": "transceiver_channel_info_info{job=~\"$job\", device=~\"$device\"} * on(job, device, port, component_name) group_left(vendor, serial, part_number, hardware_rev) transceiver_info_info{job=~\"$job\", device=~\"$device\"}",
"format": "table",
"instant": true,
"refId": "A"
}
],
"fieldConfig": {
"defaults": {},
"overrides": [
{
"matcher": { "id": "byName", "options": "Time" },
"properties": [
{ "id": "custom.hidden", "value": true }
]
},
{
"matcher": { "id": "byName", "options": "Value" },
"properties": [
{ "id": "custom.hidden", "value": true }
]
}
]
},
"options": {
"showHeader": true
},
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 36 }
},
{
"type": "stat",
"title": "Time Since Last Scrape (s)",
"description": "含义time() - netconf_last_scrape_timestamp_seconds表示每个设备距离最近一次 NETCONF 抓取已过去的秒数。用法:如果数值一直在几十秒以内,说明抓取在持续正常执行;若某设备数值突然飙高且长期不回落,说明它很久没有被成功抓取,需检查连接、认证或设备状态。",
"id": 20,
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"targets": [
{
"expr": "time() - netconf_last_scrape_timestamp_seconds{job=~\"$job\", device=~\"$device\"}",
"legendFormat": "{{device}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": { "unit": "s" },
"overrides": []
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"orientation": "horizontal",
"textMode": "auto"
},
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 46 }
},
{
"type": "stat",
"title": "Transceiver Data Staleness (TODO)",
"description": "含义:设计中的 transceiver_data_staleness_seconds计划表示 Transceiver 业务数据(最近一次成功抓取)相对当前时间的滞后程度。用法:目前仅注册 metric逻辑尚未实现因此无数据属预期未来实现后将比 Panel 20 更直接地体现“业务数据本身有多旧”,用于监控数据新鲜度。",
"id": 21,
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{
"expr": "transceiver_data_staleness_seconds{job=~\"$job\", device=~\"$device\"}",
"legendFormat": "{{device}}",
"refId": "A"
}
],
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"orientation": "horizontal",
"textMode": "auto"
},
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 46 }
}
],
"templating": {
"list": [
{
"name": "job",
"label": "Job",
"type": "query",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"query": "label_values(netconf_scrape_success, job)",
"refresh": 1,
"multi": true,
"includeAll": true,
"current": {
"text": "All",
"value": "$__all"
}
},
{
"name": "device",
"label": "Device",
"type": "query",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"query": "label_values(netconf_scrape_success{job=~\"$job\"}, device)",
"refresh": 2,
"multi": true,
"includeAll": true
},
{
"name": "port",
"label": "Port",
"type": "query",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"query": "label_values(transceiver_channel_info_info{job=~\"$job\",device=~\"$device\"}, port)",
"refresh": 2,
"multi": true,
"includeAll": true
},
{
"name": "component",
"label": "Component",
"type": "query",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"query": "label_values(transceiver_channel_info_info{job=~\"$job\",device=~\"$device\",port=~\"$port\"}, component_name)",
"refresh": 2,
"multi": true,
"includeAll": true
}
]
},
"time": {
"from": "now-1h",
"to": "now"
}
}

View File

@ -0,0 +1,12 @@
apiVersion: 1
providers:
- name: "netconf-transceiver-exporter"
orgId: 1
folder: ""
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards

View File

@ -0,0 +1,11 @@
apiVersion: 1
datasources:
- name: Prometheus
uid: prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: false

View File

@ -0,0 +1,13 @@
global:
scrape_interval: 15s
evaluation_interval: 30s
scrape_configs:
- job_name: "netconf_transceiver_exporter"
scrape_interval: 15s
static_configs:
- targets:
- "host.docker.internal:19100"
labels:
env: "deploytest"