451 lines
18 KiB
JSON
451 lines
18 KiB
JSON
{
|
||
"id": null,
|
||
"uid": "netconf-transceiver-exporter",
|
||
"title": "NETCONF Transceiver Exporter",
|
||
"tags": ["netconf", "transceiver", "exporter"],
|
||
"timezone": "browser",
|
||
"schemaVersion": 38,
|
||
"version": 1,
|
||
"refresh": "30s",
|
||
"panels": [
|
||
{
|
||
"type": "stat",
|
||
"title": "Exporter Scrape Success Ratio",
|
||
"description": "含义:当前选择的所有 $device 的 netconf_scrape_success 平均值,介于 0–1。用法:一眼看整体抓取健康度(≈1 表示基本都成功,<1 表示有失败);多选多个设备时是平均值,可快速知道整体是否在抖。",
|
||
"id": 1,
|
||
"datasource": {
|
||
"type": "prometheus",
|
||
"uid": "prometheus"
|
||
},
|
||
"targets": [
|
||
{
|
||
"expr": "avg(netconf_scrape_success{job=~\"$job\", device=~\"$device\"})",
|
||
"legendFormat": "success_ratio",
|
||
"refId": "A"
|
||
}
|
||
],
|
||
"options": {
|
||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||
"orientation": "horizontal",
|
||
"textMode": "auto"
|
||
},
|
||
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }
|
||
},
|
||
{
|
||
"type": "timeseries",
|
||
"title": "Exporter Build Info (TODO)",
|
||
"description": "含义:预留的 exporter_build_info{version,python_version} 指标,可查看 exporter 版本与 Python 运行环境。用法:目前 exporter 尚未实现该指标,显示 No data 属正常;未来实现后,可以用来确认线上跑的是哪个版本/解释器,排查“是不是在用最新代码”。",
|
||
"id": 9,
|
||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||
"targets": [
|
||
{
|
||
"expr": "exporter_build_info{job=~\"$job\", device=~\"$device\"}",
|
||
"legendFormat": "{{version}} ({{python_version}})",
|
||
"refId": "A"
|
||
}
|
||
],
|
||
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }
|
||
},
|
||
{
|
||
"type": "stat",
|
||
"title": "Exporter Devices Total (TODO)",
|
||
"description": "含义:预留的 exporter_devices_total,表示 exporter 配置或注册的设备总数。用法:当前无数据属预期;未来实现后,可对比 total 与 enabled(Panel 11),例如 total=10 但 enabled=8,用来找出被禁用或注册失败的设备。",
|
||
"id": 10,
|
||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||
"targets": [
|
||
{
|
||
"expr": "exporter_devices_total{job=~\"$job\", device=~\"$device\"}",
|
||
"legendFormat": "{{device}} total",
|
||
"refId": "A"
|
||
}
|
||
],
|
||
"fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] },
|
||
"options": {
|
||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||
"orientation": "horizontal",
|
||
"textMode": "auto"
|
||
},
|
||
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }
|
||
},
|
||
{
|
||
"type": "stat",
|
||
"title": "Exporter Devices Enabled (TODO)",
|
||
"description": "含义:预留的 exporter_devices_enabled,表示实际启用抓取的设备数量。用法:当前无数据属预期;未来实现后,可快速判断是否有设备被标记为 disabled 或注册失败,并与 Panel 10 对比排查配置问题。",
|
||
"id": 11,
|
||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||
"targets": [
|
||
{
|
||
"expr": "exporter_devices_enabled{job=~\"$job\", device=~\"$device\"}",
|
||
"legendFormat": "{{device}} enabled",
|
||
"refId": "A"
|
||
}
|
||
],
|
||
"fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] },
|
||
"options": {
|
||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||
"orientation": "horizontal",
|
||
"textMode": "auto"
|
||
},
|
||
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 }
|
||
},
|
||
{
|
||
"type": "timeseries",
|
||
"title": "Scrape Success by Device",
|
||
"description": "含义:每个设备的 netconf_scrape_success 随时间的 0/1 曲线(按 $device 过滤)。用法:查看“哪个设备最近在频繁失败”:曲线掉到 0 的时间段就是失败窗口;多设备同时查看时,每条线代表一个 device,可以直观看出谁最不稳定。",
|
||
"id": 2,
|
||
"datasource": {
|
||
"type": "prometheus",
|
||
"uid": "prometheus"
|
||
},
|
||
"targets": [
|
||
{
|
||
"expr": "netconf_scrape_success{job=~\"$job\", device=~\"$device\"}",
|
||
"legendFormat": "{{device}}",
|
||
"refId": "A"
|
||
}
|
||
],
|
||
"fieldConfig": {
|
||
"defaults": { "unit": "bool" },
|
||
"overrides": []
|
||
},
|
||
"gridPos": { "h": 8, "w": 9, "x": 6, "y": 0 }
|
||
},
|
||
{
|
||
"type": "timeseries",
|
||
"title": "Scrape Duration per Device",
|
||
"description": "含义:每个设备的 netconf_scrape_duration_seconds,单位秒。用法:判断哪台设备抓取最慢(曲线越高,抓取耗时越长,可能 RPC 多或设备性能差);观察趋势变化,例如某设备 duration 从 2s 突然升到 10s,提示设备压力或链路问题。",
|
||
"id": 3,
|
||
"datasource": {
|
||
"type": "prometheus",
|
||
"uid": "prometheus"
|
||
},
|
||
"targets": [
|
||
{
|
||
"expr": "netconf_scrape_duration_seconds{job=~\"$job\", device=~\"$device\"}",
|
||
"legendFormat": "{{device}}",
|
||
"refId": "A"
|
||
}
|
||
],
|
||
"fieldConfig": {
|
||
"defaults": { "unit": "s" },
|
||
"overrides": []
|
||
},
|
||
"gridPos": { "h": 8, "w": 9, "x": 15, "y": 0 }
|
||
},
|
||
{
|
||
"type": "table",
|
||
"title": "Scrape Errors by Device & Type (5m)",
|
||
"description": "含义:最近 5 分钟内 netconf_scrape_errors_total 的增长量,按 (device,error_type) 聚合。用法:出现异常时优先查看 error_type(如 AuthError、RpcTimeout、UnknownError 等);配合 Panel 2/3 使用,当某设备 success 掉、duration 拉长时,看对应的 error_type 是什么以定位问题。",
|
||
"id": 4,
|
||
"datasource": {
|
||
"type": "prometheus",
|
||
"uid": "prometheus"
|
||
},
|
||
"targets": [
|
||
{
|
||
"expr": "sum(increase(netconf_scrape_errors_total{job=~\"$job\", device=~\"$device\"}[5m])) by (device, error_type)",
|
||
"format": "table",
|
||
"refId": "A"
|
||
}
|
||
],
|
||
"fieldConfig": { "defaults": {}, "overrides": [] },
|
||
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 4 }
|
||
},
|
||
{
|
||
"type": "timeseries",
|
||
"title": "Transceiver Supply Voltage",
|
||
"description": "含义:transceiver_supply_voltage_volts,按 {device,port,component_name} 展示各光模块电压。用法:$port 选 All 时可对比不同端口/模块电压是否在合理区间;当某个 transceiver 电压明显过低或过高时,通过 legend 中的 device/port/component_name 精确定位异常模块。",
|
||
"id": 12,
|
||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||
"targets": [
|
||
{
|
||
"expr": "transceiver_supply_voltage_volts{job=~\"$job\", device=~\"$device\", port=~\"$port\"}",
|
||
"legendFormat": "{{device}}/{{port}}/{{component_name}}",
|
||
"refId": "A"
|
||
}
|
||
],
|
||
"fieldConfig": {
|
||
"defaults": { "unit": "volt" },
|
||
"overrides": []
|
||
},
|
||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }
|
||
},
|
||
{
|
||
"type": "timeseries",
|
||
"title": "Transceiver Temperature (with vendor/serial)",
|
||
"description": "含义:transceiver_temperature_celsius 与 transceiver_info_info join 后的温度曲线,legend 中包含 vendor 与 serial 等元数据。用法:在同一设备下对比多个模块温度,快速发现温度显著偏高的模块;带 serial 的 legend 方便截图或记录,用于资产与维护(如“序列号为 X 的模块长期偏高”)。",
|
||
"id": 13,
|
||
"datasource": {
|
||
"type": "prometheus",
|
||
"uid": "prometheus"
|
||
},
|
||
"targets": [
|
||
{
|
||
"expr": "transceiver_temperature_celsius{job=~\"$job\", device=~\"$device\", port=~\"$port\"} * on(job, device, port, component_name) group_left(vendor, serial, part_number, hardware_rev) transceiver_info_info{job=~\"$job\", device=~\"$device\"}",
|
||
"legendFormat": "{{device}}/{{port}} {{vendor}}({{serial}})",
|
||
"refId": "A"
|
||
}
|
||
],
|
||
"fieldConfig": {
|
||
"defaults": { "unit": "celsius" },
|
||
"overrides": []
|
||
},
|
||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }
|
||
},
|
||
{
|
||
"type": "bargauge",
|
||
"title": "Transceiver Present & Oper Status",
|
||
"description": "含义:transceiver_present 表示模块是否在位 (1=在位,0=不在位),同时预留展示 transceiver_oper_status(运行状态,例如 up/down),当前无该数据属预期。用法:快速确认某设备端口上光模块是否插着(present=1);未来实现 oper_status 后,可在同一面板中发现“模块在位但状态为 down”这类异常。",
|
||
"id": 14,
|
||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||
"targets": [
|
||
{
|
||
"expr": "transceiver_present{job=~\"$job\", device=~\"$device\", port=~\"$port\"}",
|
||
"legendFormat": "{{device}}/{{port}}/{{component_name}}",
|
||
"refId": "A"
|
||
},
|
||
{
|
||
"expr": "transceiver_oper_status{job=~\"$job\", device=~\"$device\", port=~\"$port\"}",
|
||
"legendFormat": "oper: {{device}}/{{port}}/{{component_name}}",
|
||
"refId": "B"
|
||
}
|
||
],
|
||
"fieldConfig": {
|
||
"defaults": { "unit": "bool" },
|
||
"overrides": []
|
||
},
|
||
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 14 }
|
||
},
|
||
{
|
||
"type": "timeseries",
|
||
"title": "Channel Tx Power (dBm)",
|
||
"description": "含义:transceiver_channel_tx_power_dbm,按 channel 维度展示发射光功率。用法:$device + $port 指定后,$component 可选单个或多个模块,查看各通道的 TX 功率;用于对比多通道是否功率均衡,以及定位某个通道发射功率突然下降的情况。",
|
||
"id": 15,
|
||
"datasource": {
|
||
"type": "prometheus",
|
||
"uid": "prometheus"
|
||
},
|
||
"targets": [
|
||
{
|
||
"expr": "transceiver_channel_tx_power_dbm{job=~\"$job\", device=~\"$device\", port=~\"$port\", component_name=~\"$component\"}",
|
||
"legendFormat": "{{component_name}}:{{channel}}",
|
||
"refId": "A"
|
||
}
|
||
],
|
||
"fieldConfig": {
|
||
"defaults": { "unit": "dBm" },
|
||
"overrides": []
|
||
},
|
||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 }
|
||
},
|
||
{
|
||
"type": "timeseries",
|
||
"title": "Channel Rx Power (dBm)",
|
||
"description": "含义:transceiver_channel_rx_power_dbm,按 channel 展示接收光功率。用法:配合 TX 面板一起查看链路健康度(TX 正常但 RX 很低可能是远端/链路问题);多 device、多 port 对比场景下,可快速识别接收功率异常低的通道。",
|
||
"id": 16,
|
||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||
"targets": [
|
||
{
|
||
"expr": "transceiver_channel_rx_power_dbm{job=~\"$job\", device=~\"$device\", port=~\"$port\", component_name=~\"$component\"}",
|
||
"legendFormat": "{{component_name}}:{{channel}}",
|
||
"refId": "A"
|
||
}
|
||
],
|
||
"fieldConfig": {
|
||
"defaults": { "unit": "dBm" },
|
||
"overrides": []
|
||
},
|
||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 }
|
||
},
|
||
{
|
||
"type": "timeseries",
|
||
"title": "Channel Bias Current (mA)",
|
||
"description": "含义:transceiver_channel_bias_current_ma,每个 channel 对应激光器偏置电流。用法:检查不同通道的电流是否在合理范围内,识别某个通道电流特别高的情况(可能模块老化);结合历史趋势观察电流缓慢爬升,用于预判光模块老化风险。",
|
||
"id": 17,
|
||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||
"targets": [
|
||
{
|
||
"expr": "transceiver_channel_bias_current_ma{job=~\"$job\", device=~\"$device\", port=~\"$port\", component_name=~\"$component\"}",
|
||
"legendFormat": "{{component_name}}:{{channel}}",
|
||
"refId": "A"
|
||
}
|
||
],
|
||
"fieldConfig": {
|
||
"defaults": { "unit": "mA" },
|
||
"overrides": []
|
||
},
|
||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 28 }
|
||
},
|
||
{
|
||
"type": "timeseries",
|
||
"title": "Channel Laser Temperature (C, device-dependent)",
|
||
"description": "含义:transceiver_channel_laser_temperature_celsius,按通道展示激光器温度。用法:当前 H3C 设备通常不返回该字段,因此本面板往往无数据,这是设备特性而非 bug;若未来设备或其他厂商实现该字段,本面板可用于更细粒度地观察每个通道的激光温度。",
|
||
"id": 18,
|
||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||
"targets": [
|
||
{
|
||
"expr": "transceiver_channel_laser_temperature_celsius{job=~\"$job\", device=~\"$device\", port=~\"$port\", component_name=~\"$component\"}",
|
||
"legendFormat": "{{component_name}}:{{channel}}",
|
||
"refId": "A"
|
||
}
|
||
],
|
||
"fieldConfig": {
|
||
"defaults": { "unit": "celsius" },
|
||
"overrides": []
|
||
},
|
||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 28 }
|
||
},
|
||
{
|
||
"type": "table",
|
||
"title": "Channel Info (Instant)",
|
||
"description": "含义:transceiver_channel_info_info 与 transceiver_info_info join 后形成的静态资产表,每行对应一个 channel,并附带 vendor/serial/part_number/hardware_rev 等上游光模块信息。用法:使用 Instant 查询和表格展示,仅保留 label 列;作为“光模块资产清单”,可按 port、component_name、channel_index 排序,清晰查看每个端口有哪些模块及其通道数量,并通过 vendor/serial 等字段核对现场资产或反查某个 channel 属于哪个物理模块。",
|
||
"id": 19,
|
||
"datasource": {
|
||
"type": "prometheus",
|
||
"uid": "prometheus"
|
||
},
|
||
"targets": [
|
||
{
|
||
"expr": "transceiver_channel_info_info{job=~\"$job\", device=~\"$device\"} * on(job, device, port, component_name) group_left(vendor, serial, part_number, hardware_rev) transceiver_info_info{job=~\"$job\", device=~\"$device\"}",
|
||
"format": "table",
|
||
"instant": true,
|
||
"refId": "A"
|
||
}
|
||
],
|
||
"fieldConfig": {
|
||
"defaults": {},
|
||
"overrides": [
|
||
{
|
||
"matcher": { "id": "byName", "options": "Time" },
|
||
"properties": [
|
||
{ "id": "custom.hidden", "value": true }
|
||
]
|
||
},
|
||
{
|
||
"matcher": { "id": "byName", "options": "Value" },
|
||
"properties": [
|
||
{ "id": "custom.hidden", "value": true }
|
||
]
|
||
}
|
||
]
|
||
},
|
||
"options": {
|
||
"showHeader": true
|
||
},
|
||
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 36 }
|
||
},
|
||
{
|
||
"type": "stat",
|
||
"title": "Time Since Last Scrape (s)",
|
||
"description": "含义:time() - netconf_last_scrape_timestamp_seconds,表示每个设备距离最近一次 NETCONF 抓取已过去的秒数。用法:如果数值一直在几十秒以内,说明抓取在持续正常执行;若某设备数值突然飙高且长期不回落,说明它很久没有被成功抓取,需检查连接、认证或设备状态。",
|
||
"id": 20,
|
||
"datasource": {
|
||
"type": "prometheus",
|
||
"uid": "prometheus"
|
||
},
|
||
"targets": [
|
||
{
|
||
"expr": "time() - netconf_last_scrape_timestamp_seconds{job=~\"$job\", device=~\"$device\"}",
|
||
"legendFormat": "{{device}}",
|
||
"refId": "A"
|
||
}
|
||
],
|
||
"fieldConfig": {
|
||
"defaults": { "unit": "s" },
|
||
"overrides": []
|
||
},
|
||
"options": {
|
||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||
"orientation": "horizontal",
|
||
"textMode": "auto"
|
||
},
|
||
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 46 }
|
||
},
|
||
{
|
||
"type": "stat",
|
||
"title": "Transceiver Data Staleness (TODO)",
|
||
"description": "含义:设计中的 transceiver_data_staleness_seconds,计划表示 Transceiver 业务数据(最近一次成功抓取)相对当前时间的滞后程度。用法:目前仅注册 metric,逻辑尚未实现,因此无数据属预期;未来实现后,将比 Panel 20 更直接地体现“业务数据本身有多旧”,用于监控数据新鲜度。",
|
||
"id": 21,
|
||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||
"targets": [
|
||
{
|
||
"expr": "transceiver_data_staleness_seconds{job=~\"$job\", device=~\"$device\"}",
|
||
"legendFormat": "{{device}}",
|
||
"refId": "A"
|
||
}
|
||
],
|
||
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
|
||
"options": {
|
||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||
"orientation": "horizontal",
|
||
"textMode": "auto"
|
||
},
|
||
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 46 }
|
||
}
|
||
],
|
||
"templating": {
|
||
"list": [
|
||
{
|
||
"name": "job",
|
||
"label": "Job",
|
||
"type": "query",
|
||
"datasource": {
|
||
"type": "prometheus",
|
||
"uid": "prometheus"
|
||
},
|
||
"query": "label_values(netconf_scrape_success, job)",
|
||
"refresh": 1,
|
||
"multi": true,
|
||
"includeAll": true,
|
||
"current": {
|
||
"text": "All",
|
||
"value": "$__all"
|
||
}
|
||
},
|
||
{
|
||
"name": "device",
|
||
"label": "Device",
|
||
"type": "query",
|
||
"datasource": {
|
||
"type": "prometheus",
|
||
"uid": "prometheus"
|
||
},
|
||
"query": "label_values(netconf_scrape_success{job=~\"$job\"}, device)",
|
||
"refresh": 2,
|
||
"multi": true,
|
||
"includeAll": true
|
||
},
|
||
{
|
||
"name": "port",
|
||
"label": "Port",
|
||
"type": "query",
|
||
"datasource": {
|
||
"type": "prometheus",
|
||
"uid": "prometheus"
|
||
},
|
||
"query": "label_values(transceiver_channel_info_info{job=~\"$job\",device=~\"$device\"}, port)",
|
||
"refresh": 2,
|
||
"multi": true,
|
||
"includeAll": true
|
||
},
|
||
{
|
||
"name": "component",
|
||
"label": "Component",
|
||
"type": "query",
|
||
"datasource": {
|
||
"type": "prometheus",
|
||
"uid": "prometheus"
|
||
},
|
||
"query": "label_values(transceiver_channel_info_info{job=~\"$job\",device=~\"$device\",port=~\"$port\"}, component_name)",
|
||
"refresh": 2,
|
||
"multi": true,
|
||
"includeAll": true
|
||
}
|
||
]
|
||
},
|
||
"time": {
|
||
"from": "now-1h",
|
||
"to": "now"
|
||
}
|
||
}
|