diff --git a/deploytest/docker-compose.yml b/deploytest/docker-compose.yml new file mode 100644 index 0000000..a3f7e0e --- /dev/null +++ b/deploytest/docker-compose.yml @@ -0,0 +1,41 @@ +version: "3.8" + +networks: + monitor_net: + driver: bridge + +services: + prometheus: + image: prom/prometheus:latest + container_name: deploytest-prometheus + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + # - ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro # 可选:如需告警规则可解开 + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--web.enable-lifecycle" + ports: + - "19090:9090" + networks: + - monitor_net + extra_hosts: + # 兼容 Linux:让容器中的 host.docker.internal 指向宿主机 + - "host.docker.internal:host-gateway" + + grafana: + image: grafana/grafana:latest + container_name: deploytest-grafana + depends_on: + - prometheus + ports: + - "13000:3000" + networks: + - monitor_net + volumes: + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_ALLOW_SIGN_UP=false + diff --git a/deploytest/grafana/dashboards/netconf-transceiver-exporter.json b/deploytest/grafana/dashboards/netconf-transceiver-exporter.json new file mode 100644 index 0000000..4727ada --- /dev/null +++ b/deploytest/grafana/dashboards/netconf-transceiver-exporter.json @@ -0,0 +1,450 @@ +{ + "id": null, + "uid": "netconf-transceiver-exporter", + "title": "NETCONF Transceiver Exporter", + "tags": ["netconf", "transceiver", "exporter"], + "timezone": "browser", + "schemaVersion": 38, + "version": 1, + "refresh": "30s", + "panels": [ + { + "type": "stat", + "title": "Exporter Scrape Success Ratio", + "description": "含义:当前选择的所有 $device 的 netconf_scrape_success 平均值,介于 0–1。用法:一眼看整体抓取健康度(≈1 表示基本都成功,<1 表示有失败);多选多个设备时是平均值,可快速知道整体是否在抖。", + "id": 1, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "targets": [ + { + "expr": "avg(netconf_scrape_success{job=~\"$job\", device=~\"$device\"})", + "legendFormat": "success_ratio", + "refId": "A" + } + ], + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "orientation": "horizontal", + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 } + }, + { + "type": "timeseries", + "title": "Exporter Build Info (TODO)", + "description": "含义:预留的 exporter_build_info{version,python_version} 指标,可查看 exporter 版本与 Python 运行环境。用法:目前 exporter 尚未实现该指标,显示 No data 属正常;未来实现后,可以用来确认线上跑的是哪个版本/解释器,排查“是不是在用最新代码”。", + "id": 9, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "exporter_build_info{job=~\"$job\", device=~\"$device\"}", + "legendFormat": "{{version}} ({{python_version}})", + "refId": "A" + } + ], + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 } + }, + { + "type": "stat", + "title": "Exporter Devices Total (TODO)", + "description": "含义:预留的 exporter_devices_total,表示 exporter 配置或注册的设备总数。用法:当前无数据属预期;未来实现后,可对比 total 与 enabled(Panel 11),例如 total=10 但 enabled=8,用来找出被禁用或注册失败的设备。", + "id": 10, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "exporter_devices_total{job=~\"$job\", device=~\"$device\"}", + "legendFormat": "{{device}} total", + "refId": "A" + } + ], + "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "orientation": "horizontal", + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 } + }, + { + "type": "stat", + "title": "Exporter Devices Enabled (TODO)", + "description": "含义:预留的 exporter_devices_enabled,表示实际启用抓取的设备数量。用法:当前无数据属预期;未来实现后,可快速判断是否有设备被标记为 disabled 或注册失败,并与 Panel 10 对比排查配置问题。", + "id": 11, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "exporter_devices_enabled{job=~\"$job\", device=~\"$device\"}", + "legendFormat": "{{device}} enabled", + "refId": "A" + } + ], + "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "orientation": "horizontal", + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 } + }, + { + "type": "timeseries", + "title": "Scrape Success by Device", + "description": "含义:每个设备的 netconf_scrape_success 随时间的 0/1 曲线(按 $device 过滤)。用法:查看“哪个设备最近在频繁失败”:曲线掉到 0 的时间段就是失败窗口;多设备同时查看时,每条线代表一个 device,可以直观看出谁最不稳定。", + "id": 2, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "targets": [ + { + "expr": "netconf_scrape_success{job=~\"$job\", device=~\"$device\"}", + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { "unit": "bool" }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 9, "x": 6, "y": 0 } + }, + { + "type": "timeseries", + "title": "Scrape Duration per Device", + "description": "含义:每个设备的 netconf_scrape_duration_seconds,单位秒。用法:判断哪台设备抓取最慢(曲线越高,抓取耗时越长,可能 RPC 多或设备性能差);观察趋势变化,例如某设备 duration 从 2s 突然升到 10s,提示设备压力或链路问题。", + "id": 3, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "targets": [ + { + "expr": "netconf_scrape_duration_seconds{job=~\"$job\", device=~\"$device\"}", + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { "unit": "s" }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 9, "x": 15, "y": 0 } + }, + { + "type": "table", + "title": "Scrape Errors by Device & Type (5m)", + "description": "含义:最近 5 分钟内 netconf_scrape_errors_total 的增长量,按 (device,error_type) 聚合。用法:出现异常时优先查看 error_type(如 AuthError、RpcTimeout、UnknownError 等);配合 Panel 2/3 使用,当某设备 success 掉、duration 拉长时,看对应的 error_type 是什么以定位问题。", + "id": 4, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "targets": [ + { + "expr": "sum(increase(netconf_scrape_errors_total{job=~\"$job\", device=~\"$device\"}[5m])) by (device, error_type)", + "format": "table", + "refId": "A" + } + ], + "fieldConfig": { "defaults": {}, "overrides": [] }, + "gridPos": { "h": 6, "w": 12, "x": 0, "y": 4 } + }, + { + "type": "timeseries", + "title": "Transceiver Supply Voltage", + "description": "含义:transceiver_supply_voltage_volts,按 {device,port,component_name} 展示各光模块电压。用法:$port 选 All 时可对比不同端口/模块电压是否在合理区间;当某个 transceiver 电压明显过低或过高时,通过 legend 中的 device/port/component_name 精确定位异常模块。", + "id": 12, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "transceiver_supply_voltage_volts{job=~\"$job\", device=~\"$device\", port=~\"$port\"}", + "legendFormat": "{{device}}/{{port}}/{{component_name}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { "unit": "volt" }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 } + }, + { + "type": "timeseries", + "title": "Transceiver Temperature (with vendor/serial)", + "description": "含义:transceiver_temperature_celsius 与 transceiver_info_info join 后的温度曲线,legend 中包含 vendor 与 serial 等元数据。用法:在同一设备下对比多个模块温度,快速发现温度显著偏高的模块;带 serial 的 legend 方便截图或记录,用于资产与维护(如“序列号为 X 的模块长期偏高”)。", + "id": 13, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "targets": [ + { + "expr": "transceiver_temperature_celsius{job=~\"$job\", device=~\"$device\", port=~\"$port\"} * on(job, device, port, component_name) group_left(vendor, serial, part_number, hardware_rev) transceiver_info_info{job=~\"$job\", device=~\"$device\"}", + "legendFormat": "{{device}}/{{port}} {{vendor}}({{serial}})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { "unit": "celsius" }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 } + }, + { + "type": "bargauge", + "title": "Transceiver Present & Oper Status", + "description": "含义:transceiver_present 表示模块是否在位 (1=在位,0=不在位),同时预留展示 transceiver_oper_status(运行状态,例如 up/down),当前无该数据属预期。用法:快速确认某设备端口上光模块是否插着(present=1);未来实现 oper_status 后,可在同一面板中发现“模块在位但状态为 down”这类异常。", + "id": 14, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "transceiver_present{job=~\"$job\", device=~\"$device\", port=~\"$port\"}", + "legendFormat": "{{device}}/{{port}}/{{component_name}}", + "refId": "A" + }, + { + "expr": "transceiver_oper_status{job=~\"$job\", device=~\"$device\", port=~\"$port\"}", + "legendFormat": "oper: {{device}}/{{port}}/{{component_name}}", + "refId": "B" + } + ], + "fieldConfig": { + "defaults": { "unit": "bool" }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 12, "x": 0, "y": 14 } + }, + { + "type": "timeseries", + "title": "Channel Tx Power (dBm)", + "description": "含义:transceiver_channel_tx_power_dbm,按 channel 维度展示发射光功率。用法:$device + $port 指定后,$component 可选单个或多个模块,查看各通道的 TX 功率;用于对比多通道是否功率均衡,以及定位某个通道发射功率突然下降的情况。", + "id": 15, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "targets": [ + { + "expr": "transceiver_channel_tx_power_dbm{job=~\"$job\", device=~\"$device\", port=~\"$port\", component_name=~\"$component\"}", + "legendFormat": "{{component_name}}:{{channel}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { "unit": "dBm" }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 } + }, + { + "type": "timeseries", + "title": "Channel Rx Power (dBm)", + "description": "含义:transceiver_channel_rx_power_dbm,按 channel 展示接收光功率。用法:配合 TX 面板一起查看链路健康度(TX 正常但 RX 很低可能是远端/链路问题);多 device、多 port 对比场景下,可快速识别接收功率异常低的通道。", + "id": 16, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "transceiver_channel_rx_power_dbm{job=~\"$job\", device=~\"$device\", port=~\"$port\", component_name=~\"$component\"}", + "legendFormat": "{{component_name}}:{{channel}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { "unit": "dBm" }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 } + }, + { + "type": "timeseries", + "title": "Channel Bias Current (mA)", + "description": "含义:transceiver_channel_bias_current_ma,每个 channel 对应激光器偏置电流。用法:检查不同通道的电流是否在合理范围内,识别某个通道电流特别高的情况(可能模块老化);结合历史趋势观察电流缓慢爬升,用于预判光模块老化风险。", + "id": 17, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "transceiver_channel_bias_current_ma{job=~\"$job\", device=~\"$device\", port=~\"$port\", component_name=~\"$component\"}", + "legendFormat": "{{component_name}}:{{channel}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { "unit": "mA" }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 28 } + }, + { + "type": "timeseries", + "title": "Channel Laser Temperature (C, device-dependent)", + "description": "含义:transceiver_channel_laser_temperature_celsius,按通道展示激光器温度。用法:当前 H3C 设备通常不返回该字段,因此本面板往往无数据,这是设备特性而非 bug;若未来设备或其他厂商实现该字段,本面板可用于更细粒度地观察每个通道的激光温度。", + "id": 18, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "transceiver_channel_laser_temperature_celsius{job=~\"$job\", device=~\"$device\", port=~\"$port\", component_name=~\"$component\"}", + "legendFormat": "{{component_name}}:{{channel}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { "unit": "celsius" }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 28 } + }, + { + "type": "table", + "title": "Channel Info (Instant)", + "description": "含义:transceiver_channel_info_info 与 transceiver_info_info join 后形成的静态资产表,每行对应一个 channel,并附带 vendor/serial/part_number/hardware_rev 等上游光模块信息。用法:使用 Instant 查询和表格展示,仅保留 label 列;作为“光模块资产清单”,可按 port、component_name、channel_index 排序,清晰查看每个端口有哪些模块及其通道数量,并通过 vendor/serial 等字段核对现场资产或反查某个 channel 属于哪个物理模块。", + "id": 19, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "targets": [ + { + "expr": "transceiver_channel_info_info{job=~\"$job\", device=~\"$device\"} * on(job, device, port, component_name) group_left(vendor, serial, part_number, hardware_rev) transceiver_info_info{job=~\"$job\", device=~\"$device\"}", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Time" }, + "properties": [ + { "id": "custom.hidden", "value": true } + ] + }, + { + "matcher": { "id": "byName", "options": "Value" }, + "properties": [ + { "id": "custom.hidden", "value": true } + ] + } + ] + }, + "options": { + "showHeader": true + }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 36 } + }, + { + "type": "stat", + "title": "Time Since Last Scrape (s)", + "description": "含义:time() - netconf_last_scrape_timestamp_seconds,表示每个设备距离最近一次 NETCONF 抓取已过去的秒数。用法:如果数值一直在几十秒以内,说明抓取在持续正常执行;若某设备数值突然飙高且长期不回落,说明它很久没有被成功抓取,需检查连接、认证或设备状态。", + "id": 20, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "targets": [ + { + "expr": "time() - netconf_last_scrape_timestamp_seconds{job=~\"$job\", device=~\"$device\"}", + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { "unit": "s" }, + "overrides": [] + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "orientation": "horizontal", + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 46 } + }, + { + "type": "stat", + "title": "Transceiver Data Staleness (TODO)", + "description": "含义:设计中的 transceiver_data_staleness_seconds,计划表示 Transceiver 业务数据(最近一次成功抓取)相对当前时间的滞后程度。用法:目前仅注册 metric,逻辑尚未实现,因此无数据属预期;未来实现后,将比 Panel 20 更直接地体现“业务数据本身有多旧”,用于监控数据新鲜度。", + "id": 21, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "transceiver_data_staleness_seconds{job=~\"$job\", device=~\"$device\"}", + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "orientation": "horizontal", + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 46 } + } + ], + "templating": { + "list": [ + { + "name": "job", + "label": "Job", + "type": "query", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "query": "label_values(netconf_scrape_success, job)", + "refresh": 1, + "multi": true, + "includeAll": true, + "current": { + "text": "All", + "value": "$__all" + } + }, + { + "name": "device", + "label": "Device", + "type": "query", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "query": "label_values(netconf_scrape_success{job=~\"$job\"}, device)", + "refresh": 2, + "multi": true, + "includeAll": true + }, + { + "name": "port", + "label": "Port", + "type": "query", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "query": "label_values(transceiver_channel_info_info{job=~\"$job\",device=~\"$device\"}, port)", + "refresh": 2, + "multi": true, + "includeAll": true + }, + { + "name": "component", + "label": "Component", + "type": "query", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "query": "label_values(transceiver_channel_info_info{job=~\"$job\",device=~\"$device\",port=~\"$port\"}, component_name)", + "refresh": 2, + "multi": true, + "includeAll": true + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + } +} diff --git a/deploytest/grafana/provisioning/dashboards/dashboards.yml b/deploytest/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000..9671ff3 --- /dev/null +++ b/deploytest/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: "netconf-transceiver-exporter" + orgId: 1 + folder: "" + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards + diff --git a/deploytest/grafana/provisioning/datasources/datasource.yml b/deploytest/grafana/provisioning/datasources/datasource.yml new file mode 100644 index 0000000..b4ec358 --- /dev/null +++ b/deploytest/grafana/provisioning/datasources/datasource.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + uid: prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false + diff --git a/deploytest/prometheus/prometheus.yml b/deploytest/prometheus/prometheus.yml new file mode 100644 index 0000000..51d8710 --- /dev/null +++ b/deploytest/prometheus/prometheus.yml @@ -0,0 +1,13 @@ +global: + scrape_interval: 15s + evaluation_interval: 30s + +scrape_configs: + - job_name: "netconf_transceiver_exporter" + scrape_interval: 15s + static_configs: + - targets: + - "host.docker.internal:19100" + labels: + env: "deploytest" +