parent
8e2d751c2c
commit
46c34f3de6
40
src/metric/prometheus/Dockerfile → src/metric/prometheus/build/Dockerfile
Normal file → Executable file
40
src/metric/prometheus/Dockerfile → src/metric/prometheus/build/Dockerfile
Normal file → Executable file
@ -11,13 +11,32 @@ RUN if [ "$USE_INTRANET" = "true" ]; then \
|
|||||||
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
|
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
|
||||||
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
|
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
|
||||||
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
|
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
|
||||||
|
else \
|
||||||
|
echo "Configuring fast apt sources for external network..." && \
|
||||||
|
# 查找并替换sources.list文件
|
||||||
|
find /etc/apt -name "sources.list*" -exec sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/g' {} \; && \
|
||||||
|
find /etc/apt -name "sources.list*" -exec sed -i 's/security.ubuntu.com/mirrors.aliyun.com/g' {} \; && \
|
||||||
|
# 使用阿里云源
|
||||||
|
echo "deb http://mirrors.aliyun.com/ubuntu/ jammy main restricted universe multiverse" > /etc/apt/sources.list && \
|
||||||
|
echo "deb http://mirrors.aliyun.com/ubuntu/ jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \
|
||||||
|
echo "deb http://mirrors.aliyun.com/ubuntu/ jammy-security main restricted universe multiverse" >> /etc/apt/sources.list; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# 常用工具
|
# 验证源配置并安装常用工具
|
||||||
RUN apt-get update && \
|
RUN echo "=== Current apt sources ===" && \
|
||||||
apt-get install -y supervisor net-tools inetutils-ping vim && \
|
cat /etc/apt/sources.list && \
|
||||||
|
echo "=== Updating package list ===" && \
|
||||||
|
apt-get update && \
|
||||||
|
echo "=== Installing packages ===" && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
supervisor \
|
||||||
|
net-tools \
|
||||||
|
inetutils-ping \
|
||||||
|
vim \
|
||||||
|
python3 \
|
||||||
|
python3-pip && \
|
||||||
apt-get clean && \
|
apt-get clean && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||||
|
|
||||||
# 如果是部署环境替换 apt 源
|
# 如果是部署环境替换 apt 源
|
||||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||||
@ -57,9 +76,22 @@ COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
|||||||
COPY start-prometheus-supervised.sh /usr/local/bin/start-prometheus-supervised.sh
|
COPY start-prometheus-supervised.sh /usr/local/bin/start-prometheus-supervised.sh
|
||||||
RUN chmod +x /usr/local/bin/start-prometheus-supervised.sh
|
RUN chmod +x /usr/local/bin/start-prometheus-supervised.sh
|
||||||
|
|
||||||
|
# targets 更新脚本
|
||||||
|
COPY start-targets-updater.sh /usr/local/bin/start-targets-updater.sh
|
||||||
|
RUN chmod +x /usr/local/bin/start-targets-updater.sh
|
||||||
|
|
||||||
|
# targets 更新 Python 脚本
|
||||||
|
COPY update_targets.py /usr/local/bin/update_targets.py
|
||||||
|
RUN chmod +x /usr/local/bin/update_targets.py
|
||||||
|
|
||||||
|
# exporter 配置文件
|
||||||
|
COPY exporter_config.json ${PROMETHEUS_BASE_PATH}/exporter_config.json
|
||||||
|
|
||||||
# 自定义 prometheus 配置文件
|
# 自定义 prometheus 配置文件
|
||||||
COPY prometheus.yml /etc/prometheus/prometheus.yml
|
COPY prometheus.yml /etc/prometheus/prometheus.yml
|
||||||
|
|
||||||
|
RUN chown nobody:nogroup ${PROMETHEUS_BASE_PATH}/exporter_config.json /etc/prometheus/prometheus.yml
|
||||||
|
|
||||||
USER root
|
USER root
|
||||||
|
|
||||||
EXPOSE 9090
|
EXPOSE 9090
|
96
src/metric/prometheus/README.md → src/metric/prometheus/build/README.md
Normal file → Executable file
96
src/metric/prometheus/README.md → src/metric/prometheus/build/README.md
Normal file → Executable file
@ -13,60 +13,6 @@
|
|||||||
- 规则文件路径: `${PROMETHEUS_BASE_PATH}/rules/*.yml`
|
- 规则文件路径: `${PROMETHEUS_BASE_PATH}/rules/*.yml`
|
||||||
- 监控目标文件路径: `${PROMETHEUS_BASE_PATH}/targets/`
|
- 监控目标文件路径: `${PROMETHEUS_BASE_PATH}/targets/`
|
||||||
|
|
||||||
## 使用示例
|
|
||||||
|
|
||||||
### 1. 使用默认路径
|
|
||||||
```bash
|
|
||||||
docker run -d \
|
|
||||||
--name prometheus \
|
|
||||||
-p 9090:9090 \
|
|
||||||
-v /host/prometheus/data:/private/argus/metric/prometheus \
|
|
||||||
prometheus:latest
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. 自定义基础路径
|
|
||||||
```bash
|
|
||||||
docker run -d \
|
|
||||||
--name prometheus \
|
|
||||||
-p 9090:9090 \
|
|
||||||
-e PROMETHEUS_BASE_PATH=/custom/prometheus/path \
|
|
||||||
-v /host/prometheus/data:/custom/prometheus/path \
|
|
||||||
prometheus:latest
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Kubernetes 部署示例
|
|
||||||
```yaml
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: prometheus
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: prometheus
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: prometheus
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: prometheus
|
|
||||||
image: prometheus:latest
|
|
||||||
env:
|
|
||||||
- name: PROMETHEUS_BASE_PATH
|
|
||||||
value: "/data/prometheus"
|
|
||||||
ports:
|
|
||||||
- containerPort: 9090
|
|
||||||
volumeMounts:
|
|
||||||
- name: prometheus-data
|
|
||||||
mountPath: /data/prometheus
|
|
||||||
volumes:
|
|
||||||
- name: prometheus-data
|
|
||||||
persistentVolumeClaim:
|
|
||||||
claimName: prometheus-pvc
|
|
||||||
```
|
|
||||||
|
|
||||||
## 目录结构
|
## 目录结构
|
||||||
|
|
||||||
容器启动后会在 `${PROMETHEUS_BASE_PATH}` 下创建以下目录结构:
|
容器启动后会在 `${PROMETHEUS_BASE_PATH}` 下创建以下目录结构:
|
||||||
@ -118,9 +64,51 @@ chown -R 2133:2015 /path/to/prometheus/data
|
|||||||
chmod -R 755 /path/to/prometheus/data
|
chmod -R 755 /path/to/prometheus/data
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## 动态 Targets 配置
|
||||||
|
|
||||||
|
### 配置流程
|
||||||
|
|
||||||
|
1. **节点资源清单**: `nodes.json` 包含所有监控节点的基本信息
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"node_id": "A1",
|
||||||
|
"user_id": "user01",
|
||||||
|
"ip": "1.2.3.4",
|
||||||
|
"hostname": "dev-node-1",
|
||||||
|
"labels": ["production", "us-west-1"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Exporter 配置**: `exporter_config.json` 定义各类型 exporter 的端口和标签模板
|
||||||
|
- 支持 dcgm (GPU监控) 和 node (系统监控) 两种类型
|
||||||
|
- 配置端口映射和标签模板规则
|
||||||
|
|
||||||
|
3. **自动拆分生成**: `update_targets.py` 脚本根据节点清单自动生成对应的 targets 文件
|
||||||
|
- 读取 `nodes.json` 获取节点信息
|
||||||
|
- 按 exporter 类型拆分生成 `targets/*_exporter.json`
|
||||||
|
- 应用标签模板,生成完整的监控目标配置
|
||||||
|
|
||||||
|
4. **热加载机制**:
|
||||||
|
- 脚本支持守护进程模式,定期检查 `nodes.json` 变化
|
||||||
|
- 文件内容变化时自动重新生成 targets 配置
|
||||||
|
- Prometheus 自动发现并重新加载新的监控目标
|
||||||
|
|
||||||
|
### 使用方式
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 单次更新(注意用户权限,此方法用于测试,但生成文件是 root 权限)
|
||||||
|
python3 update_targets.py --config nodes.json --targets-dir targets/
|
||||||
|
|
||||||
|
# 守护进程模式, 该进程托管于supervisor
|
||||||
|
python3 update_targets.py --daemon --check-interval 30
|
||||||
|
```
|
||||||
|
|
||||||
## 注意事项
|
## 注意事项
|
||||||
|
|
||||||
1. 确保挂载的目录有适当的读写权限
|
1. 确保挂载的目录有适当的读写权限
|
||||||
2. 配置文件会在容器启动时自动生成,无需手动创建
|
2. 配置文件会在容器启动时自动生成,无需手动创建
|
||||||
3. 可以通过修改环境变量 `PROMETHEUS_BASE_PATH` 来改变所有相关路径,无需重新构建镜像
|
3. 可以通过修改环境变量 `PROMETHEUS_BASE_PATH` 来改变所有相关路径,无需重新构建镜像
|
||||||
4. 自定义路径的目录会在启动时自动创建并设置权限
|
4. 自定义路径的目录会在启动时自动创建并设置权限
|
||||||
|
5. `nodes.json` 文件变化后,targets 配置会自动更新,无需手动干预
|
41
src/metric/prometheus/build/exporter_config.json
Executable file
41
src/metric/prometheus/build/exporter_config.json
Executable file
@ -0,0 +1,41 @@
|
|||||||
|
{
|
||||||
|
"exporters": {
|
||||||
|
"dcgm": {
|
||||||
|
"port": 9400,
|
||||||
|
"job_name": "dcgm",
|
||||||
|
"instance_prefix": "dcgm-exporter",
|
||||||
|
"description": "DCGM GPU 监控 exporter"
|
||||||
|
},
|
||||||
|
"node": {
|
||||||
|
"port": 9100,
|
||||||
|
"job_name": "node",
|
||||||
|
"instance_prefix": "node-exporter",
|
||||||
|
"description": "Node 系统监控 exporter"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"label_templates": {
|
||||||
|
"dcgm": {
|
||||||
|
"job": "dcgm",
|
||||||
|
"instance": "dcgm-exporter-{node_id}",
|
||||||
|
"node_id": "{node_id}",
|
||||||
|
"ip": "{ip}",
|
||||||
|
"hostname": "{hostname}",
|
||||||
|
"user_id": "{user_id}",
|
||||||
|
"tag": "{tag}"
|
||||||
|
},
|
||||||
|
"node": {
|
||||||
|
"job": "node",
|
||||||
|
"instance": "node-exporter-{node_id}",
|
||||||
|
"node_id": "{node_id}",
|
||||||
|
"ip": "{ip}",
|
||||||
|
"hostname": "{hostname}",
|
||||||
|
"user_id": "{user_id}",
|
||||||
|
"tag": "{tag}"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"settings": {
|
||||||
|
"backup_retention_days": 7,
|
||||||
|
"log_retention_days": 30,
|
||||||
|
"refresh_interval": "30s"
|
||||||
|
}
|
||||||
|
}
|
0
src/metric/prometheus/prometheus.yml → src/metric/prometheus/build/prometheus.yml
Normal file → Executable file
0
src/metric/prometheus/prometheus.yml → src/metric/prometheus/build/prometheus.yml
Normal file → Executable file
0
src/metric/prometheus/start-prometheus-supervised.sh → src/metric/prometheus/build/start-prometheus-supervised.sh
Normal file → Executable file
0
src/metric/prometheus/start-prometheus-supervised.sh → src/metric/prometheus/build/start-prometheus-supervised.sh
Normal file → Executable file
29
src/metric/prometheus/build/start-targets-updater.sh
Executable file
29
src/metric/prometheus/build/start-targets-updater.sh
Executable file
@ -0,0 +1,29 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
echo "[INFO] Starting Prometheus Targets Updater under supervisor..."
|
||||||
|
|
||||||
|
# 配置变量
|
||||||
|
PROMETHEUS_BASE_PATH=${PROMETHEUS_BASE_PATH:-/private/argus/metric/prometheus}
|
||||||
|
NODES_CONFIG_FILE=${NODES_CONFIG_FILE:-${PROMETHEUS_BASE_PATH}/nodes.json}
|
||||||
|
TARGETS_DIR=${PROMETHEUS_BASE_PATH}/targets
|
||||||
|
EXPORTER_CONFIG_FILE=${EXPORTER_CONFIG_FILE:-${PROMETHEUS_BASE_PATH}/exporter_config.json}
|
||||||
|
CHECK_INTERVAL=${CHECK_INTERVAL:-30}
|
||||||
|
LOG_LEVEL=${LOG_LEVEL:-INFO}
|
||||||
|
|
||||||
|
echo "[INFO] Prometheus base path: ${PROMETHEUS_BASE_PATH}"
|
||||||
|
echo "[INFO] Nodes config file: ${NODES_CONFIG_FILE}"
|
||||||
|
echo "[INFO] Targets directory: ${TARGETS_DIR}"
|
||||||
|
echo "[INFO] Exporter config file: ${EXPORTER_CONFIG_FILE}"
|
||||||
|
echo "[INFO] Check interval: ${CHECK_INTERVAL}s"
|
||||||
|
echo "[INFO] Log level: ${LOG_LEVEL}"
|
||||||
|
|
||||||
|
mkdir -p "${TARGETS_DIR}"
|
||||||
|
|
||||||
|
exec python3 /usr/local/bin/update_targets.py \
|
||||||
|
--config "${NODES_CONFIG_FILE}" \
|
||||||
|
--targets-dir "${TARGETS_DIR}" \
|
||||||
|
--exporter-config "${EXPORTER_CONFIG_FILE}" \
|
||||||
|
--log-level "${LOG_LEVEL}" \
|
||||||
|
--daemon \
|
||||||
|
--check-interval "${CHECK_INTERVAL}"
|
12
src/metric/prometheus/supervisord.conf → src/metric/prometheus/build/supervisord.conf
Normal file → Executable file
12
src/metric/prometheus/supervisord.conf → src/metric/prometheus/build/supervisord.conf
Normal file → Executable file
@ -16,6 +16,18 @@ stopwaitsecs=30
|
|||||||
killasgroup=true
|
killasgroup=true
|
||||||
stopasgroup=true
|
stopasgroup=true
|
||||||
|
|
||||||
|
[program:targets-updater]
|
||||||
|
command=/usr/local/bin/start-targets-updater.sh
|
||||||
|
user=nobody
|
||||||
|
stdout_logfile=/var/log/supervisor/targets_updater.log
|
||||||
|
stderr_logfile=/var/log/supervisor/targets_updater_error.log
|
||||||
|
autorestart=true
|
||||||
|
startretries=3
|
||||||
|
startsecs=10
|
||||||
|
stopwaitsecs=30
|
||||||
|
killasgroup=true
|
||||||
|
stopasgroup=true
|
||||||
|
|
||||||
[unix_http_server]
|
[unix_http_server]
|
||||||
file=/var/run/supervisor.sock
|
file=/var/run/supervisor.sock
|
||||||
chmod=0700
|
chmod=0700
|
416
src/metric/prometheus/build/update_targets.py
Executable file
416
src/metric/prometheus/build/update_targets.py
Executable file
@ -0,0 +1,416 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Prometheus Targets 动态更新脚本
|
||||||
|
|
||||||
|
脚本从节点配置文件读取节点信息,并动态生成对应的 Prometheus targets 文件。
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
|
import argparse
|
||||||
|
import time
|
||||||
|
import hashlib
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, List, Any
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
class PrometheusTargetsManager:
|
||||||
|
"""Prometheus Targets 管理器"""
|
||||||
|
|
||||||
|
def __init__(self, config_file: str, targets_dir: str, exporter_config_file: str = None, log_level: str = "INFO"):
|
||||||
|
"""
|
||||||
|
初始化管理器
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config_file: 节点配置文件路径
|
||||||
|
targets_dir: targets 文件输出目录
|
||||||
|
exporter_config_file: exporter 配置文件路径
|
||||||
|
log_level: 日志级别
|
||||||
|
"""
|
||||||
|
self.config_file = Path(config_file)
|
||||||
|
self.targets_dir = Path(targets_dir)
|
||||||
|
self.exporter_config_file = Path(exporter_config_file) if exporter_config_file else None
|
||||||
|
self.log_level = log_level
|
||||||
|
self.last_mtime = 0 # 记录文件最后修改时间
|
||||||
|
self.last_content_hash = None # 记录文件内容哈希
|
||||||
|
|
||||||
|
# 设置日志
|
||||||
|
self._setup_logging()
|
||||||
|
|
||||||
|
# 加载 exporter 配置(必需,失败则程序退出)
|
||||||
|
try:
|
||||||
|
full_config = self._load_exporter_config()
|
||||||
|
self.exporter_configs = full_config.get('exporters', {})
|
||||||
|
self.label_templates = full_config.get('label_templates', {})
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"初始化失败,无法加载 exporter 配置: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
# 确保 targets 目录存在
|
||||||
|
self.targets_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
def _setup_logging(self):
|
||||||
|
"""设置日志配置"""
|
||||||
|
logging.basicConfig(
|
||||||
|
level=getattr(logging, self.log_level.upper()),
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.StreamHandler(sys.stdout),
|
||||||
|
logging.FileHandler(f'{self.targets_dir}/targets_update.log')
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def _load_exporter_config(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
加载 exporter 配置文件
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
exporter 配置字典
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: 配置文件不存在
|
||||||
|
json.JSONDecodeError: JSON 格式错误
|
||||||
|
ValueError: 配置格式错误
|
||||||
|
"""
|
||||||
|
if not self.exporter_config_file:
|
||||||
|
raise FileNotFoundError("Exporter 配置文件路径未指定")
|
||||||
|
|
||||||
|
if not self.exporter_config_file.exists():
|
||||||
|
raise FileNotFoundError(f"Exporter 配置文件不存在: {self.exporter_config_file}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(self.exporter_config_file, 'r', encoding='utf-8') as f:
|
||||||
|
config = json.load(f)
|
||||||
|
|
||||||
|
if not isinstance(config, dict):
|
||||||
|
raise ValueError("Exporter 配置文件必须是 JSON 对象格式")
|
||||||
|
|
||||||
|
exporters = config.get('exporters', {})
|
||||||
|
if not isinstance(exporters, dict):
|
||||||
|
raise ValueError("exporters 配置必须是对象格式")
|
||||||
|
|
||||||
|
if not exporters:
|
||||||
|
raise ValueError("exporters 配置不能为空")
|
||||||
|
|
||||||
|
self.logger.info(f"成功加载 exporter 配置: {len(exporters)} 个 exporter")
|
||||||
|
return config
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
self.logger.error(f"Exporter 配置文件 JSON 解析错误: {e}")
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"加载 exporter 配置失败: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def load_nodes_config(self) -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
加载节点配置文件
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
节点配置列表
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if not self.config_file.exists():
|
||||||
|
self.logger.warning(f"节点配置文件不存在: {self.config_file}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
with open(self.config_file, 'r', encoding='utf-8') as f:
|
||||||
|
nodes = json.load(f)
|
||||||
|
|
||||||
|
if not isinstance(nodes, list):
|
||||||
|
self.logger.error("节点配置必须是数组格式")
|
||||||
|
return []
|
||||||
|
|
||||||
|
self.logger.info(f"成功加载 {len(nodes)} 个节点配置")
|
||||||
|
return nodes
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
self.logger.error(f"JSON 解析错误: {e}")
|
||||||
|
return []
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"加载节点配置失败: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def generate_targets(self, nodes: List[Dict[str, Any]], exporter_type: str) -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
生成指定类型的 targets 配置
|
||||||
|
|
||||||
|
Args:
|
||||||
|
nodes: 节点配置列表
|
||||||
|
exporter_type: exporter 类型 (dcgm, node)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
targets 配置列表
|
||||||
|
"""
|
||||||
|
if exporter_type not in self.exporter_configs:
|
||||||
|
self.logger.error(f"不支持的 exporter 类型: {exporter_type}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
config = self.exporter_configs[exporter_type]
|
||||||
|
targets = []
|
||||||
|
|
||||||
|
for node in nodes:
|
||||||
|
# 验证必要字段
|
||||||
|
if not all(key in node for key in ['node_id', 'ip']):
|
||||||
|
self.logger.warning(f"节点配置缺少必要字段,跳过: {node}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 构建 target 地址
|
||||||
|
target_address = f"{node['ip']}:{config['port']}"
|
||||||
|
|
||||||
|
# 构建上下文变量
|
||||||
|
context = {
|
||||||
|
'node_id': node['node_id'],
|
||||||
|
'ip': node['ip'],
|
||||||
|
'hostname': node.get('hostname', ''),
|
||||||
|
'user_id': node.get('user_id', ''),
|
||||||
|
'tag': self._join_labels(node.get('labels', []))
|
||||||
|
}
|
||||||
|
|
||||||
|
# 使用模板生成标签
|
||||||
|
label_template = self.label_templates.get(exporter_type, {})
|
||||||
|
labels = {}
|
||||||
|
|
||||||
|
for label_key, template_value in label_template.items():
|
||||||
|
if isinstance(template_value, str) and '{' in template_value:
|
||||||
|
# 模板字符串,需要渲染
|
||||||
|
labels[label_key] = self._render_label_template(template_value, context)
|
||||||
|
else:
|
||||||
|
# 固定值
|
||||||
|
labels[label_key] = template_value
|
||||||
|
|
||||||
|
targets.append({
|
||||||
|
"targets": [target_address],
|
||||||
|
"labels": labels
|
||||||
|
})
|
||||||
|
|
||||||
|
self.logger.info(f"为 {exporter_type} exporter 生成了 {len(targets)} 个 targets")
|
||||||
|
return targets
|
||||||
|
|
||||||
|
def write_targets_file(self, targets: List[Dict[str, Any]], exporter_type: str) -> None:
|
||||||
|
"""
|
||||||
|
写入 targets 文件
|
||||||
|
|
||||||
|
Args:
|
||||||
|
targets: targets 配置列表
|
||||||
|
exporter_type: exporter 类型
|
||||||
|
"""
|
||||||
|
filename = f"{exporter_type}_exporter.json"
|
||||||
|
filepath = self.targets_dir / filename
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 写入新文件
|
||||||
|
with open(filepath, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(targets, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
self.logger.info(f"成功写入 targets 文件: {filepath}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"写入 targets 文件失败: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def update_all_targets(self) -> None:
|
||||||
|
"""更新所有类型的 targets 文件"""
|
||||||
|
try:
|
||||||
|
# 加载节点配置
|
||||||
|
nodes = self.load_nodes_config()
|
||||||
|
|
||||||
|
if not nodes:
|
||||||
|
self.logger.warning("没有找到任何节点配置")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 为每种 exporter 类型生成 targets
|
||||||
|
for exporter_type in self.exporter_configs.keys():
|
||||||
|
targets = self.generate_targets(nodes, exporter_type)
|
||||||
|
if targets: # 只有当有 targets 时才写入文件
|
||||||
|
self.write_targets_file(targets, exporter_type)
|
||||||
|
|
||||||
|
self.logger.info("所有 targets 文件更新完成")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"更新 targets 失败: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def _calculate_file_hash(self, file_path: Path) -> str:
|
||||||
|
"""
|
||||||
|
计算文件内容的 MD5 哈希值
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: 文件路径
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
文件内容的 MD5 哈希值
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with open(file_path, 'rb') as f:
|
||||||
|
content = f.read()
|
||||||
|
return hashlib.md5(content).hexdigest()
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"计算文件哈希失败: {e}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _render_label_template(self, template: str, context: Dict[str, str]) -> str:
|
||||||
|
"""
|
||||||
|
渲染标签模板
|
||||||
|
|
||||||
|
Args:
|
||||||
|
template: 模板字符串,如 "dcgm-exporter-{node_id}"
|
||||||
|
context: 上下文变量字典
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
渲染后的字符串
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return template.format(**context)
|
||||||
|
except KeyError as e:
|
||||||
|
self.logger.warning(f"模板渲染失败,缺少变量 {e}: {template}")
|
||||||
|
return template
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"模板渲染失败: {e}")
|
||||||
|
return template
|
||||||
|
|
||||||
|
def _join_labels(self, labels_list: List[str]) -> str:
|
||||||
|
"""
|
||||||
|
将 labels 数组拼接成一个字符串
|
||||||
|
|
||||||
|
Args:
|
||||||
|
labels_list: 标签字符串数组
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
拼接后的字符串,用逗号分隔
|
||||||
|
"""
|
||||||
|
if not labels_list:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# 过滤掉空字符串和 None 值
|
||||||
|
valid_labels = [label.strip() for label in labels_list if label and label.strip()]
|
||||||
|
|
||||||
|
return ",".join(valid_labels)
|
||||||
|
|
||||||
|
def check_file_changed(self) -> bool:
|
||||||
|
"""
|
||||||
|
检查配置文件是否发生变化
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True 如果文件发生变化,False 否则
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if not self.config_file.exists():
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 计算当前文件内容哈希
|
||||||
|
current_hash = self._calculate_file_hash(self.config_file)
|
||||||
|
if not current_hash:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 如果是第一次检查,记录哈希并触发更新
|
||||||
|
if self.last_content_hash is None:
|
||||||
|
self.last_content_hash = current_hash
|
||||||
|
self.logger.info("首次检查,记录文件内容哈希并触发初始更新")
|
||||||
|
return True
|
||||||
|
|
||||||
|
# 比较内容哈希
|
||||||
|
if current_hash != self.last_content_hash:
|
||||||
|
self.last_content_hash = current_hash
|
||||||
|
self.logger.info("检测到文件内容变化")
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"检查文件变化失败: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def run_daemon(self, check_interval: int = 30) -> None:
|
||||||
|
"""
|
||||||
|
以守护进程模式运行,定期检查文件变化
|
||||||
|
|
||||||
|
Args:
|
||||||
|
check_interval: 检查间隔(秒)
|
||||||
|
"""
|
||||||
|
self.logger.info(f"启动守护进程模式,检查间隔: {check_interval}秒")
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
if self.check_file_changed():
|
||||||
|
self.logger.info("检测到配置文件变化,开始更新 targets")
|
||||||
|
self.update_all_targets()
|
||||||
|
else:
|
||||||
|
self.logger.debug("配置文件无变化,跳过更新")
|
||||||
|
|
||||||
|
time.sleep(check_interval)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
self.logger.info("收到中断信号,正在退出...")
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"守护进程运行错误: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""主函数"""
|
||||||
|
parser = argparse.ArgumentParser(description="Prometheus Targets 动态更新脚本 (精简版)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--config",
|
||||||
|
default="/private/argus/metric/prometheus/nodes.json",
|
||||||
|
help="节点配置文件路径 (默认: /private/argus/metric/prometheus/nodes.json)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--targets-dir",
|
||||||
|
default="/private/argus/metric/prometheus/targets",
|
||||||
|
help="targets 文件输出目录 (默认: /private/argus/metric/prometheus/targets)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--exporter-config",
|
||||||
|
default="/private/argus/metric/prometheus/exporter_config.json",
|
||||||
|
help="exporter 配置文件路径 (默认: /private/argus/metric/prometheus/exporter_config.json)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--log-level",
|
||||||
|
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||||
|
default="INFO",
|
||||||
|
help="日志级别 (默认: INFO)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--daemon",
|
||||||
|
action="store_true",
|
||||||
|
help="以守护进程模式运行"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--check-interval",
|
||||||
|
type=int,
|
||||||
|
default=30,
|
||||||
|
help="守护进程模式下的检查间隔(秒,默认: 30)"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 创建管理器
|
||||||
|
manager = PrometheusTargetsManager(
|
||||||
|
config_file=args.config,
|
||||||
|
targets_dir=args.targets_dir,
|
||||||
|
exporter_config_file=args.exporter_config,
|
||||||
|
log_level=args.log_level
|
||||||
|
)
|
||||||
|
|
||||||
|
if args.daemon:
|
||||||
|
# 守护进程模式
|
||||||
|
manager.run_daemon(args.check_interval)
|
||||||
|
else:
|
||||||
|
# 单次执行模式
|
||||||
|
manager.update_all_targets()
|
||||||
|
print("成功更新所有 exporter targets")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"错误: {e}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
x
Reference in New Issue
Block a user