diff --git a/src/metric/prometheus/Dockerfile b/src/metric/prometheus/Dockerfile new file mode 100644 index 0000000..08fbb41 --- /dev/null +++ b/src/metric/prometheus/Dockerfile @@ -0,0 +1,67 @@ +FROM ubuntu/prometheus:3-24.04_stable + +USER root + +ARG USE_INTRANET=false + +# 内网 apt 源配置 +RUN if [ "$USE_INTRANET" = "true" ]; then \ + echo "Configuring intranet apt sources..." && \ + cp /etc/apt/sources.list /etc/apt/sources.list.bak && \ + echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \ + echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \ + echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \ + fi + +# 常用工具 +RUN apt-get update && \ + apt-get install -y supervisor net-tools inetutils-ping vim && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# 如果是部署环境替换 apt 源 +RUN if [ "$USE_INTRANET" = "true" ]; then \ + echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \ + fi + +# supervisor 日志目录 +RUN mkdir -p /var/log/supervisor + +# 设置 Prometheus 基础路径环境变量 +ENV PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus + +# 设置用户和组ID环境变量 +ARG PROMETHEUS_UID=2133 +ARG PROMETHEUS_GID=2015 +ENV PROMETHEUS_UID=${PROMETHEUS_UID} +ENV PROMETHEUS_GID=${PROMETHEUS_GID} + +# 创建目录结构 +RUN mkdir -p ${PROMETHEUS_BASE_PATH}/rules \ + && mkdir -p ${PROMETHEUS_BASE_PATH}/targets \ + && mkdir -p /private/argus/etc \ + && rm -rf /prometheus \ + && ln -s ${PROMETHEUS_BASE_PATH} /prometheus + +# 修改 Prometheus 用户 UID/GID 并授权 +RUN usermod -u ${PROMETHEUS_UID} nobody && \ + groupmod -g ${PROMETHEUS_GID} nogroup && \ + chown -h nobody:nogroup /prometheus && \ + chown -R nobody:nogroup /private/argus/metric /etc/prometheus && \ + chown -R nobody:nogroup ${PROMETHEUS_BASE_PATH} + +# supervisor 配置 +COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf + +# 启动脚本 +COPY start-prometheus-supervised.sh /usr/local/bin/start-prometheus-supervised.sh +RUN chmod +x /usr/local/bin/start-prometheus-supervised.sh + +# 自定义 prometheus 配置文件 +COPY prometheus.yml /etc/prometheus/prometheus.yml + +USER root + +EXPOSE 9090 + +ENTRYPOINT ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf", "-n"] diff --git a/src/metric/prometheus/README.md b/src/metric/prometheus/README.md new file mode 100644 index 0000000..e9ba826 --- /dev/null +++ b/src/metric/prometheus/README.md @@ -0,0 +1,126 @@ +# Prometheus Docker 镜像配置 + +## 环境变量配置 + +### PROMETHEUS_BASE_PATH + +设置 Prometheus 配置和数据的基础路径。 + +**默认值**: `/private/argus/metric/prometheus` + +**用途**: +- 配置文件存储路径: `${PROMETHEUS_BASE_PATH}/prometheus.yml` +- 规则文件路径: `${PROMETHEUS_BASE_PATH}/rules/*.yml` +- 监控目标文件路径: `${PROMETHEUS_BASE_PATH}/targets/` + +## 使用示例 + +### 1. 使用默认路径 +```bash +docker run -d \ + --name prometheus \ + -p 9090:9090 \ + -v /host/prometheus/data:/private/argus/metric/prometheus \ + prometheus:latest +``` + +### 2. 自定义基础路径 +```bash +docker run -d \ + --name prometheus \ + -p 9090:9090 \ + -e PROMETHEUS_BASE_PATH=/custom/prometheus/path \ + -v /host/prometheus/data:/custom/prometheus/path \ + prometheus:latest +``` + +### 3. Kubernetes 部署示例 +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus +spec: + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + spec: + containers: + - name: prometheus + image: prometheus:latest + env: + - name: PROMETHEUS_BASE_PATH + value: "/data/prometheus" + ports: + - containerPort: 9090 + volumeMounts: + - name: prometheus-data + mountPath: /data/prometheus + volumes: + - name: prometheus-data + persistentVolumeClaim: + claimName: prometheus-pvc +``` + +## 目录结构 + +容器启动后会在 `${PROMETHEUS_BASE_PATH}` 下创建以下目录结构: + +``` +${PROMETHEUS_BASE_PATH}/ +├── prometheus.yml # 主配置文件 +├── rules/ # 告警规则目录 +│ └── *.yml +└── targets/ # 监控目标目录 + ├── node_exporter.json + └── dcgm_exporter.json +``` + +## 动态配置 + +- **规则文件**: 在 `rules/` 目录下添加 `.yml` 文件即可自动加载 +- **监控目标**: 修改 `targets/` 目录下的 JSON 文件即可动态更新监控目标 +- **主配置**: 修改 `prometheus.yml` 后可通过 Prometheus 的 `/-/reload` 端点重新加载配置 + +## 权限管理 + +### 默认路径权限 +- 默认路径 `/private/argus/metric/prometheus` 在 Dockerfile 中已设置正确的权限 +- nobody 用户(UID: 2133, GID: 2015)拥有完全读写权限 + +### 自定义路径权限 +- 当使用自定义 `PROMETHEUS_BASE_PATH` 时,启动脚本会自动创建目录并设置权限 +- 确保 nobody 用户对自定义路径有读写权限 + +### 挂载卷注意事项 +1. **主机目录权限**: 确保挂载的主机目录对 nobody 用户(UID: 2133)可写 +2. **SELinux**: 如果使用 SELinux,可能需要设置适当的上下文 +3. **Docker 用户映射**: 确保容器内的 nobody 用户与主机用户权限匹配 + +## 故障排除 + +### 权限问题 +如果遇到权限错误,可以检查: +```bash +# 检查目录权限 +ls -la /path/to/prometheus/data + +# 检查用户映射 +id nobody + +# 手动修复权限 +chown -R 2133:2015 /path/to/prometheus/data +chmod -R 755 /path/to/prometheus/data +``` + +## 注意事项 + +1. 确保挂载的目录有适当的读写权限 +2. 配置文件会在容器启动时自动生成,无需手动创建 +3. 可以通过修改环境变量 `PROMETHEUS_BASE_PATH` 来改变所有相关路径,无需重新构建镜像 +4. 自定义路径的目录会在启动时自动创建并设置权限 diff --git a/src/metric/prometheus/targets/dcgm_exporter.json b/src/metric/prometheus/demo-targets/dcgm_exporter.json similarity index 100% rename from src/metric/prometheus/targets/dcgm_exporter.json rename to src/metric/prometheus/demo-targets/dcgm_exporter.json diff --git a/src/metric/prometheus/targets/node_exporter.json b/src/metric/prometheus/demo-targets/node_exporter.json similarity index 100% rename from src/metric/prometheus/targets/node_exporter.json rename to src/metric/prometheus/demo-targets/node_exporter.json diff --git a/src/metric/prometheus/prometheus.yml b/src/metric/prometheus/prometheus.yml index ebb503e..e3e4403 100644 --- a/src/metric/prometheus/prometheus.yml +++ b/src/metric/prometheus/prometheus.yml @@ -1,15 +1,27 @@ global: scrape_interval: 15s + evaluation_interval: 15s + scrape_timeout: 10s + +# 对接 AlertManager +alerting: + alertmanagers: + - static_configs: + - targets: [] + +# 规则目录 +rule_files: + - "${PROMETHEUS_BASE_PATH}/rules/*.yml" scrape_configs: - job_name: "node" file_sd_configs: - files: - - "targets/node_exporter.json" + - "${PROMETHEUS_BASE_PATH}/targets/node_exporter.json" refresh_interval: 30s - job_name: "dcgm" file_sd_configs: - files: - - "targets/dcgm_exporter.json" + - "${PROMETHEUS_BASE_PATH}/targets/dcgm_exporter.json" refresh_interval: 30s diff --git a/src/metric/prometheus/start-prometheus-supervised.sh b/src/metric/prometheus/start-prometheus-supervised.sh new file mode 100644 index 0000000..75d9a39 --- /dev/null +++ b/src/metric/prometheus/start-prometheus-supervised.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -euo pipefail + +echo "[INFO] Starting Prometheus under supervisor..." + +PROMETHEUS_BASE_PATH=${PROMETHEUS_BASE_PATH:-/private/argus/metric/prometheus} +DOMAIN=prom.metric.argus.com + +echo "[INFO] Prometheus base path: ${PROMETHEUS_BASE_PATH}" + +# 生成配置文件 +echo "[INFO] Generating prometheus.yml with base path: ${PROMETHEUS_BASE_PATH}" +sed "s|\${PROMETHEUS_BASE_PATH}|${PROMETHEUS_BASE_PATH}|g" \ + /etc/prometheus/prometheus.yml > ${PROMETHEUS_BASE_PATH}/prometheus.yml + +# 记录容器 IP +IP=$(ifconfig eth0 | awk '/inet /{print $2}') +echo "current IP: ${IP}" +echo "${IP}" > /private/argus/etc/${DOMAIN} + +exec /bin/prometheus \ + --config.file=${PROMETHEUS_BASE_PATH}/prometheus.yml \ + --storage.tsdb.path=/prometheus \ + --web.enable-lifecycle \ + --web.console.libraries=/usr/share/prometheus/console_libraries \ + --web.console.templates=/usr/share/prometheus/consoles diff --git a/src/metric/prometheus/supervisord.conf b/src/metric/prometheus/supervisord.conf new file mode 100644 index 0000000..3aca877 --- /dev/null +++ b/src/metric/prometheus/supervisord.conf @@ -0,0 +1,27 @@ +[supervisord] +nodaemon=true +logfile=/var/log/supervisor/supervisord.log +pidfile=/var/run/supervisord.pid +user=root + +[program:prometheus] +command=/usr/local/bin/start-prometheus-supervised.sh +user=nobody +stdout_logfile=/var/log/supervisor/prometheus.log +stderr_logfile=/var/log/supervisor/prometheus_error.log +autorestart=true +startretries=3 +startsecs=30 +stopwaitsecs=30 +killasgroup=true +stopasgroup=true + +[unix_http_server] +file=/var/run/supervisor.sock +chmod=0700 + +[supervisorctl] +serverurl=unix:///var/run/supervisor.sock + +[rpcinterface:supervisor] +supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface \ No newline at end of file