feat: 基于算力平台的Prometheus镜像改造,supervisor自启应用;调整Prometheus.yml结构;

refs #9
This commit is contained in:
sundapeng.sdp 2025-09-23 17:37:19 +08:00
parent d9d937f5d6
commit 68b265624c
7 changed files with 260 additions and 2 deletions

View File

@ -0,0 +1,67 @@
FROM ubuntu/prometheus:3-24.04_stable
USER root
ARG USE_INTRANET=false
# 内网 apt 源配置
RUN if [ "$USE_INTRANET" = "true" ]; then \
echo "Configuring intranet apt sources..." && \
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
fi
# 常用工具
RUN apt-get update && \
apt-get install -y supervisor net-tools inetutils-ping vim && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# 如果是部署环境替换 apt 源
RUN if [ "$USE_INTRANET" = "true" ]; then \
echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
fi
# supervisor 日志目录
RUN mkdir -p /var/log/supervisor
# 设置 Prometheus 基础路径环境变量
ENV PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus
# 设置用户和组ID环境变量
ARG PROMETHEUS_UID=2133
ARG PROMETHEUS_GID=2015
ENV PROMETHEUS_UID=${PROMETHEUS_UID}
ENV PROMETHEUS_GID=${PROMETHEUS_GID}
# 创建目录结构
RUN mkdir -p ${PROMETHEUS_BASE_PATH}/rules \
&& mkdir -p ${PROMETHEUS_BASE_PATH}/targets \
&& mkdir -p /private/argus/etc \
&& rm -rf /prometheus \
&& ln -s ${PROMETHEUS_BASE_PATH} /prometheus
# 修改 Prometheus 用户 UID/GID 并授权
RUN usermod -u ${PROMETHEUS_UID} nobody && \
groupmod -g ${PROMETHEUS_GID} nogroup && \
chown -h nobody:nogroup /prometheus && \
chown -R nobody:nogroup /private/argus/metric /etc/prometheus && \
chown -R nobody:nogroup ${PROMETHEUS_BASE_PATH}
# supervisor 配置
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
# 启动脚本
COPY start-prometheus-supervised.sh /usr/local/bin/start-prometheus-supervised.sh
RUN chmod +x /usr/local/bin/start-prometheus-supervised.sh
# 自定义 prometheus 配置文件
COPY prometheus.yml /etc/prometheus/prometheus.yml
USER root
EXPOSE 9090
ENTRYPOINT ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf", "-n"]

View File

@ -0,0 +1,126 @@
# Prometheus Docker 镜像配置
## 环境变量配置
### PROMETHEUS_BASE_PATH
设置 Prometheus 配置和数据的基础路径。
**默认值**: `/private/argus/metric/prometheus`
**用途**:
- 配置文件存储路径: `${PROMETHEUS_BASE_PATH}/prometheus.yml`
- 规则文件路径: `${PROMETHEUS_BASE_PATH}/rules/*.yml`
- 监控目标文件路径: `${PROMETHEUS_BASE_PATH}/targets/`
## 使用示例
### 1. 使用默认路径
```bash
docker run -d \
--name prometheus \
-p 9090:9090 \
-v /host/prometheus/data:/private/argus/metric/prometheus \
prometheus:latest
```
### 2. 自定义基础路径
```bash
docker run -d \
--name prometheus \
-p 9090:9090 \
-e PROMETHEUS_BASE_PATH=/custom/prometheus/path \
-v /host/prometheus/data:/custom/prometheus/path \
prometheus:latest
```
### 3. Kubernetes 部署示例
```yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
containers:
- name: prometheus
image: prometheus:latest
env:
- name: PROMETHEUS_BASE_PATH
value: "/data/prometheus"
ports:
- containerPort: 9090
volumeMounts:
- name: prometheus-data
mountPath: /data/prometheus
volumes:
- name: prometheus-data
persistentVolumeClaim:
claimName: prometheus-pvc
```
## 目录结构
容器启动后会在 `${PROMETHEUS_BASE_PATH}` 下创建以下目录结构:
```
${PROMETHEUS_BASE_PATH}/
├── prometheus.yml # 主配置文件
├── rules/ # 告警规则目录
│ └── *.yml
└── targets/ # 监控目标目录
├── node_exporter.json
└── dcgm_exporter.json
```
## 动态配置
- **规则文件**: 在 `rules/` 目录下添加 `.yml` 文件即可自动加载
- **监控目标**: 修改 `targets/` 目录下的 JSON 文件即可动态更新监控目标
- **主配置**: 修改 `prometheus.yml` 后可通过 Prometheus 的 `/-/reload` 端点重新加载配置
## 权限管理
### 默认路径权限
- 默认路径 `/private/argus/metric/prometheus` 在 Dockerfile 中已设置正确的权限
- nobody 用户UID: 2133, GID: 2015拥有完全读写权限
### 自定义路径权限
- 当使用自定义 `PROMETHEUS_BASE_PATH` 时,启动脚本会自动创建目录并设置权限
- 确保 nobody 用户对自定义路径有读写权限
### 挂载卷注意事项
1. **主机目录权限**: 确保挂载的主机目录对 nobody 用户UID: 2133可写
2. **SELinux**: 如果使用 SELinux可能需要设置适当的上下文
3. **Docker 用户映射**: 确保容器内的 nobody 用户与主机用户权限匹配
## 故障排除
### 权限问题
如果遇到权限错误,可以检查:
```bash
# 检查目录权限
ls -la /path/to/prometheus/data
# 检查用户映射
id nobody
# 手动修复权限
chown -R 2133:2015 /path/to/prometheus/data
chmod -R 755 /path/to/prometheus/data
```
## 注意事项
1. 确保挂载的目录有适当的读写权限
2. 配置文件会在容器启动时自动生成,无需手动创建
3. 可以通过修改环境变量 `PROMETHEUS_BASE_PATH` 来改变所有相关路径,无需重新构建镜像
4. 自定义路径的目录会在启动时自动创建并设置权限

View File

@ -1,15 +1,27 @@
global: global:
scrape_interval: 15s scrape_interval: 15s
evaluation_interval: 15s
scrape_timeout: 10s
# 对接 AlertManager
alerting:
alertmanagers:
- static_configs:
- targets: []
# 规则目录
rule_files:
- "${PROMETHEUS_BASE_PATH}/rules/*.yml"
scrape_configs: scrape_configs:
- job_name: "node" - job_name: "node"
file_sd_configs: file_sd_configs:
- files: - files:
- "targets/node_exporter.json" - "${PROMETHEUS_BASE_PATH}/targets/node_exporter.json"
refresh_interval: 30s refresh_interval: 30s
- job_name: "dcgm" - job_name: "dcgm"
file_sd_configs: file_sd_configs:
- files: - files:
- "targets/dcgm_exporter.json" - "${PROMETHEUS_BASE_PATH}/targets/dcgm_exporter.json"
refresh_interval: 30s refresh_interval: 30s

View File

@ -0,0 +1,26 @@
#!/bin/bash
set -euo pipefail
echo "[INFO] Starting Prometheus under supervisor..."
PROMETHEUS_BASE_PATH=${PROMETHEUS_BASE_PATH:-/private/argus/metric/prometheus}
DOMAIN=prom.metric.argus.com
echo "[INFO] Prometheus base path: ${PROMETHEUS_BASE_PATH}"
# 生成配置文件
echo "[INFO] Generating prometheus.yml with base path: ${PROMETHEUS_BASE_PATH}"
sed "s|\${PROMETHEUS_BASE_PATH}|${PROMETHEUS_BASE_PATH}|g" \
/etc/prometheus/prometheus.yml > ${PROMETHEUS_BASE_PATH}/prometheus.yml
# 记录容器 IP
IP=$(ifconfig eth0 | awk '/inet /{print $2}')
echo "current IP: ${IP}"
echo "${IP}" > /private/argus/etc/${DOMAIN}
exec /bin/prometheus \
--config.file=${PROMETHEUS_BASE_PATH}/prometheus.yml \
--storage.tsdb.path=/prometheus \
--web.enable-lifecycle \
--web.console.libraries=/usr/share/prometheus/console_libraries \
--web.console.templates=/usr/share/prometheus/consoles

View File

@ -0,0 +1,27 @@
[supervisord]
nodaemon=true
logfile=/var/log/supervisor/supervisord.log
pidfile=/var/run/supervisord.pid
user=root
[program:prometheus]
command=/usr/local/bin/start-prometheus-supervised.sh
user=nobody
stdout_logfile=/var/log/supervisor/prometheus.log
stderr_logfile=/var/log/supervisor/prometheus_error.log
autorestart=true
startretries=3
startsecs=30
stopwaitsecs=30
killasgroup=true
stopasgroup=true
[unix_http_server]
file=/var/run/supervisor.sock
chmod=0700
[supervisorctl]
serverurl=unix:///var/run/supervisor.sock
[rpcinterface:supervisor]
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface