parent
5fa6e80578
commit
586740a17a
67
src/metric/prometheus/Dockerfile
Normal file
67
src/metric/prometheus/Dockerfile
Normal file
@ -0,0 +1,67 @@
|
||||
FROM ubuntu/prometheus:3-24.04_stable
|
||||
|
||||
USER root
|
||||
|
||||
ARG USE_INTRANET=false
|
||||
|
||||
# 内网 apt 源配置
|
||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||
echo "Configuring intranet apt sources..." && \
|
||||
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
|
||||
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
|
||||
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
|
||||
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
|
||||
fi
|
||||
|
||||
# 常用工具
|
||||
RUN apt-get update && \
|
||||
apt-get install -y supervisor net-tools inetutils-ping vim && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 如果是部署环境替换 apt 源
|
||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||
echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
|
||||
fi
|
||||
|
||||
# supervisor 日志目录
|
||||
RUN mkdir -p /var/log/supervisor
|
||||
|
||||
# 设置 Prometheus 基础路径环境变量
|
||||
ENV PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus
|
||||
|
||||
# 设置用户和组ID环境变量
|
||||
ARG PROMETHEUS_UID=2133
|
||||
ARG PROMETHEUS_GID=2015
|
||||
ENV PROMETHEUS_UID=${PROMETHEUS_UID}
|
||||
ENV PROMETHEUS_GID=${PROMETHEUS_GID}
|
||||
|
||||
# 创建目录结构
|
||||
RUN mkdir -p ${PROMETHEUS_BASE_PATH}/rules \
|
||||
&& mkdir -p ${PROMETHEUS_BASE_PATH}/targets \
|
||||
&& mkdir -p /private/argus/etc \
|
||||
&& rm -rf /prometheus \
|
||||
&& ln -s ${PROMETHEUS_BASE_PATH} /prometheus
|
||||
|
||||
# 修改 Prometheus 用户 UID/GID 并授权
|
||||
RUN usermod -u ${PROMETHEUS_UID} nobody && \
|
||||
groupmod -g ${PROMETHEUS_GID} nogroup && \
|
||||
chown -h nobody:nogroup /prometheus && \
|
||||
chown -R nobody:nogroup /private/argus/metric /etc/prometheus && \
|
||||
chown -R nobody:nogroup ${PROMETHEUS_BASE_PATH}
|
||||
|
||||
# supervisor 配置
|
||||
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
||||
|
||||
# 启动脚本
|
||||
COPY start-prometheus-supervised.sh /usr/local/bin/start-prometheus-supervised.sh
|
||||
RUN chmod +x /usr/local/bin/start-prometheus-supervised.sh
|
||||
|
||||
# 自定义 prometheus 配置文件
|
||||
COPY prometheus.yml /etc/prometheus/prometheus.yml
|
||||
|
||||
USER root
|
||||
|
||||
EXPOSE 9090
|
||||
|
||||
ENTRYPOINT ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf", "-n"]
|
126
src/metric/prometheus/README.md
Normal file
126
src/metric/prometheus/README.md
Normal file
@ -0,0 +1,126 @@
|
||||
# Prometheus Docker 镜像配置
|
||||
|
||||
## 环境变量配置
|
||||
|
||||
### PROMETHEUS_BASE_PATH
|
||||
|
||||
设置 Prometheus 配置和数据的基础路径。
|
||||
|
||||
**默认值**: `/private/argus/metric/prometheus`
|
||||
|
||||
**用途**:
|
||||
- 配置文件存储路径: `${PROMETHEUS_BASE_PATH}/prometheus.yml`
|
||||
- 规则文件路径: `${PROMETHEUS_BASE_PATH}/rules/*.yml`
|
||||
- 监控目标文件路径: `${PROMETHEUS_BASE_PATH}/targets/`
|
||||
|
||||
## 使用示例
|
||||
|
||||
### 1. 使用默认路径
|
||||
```bash
|
||||
docker run -d \
|
||||
--name prometheus \
|
||||
-p 9090:9090 \
|
||||
-v /host/prometheus/data:/private/argus/metric/prometheus \
|
||||
prometheus:latest
|
||||
```
|
||||
|
||||
### 2. 自定义基础路径
|
||||
```bash
|
||||
docker run -d \
|
||||
--name prometheus \
|
||||
-p 9090:9090 \
|
||||
-e PROMETHEUS_BASE_PATH=/custom/prometheus/path \
|
||||
-v /host/prometheus/data:/custom/prometheus/path \
|
||||
prometheus:latest
|
||||
```
|
||||
|
||||
### 3. Kubernetes 部署示例
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: prometheus
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: prometheus
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: prometheus
|
||||
spec:
|
||||
containers:
|
||||
- name: prometheus
|
||||
image: prometheus:latest
|
||||
env:
|
||||
- name: PROMETHEUS_BASE_PATH
|
||||
value: "/data/prometheus"
|
||||
ports:
|
||||
- containerPort: 9090
|
||||
volumeMounts:
|
||||
- name: prometheus-data
|
||||
mountPath: /data/prometheus
|
||||
volumes:
|
||||
- name: prometheus-data
|
||||
persistentVolumeClaim:
|
||||
claimName: prometheus-pvc
|
||||
```
|
||||
|
||||
## 目录结构
|
||||
|
||||
容器启动后会在 `${PROMETHEUS_BASE_PATH}` 下创建以下目录结构:
|
||||
|
||||
```
|
||||
${PROMETHEUS_BASE_PATH}/
|
||||
├── prometheus.yml # 主配置文件
|
||||
├── rules/ # 告警规则目录
|
||||
│ └── *.yml
|
||||
└── targets/ # 监控目标目录
|
||||
├── node_exporter.json
|
||||
└── dcgm_exporter.json
|
||||
```
|
||||
|
||||
## 动态配置
|
||||
|
||||
- **规则文件**: 在 `rules/` 目录下添加 `.yml` 文件即可自动加载
|
||||
- **监控目标**: 修改 `targets/` 目录下的 JSON 文件即可动态更新监控目标
|
||||
- **主配置**: 修改 `prometheus.yml` 后可通过 Prometheus 的 `/-/reload` 端点重新加载配置
|
||||
|
||||
## 权限管理
|
||||
|
||||
### 默认路径权限
|
||||
- 默认路径 `/private/argus/metric/prometheus` 在 Dockerfile 中已设置正确的权限
|
||||
- nobody 用户(UID: 2133, GID: 2015)拥有完全读写权限
|
||||
|
||||
### 自定义路径权限
|
||||
- 当使用自定义 `PROMETHEUS_BASE_PATH` 时,启动脚本会自动创建目录并设置权限
|
||||
- 确保 nobody 用户对自定义路径有读写权限
|
||||
|
||||
### 挂载卷注意事项
|
||||
1. **主机目录权限**: 确保挂载的主机目录对 nobody 用户(UID: 2133)可写
|
||||
2. **SELinux**: 如果使用 SELinux,可能需要设置适当的上下文
|
||||
3. **Docker 用户映射**: 确保容器内的 nobody 用户与主机用户权限匹配
|
||||
|
||||
## 故障排除
|
||||
|
||||
### 权限问题
|
||||
如果遇到权限错误,可以检查:
|
||||
```bash
|
||||
# 检查目录权限
|
||||
ls -la /path/to/prometheus/data
|
||||
|
||||
# 检查用户映射
|
||||
id nobody
|
||||
|
||||
# 手动修复权限
|
||||
chown -R 2133:2015 /path/to/prometheus/data
|
||||
chmod -R 755 /path/to/prometheus/data
|
||||
```
|
||||
|
||||
## 注意事项
|
||||
|
||||
1. 确保挂载的目录有适当的读写权限
|
||||
2. 配置文件会在容器启动时自动生成,无需手动创建
|
||||
3. 可以通过修改环境变量 `PROMETHEUS_BASE_PATH` 来改变所有相关路径,无需重新构建镜像
|
||||
4. 自定义路径的目录会在启动时自动创建并设置权限
|
@ -1,15 +1,27 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
scrape_timeout: 10s
|
||||
|
||||
# 对接 AlertManager
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: []
|
||||
|
||||
# 规则目录
|
||||
rule_files:
|
||||
- "${PROMETHEUS_BASE_PATH}/rules/*.yml"
|
||||
|
||||
scrape_configs:
|
||||
- job_name: "node"
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- "targets/node_exporter.json"
|
||||
- "${PROMETHEUS_BASE_PATH}/targets/node_exporter.json"
|
||||
refresh_interval: 30s
|
||||
|
||||
- job_name: "dcgm"
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- "targets/dcgm_exporter.json"
|
||||
- "${PROMETHEUS_BASE_PATH}/targets/dcgm_exporter.json"
|
||||
refresh_interval: 30s
|
||||
|
26
src/metric/prometheus/start-prometheus-supervised.sh
Normal file
26
src/metric/prometheus/start-prometheus-supervised.sh
Normal file
@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
echo "[INFO] Starting Prometheus under supervisor..."
|
||||
|
||||
PROMETHEUS_BASE_PATH=${PROMETHEUS_BASE_PATH:-/private/argus/metric/prometheus}
|
||||
DOMAIN=prom.metric.argus.com
|
||||
|
||||
echo "[INFO] Prometheus base path: ${PROMETHEUS_BASE_PATH}"
|
||||
|
||||
# 生成配置文件
|
||||
echo "[INFO] Generating prometheus.yml with base path: ${PROMETHEUS_BASE_PATH}"
|
||||
sed "s|\${PROMETHEUS_BASE_PATH}|${PROMETHEUS_BASE_PATH}|g" \
|
||||
/etc/prometheus/prometheus.yml > ${PROMETHEUS_BASE_PATH}/prometheus.yml
|
||||
|
||||
# 记录容器 IP
|
||||
IP=$(ifconfig eth0 | awk '/inet /{print $2}')
|
||||
echo "current IP: ${IP}"
|
||||
echo "${IP}" > /private/argus/etc/${DOMAIN}
|
||||
|
||||
exec /bin/prometheus \
|
||||
--config.file=${PROMETHEUS_BASE_PATH}/prometheus.yml \
|
||||
--storage.tsdb.path=/prometheus \
|
||||
--web.enable-lifecycle \
|
||||
--web.console.libraries=/usr/share/prometheus/console_libraries \
|
||||
--web.console.templates=/usr/share/prometheus/consoles
|
27
src/metric/prometheus/supervisord.conf
Normal file
27
src/metric/prometheus/supervisord.conf
Normal file
@ -0,0 +1,27 @@
|
||||
[supervisord]
|
||||
nodaemon=true
|
||||
logfile=/var/log/supervisor/supervisord.log
|
||||
pidfile=/var/run/supervisord.pid
|
||||
user=root
|
||||
|
||||
[program:prometheus]
|
||||
command=/usr/local/bin/start-prometheus-supervised.sh
|
||||
user=nobody
|
||||
stdout_logfile=/var/log/supervisor/prometheus.log
|
||||
stderr_logfile=/var/log/supervisor/prometheus_error.log
|
||||
autorestart=true
|
||||
startretries=3
|
||||
startsecs=30
|
||||
stopwaitsecs=30
|
||||
killasgroup=true
|
||||
stopasgroup=true
|
||||
|
||||
[unix_http_server]
|
||||
file=/var/run/supervisor.sock
|
||||
chmod=0700
|
||||
|
||||
[supervisorctl]
|
||||
serverurl=unix:///var/run/supervisor.sock
|
||||
|
||||
[rpcinterface:supervisor]
|
||||
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
|
Loading…
x
Reference in New Issue
Block a user