[#37] 修复dcgm exporter启动

This commit is contained in:
yuyr 2025-11-07 17:29:06 +08:00
parent 1819fb9c46
commit dac180f12b
8 changed files with 193 additions and 12 deletions

View File

@ -1 +1 @@
1.42.0
1.43.0

View File

@ -14,6 +14,16 @@ log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
# 运行时开关(可通过环境变量覆盖)
# 1) 是否自动启动 nv-hostengine容器内通常没有 systemd
AUTO_START_DCGM="${AUTO_START_DCGM:-1}"
# 2) 是否默认禁用 Profiling 指标(避免在部分环境触发 DCGM Profiling 崩溃)
DCGM_EXPORTER_DISABLE_PROFILING="${DCGM_EXPORTER_DISABLE_PROFILING:-1}"
# 3) 自定义 collectors 文件;若为空且禁用 Profiling则自动生成 no-prof 清单
DCGM_EXPORTER_COLLECTORS="${DCGM_EXPORTER_COLLECTORS:-}"
# 4) 监听地址
DCGM_EXPORTER_LISTEN="${DCGM_EXPORTER_LISTEN:-:9400}"
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
@ -160,10 +170,21 @@ check_dcgm_service() {
elif pgrep -f nv-hostengine > /dev/null; then
log_success "nv-hostengine 进程已在运行"
else
log_warning "DCGM 服务未运行,需要手动启动"
log_info "启动 DCGM 服务的方法:"
log_info " 1. 使用 systemd: sudo systemctl start dcgm"
log_info " 2. 手动启动: nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 &"
log_warning "DCGM 服务未运行"
if [[ "${AUTO_START_DCGM}" == "1" ]]; then
log_info "尝试自动启动 nv-hostengine容器内无 systemd 场景)..."
nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 &
sleep 2
if pgrep -f nv-hostengine >/dev/null; then
log_success "nv-hostengine 已启动"
else
log_error "nv-hostengine 启动失败,请手动检查 /var/log/nv-hostengine.log"
fi
else
log_info "启动 DCGM 服务的方法:"
log_info " 1. 使用 systemd: sudo systemctl start dcgm"
log_info " 2. 手动启动: nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 &"
fi
fi
# 测试 DCGM 连接
@ -172,7 +193,7 @@ check_dcgm_service() {
if dcgmi discovery -l > /dev/null 2>&1; then
log_success "DCGM 连接测试成功"
else
log_warning "DCGM 连接测试失败,请检查服务状态"
log_warning "DCGM 连接测试失败,请检查服务状态(驱动/权限/设备可见性)"
fi
fi
}
@ -269,6 +290,7 @@ start_dcgm_exporter() {
local binary_path="/usr/local/bin/dcgm-exporter"
local log_file="/var/log/dcgm-exporter.log"
local pid_file="/var/run/dcgm-exporter.pid"
local collectors_arg=""
# 检查服务是否已经在运行
if [[ -f "$pid_file" ]]; then
@ -282,15 +304,48 @@ start_dcgm_exporter() {
fi
fi
# 计算 collectors 参数
if [[ -n "${DCGM_EXPORTER_COLLECTORS}" ]]; then
if [[ -f "${DCGM_EXPORTER_COLLECTORS}" ]]; then
collectors_arg=(--collectors "${DCGM_EXPORTER_COLLECTORS}")
log_info "使用自定义 collectors: ${DCGM_EXPORTER_COLLECTORS}"
else
log_warning "指定的 DCGM_EXPORTER_COLLECTORS 文件不存在: ${DCGM_EXPORTER_COLLECTORS}(将忽略)"
fi
elif [[ "${DCGM_EXPORTER_DISABLE_PROFILING}" == "1" ]]; then
local cfg_dir="/etc/dcgm-exporter"
local default_cfg="${cfg_dir}/default-counters.csv"
local no_prof_cfg="${cfg_dir}/no-prof.csv"
mkdir -p "${cfg_dir}"
if [[ -f "${default_cfg}" ]]; then
grep -v 'DCGM_FI_PROF_' "${default_cfg}" > "${no_prof_cfg}" || true
collectors_arg=(--collectors "${no_prof_cfg}")
log_info "已生成无 Profiling 的 collectors: ${no_prof_cfg}"
else
log_warning "未找到默认 collectors 文件: ${default_cfg}"
fi
fi
# 检查端口是否被占用
if netstat -tuln 2>/dev/null | grep -q ":9400 "; then
if netstat -tuln 2>/dev/null | grep -q ":${DCGM_EXPORTER_LISTEN#:} "; then
log_warning "端口 9400 已被占用,请检查是否有其他服务在运行"
return 1
fi
# 启动前再校验一次 DCGM 主机引擎
if ! (systemctl is-active --quiet dcgm 2>/dev/null || pgrep -f nv-hostengine >/dev/null); then
log_warning "nv-hostengine 未运行,尝试自动启动"
nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 &
sleep 2
fi
# 启动服务
log_info "正在启动 DCGM Exporter..."
nohup "$binary_path" --address=:9400 > "$log_file" 2>&1 &
if [[ ${#collectors_arg[@]} -gt 0 ]]; then
nohup "$binary_path" --address="${DCGM_EXPORTER_LISTEN}" "${collectors_arg[@]}" > "$log_file" 2>&1 &
else
nohup "$binary_path" --address="${DCGM_EXPORTER_LISTEN}" > "$log_file" 2>&1 &
fi
local pid=$!
# 保存 PID
@ -310,6 +365,20 @@ start_dcgm_exporter() {
else
log_error "DCGM Exporter 服务启动失败"
rm -f "$pid_file"
# 失败回退:若未禁用 Profiling也未指定 collectors则尝试自动回退到 no-prof 再起一次
if [[ -z "${DCGM_EXPORTER_COLLECTORS}" && "${DCGM_EXPORTER_DISABLE_PROFILING}" != "1" ]]; then
log_warning "尝试以无 Profiling 清单回退启动"
local cfg_dir="/etc/dcgm-exporter"; local default_cfg="${cfg_dir}/default-counters.csv"; local no_prof_cfg="${cfg_dir}/no-prof.csv"
if [[ -f "${default_cfg}" ]]; then
grep -v 'DCGM_FI_PROF_' "${default_cfg}" > "${no_prof_cfg}" || true
nohup "$binary_path" --address="${DCGM_EXPORTER_LISTEN}" --collectors "${no_prof_cfg}" > "$log_file" 2>&1 &
sleep 2
if pgrep -f dcgm-exporter >/dev/null; then
log_success "DCGM Exporter 已用无 Profiling 清单启动"
return 0
fi
fi
fi
return 1
fi
}

View File

@ -40,7 +40,15 @@ else
# run component installer within version dir
if [[ -f "$version_dir/install.sh" ]]; then
chmod +x "$version_dir/install.sh" 2>/dev/null || true
(cd "$version_dir" && ./install.sh "$version_dir")
# 传递运行时开关:容器内缺省启用 AUTO_START_DCGM=1、禁用 Profiling可通过环境变量覆盖
# 注意:不能用 `VAR=.. VAR2=.. (cmd)` 前缀到子 shellbash 不允许 env 赋值直接修饰 `(` 复合命令。
# 因此改为在子 subshell 中 export 后再执行。
(
export AUTO_START_DCGM="${AUTO_START_DCGM:-1}"
export DCGM_EXPORTER_DISABLE_PROFILING="${DCGM_EXPORTER_DISABLE_PROFILING:-1}"
export DCGM_EXPORTER_LISTEN="${DCGM_EXPORTER_LISTEN:-:9400}"
cd "$version_dir" && ./install.sh "$version_dir"
)
echo "$ver" > "$target_root/LATEST_VERSION" 2>/dev/null || true
ln -sfn "$version_dir" "$target_root/current" 2>/dev/null || true
if [[ -L "$target_root/current" && -d "$target_root/current" ]]; then
@ -75,7 +83,19 @@ if ! pgrep -x argus-agent >/dev/null 2>&1; then
setsid /usr/local/bin/argus-agent >/var/log/argus-agent.log 2>&1 < /dev/null &
fi
# 5) post-install selfcheck (best-effort) and wait for node.json
# 5) 若 dcgm-exporter 未监听(可能因 Profiling 崩溃),尝试无 Profiling 清单回退启动
if ! ss -tlnp 2>/dev/null | grep -q ":9400 "; then
echo "[BOOT] dcgm-exporter not listening; trying no-prof fallback"
pgrep -f nv-hostengine >/dev/null || (nohup nv-hostengine >/var/log/nv-hostengine.log 2>&1 & sleep 2)
cfg_dir="/etc/dcgm-exporter"; default_cfg="$cfg_dir/default-counters.csv"; no_prof_cfg="$cfg_dir/no-prof.csv"
if [[ -f "$default_cfg" ]]; then
grep -v 'DCGM_FI_PROF_' "$default_cfg" > "$no_prof_cfg" || true
pkill -f dcgm-exporter >/dev/null 2>&1 || true
nohup /usr/local/bin/dcgm-exporter --address="${DCGM_EXPORTER_LISTEN:-:9400}" --collectors "$no_prof_cfg" >/var/log/dcgm-exporter.log 2>&1 &
fi
fi
# 6) post-install selfcheck (best-effort) and wait for node.json
for i in {1..30}; do
if compgen -G "$INSTALL_DIR/versions/*/check_health.sh" > /dev/null; then
bash "$INSTALL_DIR"/versions/*/check_health.sh || true

View File

@ -3,6 +3,8 @@ FTPIP=10.0.4.29
MASTER_ENDPOINT=http://master.argus.com:3000
FTP_USER=ftpuser
FTP_PASSWORD=ZGClab1234!
AGENT_ENV=dev2
AGENT_ENV=lm1
AGENT_USER=yuyr
AGENT_INSTANCE=node001sX
NODE_HOSTNAME=lm1
GPU_NODE_HOSTNAME=lm1

View File

@ -50,3 +50,45 @@ bash scripts/99_down.sh
- Grafana/Kibana 启动报权限:检查 `configs/build_user.local.conf``00_bootstrap.sh` 的输出 UID/GID 是否一致;必要时设置 `SWARM_FIX_PERMS=1` 重新 `01_server_up.sh`
- 节点容器 fallback 到 FTP通常为 bundle 结构异常或健康检查失败(早期脚本在 `sh` 下执行)。当前 `node-bootstrap.sh` 已使用 `bash` 执行健康检查,并在本地安装成功后跳过 FTP。
- 代理 502查看容器 `argus-web-proxy``/var/log/nginx/error.log` 与启动日志中 `upstream check` 行;若后端未就绪(尤其 Kibana等待 `02_wait_ready.sh` 通过后再访问。
### 在 worker 上用 compose 起 GPU 节点的网络预热overlay not found
在多机 Swarm 场景,如果在 worker`lm1`)上直接运行 `05_gpu_node_up.sh``docker compose` 对 external overlay `argus-sys-net` 的本地预检查可能报错 `network ... not found`。这是因为 worker 尚未在本地“加入”该 overlay。
Workaround先在 worker 启一个临时容器加入 overlay 进行“网络预热”,随后再运行 GPU compose。
```
# 在 worker 节点lm1
cd src/sys/swarm_tests
set -a; source .env; source .env.nodes; set +a
# 预热 overlay默认 600s 超时自动退出,可重复执行)
bash scripts/05a_net_warmup.sh
# 然后再启动 GPU 节点
bash scripts/05_gpu_node_up.sh
```
清理时 `scripts/99_down.sh` 会顺带移除预热容器 `argus-net-warmup`
更推荐的做法是改用 `docker stack deploy` 由 manager 调度 GPU 节点(支持渐进式扩容与节点约束),详见 `specs/issues/2025-11-07-swarm-compose-worker-overlay-network-not-found-lm1.md`
### 可选Stack 部署 GPU 节点manager 上执行)
前置:已在 managerlm2完成 `00_bootstrap.sh``01_server_up.sh`,并通过 `02_wait_ready.sh` 生成 `.env.nodes`;给目标 GPU 节点打标签 `argus.gpu=true`
```
cd src/sys/swarm_tests
# 给 GPU 节点打标签(示例)
docker node update --label-add argus.gpu=true lm1
# 可按需覆盖挂载路径(每个 GPU 节点都需存在同一路径)
export AGENT_VOLUME_PATH=/data1/yuyr/dev/argus/src/sys/swarm_tests/private-gpu-nodes/argus/agent
# 在 manager 上部署global 模式,自动在打标节点各拉起 1 副本)
bash scripts/05b_gpu_stack_deploy.sh
# 查看
docker stack services argus-swarm-gpu
docker stack ps argus-swarm-gpu
```
移除 stack`docker stack rm argus-swarm-gpu`(不会删除 overlay 网络与数据目录)。

View File

@ -0,0 +1,46 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$ROOT/.env"; [[ -f "$ENV_FILE" ]] && { set -a; source "$ENV_FILE"; set +a; }
ENV_NODES_FILE="$ROOT/.env.nodes"; [[ -f "$ENV_NODES_FILE" ]] && { set -a; source "$ENV_NODES_FILE"; set +a; }
NET_NAME="${NET_NAME:-argus-sys-net}"
WARMUP_NAME="${WARMUP_NAME:-argus-net-warmup}"
WARMUP_IMAGE="${WARMUP_IMAGE:-busybox:1.36}"
WARMUP_SECONDS="${WARMUP_SECONDS:-600}"
echo "[NET] warming up overlay network on worker: ${NET_NAME}"
if docker ps --format '{{.Names}}' | grep -q "^${WARMUP_NAME}$"; then
echo "[NET] warmup container already running: ${WARMUP_NAME}"
else
docker image inspect "$WARMUP_IMAGE" >/dev/null 2>&1 || docker pull "$WARMUP_IMAGE"
set +e
docker run -d --rm \
--name "$WARMUP_NAME" \
--network "$NET_NAME" \
${BINDIP:+--dns "$BINDIP"} \
"$WARMUP_IMAGE" sleep "$WARMUP_SECONDS"
rc=$?
set -e
if [[ $rc -ne 0 ]]; then
echo "[ERR] failed to start warmup container on network ${NET_NAME}. Is the overlay created with --attachable on manager?" >&2
exit 1
fi
fi
echo "[NET] waiting for local engine to see network (${NET_NAME})"
for i in {1..60}; do
if docker network inspect "$NET_NAME" >/dev/null 2>&1; then
echo "[NET] overlay visible locally now. You can run GPU compose."
docker network ls | grep -E "\b${NET_NAME}\b" || true
exit 0
fi
sleep 1
done
echo "[WARN] network still not inspectable locally after 60s, but warmup container is running. Compose may still pass; proceed to run GPU compose and retry if needed." >&2
exit 0

View File

@ -11,6 +11,9 @@ docker compose -p "${NODES_PROJECT:-argus-swarm-nodes}" -f "$ROOT/docker-compose
echo "[DOWN] stopping server compose"
docker compose -p "${SERVER_PROJECT:-argus-swarm-server}" -f "$ROOT/docker-compose.server.yml" down --remove-orphans || true
echo "[DOWN] removing warmup container (if any)"
docker rm -f argus-net-warmup >/dev/null 2>&1 || true
echo "[DOWN] removing overlay network"
docker network rm argus-sys-net >/dev/null 2>&1 || true
@ -18,4 +21,3 @@ echo "[DOWN] cleanup temp files"
rm -rf "$ROOT/private-server/tmp" "$ROOT/private-nodes/tmp" 2>/dev/null || true
echo "[DOWN] done"

0
src/sys/tests/scripts/15_alert_verify.sh Normal file → Executable file
View File