[#37] 修复dcgm exporter启动

2025-11-07 17:29:06 +08:00 · 2025-11-07 17:29:06 +08:00 · dac180f12b
commit dac180f12b
parent 1819fb9c46
8 changed files with 193 additions and 12 deletions
--- a/src/metric/client-plugins/all-in-one-full/config/VERSION
+++ b/src/metric/client-plugins/all-in-one-full/config/VERSION
@ -1 +1 @@
-1.42.0
+1.43.0
--- a/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/install.sh
+++ b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/install.sh
@ -14,6 +14,16 @@ log_info() {
    echo -e "${BLUE}[INFO]${NC} $1"
 }

+# 运行时开关（可通过环境变量覆盖）
+# 1) 是否自动启动 nv-hostengine（容器内通常没有 systemd）
+AUTO_START_DCGM="${AUTO_START_DCGM:-1}"
+# 2) 是否默认禁用 Profiling 指标（避免在部分环境触发 DCGM Profiling 崩溃）
+DCGM_EXPORTER_DISABLE_PROFILING="${DCGM_EXPORTER_DISABLE_PROFILING:-1}"
+# 3) 自定义 collectors 文件；若为空且禁用 Profiling，则自动生成 no-prof 清单
+DCGM_EXPORTER_COLLECTORS="${DCGM_EXPORTER_COLLECTORS:-}"
+# 4) 监听地址
+DCGM_EXPORTER_LISTEN="${DCGM_EXPORTER_LISTEN:-:9400}"
+
 log_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
 }
@ -160,10 +170,21 @@ check_dcgm_service() {
    elif pgrep -f nv-hostengine > /dev/null; then
        log_success "nv-hostengine 进程已在运行"
    else
-        log_warning "DCGM 服务未运行，需要手动启动"
-        log_info "启动 DCGM 服务的方法:"
-        log_info "  1. 使用 systemd: sudo systemctl start dcgm"
-        log_info "  2. 手动启动: nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 &"
+        log_warning "DCGM 服务未运行"
+        if [[ "${AUTO_START_DCGM}" == "1" ]]; then
+            log_info "尝试自动启动 nv-hostengine（容器内无 systemd 场景）..."
+            nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 &
+            sleep 2
+            if pgrep -f nv-hostengine >/dev/null; then
+                log_success "nv-hostengine 已启动"
+            else
+                log_error "nv-hostengine 启动失败，请手动检查 /var/log/nv-hostengine.log"
+            fi
+        else
+            log_info "启动 DCGM 服务的方法:"
+            log_info "  1. 使用 systemd: sudo systemctl start dcgm"
+            log_info "  2. 手动启动: nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 &"
+        fi
    fi
    
    # 测试 DCGM 连接
@ -172,7 +193,7 @@ check_dcgm_service() {
        if dcgmi discovery -l > /dev/null 2>&1; then
            log_success "DCGM 连接测试成功"
        else
-            log_warning "DCGM 连接测试失败，请检查服务状态"
+            log_warning "DCGM 连接测试失败，请检查服务状态（驱动/权限/设备可见性）"
        fi
    fi
 }
@ -269,6 +290,7 @@ start_dcgm_exporter() {
    local binary_path="/usr/local/bin/dcgm-exporter"
    local log_file="/var/log/dcgm-exporter.log"
    local pid_file="/var/run/dcgm-exporter.pid"
+    local collectors_arg=""
    
    # 检查服务是否已经在运行
    if [[ -f "$pid_file" ]]; then
@ -282,15 +304,48 @@ start_dcgm_exporter() {
        fi
    fi
    
+    # 计算 collectors 参数
+    if [[ -n "${DCGM_EXPORTER_COLLECTORS}" ]]; then
+        if [[ -f "${DCGM_EXPORTER_COLLECTORS}" ]]; then
+            collectors_arg=(--collectors "${DCGM_EXPORTER_COLLECTORS}")
+            log_info "使用自定义 collectors: ${DCGM_EXPORTER_COLLECTORS}"
+        else
+            log_warning "指定的 DCGM_EXPORTER_COLLECTORS 文件不存在: ${DCGM_EXPORTER_COLLECTORS}（将忽略）"
+        fi
+    elif [[ "${DCGM_EXPORTER_DISABLE_PROFILING}" == "1" ]]; then
+        local cfg_dir="/etc/dcgm-exporter"
+        local default_cfg="${cfg_dir}/default-counters.csv"
+        local no_prof_cfg="${cfg_dir}/no-prof.csv"
+        mkdir -p "${cfg_dir}"
+        if [[ -f "${default_cfg}" ]]; then
+            grep -v 'DCGM_FI_PROF_' "${default_cfg}" > "${no_prof_cfg}" || true
+            collectors_arg=(--collectors "${no_prof_cfg}")
+            log_info "已生成无 Profiling 的 collectors: ${no_prof_cfg}"
+        else
+            log_warning "未找到默认 collectors 文件: ${default_cfg}"
+        fi
+    fi
+
    # 检查端口是否被占用
-    if netstat -tuln 2>/dev/null | grep -q ":9400 "; then
+    if netstat -tuln 2>/dev/null | grep -q ":${DCGM_EXPORTER_LISTEN#:} "; then
        log_warning "端口 9400 已被占用，请检查是否有其他服务在运行"
        return 1
    fi
    
+    # 启动前再校验一次 DCGM 主机引擎
+    if ! (systemctl is-active --quiet dcgm 2>/dev/null || pgrep -f nv-hostengine >/dev/null); then
+        log_warning "nv-hostengine 未运行，尝试自动启动"
+        nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 &
+        sleep 2
+    fi
+
    # 启动服务
    log_info "正在启动 DCGM Exporter..."
-    nohup "$binary_path" --address=:9400 > "$log_file" 2>&1 &
+    if [[ ${#collectors_arg[@]} -gt 0 ]]; then
+        nohup "$binary_path" --address="${DCGM_EXPORTER_LISTEN}" "${collectors_arg[@]}" > "$log_file" 2>&1 &
+    else
+        nohup "$binary_path" --address="${DCGM_EXPORTER_LISTEN}" > "$log_file" 2>&1 &
+    fi
    local pid=$!
    
    # 保存 PID
@ -310,6 +365,20 @@ start_dcgm_exporter() {
    else
        log_error "DCGM Exporter 服务启动失败"
        rm -f "$pid_file"
+        # 失败回退：若未禁用 Profiling，也未指定 collectors，则尝试自动回退到 no-prof 再起一次
+        if [[ -z "${DCGM_EXPORTER_COLLECTORS}" && "${DCGM_EXPORTER_DISABLE_PROFILING}" != "1" ]]; then
+            log_warning "尝试以无 Profiling 清单回退启动"
+            local cfg_dir="/etc/dcgm-exporter"; local default_cfg="${cfg_dir}/default-counters.csv"; local no_prof_cfg="${cfg_dir}/no-prof.csv"
+            if [[ -f "${default_cfg}" ]]; then
+                grep -v 'DCGM_FI_PROF_' "${default_cfg}" > "${no_prof_cfg}" || true
+                nohup "$binary_path" --address="${DCGM_EXPORTER_LISTEN}" --collectors "${no_prof_cfg}" > "$log_file" 2>&1 &
+                sleep 2
+                if pgrep -f dcgm-exporter >/dev/null; then
+                    log_success "DCGM Exporter 已用无 Profiling 清单启动"
+                    return 0
+                fi
+            fi
+        fi
        return 1
    fi
 }
--- a/src/sys/build/node-bundle/node-bootstrap.sh
+++ b/src/sys/build/node-bundle/node-bootstrap.sh
@ -40,7 +40,15 @@ else
        # run component installer within version dir
        if [[ -f "$version_dir/install.sh" ]]; then
          chmod +x "$version_dir/install.sh" 2>/dev/null || true
-          (cd "$version_dir" && ./install.sh "$version_dir")
+          # 传递运行时开关：容器内缺省启用 AUTO_START_DCGM=1、禁用 Profiling（可通过环境变量覆盖）
+          # 注意：不能用 `VAR=.. VAR2=.. (cmd)` 前缀到子 shell；bash 不允许 env 赋值直接修饰 `(` 复合命令。
+          # 因此改为在子 subshell 中 export 后再执行。
+          (
+            export AUTO_START_DCGM="${AUTO_START_DCGM:-1}"
+            export DCGM_EXPORTER_DISABLE_PROFILING="${DCGM_EXPORTER_DISABLE_PROFILING:-1}"
+            export DCGM_EXPORTER_LISTEN="${DCGM_EXPORTER_LISTEN:-:9400}"
+            cd "$version_dir" && ./install.sh "$version_dir"
+          )
          echo "$ver" > "$target_root/LATEST_VERSION" 2>/dev/null || true
          ln -sfn "$version_dir" "$target_root/current" 2>/dev/null || true
          if [[ -L "$target_root/current" && -d "$target_root/current" ]]; then
@ -75,7 +83,19 @@ if ! pgrep -x argus-agent >/dev/null 2>&1; then
  setsid /usr/local/bin/argus-agent >/var/log/argus-agent.log 2>&1 < /dev/null &
 fi

-# 5) post-install selfcheck (best-effort) and wait for node.json
+# 5) 若 dcgm-exporter 未监听（可能因 Profiling 崩溃），尝试无 Profiling 清单回退启动
+if ! ss -tlnp 2>/dev/null | grep -q ":9400 "; then
+  echo "[BOOT] dcgm-exporter not listening; trying no-prof fallback"
+  pgrep -f nv-hostengine >/dev/null || (nohup nv-hostengine >/var/log/nv-hostengine.log 2>&1 & sleep 2)
+  cfg_dir="/etc/dcgm-exporter"; default_cfg="$cfg_dir/default-counters.csv"; no_prof_cfg="$cfg_dir/no-prof.csv"
+  if [[ -f "$default_cfg" ]]; then
+    grep -v 'DCGM_FI_PROF_' "$default_cfg" > "$no_prof_cfg" || true
+    pkill -f dcgm-exporter >/dev/null 2>&1 || true
+    nohup /usr/local/bin/dcgm-exporter --address="${DCGM_EXPORTER_LISTEN:-:9400}" --collectors "$no_prof_cfg" >/var/log/dcgm-exporter.log 2>&1 &
+  fi
+fi
+
+# 6) post-install selfcheck (best-effort) and wait for node.json
 for i in {1..30}; do
  if compgen -G "$INSTALL_DIR/versions/*/check_health.sh" > /dev/null; then
    bash "$INSTALL_DIR"/versions/*/check_health.sh || true
--- a/src/sys/swarm_tests/.env.nodes.template
+++ b/src/sys/swarm_tests/.env.nodes.template
@ -3,6 +3,8 @@ FTPIP=10.0.4.29
 MASTER_ENDPOINT=http://master.argus.com:3000
 FTP_USER=ftpuser
 FTP_PASSWORD=ZGClab1234!
-AGENT_ENV=dev2
+AGENT_ENV=lm1
 AGENT_USER=yuyr
 AGENT_INSTANCE=node001sX
+NODE_HOSTNAME=lm1
+GPU_NODE_HOSTNAME=lm1
--- a/src/sys/swarm_tests/README.md
+++ b/src/sys/swarm_tests/README.md
@ -50,3 +50,45 @@ bash scripts/99_down.sh
 - Grafana/Kibana 启动报权限：检查 `configs/build_user.local.conf` 与 `00_bootstrap.sh` 的输出 UID/GID 是否一致；必要时设置 `SWARM_FIX_PERMS=1` 重新 `01_server_up.sh`。
 - 节点容器 fallback 到 FTP：通常为 bundle 结构异常或健康检查失败（早期脚本在 `sh` 下执行）。当前 `node-bootstrap.sh` 已使用 `bash` 执行健康检查，并在本地安装成功后跳过 FTP。
 - 代理 502：查看容器 `argus-web-proxy` 的 `/var/log/nginx/error.log` 与启动日志中 `upstream check` 行；若后端未就绪（尤其 Kibana），等待 `02_wait_ready.sh` 通过后再访问。
+
+### 在 worker 上用 compose 起 GPU 节点的网络预热（overlay not found）
+在多机 Swarm 场景，如果在 worker（如 `lm1`）上直接运行 `05_gpu_node_up.sh`，`docker compose` 对 external overlay `argus-sys-net` 的本地预检查可能报错 `network ... not found`。这是因为 worker 尚未在本地“加入”该 overlay。
+
+Workaround：先在 worker 启一个临时容器加入 overlay 进行“网络预热”，随后再运行 GPU compose。
+
+```
+# 在 worker 节点（lm1）
+cd src/sys/swarm_tests
+set -a; source .env; source .env.nodes; set +a
+
+# 预热 overlay（默认 600s 超时自动退出，可重复执行）
+bash scripts/05a_net_warmup.sh
+
+# 然后再启动 GPU 节点
+bash scripts/05_gpu_node_up.sh
+```
+
+清理时 `scripts/99_down.sh` 会顺带移除预热容器 `argus-net-warmup`。
+
+更推荐的做法是改用 `docker stack deploy` 由 manager 调度 GPU 节点（支持渐进式扩容与节点约束），详见 `specs/issues/2025-11-07-swarm-compose-worker-overlay-network-not-found-lm1.md`。
+
+### （可选）Stack 部署 GPU 节点（manager 上执行）
+前置：已在 manager（lm2）完成 `00_bootstrap.sh` 与 `01_server_up.sh`，并通过 `02_wait_ready.sh` 生成 `.env.nodes`；给目标 GPU 节点打标签 `argus.gpu=true`。
+
+```
+cd src/sys/swarm_tests
+# 给 GPU 节点打标签（示例）
+docker node update --label-add argus.gpu=true lm1
+
+# 可按需覆盖挂载路径（每个 GPU 节点都需存在同一路径）
+export AGENT_VOLUME_PATH=/data1/yuyr/dev/argus/src/sys/swarm_tests/private-gpu-nodes/argus/agent
+
+# 在 manager 上部署（global 模式，自动在打标节点各拉起 1 副本）
+bash scripts/05b_gpu_stack_deploy.sh
+
+# 查看
+docker stack services argus-swarm-gpu
+docker stack ps argus-swarm-gpu
+```
+
+移除 stack：`docker stack rm argus-swarm-gpu`（不会删除 overlay 网络与数据目录）。
--- a/src/sys/swarm_tests/scripts/05a_net_warmup.sh
+++ b/src/sys/swarm_tests/scripts/05a_net_warmup.sh
@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+ENV_FILE="$ROOT/.env"; [[ -f "$ENV_FILE" ]] && { set -a; source "$ENV_FILE"; set +a; }
+ENV_NODES_FILE="$ROOT/.env.nodes"; [[ -f "$ENV_NODES_FILE" ]] && { set -a; source "$ENV_NODES_FILE"; set +a; }
+
+NET_NAME="${NET_NAME:-argus-sys-net}"
+WARMUP_NAME="${WARMUP_NAME:-argus-net-warmup}"
+WARMUP_IMAGE="${WARMUP_IMAGE:-busybox:1.36}"
+WARMUP_SECONDS="${WARMUP_SECONDS:-600}"
+
+echo "[NET] warming up overlay network on worker: ${NET_NAME}"
+
+if docker ps --format '{{.Names}}' | grep -q "^${WARMUP_NAME}$"; then
+  echo "[NET] warmup container already running: ${WARMUP_NAME}"
+else
+  docker image inspect "$WARMUP_IMAGE" >/dev/null 2>&1 || docker pull "$WARMUP_IMAGE"
+  set +e
+  docker run -d --rm \
+    --name "$WARMUP_NAME" \
+    --network "$NET_NAME" \
+    ${BINDIP:+--dns "$BINDIP"} \
+    "$WARMUP_IMAGE" sleep "$WARMUP_SECONDS"
+  rc=$?
+  set -e
+  if [[ $rc -ne 0 ]]; then
+    echo "[ERR] failed to start warmup container on network ${NET_NAME}. Is the overlay created with --attachable on manager?" >&2
+    exit 1
+  fi
+fi
+
+echo "[NET] waiting for local engine to see network (${NET_NAME})"
+for i in {1..60}; do
+  if docker network inspect "$NET_NAME" >/dev/null 2>&1; then
+    echo "[NET] overlay visible locally now. You can run GPU compose."
+    docker network ls | grep -E "\b${NET_NAME}\b" || true
+    exit 0
+  fi
+  sleep 1
+done
+
+echo "[WARN] network still not inspectable locally after 60s, but warmup container is running. Compose may still pass; proceed to run GPU compose and retry if needed." >&2
+exit 0
+
--- a/src/sys/swarm_tests/scripts/99_down.sh
+++ b/src/sys/swarm_tests/scripts/99_down.sh
@ -11,6 +11,9 @@ docker compose -p "${NODES_PROJECT:-argus-swarm-nodes}" -f "$ROOT/docker-compose
 echo "[DOWN] stopping server compose"
 docker compose -p "${SERVER_PROJECT:-argus-swarm-server}" -f "$ROOT/docker-compose.server.yml" down --remove-orphans || true

+echo "[DOWN] removing warmup container (if any)"
+docker rm -f argus-net-warmup >/dev/null 2>&1 || true
+
 echo "[DOWN] removing overlay network"
 docker network rm argus-sys-net >/dev/null 2>&1 || true

@ -18,4 +21,3 @@ echo "[DOWN] cleanup temp files"
 rm -rf "$ROOT/private-server/tmp" "$ROOT/private-nodes/tmp" 2>/dev/null || true

 echo "[DOWN] done"
-
--- a/src/sys/tests/scripts/15_alert_verify.sh
+++ b/src/sys/tests/scripts/15_alert_verify.sh
 @ -1 +1 @@
 .42.0
 .43.0