diff --git a/src/metric/client-plugins/all-in-one-full/config/VERSION b/src/metric/client-plugins/all-in-one-full/config/VERSION index a50908c..b978278 100644 --- a/src/metric/client-plugins/all-in-one-full/config/VERSION +++ b/src/metric/client-plugins/all-in-one-full/config/VERSION @@ -1 +1 @@ -1.42.0 +1.43.0 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/install.sh b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/install.sh index 7c97d6b..93bde99 100755 --- a/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/install.sh +++ b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/install.sh @@ -14,6 +14,16 @@ log_info() { echo -e "${BLUE}[INFO]${NC} $1" } +# 运行时开关(可通过环境变量覆盖) +# 1) 是否自动启动 nv-hostengine(容器内通常没有 systemd) +AUTO_START_DCGM="${AUTO_START_DCGM:-1}" +# 2) 是否默认禁用 Profiling 指标(避免在部分环境触发 DCGM Profiling 崩溃) +DCGM_EXPORTER_DISABLE_PROFILING="${DCGM_EXPORTER_DISABLE_PROFILING:-1}" +# 3) 自定义 collectors 文件;若为空且禁用 Profiling,则自动生成 no-prof 清单 +DCGM_EXPORTER_COLLECTORS="${DCGM_EXPORTER_COLLECTORS:-}" +# 4) 监听地址 +DCGM_EXPORTER_LISTEN="${DCGM_EXPORTER_LISTEN:-:9400}" + log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" } @@ -160,10 +170,21 @@ check_dcgm_service() { elif pgrep -f nv-hostengine > /dev/null; then log_success "nv-hostengine 进程已在运行" else - log_warning "DCGM 服务未运行,需要手动启动" - log_info "启动 DCGM 服务的方法:" - log_info " 1. 使用 systemd: sudo systemctl start dcgm" - log_info " 2. 手动启动: nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 &" + log_warning "DCGM 服务未运行" + if [[ "${AUTO_START_DCGM}" == "1" ]]; then + log_info "尝试自动启动 nv-hostengine(容器内无 systemd 场景)..." + nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 & + sleep 2 + if pgrep -f nv-hostengine >/dev/null; then + log_success "nv-hostengine 已启动" + else + log_error "nv-hostengine 启动失败,请手动检查 /var/log/nv-hostengine.log" + fi + else + log_info "启动 DCGM 服务的方法:" + log_info " 1. 使用 systemd: sudo systemctl start dcgm" + log_info " 2. 手动启动: nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 &" + fi fi # 测试 DCGM 连接 @@ -172,7 +193,7 @@ check_dcgm_service() { if dcgmi discovery -l > /dev/null 2>&1; then log_success "DCGM 连接测试成功" else - log_warning "DCGM 连接测试失败,请检查服务状态" + log_warning "DCGM 连接测试失败,请检查服务状态(驱动/权限/设备可见性)" fi fi } @@ -269,6 +290,7 @@ start_dcgm_exporter() { local binary_path="/usr/local/bin/dcgm-exporter" local log_file="/var/log/dcgm-exporter.log" local pid_file="/var/run/dcgm-exporter.pid" + local collectors_arg="" # 检查服务是否已经在运行 if [[ -f "$pid_file" ]]; then @@ -282,15 +304,48 @@ start_dcgm_exporter() { fi fi + # 计算 collectors 参数 + if [[ -n "${DCGM_EXPORTER_COLLECTORS}" ]]; then + if [[ -f "${DCGM_EXPORTER_COLLECTORS}" ]]; then + collectors_arg=(--collectors "${DCGM_EXPORTER_COLLECTORS}") + log_info "使用自定义 collectors: ${DCGM_EXPORTER_COLLECTORS}" + else + log_warning "指定的 DCGM_EXPORTER_COLLECTORS 文件不存在: ${DCGM_EXPORTER_COLLECTORS}(将忽略)" + fi + elif [[ "${DCGM_EXPORTER_DISABLE_PROFILING}" == "1" ]]; then + local cfg_dir="/etc/dcgm-exporter" + local default_cfg="${cfg_dir}/default-counters.csv" + local no_prof_cfg="${cfg_dir}/no-prof.csv" + mkdir -p "${cfg_dir}" + if [[ -f "${default_cfg}" ]]; then + grep -v 'DCGM_FI_PROF_' "${default_cfg}" > "${no_prof_cfg}" || true + collectors_arg=(--collectors "${no_prof_cfg}") + log_info "已生成无 Profiling 的 collectors: ${no_prof_cfg}" + else + log_warning "未找到默认 collectors 文件: ${default_cfg}" + fi + fi + # 检查端口是否被占用 - if netstat -tuln 2>/dev/null | grep -q ":9400 "; then + if netstat -tuln 2>/dev/null | grep -q ":${DCGM_EXPORTER_LISTEN#:} "; then log_warning "端口 9400 已被占用,请检查是否有其他服务在运行" return 1 fi + # 启动前再校验一次 DCGM 主机引擎 + if ! (systemctl is-active --quiet dcgm 2>/dev/null || pgrep -f nv-hostengine >/dev/null); then + log_warning "nv-hostengine 未运行,尝试自动启动" + nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 & + sleep 2 + fi + # 启动服务 log_info "正在启动 DCGM Exporter..." - nohup "$binary_path" --address=:9400 > "$log_file" 2>&1 & + if [[ ${#collectors_arg[@]} -gt 0 ]]; then + nohup "$binary_path" --address="${DCGM_EXPORTER_LISTEN}" "${collectors_arg[@]}" > "$log_file" 2>&1 & + else + nohup "$binary_path" --address="${DCGM_EXPORTER_LISTEN}" > "$log_file" 2>&1 & + fi local pid=$! # 保存 PID @@ -310,6 +365,20 @@ start_dcgm_exporter() { else log_error "DCGM Exporter 服务启动失败" rm -f "$pid_file" + # 失败回退:若未禁用 Profiling,也未指定 collectors,则尝试自动回退到 no-prof 再起一次 + if [[ -z "${DCGM_EXPORTER_COLLECTORS}" && "${DCGM_EXPORTER_DISABLE_PROFILING}" != "1" ]]; then + log_warning "尝试以无 Profiling 清单回退启动" + local cfg_dir="/etc/dcgm-exporter"; local default_cfg="${cfg_dir}/default-counters.csv"; local no_prof_cfg="${cfg_dir}/no-prof.csv" + if [[ -f "${default_cfg}" ]]; then + grep -v 'DCGM_FI_PROF_' "${default_cfg}" > "${no_prof_cfg}" || true + nohup "$binary_path" --address="${DCGM_EXPORTER_LISTEN}" --collectors "${no_prof_cfg}" > "$log_file" 2>&1 & + sleep 2 + if pgrep -f dcgm-exporter >/dev/null; then + log_success "DCGM Exporter 已用无 Profiling 清单启动" + return 0 + fi + fi + fi return 1 fi } diff --git a/src/sys/build/node-bundle/node-bootstrap.sh b/src/sys/build/node-bundle/node-bootstrap.sh index ab1a501..0b2db6f 100644 --- a/src/sys/build/node-bundle/node-bootstrap.sh +++ b/src/sys/build/node-bundle/node-bootstrap.sh @@ -40,7 +40,15 @@ else # run component installer within version dir if [[ -f "$version_dir/install.sh" ]]; then chmod +x "$version_dir/install.sh" 2>/dev/null || true - (cd "$version_dir" && ./install.sh "$version_dir") + # 传递运行时开关:容器内缺省启用 AUTO_START_DCGM=1、禁用 Profiling(可通过环境变量覆盖) + # 注意:不能用 `VAR=.. VAR2=.. (cmd)` 前缀到子 shell;bash 不允许 env 赋值直接修饰 `(` 复合命令。 + # 因此改为在子 subshell 中 export 后再执行。 + ( + export AUTO_START_DCGM="${AUTO_START_DCGM:-1}" + export DCGM_EXPORTER_DISABLE_PROFILING="${DCGM_EXPORTER_DISABLE_PROFILING:-1}" + export DCGM_EXPORTER_LISTEN="${DCGM_EXPORTER_LISTEN:-:9400}" + cd "$version_dir" && ./install.sh "$version_dir" + ) echo "$ver" > "$target_root/LATEST_VERSION" 2>/dev/null || true ln -sfn "$version_dir" "$target_root/current" 2>/dev/null || true if [[ -L "$target_root/current" && -d "$target_root/current" ]]; then @@ -75,7 +83,19 @@ if ! pgrep -x argus-agent >/dev/null 2>&1; then setsid /usr/local/bin/argus-agent >/var/log/argus-agent.log 2>&1 < /dev/null & fi -# 5) post-install selfcheck (best-effort) and wait for node.json +# 5) 若 dcgm-exporter 未监听(可能因 Profiling 崩溃),尝试无 Profiling 清单回退启动 +if ! ss -tlnp 2>/dev/null | grep -q ":9400 "; then + echo "[BOOT] dcgm-exporter not listening; trying no-prof fallback" + pgrep -f nv-hostengine >/dev/null || (nohup nv-hostengine >/var/log/nv-hostengine.log 2>&1 & sleep 2) + cfg_dir="/etc/dcgm-exporter"; default_cfg="$cfg_dir/default-counters.csv"; no_prof_cfg="$cfg_dir/no-prof.csv" + if [[ -f "$default_cfg" ]]; then + grep -v 'DCGM_FI_PROF_' "$default_cfg" > "$no_prof_cfg" || true + pkill -f dcgm-exporter >/dev/null 2>&1 || true + nohup /usr/local/bin/dcgm-exporter --address="${DCGM_EXPORTER_LISTEN:-:9400}" --collectors "$no_prof_cfg" >/var/log/dcgm-exporter.log 2>&1 & + fi +fi + +# 6) post-install selfcheck (best-effort) and wait for node.json for i in {1..30}; do if compgen -G "$INSTALL_DIR/versions/*/check_health.sh" > /dev/null; then bash "$INSTALL_DIR"/versions/*/check_health.sh || true diff --git a/src/sys/swarm_tests/.env.nodes.template b/src/sys/swarm_tests/.env.nodes.template index 7004b30..b28e9bf 100644 --- a/src/sys/swarm_tests/.env.nodes.template +++ b/src/sys/swarm_tests/.env.nodes.template @@ -3,6 +3,8 @@ FTPIP=10.0.4.29 MASTER_ENDPOINT=http://master.argus.com:3000 FTP_USER=ftpuser FTP_PASSWORD=ZGClab1234! -AGENT_ENV=dev2 +AGENT_ENV=lm1 AGENT_USER=yuyr AGENT_INSTANCE=node001sX +NODE_HOSTNAME=lm1 +GPU_NODE_HOSTNAME=lm1 \ No newline at end of file diff --git a/src/sys/swarm_tests/README.md b/src/sys/swarm_tests/README.md index 0d82f33..abbec3e 100644 --- a/src/sys/swarm_tests/README.md +++ b/src/sys/swarm_tests/README.md @@ -50,3 +50,45 @@ bash scripts/99_down.sh - Grafana/Kibana 启动报权限:检查 `configs/build_user.local.conf` 与 `00_bootstrap.sh` 的输出 UID/GID 是否一致;必要时设置 `SWARM_FIX_PERMS=1` 重新 `01_server_up.sh`。 - 节点容器 fallback 到 FTP:通常为 bundle 结构异常或健康检查失败(早期脚本在 `sh` 下执行)。当前 `node-bootstrap.sh` 已使用 `bash` 执行健康检查,并在本地安装成功后跳过 FTP。 - 代理 502:查看容器 `argus-web-proxy` 的 `/var/log/nginx/error.log` 与启动日志中 `upstream check` 行;若后端未就绪(尤其 Kibana),等待 `02_wait_ready.sh` 通过后再访问。 + +### 在 worker 上用 compose 起 GPU 节点的网络预热(overlay not found) +在多机 Swarm 场景,如果在 worker(如 `lm1`)上直接运行 `05_gpu_node_up.sh`,`docker compose` 对 external overlay `argus-sys-net` 的本地预检查可能报错 `network ... not found`。这是因为 worker 尚未在本地“加入”该 overlay。 + +Workaround:先在 worker 启一个临时容器加入 overlay 进行“网络预热”,随后再运行 GPU compose。 + +``` +# 在 worker 节点(lm1) +cd src/sys/swarm_tests +set -a; source .env; source .env.nodes; set +a + +# 预热 overlay(默认 600s 超时自动退出,可重复执行) +bash scripts/05a_net_warmup.sh + +# 然后再启动 GPU 节点 +bash scripts/05_gpu_node_up.sh +``` + +清理时 `scripts/99_down.sh` 会顺带移除预热容器 `argus-net-warmup`。 + +更推荐的做法是改用 `docker stack deploy` 由 manager 调度 GPU 节点(支持渐进式扩容与节点约束),详见 `specs/issues/2025-11-07-swarm-compose-worker-overlay-network-not-found-lm1.md`。 + +### (可选)Stack 部署 GPU 节点(manager 上执行) +前置:已在 manager(lm2)完成 `00_bootstrap.sh` 与 `01_server_up.sh`,并通过 `02_wait_ready.sh` 生成 `.env.nodes`;给目标 GPU 节点打标签 `argus.gpu=true`。 + +``` +cd src/sys/swarm_tests +# 给 GPU 节点打标签(示例) +docker node update --label-add argus.gpu=true lm1 + +# 可按需覆盖挂载路径(每个 GPU 节点都需存在同一路径) +export AGENT_VOLUME_PATH=/data1/yuyr/dev/argus/src/sys/swarm_tests/private-gpu-nodes/argus/agent + +# 在 manager 上部署(global 模式,自动在打标节点各拉起 1 副本) +bash scripts/05b_gpu_stack_deploy.sh + +# 查看 +docker stack services argus-swarm-gpu +docker stack ps argus-swarm-gpu +``` + +移除 stack:`docker stack rm argus-swarm-gpu`(不会删除 overlay 网络与数据目录)。 diff --git a/src/sys/swarm_tests/scripts/05a_net_warmup.sh b/src/sys/swarm_tests/scripts/05a_net_warmup.sh new file mode 100755 index 0000000..4048c8e --- /dev/null +++ b/src/sys/swarm_tests/scripts/05a_net_warmup.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +ENV_FILE="$ROOT/.env"; [[ -f "$ENV_FILE" ]] && { set -a; source "$ENV_FILE"; set +a; } +ENV_NODES_FILE="$ROOT/.env.nodes"; [[ -f "$ENV_NODES_FILE" ]] && { set -a; source "$ENV_NODES_FILE"; set +a; } + +NET_NAME="${NET_NAME:-argus-sys-net}" +WARMUP_NAME="${WARMUP_NAME:-argus-net-warmup}" +WARMUP_IMAGE="${WARMUP_IMAGE:-busybox:1.36}" +WARMUP_SECONDS="${WARMUP_SECONDS:-600}" + +echo "[NET] warming up overlay network on worker: ${NET_NAME}" + +if docker ps --format '{{.Names}}' | grep -q "^${WARMUP_NAME}$"; then + echo "[NET] warmup container already running: ${WARMUP_NAME}" +else + docker image inspect "$WARMUP_IMAGE" >/dev/null 2>&1 || docker pull "$WARMUP_IMAGE" + set +e + docker run -d --rm \ + --name "$WARMUP_NAME" \ + --network "$NET_NAME" \ + ${BINDIP:+--dns "$BINDIP"} \ + "$WARMUP_IMAGE" sleep "$WARMUP_SECONDS" + rc=$? + set -e + if [[ $rc -ne 0 ]]; then + echo "[ERR] failed to start warmup container on network ${NET_NAME}. Is the overlay created with --attachable on manager?" >&2 + exit 1 + fi +fi + +echo "[NET] waiting for local engine to see network (${NET_NAME})" +for i in {1..60}; do + if docker network inspect "$NET_NAME" >/dev/null 2>&1; then + echo "[NET] overlay visible locally now. You can run GPU compose." + docker network ls | grep -E "\b${NET_NAME}\b" || true + exit 0 + fi + sleep 1 +done + +echo "[WARN] network still not inspectable locally after 60s, but warmup container is running. Compose may still pass; proceed to run GPU compose and retry if needed." >&2 +exit 0 + diff --git a/src/sys/swarm_tests/scripts/99_down.sh b/src/sys/swarm_tests/scripts/99_down.sh index 95d8392..28e96e2 100755 --- a/src/sys/swarm_tests/scripts/99_down.sh +++ b/src/sys/swarm_tests/scripts/99_down.sh @@ -11,6 +11,9 @@ docker compose -p "${NODES_PROJECT:-argus-swarm-nodes}" -f "$ROOT/docker-compose echo "[DOWN] stopping server compose" docker compose -p "${SERVER_PROJECT:-argus-swarm-server}" -f "$ROOT/docker-compose.server.yml" down --remove-orphans || true +echo "[DOWN] removing warmup container (if any)" +docker rm -f argus-net-warmup >/dev/null 2>&1 || true + echo "[DOWN] removing overlay network" docker network rm argus-sys-net >/dev/null 2>&1 || true @@ -18,4 +21,3 @@ echo "[DOWN] cleanup temp files" rm -rf "$ROOT/private-server/tmp" "$ROOT/private-nodes/tmp" 2>/dev/null || true echo "[DOWN] done" - diff --git a/src/sys/tests/scripts/15_alert_verify.sh b/src/sys/tests/scripts/15_alert_verify.sh old mode 100644 new mode 100755