当前部署情况 - h1: 部署server & client - h2: 部署client - 部署2025-11-25 - 部署目录: /home2/argus/server , /home2/argus/client - 部署使用账号:argus 网络拓扑: - h1 作为docker swarm manager - h2 作为worker加入docker swarm - docker swarm 上创建overlay network 访问方式: - 通过ssh到h1服务器,端口转发 20006-20011 端口到笔记本本地; - 门户网址:http://localhost:20006/dashboard 部署截图:    注意事项: - server各容器使用域名作为overlay network上alias别名,实现域名访问,当前版本禁用bind作为域名解析,原因是容器重启后IP变化场景bind机制复杂且不稳定。 - client 构建是内置安装包,容器启动时执行安装流程,后续重启容器跳过安装步骤。 - UID/GID:部署使用 argus账号 uid=2133, gid=2015。 Reviewed-on: #51 Reviewed-by: sundapeng <sundp@mail.zgclab.edu.cn> Reviewed-by: xuxt <xuxt@zgclab.edu.cn> Reviewed-by: huhy <husteryezi@163.com>
73 lines
3.9 KiB
Bash
73 lines
3.9 KiB
Bash
#!/usr/bin/env bash
|
||
set -euo pipefail
|
||
|
||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||
PKG_ROOT="$ROOT_DIR"
|
||
ENV_FILE="$PKG_ROOT/compose/.env"
|
||
COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml"
|
||
|
||
info(){ echo -e "\033[34m[INSTALL-GPU]\033[0m $*"; }
|
||
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
|
||
require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; }
|
||
# Compose 检测:优先 docker compose(v2),回退 docker-compose(v1)
|
||
require_compose(){
|
||
if docker compose version >/dev/null 2>&1; then return 0; fi
|
||
if command -v docker-compose >/dev/null 2>&1 && docker-compose version >/dev/null 2>&1; then return 0; fi
|
||
err "未检测到 Docker Compose,请安装 docker compose v2 或 docker-compose v1"; exit 1
|
||
}
|
||
require docker nvidia-smi
|
||
require_compose
|
||
|
||
[[ -f "$ENV_FILE" ]] || { err "缺少 compose/.env,请先运行 scripts/config.sh"; exit 1; }
|
||
info "使用环境文件: $ENV_FILE"
|
||
|
||
# 预热 overlay(当 config 执行很久之前或容器已被清理时,warmup 可能不存在)
|
||
set -a; source "$ENV_FILE"; set +a
|
||
NET_NAME="${ARGUS_OVERLAY_NET:-argus-sys-net}"
|
||
info "检查 overlay 网络可见性: $NET_NAME"
|
||
if ! docker network inspect "$NET_NAME" >/dev/null 2>&1; then
|
||
# 如 Overlay 不可见,尝试用 busybox 预热(仅为确保 worker 节点已加入 overlay)
|
||
if ! docker image inspect busybox:latest >/dev/null 2>&1; then
|
||
if [[ -f "$PKG_ROOT/images/busybox.tar" ]]; then docker load -i "$PKG_ROOT/images/busybox.tar"; else err "缺少 busybox 镜像(images/busybox.tar 或本地 busybox:latest)"; exit 1; fi
|
||
fi
|
||
docker rm -f argus-net-warmup >/dev/null 2>&1 || true
|
||
docker run -d --rm --name argus-net-warmup --network "$NET_NAME" busybox:latest sleep 600 >/dev/null 2>&1 || true
|
||
for i in {1..60}; do docker network inspect "$NET_NAME" >/dev/null 2>&1 && break; sleep 1; done
|
||
docker network inspect "$NET_NAME" >/dev/null 2>&1 || { err "预热后仍未看到 overlay: $NET_NAME;请确认 manager 已创建并网络可达"; exit 1; }
|
||
info "overlay 已可见(warmup=argus-net-warmup)"
|
||
fi
|
||
|
||
# 若本函数内重新创建了 warmup 容器,同样测试一次 alias 数据通路
|
||
if docker ps --format '{{.Names}}' | grep -q '^argus-net-warmup$'; then
|
||
if ! docker exec argus-net-warmup sh -lc "ping -c 1 -W 2 master.argus.com >/dev/null 2>&1"; then
|
||
err "GPU install 阶段:warmup 容器内无法通过别名访问 master.argus.com;请检查 overlay $NET_NAME 与 server 状态"
|
||
exit 1
|
||
fi
|
||
info "GPU install 阶段:warmup 容器内可达 master.argus.com"
|
||
fi
|
||
|
||
# 导入 GPU bundle 镜像
|
||
IMG_TGZ=$(ls -1 "$PKG_ROOT"/images/argus-sys-metric-test-node-bundle-gpu-*.tar.gz 2>/dev/null | head -1 || true)
|
||
[[ -n "$IMG_TGZ" ]] || { err "找不到 GPU bundle 镜像 tar.gz"; exit 1; }
|
||
info "导入 GPU bundle 镜像: $(basename "$IMG_TGZ")"
|
||
tmp=$(mktemp); gunzip -c "$IMG_TGZ" > "$tmp"; docker load -i "$tmp" >/dev/null; rm -f "$tmp"
|
||
|
||
# 确保日志目录存在(宿主侧,用于映射 /logs/infer 与 /logs/train),并赋权 1777(粘滞位)
|
||
mkdir -p "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train"
|
||
chmod 1777 "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" || true
|
||
info "日志目录已准备并赋权 1777: logs/infer logs/train"
|
||
stat -c '%a %U:%G %n' "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" 2>/dev/null || true
|
||
|
||
# 启动 compose 并跟踪日志
|
||
PROJECT="${COMPOSE_PROJECT_NAME:-argus-client}"
|
||
info "启动 GPU 节点 (docker compose -p $PROJECT up -d)"
|
||
docker compose -p "$PROJECT" --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d
|
||
docker compose -p "$PROJECT" --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps
|
||
|
||
# 再次校准宿主日志目录权限,避免容器内脚本对 bind mount 权限回退
|
||
chmod 1777 "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" || true
|
||
stat -c '%a %U:%G %n' "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" 2>/dev/null || true
|
||
|
||
info "跟踪节点容器日志(按 Ctrl+C 退出)"
|
||
docker logs -f argus-metric-gpu-node-swarm || true
|