当前部署情况 - h1: 部署server & client - h2: 部署client - 部署2025-11-25 - 部署目录: /home2/argus/server , /home2/argus/client - 部署使用账号:argus 网络拓扑: - h1 作为docker swarm manager - h2 作为worker加入docker swarm - docker swarm 上创建overlay network 访问方式: - 通过ssh到h1服务器,端口转发 20006-20011 端口到笔记本本地; - 门户网址:http://localhost:20006/dashboard 部署截图:    注意事项: - server各容器使用域名作为overlay network上alias别名,实现域名访问,当前版本禁用bind作为域名解析,原因是容器重启后IP变化场景bind机制复杂且不稳定。 - client 构建是内置安装包,容器启动时执行安装流程,后续重启容器跳过安装步骤。 - UID/GID:部署使用 argus账号 uid=2133, gid=2015。 Reviewed-on: #51 Reviewed-by: sundapeng <sundp@mail.zgclab.edu.cn> Reviewed-by: xuxt <xuxt@zgclab.edu.cn> Reviewed-by: huhy <husteryezi@163.com>
91 lines
4.5 KiB
Bash
91 lines
4.5 KiB
Bash
#!/usr/bin/env bash
|
||
set -euo pipefail
|
||
|
||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||
PKG_ROOT="$ROOT_DIR"
|
||
ENV_EX="$PKG_ROOT/compose/.env.example"
|
||
ENV_OUT="$PKG_ROOT/compose/.env"
|
||
|
||
info(){ echo -e "\033[34m[CONFIG-GPU]\033[0m $*"; }
|
||
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
|
||
require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; }
|
||
# Compose 检测:优先 docker compose(v2),回退 docker-compose(v1)
|
||
require_compose(){
|
||
if docker compose version >/dev/null 2>&1; then return 0; fi
|
||
if command -v docker-compose >/dev/null 2>&1 && docker-compose version >/dev/null 2>&1; then return 0; fi
|
||
err "未检测到 Docker Compose,请安装 docker compose v2 或 docker-compose v1"; exit 1
|
||
}
|
||
require docker curl jq awk sed tar gzip
|
||
require_compose
|
||
|
||
# 磁盘空间检查(MB)
|
||
check_disk(){ local p="$1"; local need=10240; local free
|
||
free=$(df -Pm "$p" | awk 'NR==2{print $4+0}')
|
||
if [[ -z "$free" || "$free" -lt "$need" ]]; then err "磁盘空间不足: $p 剩余 ${free:-0}MB (<${need}MB)"; return 1; fi
|
||
}
|
||
check_disk "$PKG_ROOT"; check_disk "/var/lib/docker" || true
|
||
|
||
# 导入 cluster-info.env(默认取当前包根,也可用 CLUSTER_INFO 指定路径)
|
||
CI_IN="${CLUSTER_INFO:-$PKG_ROOT/cluster-info.env}"
|
||
info "读取 cluster-info.env: $CI_IN"
|
||
[[ -f "$CI_IN" ]] || { err "找不到 cluster-info.env(默认当前包根,或设置环境变量 CLUSTER_INFO 指定绝对路径)"; exit 1; }
|
||
set -a; source "$CI_IN"; set +a
|
||
[[ -n "${SWARM_MANAGER_ADDR:-}" && -n "${SWARM_JOIN_TOKEN_WORKER:-}" ]] || { err "cluster-info.env 缺少 SWARM 信息(SWARM_MANAGER_ADDR/SWARM_JOIN_TOKEN_WORKER)"; exit 1; }
|
||
|
||
# 加入 Swarm(幂等)
|
||
info "加入 Swarm(幂等):$SWARM_MANAGER_ADDR"
|
||
docker swarm join --token "$SWARM_JOIN_TOKEN_WORKER" "$SWARM_MANAGER_ADDR":2377 >/dev/null 2>&1 || true
|
||
|
||
# 导入 busybox 并做 overlay 预热与连通性(总是执行)
|
||
NET_NAME="${ARGUS_OVERLAY_NET:-argus-sys-net}"
|
||
# 准备 busybox
|
||
if ! docker image inspect busybox:latest >/dev/null 2>&1; then
|
||
if [[ -f "$PKG_ROOT/images/busybox.tar" ]]; then
|
||
info "加载 busybox.tar 以预热 overlay"
|
||
docker load -i "$PKG_ROOT/images/busybox.tar" >/dev/null
|
||
else
|
||
err "缺少 busybox 镜像(包内 images/busybox.tar 或本地 busybox:latest),无法预热 overlay $NET_NAME"; exit 1
|
||
fi
|
||
fi
|
||
# 预热容器(worker 侧加入 overlay 以便本地可见)
|
||
docker rm -f argus-net-warmup >/dev/null 2>&1 || true
|
||
info "启动 warmup 容器加入 overlay: $NET_NAME"
|
||
docker run -d --rm --name argus-net-warmup --network "$NET_NAME" busybox:latest sleep 600 >/dev/null 2>&1 || true
|
||
for i in {1..60}; do docker network inspect "$NET_NAME" >/dev/null 2>&1 && { info "overlay 可见 (t=${i}s)"; break; }; sleep 1; done
|
||
docker network inspect "$NET_NAME" >/dev/null 2>&1 || { err "预热后仍未看到 overlay: $NET_NAME;请确认 manager 已创建并网络可达"; exit 1; }
|
||
|
||
# 通过 warmup 容器测试实际数据通路(alias → master)
|
||
if ! docker exec argus-net-warmup sh -lc "ping -c 1 -W 2 master.argus.com >/dev/null 2>&1"; then
|
||
err "warmup 容器内无法通过别名访问 master.argus.com;请确认 server compose 已启动并加入 overlay $NET_NAME"
|
||
exit 1
|
||
fi
|
||
info "warmup 容器内可达 master.argus.com(Docker DNS + alias 正常)"
|
||
|
||
# 生成/更新 .env(保留人工填写项,不覆盖已有键)
|
||
if [[ ! -f "$ENV_OUT" ]]; then
|
||
cp "$ENV_EX" "$ENV_OUT"
|
||
fi
|
||
|
||
set_kv(){ local k="$1" v="$2"; if grep -q "^${k}=" "$ENV_OUT"; then sed -i -E "s#^${k}=.*#${k}=${v}#" "$ENV_OUT"; else echo "${k}=${v}" >> "$ENV_OUT"; fi }
|
||
|
||
set_kv SWARM_MANAGER_ADDR "${SWARM_MANAGER_ADDR:-}"
|
||
set_kv SWARM_JOIN_TOKEN_WORKER "${SWARM_JOIN_TOKEN_WORKER:-}"
|
||
set_kv SWARM_JOIN_TOKEN_MANAGER "${SWARM_JOIN_TOKEN_MANAGER:-}"
|
||
|
||
REQ_VARS=(AGENT_ENV AGENT_USER AGENT_INSTANCE GPU_NODE_HOSTNAME)
|
||
missing=()
|
||
for v in "${REQ_VARS[@]}"; do
|
||
val=$(grep -E "^$v=" "$ENV_OUT" | head -1 | cut -d= -f2-)
|
||
if [[ -z "$val" ]]; then missing+=("$v"); fi
|
||
done
|
||
if [[ ${#missing[@]} -gt 0 ]]; then
|
||
err "以下变量必须在 compose/.env 中填写:${missing[*]}(已保留你现有的内容,不会被覆盖)"; exit 1; fi
|
||
|
||
info "已生成 compose/.env;可执行 scripts/install.sh"
|
||
|
||
# 准备并赋权宿主日志目录(幂等,便于安装前人工检查/预创建)
|
||
mkdir -p "$PKG_ROOT/logs/train" "$PKG_ROOT/logs/infer"
|
||
chmod 1777 "$PKG_ROOT/logs/train" "$PKG_ROOT/logs/infer" || true
|
||
info "日志目录权限(期待 1777,含粘滞位):"
|
||
stat -c '%a %U:%G %n' "$PKG_ROOT/logs/train" "$PKG_ROOT/logs/infer" 2>/dev/null || true
|