yuyr 34cb239bf4 完成H20服务器部署及重启测试 (#51)
当前部署情况
- h1: 部署server & client
- h2: 部署client
- 部署2025-11-25
- 部署目录:  /home2/argus/server  ,  /home2/argus/client
- 部署使用账号:argus

网络拓扑:
- h1 作为docker swarm manager
- h2 作为worker加入docker swarm
- docker swarm 上创建overlay network

访问方式:
- 通过ssh到h1服务器,端口转发 20006-20011 端口到笔记本本地;
- 门户网址:http://localhost:20006/dashboard

部署截图:
![image.png](/attachments/86c1a7af-dacc-4ba7-a182-f7cefd4e6427)
![image.png](/attachments/06f20852-771c-4264-b031-e6acd0f6ea1c)
![image.png](/attachments/091ab5a8-95bf-466f-a394-3255dcb49735)

注意事项:
- server各容器使用域名作为overlay network上alias别名,实现域名访问,当前版本禁用bind作为域名解析,原因是容器重启后IP变化场景bind机制复杂且不稳定。
- client 构建是内置安装包,容器启动时执行安装流程,后续重启容器跳过安装步骤。
- UID/GID:部署使用 argus账号 uid=2133, gid=2015。

Reviewed-on: #51
Reviewed-by: sundapeng <sundp@mail.zgclab.edu.cn>
Reviewed-by: xuxt <xuxt@zgclab.edu.cn>
Reviewed-by: huhy <husteryezi@163.com>
2025-11-25 15:54:29 +08:00

91 lines
4.5 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
PKG_ROOT="$ROOT_DIR"
ENV_EX="$PKG_ROOT/compose/.env.example"
ENV_OUT="$PKG_ROOT/compose/.env"
info(){ echo -e "\033[34m[CONFIG-GPU]\033[0m $*"; }
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; }
# Compose 检测:优先 docker composev2回退 docker-composev1
require_compose(){
if docker compose version >/dev/null 2>&1; then return 0; fi
if command -v docker-compose >/dev/null 2>&1 && docker-compose version >/dev/null 2>&1; then return 0; fi
err "未检测到 Docker Compose请安装 docker compose v2 或 docker-compose v1"; exit 1
}
require docker curl jq awk sed tar gzip
require_compose
# 磁盘空间检查MB
check_disk(){ local p="$1"; local need=10240; local free
free=$(df -Pm "$p" | awk 'NR==2{print $4+0}')
if [[ -z "$free" || "$free" -lt "$need" ]]; then err "磁盘空间不足: $p 剩余 ${free:-0}MB (<${need}MB)"; return 1; fi
}
check_disk "$PKG_ROOT"; check_disk "/var/lib/docker" || true
# 导入 cluster-info.env默认取当前包根也可用 CLUSTER_INFO 指定路径)
CI_IN="${CLUSTER_INFO:-$PKG_ROOT/cluster-info.env}"
info "读取 cluster-info.env: $CI_IN"
[[ -f "$CI_IN" ]] || { err "找不到 cluster-info.env默认当前包根或设置环境变量 CLUSTER_INFO 指定绝对路径)"; exit 1; }
set -a; source "$CI_IN"; set +a
[[ -n "${SWARM_MANAGER_ADDR:-}" && -n "${SWARM_JOIN_TOKEN_WORKER:-}" ]] || { err "cluster-info.env 缺少 SWARM 信息SWARM_MANAGER_ADDR/SWARM_JOIN_TOKEN_WORKER"; exit 1; }
# 加入 Swarm幂等
info "加入 Swarm幂等$SWARM_MANAGER_ADDR"
docker swarm join --token "$SWARM_JOIN_TOKEN_WORKER" "$SWARM_MANAGER_ADDR":2377 >/dev/null 2>&1 || true
# 导入 busybox 并做 overlay 预热与连通性(总是执行)
NET_NAME="${ARGUS_OVERLAY_NET:-argus-sys-net}"
# 准备 busybox
if ! docker image inspect busybox:latest >/dev/null 2>&1; then
if [[ -f "$PKG_ROOT/images/busybox.tar" ]]; then
info "加载 busybox.tar 以预热 overlay"
docker load -i "$PKG_ROOT/images/busybox.tar" >/dev/null
else
err "缺少 busybox 镜像(包内 images/busybox.tar 或本地 busybox:latest无法预热 overlay $NET_NAME"; exit 1
fi
fi
# 预热容器worker 侧加入 overlay 以便本地可见)
docker rm -f argus-net-warmup >/dev/null 2>&1 || true
info "启动 warmup 容器加入 overlay: $NET_NAME"
docker run -d --rm --name argus-net-warmup --network "$NET_NAME" busybox:latest sleep 600 >/dev/null 2>&1 || true
for i in {1..60}; do docker network inspect "$NET_NAME" >/dev/null 2>&1 && { info "overlay 可见 (t=${i}s)"; break; }; sleep 1; done
docker network inspect "$NET_NAME" >/dev/null 2>&1 || { err "预热后仍未看到 overlay: $NET_NAME;请确认 manager 已创建并网络可达"; exit 1; }
# 通过 warmup 容器测试实际数据通路alias → master
if ! docker exec argus-net-warmup sh -lc "ping -c 1 -W 2 master.argus.com >/dev/null 2>&1"; then
err "warmup 容器内无法通过别名访问 master.argus.com请确认 server compose 已启动并加入 overlay $NET_NAME"
exit 1
fi
info "warmup 容器内可达 master.argus.comDocker DNS + alias 正常)"
# 生成/更新 .env保留人工填写项不覆盖已有键
if [[ ! -f "$ENV_OUT" ]]; then
cp "$ENV_EX" "$ENV_OUT"
fi
set_kv(){ local k="$1" v="$2"; if grep -q "^${k}=" "$ENV_OUT"; then sed -i -E "s#^${k}=.*#${k}=${v}#" "$ENV_OUT"; else echo "${k}=${v}" >> "$ENV_OUT"; fi }
set_kv SWARM_MANAGER_ADDR "${SWARM_MANAGER_ADDR:-}"
set_kv SWARM_JOIN_TOKEN_WORKER "${SWARM_JOIN_TOKEN_WORKER:-}"
set_kv SWARM_JOIN_TOKEN_MANAGER "${SWARM_JOIN_TOKEN_MANAGER:-}"
REQ_VARS=(AGENT_ENV AGENT_USER AGENT_INSTANCE GPU_NODE_HOSTNAME)
missing=()
for v in "${REQ_VARS[@]}"; do
val=$(grep -E "^$v=" "$ENV_OUT" | head -1 | cut -d= -f2-)
if [[ -z "$val" ]]; then missing+=("$v"); fi
done
if [[ ${#missing[@]} -gt 0 ]]; then
err "以下变量必须在 compose/.env 中填写:${missing[*]}(已保留你现有的内容,不会被覆盖)"; exit 1; fi
info "已生成 compose/.env可执行 scripts/install.sh"
# 准备并赋权宿主日志目录(幂等,便于安装前人工检查/预创建)
mkdir -p "$PKG_ROOT/logs/train" "$PKG_ROOT/logs/infer"
chmod 1777 "$PKG_ROOT/logs/train" "$PKG_ROOT/logs/infer" || true
info "日志目录权限(期待 1777含粘滞位:"
stat -c '%a %U:%G %n' "$PKG_ROOT/logs/train" "$PKG_ROOT/logs/infer" 2>/dev/null || true