#!/usr/bin/env bash set -euo pipefail ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" PKG_ROOT="$ROOT_DIR" ENV_FILE="$PKG_ROOT/compose/.env" COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml" info(){ echo -e "\033[34m[INSTALL-GPU]\033[0m $*"; } err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; } require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; } require docker docker compose nvidia-smi [[ -f "$ENV_FILE" ]] || { err "缺少 compose/.env,请先运行 scripts/config.sh"; exit 1; } info "使用环境文件: $ENV_FILE" # 预热 overlay(当 config 执行很久之前或容器已被清理时,warmup 可能不存在) set -a; source "$ENV_FILE"; set +a NET_NAME="${ARGUS_OVERLAY_NET:-argus-sys-net}" info "检查 overlay 网络可见性: $NET_NAME" if ! docker network inspect "$NET_NAME" >/dev/null 2>&1; then # 如 Overlay 不可见,尝试用 busybox 预热 if ! docker image inspect busybox:latest >/dev/null 2>&1; then if [[ -f "$PKG_ROOT/images/busybox.tar" ]]; then docker load -i "$PKG_ROOT/images/busybox.tar"; else err "缺少 busybox 镜像(images/busybox.tar 或本地 busybox:latest)"; exit 1; fi fi docker rm -f argus-net-warmup >/dev/null 2>&1 || true docker run -d --rm --name argus-net-warmup --network "$NET_NAME" ${BINDIP:+--dns "$BINDIP"} busybox:latest sleep 600 >/dev/null 2>&1 || true for i in {1..60}; do docker network inspect "$NET_NAME" >/dev/null 2>&1 && break; sleep 1; done docker network inspect "$NET_NAME" >/dev/null 2>&1 || { err "预热后仍未看到 overlay: $NET_NAME;请确认 manager 已创建并网络可达"; exit 1; } info "overlay 已可见(warmup=argus-net-warmup)" fi # 容器内连通性检查:BINDIP 与 FTPIP 可达 ping_ok(){ docker exec argus-net-warmup sh -lc "ping -c 1 -W 2 $1 >/dev/null 2>&1"; } if [[ -n "${BINDIP:-}" ]]; then if ping_ok "$BINDIP"; then info "warmup 内可达 BINDIP=$BINDIP"; else err "容器内无法 ping 通 BINDIP=$BINDIP"; exit 1; fi fi if [[ -n "${FTPIP:-}" ]]; then if ping_ok "$FTPIP"; then info "warmup 内可达 FTPIP=$FTPIP"; else err "容器内无法 ping 通 FTPIP=$FTPIP"; exit 1; fi fi # 导入 GPU bundle 镜像 IMG_TGZ=$(ls -1 "$PKG_ROOT"/images/argus-sys-metric-test-node-bundle-gpu-*.tar.gz 2>/dev/null | head -1 || true) [[ -n "$IMG_TGZ" ]] || { err "找不到 GPU bundle 镜像 tar.gz"; exit 1; } info "导入 GPU bundle 镜像: $(basename "$IMG_TGZ")" tmp=$(mktemp); gunzip -c "$IMG_TGZ" > "$tmp"; docker load -i "$tmp" >/dev/null; rm -f "$tmp" # 确保日志目录存在(宿主侧,用于映射 /logs/infer 与 /logs/train),并赋权 1777(粘滞位) mkdir -p "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" chmod 1777 "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" || true info "日志目录已准备并赋权 1777: logs/infer logs/train" stat -c '%a %U:%G %n' "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" 2>/dev/null || true # 启动 compose 并跟踪日志 info "启动 GPU 节点 (docker compose up -d)" docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps # 再次校准宿主日志目录权限,避免容器内脚本对 bind mount 权限回退 chmod 1777 "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" || true stat -c '%a %U:%G %n' "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" 2>/dev/null || true info "跟踪节点容器日志(按 Ctrl+C 退出)" docker logs -f argus-metric-gpu-node-swarm || true