58 lines
3.0 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
PKG_ROOT="$ROOT_DIR"
ENV_FILE="$PKG_ROOT/compose/.env"
COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml"
info(){ echo -e "\033[34m[INSTALL-GPU]\033[0m $*"; }
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; }
require docker docker compose nvidia-smi
[[ -f "$ENV_FILE" ]] || { err "缺少 compose/.env请先运行 scripts/config.sh"; exit 1; }
info "使用环境文件: $ENV_FILE"
# 预热 overlay当 config 执行很久之前或容器已被清理时warmup 可能不存在)
set -a; source "$ENV_FILE"; set +a
NET_NAME="${ARGUS_OVERLAY_NET:-argus-sys-net}"
info "检查 overlay 网络可见性: $NET_NAME"
if ! docker network inspect "$NET_NAME" >/dev/null 2>&1; then
# 如 Overlay 不可见,尝试用 busybox 预热
if ! docker image inspect busybox:latest >/dev/null 2>&1; then
if [[ -f "$PKG_ROOT/images/busybox.tar" ]]; then docker load -i "$PKG_ROOT/images/busybox.tar"; else err "缺少 busybox 镜像images/busybox.tar 或本地 busybox:latest"; exit 1; fi
fi
docker rm -f argus-net-warmup >/dev/null 2>&1 || true
docker run -d --rm --name argus-net-warmup --network "$NET_NAME" ${BINDIP:+--dns "$BINDIP"} busybox:latest sleep 600 >/dev/null 2>&1 || true
for i in {1..60}; do docker network inspect "$NET_NAME" >/dev/null 2>&1 && break; sleep 1; done
docker network inspect "$NET_NAME" >/dev/null 2>&1 || { err "预热后仍未看到 overlay: $NET_NAME;请确认 manager 已创建并网络可达"; exit 1; }
info "overlay 已可见warmup=argus-net-warmup"
fi
# 容器内连通性检查BINDIP 与 FTPIP 可达
ping_ok(){ docker exec argus-net-warmup sh -lc "ping -c 1 -W 2 $1 >/dev/null 2>&1"; }
if [[ -n "${BINDIP:-}" ]]; then
if ping_ok "$BINDIP"; then info "warmup 内可达 BINDIP=$BINDIP"; else err "容器内无法 ping 通 BINDIP=$BINDIP"; exit 1; fi
fi
if [[ -n "${FTPIP:-}" ]]; then
if ping_ok "$FTPIP"; then info "warmup 内可达 FTPIP=$FTPIP"; else err "容器内无法 ping 通 FTPIP=$FTPIP"; exit 1; fi
fi
# 导入 GPU bundle 镜像
IMG_TGZ=$(ls -1 "$PKG_ROOT"/images/argus-sys-metric-test-node-bundle-gpu-*.tar.gz 2>/dev/null | head -1 || true)
[[ -n "$IMG_TGZ" ]] || { err "找不到 GPU bundle 镜像 tar.gz"; exit 1; }
info "导入 GPU bundle 镜像: $(basename "$IMG_TGZ")"
tmp=$(mktemp); gunzip -c "$IMG_TGZ" > "$tmp"; docker load -i "$tmp" >/dev/null; rm -f "$tmp"
# 确保日志目录存在(宿主侧,用于映射 /logs/infer 与 /logs/train
mkdir -p "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train"
info "日志目录已准备: logs/infer logs/train"
# 启动 compose 并跟踪日志
info "启动 GPU 节点 (docker compose up -d)"
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps
info "跟踪节点容器日志(按 Ctrl+C 退出)"
docker logs -f argus-metric-gpu-node-swarm || true