完成H20服务器部署及重启测试 #51
@ -73,6 +73,9 @@ PKG_VERSION=$VERSION
|
||||
|
||||
NODE_GPU_BUNDLE_IMAGE_TAG=${REPO}:${VERSION}
|
||||
|
||||
# Compose project name (isolation from server stack)
|
||||
COMPOSE_PROJECT_NAME=argus-client
|
||||
|
||||
# Required (no defaults). Must be filled before install.
|
||||
AGENT_ENV=
|
||||
AGENT_USER=
|
||||
|
||||
@ -113,6 +113,9 @@ FTP_PASSWORD=NASPlab1234!
|
||||
# UID/GID for volume ownership
|
||||
ARGUS_BUILD_UID=2133
|
||||
ARGUS_BUILD_GID=2015
|
||||
|
||||
# Compose project name (isolation from other stacks on same host)
|
||||
COMPOSE_PROJECT_NAME=argus-server
|
||||
EOF
|
||||
|
||||
# 3) Docs (from deployment_new templates)
|
||||
|
||||
@ -9,7 +9,14 @@ ENV_OUT="$PKG_ROOT/compose/.env"
|
||||
info(){ echo -e "\033[34m[CONFIG-GPU]\033[0m $*"; }
|
||||
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
|
||||
require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; }
|
||||
require docker curl jq awk sed tar gzip nvidia-smi
|
||||
# Compose 检测:优先 docker compose(v2),回退 docker-compose(v1)
|
||||
require_compose(){
|
||||
if docker compose version >/dev/null 2>&1; then return 0; fi
|
||||
if command -v docker-compose >/dev/null 2>&1 && docker-compose version >/dev/null 2>&1; then return 0; fi
|
||||
err "未检测到 Docker Compose,请安装 docker compose v2 或 docker-compose v1"; exit 1
|
||||
}
|
||||
require docker curl jq awk sed tar gzip
|
||||
require_compose
|
||||
|
||||
# 磁盘空间检查(MB)
|
||||
check_disk(){ local p="$1"; local need=10240; local free
|
||||
|
||||
@ -9,7 +9,14 @@ COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml"
|
||||
info(){ echo -e "\033[34m[INSTALL-GPU]\033[0m $*"; }
|
||||
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
|
||||
require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; }
|
||||
require docker docker compose nvidia-smi
|
||||
# Compose 检测:优先 docker compose(v2),回退 docker-compose(v1)
|
||||
require_compose(){
|
||||
if docker compose version >/dev/null 2>&1; then return 0; fi
|
||||
if command -v docker-compose >/dev/null 2>&1 && docker-compose version >/dev/null 2>&1; then return 0; fi
|
||||
err "未检测到 Docker Compose,请安装 docker compose v2 或 docker-compose v1"; exit 1
|
||||
}
|
||||
require docker nvidia-smi
|
||||
require_compose
|
||||
|
||||
[[ -f "$ENV_FILE" ]] || { err "缺少 compose/.env,请先运行 scripts/config.sh"; exit 1; }
|
||||
info "使用环境文件: $ENV_FILE"
|
||||
@ -52,9 +59,10 @@ info "日志目录已准备并赋权 1777: logs/infer logs/train"
|
||||
stat -c '%a %U:%G %n' "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" 2>/dev/null || true
|
||||
|
||||
# 启动 compose 并跟踪日志
|
||||
info "启动 GPU 节点 (docker compose up -d)"
|
||||
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d
|
||||
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps
|
||||
PROJECT="${COMPOSE_PROJECT_NAME:-argus-client}"
|
||||
info "启动 GPU 节点 (docker compose -p $PROJECT up -d)"
|
||||
docker compose -p "$PROJECT" --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d
|
||||
docker compose -p "$PROJECT" --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps
|
||||
|
||||
# 再次校准宿主日志目录权限,避免容器内脚本对 bind mount 权限回退
|
||||
chmod 1777 "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" || true
|
||||
|
||||
@ -6,11 +6,23 @@ PKG_ROOT="$ROOT_DIR"
|
||||
ENV_FILE="$PKG_ROOT/compose/.env"
|
||||
COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml"
|
||||
|
||||
# load COMPOSE_PROJECT_NAME if provided in compose/.env
|
||||
if [[ -f "$ENV_FILE" ]]; then set -a; source "$ENV_FILE"; set +a; fi
|
||||
PROJECT="${COMPOSE_PROJECT_NAME:-argus-client}"
|
||||
|
||||
info(){ echo -e "\033[34m[UNINSTALL-GPU]\033[0m $*"; }
|
||||
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
|
||||
# Compose 检测:优先 docker compose(v2),回退 docker-compose(v1)
|
||||
require_compose(){
|
||||
if docker compose version >/dev/null 2>&1; then return 0; fi
|
||||
if command -v docker-compose >/dev/null 2>&1 && docker-compose version >/dev/null 2>&1; then return 0; fi
|
||||
err "未检测到 Docker Compose,请安装 docker compose v2 或 docker-compose v1"; exit 1
|
||||
}
|
||||
require_compose
|
||||
|
||||
if [[ -f "$ENV_FILE" ]]; then
|
||||
info "stopping compose project"
|
||||
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" down --remove-orphans || true
|
||||
info "stopping compose project (project=$PROJECT)"
|
||||
docker compose -p "$PROJECT" --env-file "$ENV_FILE" -f "$COMPOSE_FILE" down --remove-orphans || true
|
||||
else
|
||||
info "compose/.env not found; attempting to remove container by name"
|
||||
fi
|
||||
@ -22,4 +34,3 @@ docker rm -f argus-net-warmup >/dev/null 2>&1 || true
|
||||
docker rm -f argus-metric-gpu-node-swarm >/dev/null 2>&1 || true
|
||||
|
||||
info "uninstall completed"
|
||||
|
||||
|
||||
@ -9,8 +9,15 @@ ENV_OUT="$PKG_ROOT/compose/.env"
|
||||
info(){ echo -e "\033[34m[CONFIG]\033[0m $*"; }
|
||||
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
|
||||
require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; }
|
||||
# Compose 检测:优先 docker compose(v2),回退 docker-compose(v1)
|
||||
require_compose(){
|
||||
if docker compose version >/dev/null 2>&1; then return 0; fi
|
||||
if command -v docker-compose >/dev/null 2>&1 && docker-compose version >/dev/null 2>&1; then return 0; fi
|
||||
err "未检测到 Docker Compose,请安装 docker compose v2 或 docker-compose v1"; exit 1
|
||||
}
|
||||
|
||||
require docker curl jq awk sed tar gzip
|
||||
require_compose
|
||||
|
||||
# 磁盘空间检查(MB)
|
||||
check_disk(){ local p="$1"; local need=10240; local free
|
||||
|
||||
@ -9,6 +9,11 @@ ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FI
|
||||
ts="$(date -u +%Y%m%d-%H%M%SZ)"
|
||||
LOG_DIR="$ROOT/logs"; mkdir -p "$LOG_DIR" || true
|
||||
if ! ( : > "$LOG_DIR/.w" 2>/dev/null ); then LOG_DIR="/tmp/argus-logs"; mkdir -p "$LOG_DIR" || true; fi
|
||||
|
||||
# load compose project for accurate ps output
|
||||
ENV_FILE="$ROOT/compose/.env"
|
||||
if [[ -f "$ENV_FILE" ]]; then set -a; source "$ENV_FILE"; set +a; fi
|
||||
PROJECT="${COMPOSE_PROJECT_NAME:-argus-server}"
|
||||
DETAILS="$LOG_DIR/diagnose_details_${ts}.log"; ERRORS="$LOG_DIR/diagnose_error_${ts}.log"; : > "$DETAILS"; : > "$ERRORS"
|
||||
|
||||
logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; }
|
||||
@ -85,7 +90,7 @@ docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share
|
||||
section SYSTEM
|
||||
logd "uname -a:"; uname -a >> "$DETAILS"
|
||||
logd "docker version:"; docker version --format '{{.Server.Version}}' >> "$DETAILS" 2>&1 || true
|
||||
logd "compose ps:"; (cd "$ROOT/compose" && docker compose ps) >> "$DETAILS" 2>&1 || true
|
||||
logd "compose ps (project=$PROJECT):"; (cd "$ROOT/compose" && docker compose -p "$PROJECT" --env-file "$ENV_FILE" -f docker-compose.yml ps) >> "$DETAILS" 2>&1 || true
|
||||
|
||||
section SUMMARY
|
||||
[[ $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health") != 200 ]] && echo "[es][http] health not 200" >> "$ERRORS"
|
||||
|
||||
@ -9,7 +9,14 @@ COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml"
|
||||
info(){ echo -e "\033[34m[INSTALL]\033[0m $*"; }
|
||||
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
|
||||
require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; }
|
||||
# Compose 检测:优先 docker compose(v2),回退 docker-compose(v1)
|
||||
require_compose(){
|
||||
if docker compose version >/dev/null 2>&1; then return 0; fi
|
||||
if command -v docker-compose >/devnull 2>&1 && docker-compose version >/dev/null 2>&1; then return 0; fi
|
||||
err "未检测到 Docker Compose,请安装 docker compose v2 或 docker-compose v1"; exit 1
|
||||
}
|
||||
require docker curl jq awk sed tar gzip
|
||||
require_compose
|
||||
|
||||
[[ -f "$ENV_FILE" ]] || { err "缺少 compose/.env,请先运行 scripts/config.sh"; exit 1; }
|
||||
info "使用环境文件: $ENV_FILE"
|
||||
@ -52,9 +59,10 @@ done
|
||||
shopt -u nullglob
|
||||
|
||||
# Compose up
|
||||
info "启动服务栈 (docker compose up -d)"
|
||||
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d
|
||||
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps
|
||||
PROJECT="${COMPOSE_PROJECT_NAME:-argus-server}"
|
||||
info "启动服务栈 (docker compose -p $PROJECT up -d)"
|
||||
docker compose -p "$PROJECT" --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d
|
||||
docker compose -p "$PROJECT" --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps
|
||||
|
||||
# Wait readiness (best-effort)
|
||||
code(){ curl -4 -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
|
||||
|
||||
@ -4,4 +4,6 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
PKG_ROOT="$ROOT_DIR"
|
||||
ENV_FILE="$PKG_ROOT/compose/.env"
|
||||
COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml"
|
||||
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps
|
||||
if [[ -f "$ENV_FILE" ]]; then set -a; source "$ENV_FILE"; set +a; fi
|
||||
PROJECT="${COMPOSE_PROJECT_NAME:-argus-server}"
|
||||
docker compose -p "$PROJECT" --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps
|
||||
|
||||
@ -4,6 +4,20 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
PKG_ROOT="$ROOT_DIR"
|
||||
ENV_FILE="$PKG_ROOT/compose/.env"
|
||||
COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml"
|
||||
echo "[UNINSTALL] stopping compose"
|
||||
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" down --remove-orphans || true
|
||||
|
||||
# load COMPOSE_PROJECT_NAME from env file if present
|
||||
if [[ -f "$ENV_FILE" ]]; then set -a; source "$ENV_FILE"; set +a; fi
|
||||
PROJECT="${COMPOSE_PROJECT_NAME:-argus-server}"
|
||||
|
||||
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
|
||||
# Compose 检测:优先 docker compose(v2),回退 docker-compose(v1)
|
||||
require_compose(){
|
||||
if docker compose version >/dev/null 2>&1; then return 0; fi
|
||||
if command -v docker-compose >/dev/null 2>&1 && docker-compose version >/dev/null 2>&1; then return 0; fi
|
||||
err "未检测到 Docker Compose,请安装 docker compose v2 或 docker-compose v1"; exit 1
|
||||
}
|
||||
require_compose
|
||||
|
||||
echo "[UNINSTALL] stopping compose (project=$PROJECT)"
|
||||
docker compose -p "$PROJECT" --env-file "$ENV_FILE" -f "$COMPOSE_FILE" down --remove-orphans || true
|
||||
echo "[UNINSTALL] done"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user