完成H20服务器部署及重启测试 #51

Merged
yuyr merged 27 commits from dev_1.1.0_yuyr_nobind into dev_1.0.0 2025-11-25 15:54:30 +08:00
10 changed files with 83 additions and 15 deletions
Showing only changes of commit 5b617f62a8 - Show all commits

View File

@ -73,6 +73,9 @@ PKG_VERSION=$VERSION
NODE_GPU_BUNDLE_IMAGE_TAG=${REPO}:${VERSION} NODE_GPU_BUNDLE_IMAGE_TAG=${REPO}:${VERSION}
# Compose project name (isolation from server stack)
COMPOSE_PROJECT_NAME=argus-client
# Required (no defaults). Must be filled before install. # Required (no defaults). Must be filled before install.
AGENT_ENV= AGENT_ENV=
AGENT_USER= AGENT_USER=

View File

@ -113,6 +113,9 @@ FTP_PASSWORD=NASPlab1234!
# UID/GID for volume ownership # UID/GID for volume ownership
ARGUS_BUILD_UID=2133 ARGUS_BUILD_UID=2133
ARGUS_BUILD_GID=2015 ARGUS_BUILD_GID=2015
# Compose project name (isolation from other stacks on same host)
COMPOSE_PROJECT_NAME=argus-server
EOF EOF
# 3) Docs (from deployment_new templates) # 3) Docs (from deployment_new templates)

View File

@ -9,7 +9,14 @@ ENV_OUT="$PKG_ROOT/compose/.env"
info(){ echo -e "\033[34m[CONFIG-GPU]\033[0m $*"; } info(){ echo -e "\033[34m[CONFIG-GPU]\033[0m $*"; }
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; } err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; } require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; }
require docker curl jq awk sed tar gzip nvidia-smi # Compose 检测:优先 docker composev2回退 docker-composev1
require_compose(){
if docker compose version >/dev/null 2>&1; then return 0; fi
if command -v docker-compose >/dev/null 2>&1 && docker-compose version >/dev/null 2>&1; then return 0; fi
err "未检测到 Docker Compose请安装 docker compose v2 或 docker-compose v1"; exit 1
}
require docker curl jq awk sed tar gzip
require_compose
# 磁盘空间检查MB # 磁盘空间检查MB
check_disk(){ local p="$1"; local need=10240; local free check_disk(){ local p="$1"; local need=10240; local free

View File

@ -9,7 +9,14 @@ COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml"
info(){ echo -e "\033[34m[INSTALL-GPU]\033[0m $*"; } info(){ echo -e "\033[34m[INSTALL-GPU]\033[0m $*"; }
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; } err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; } require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; }
require docker docker compose nvidia-smi # Compose 检测:优先 docker composev2回退 docker-composev1
require_compose(){
if docker compose version >/dev/null 2>&1; then return 0; fi
if command -v docker-compose >/dev/null 2>&1 && docker-compose version >/dev/null 2>&1; then return 0; fi
err "未检测到 Docker Compose请安装 docker compose v2 或 docker-compose v1"; exit 1
}
require docker nvidia-smi
require_compose
[[ -f "$ENV_FILE" ]] || { err "缺少 compose/.env请先运行 scripts/config.sh"; exit 1; } [[ -f "$ENV_FILE" ]] || { err "缺少 compose/.env请先运行 scripts/config.sh"; exit 1; }
info "使用环境文件: $ENV_FILE" info "使用环境文件: $ENV_FILE"
@ -52,9 +59,10 @@ info "日志目录已准备并赋权 1777: logs/infer logs/train"
stat -c '%a %U:%G %n' "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" 2>/dev/null || true stat -c '%a %U:%G %n' "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" 2>/dev/null || true
# 启动 compose 并跟踪日志 # 启动 compose 并跟踪日志
info "启动 GPU 节点 (docker compose up -d)" PROJECT="${COMPOSE_PROJECT_NAME:-argus-client}"
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d info "启动 GPU 节点 (docker compose -p $PROJECT up -d)"
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps docker compose -p "$PROJECT" --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d
docker compose -p "$PROJECT" --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps
# 再次校准宿主日志目录权限,避免容器内脚本对 bind mount 权限回退 # 再次校准宿主日志目录权限,避免容器内脚本对 bind mount 权限回退
chmod 1777 "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" || true chmod 1777 "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" || true

View File

@ -6,11 +6,23 @@ PKG_ROOT="$ROOT_DIR"
ENV_FILE="$PKG_ROOT/compose/.env" ENV_FILE="$PKG_ROOT/compose/.env"
COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml" COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml"
# load COMPOSE_PROJECT_NAME if provided in compose/.env
if [[ -f "$ENV_FILE" ]]; then set -a; source "$ENV_FILE"; set +a; fi
PROJECT="${COMPOSE_PROJECT_NAME:-argus-client}"
info(){ echo -e "\033[34m[UNINSTALL-GPU]\033[0m $*"; } info(){ echo -e "\033[34m[UNINSTALL-GPU]\033[0m $*"; }
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
# Compose 检测:优先 docker composev2回退 docker-composev1
require_compose(){
if docker compose version >/dev/null 2>&1; then return 0; fi
if command -v docker-compose >/dev/null 2>&1 && docker-compose version >/dev/null 2>&1; then return 0; fi
err "未检测到 Docker Compose请安装 docker compose v2 或 docker-compose v1"; exit 1
}
require_compose
if [[ -f "$ENV_FILE" ]]; then if [[ -f "$ENV_FILE" ]]; then
info "stopping compose project" info "stopping compose project (project=$PROJECT)"
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" down --remove-orphans || true docker compose -p "$PROJECT" --env-file "$ENV_FILE" -f "$COMPOSE_FILE" down --remove-orphans || true
else else
info "compose/.env not found; attempting to remove container by name" info "compose/.env not found; attempting to remove container by name"
fi fi
@ -22,4 +34,3 @@ docker rm -f argus-net-warmup >/dev/null 2>&1 || true
docker rm -f argus-metric-gpu-node-swarm >/dev/null 2>&1 || true docker rm -f argus-metric-gpu-node-swarm >/dev/null 2>&1 || true
info "uninstall completed" info "uninstall completed"

View File

@ -9,8 +9,15 @@ ENV_OUT="$PKG_ROOT/compose/.env"
info(){ echo -e "\033[34m[CONFIG]\033[0m $*"; } info(){ echo -e "\033[34m[CONFIG]\033[0m $*"; }
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; } err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; } require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; }
# Compose 检测:优先 docker composev2回退 docker-composev1
require_compose(){
if docker compose version >/dev/null 2>&1; then return 0; fi
if command -v docker-compose >/dev/null 2>&1 && docker-compose version >/dev/null 2>&1; then return 0; fi
err "未检测到 Docker Compose请安装 docker compose v2 或 docker-compose v1"; exit 1
}
require docker curl jq awk sed tar gzip require docker curl jq awk sed tar gzip
require_compose
# 磁盘空间检查MB # 磁盘空间检查MB
check_disk(){ local p="$1"; local need=10240; local free check_disk(){ local p="$1"; local need=10240; local free

View File

@ -9,6 +9,11 @@ ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FI
ts="$(date -u +%Y%m%d-%H%M%SZ)" ts="$(date -u +%Y%m%d-%H%M%SZ)"
LOG_DIR="$ROOT/logs"; mkdir -p "$LOG_DIR" || true LOG_DIR="$ROOT/logs"; mkdir -p "$LOG_DIR" || true
if ! ( : > "$LOG_DIR/.w" 2>/dev/null ); then LOG_DIR="/tmp/argus-logs"; mkdir -p "$LOG_DIR" || true; fi if ! ( : > "$LOG_DIR/.w" 2>/dev/null ); then LOG_DIR="/tmp/argus-logs"; mkdir -p "$LOG_DIR" || true; fi
# load compose project for accurate ps output
ENV_FILE="$ROOT/compose/.env"
if [[ -f "$ENV_FILE" ]]; then set -a; source "$ENV_FILE"; set +a; fi
PROJECT="${COMPOSE_PROJECT_NAME:-argus-server}"
DETAILS="$LOG_DIR/diagnose_details_${ts}.log"; ERRORS="$LOG_DIR/diagnose_error_${ts}.log"; : > "$DETAILS"; : > "$ERRORS" DETAILS="$LOG_DIR/diagnose_details_${ts}.log"; ERRORS="$LOG_DIR/diagnose_error_${ts}.log"; : > "$DETAILS"; : > "$ERRORS"
logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; } logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; }
@ -85,7 +90,7 @@ docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share
section SYSTEM section SYSTEM
logd "uname -a:"; uname -a >> "$DETAILS" logd "uname -a:"; uname -a >> "$DETAILS"
logd "docker version:"; docker version --format '{{.Server.Version}}' >> "$DETAILS" 2>&1 || true logd "docker version:"; docker version --format '{{.Server.Version}}' >> "$DETAILS" 2>&1 || true
logd "compose ps:"; (cd "$ROOT/compose" && docker compose ps) >> "$DETAILS" 2>&1 || true logd "compose ps (project=$PROJECT):"; (cd "$ROOT/compose" && docker compose -p "$PROJECT" --env-file "$ENV_FILE" -f docker-compose.yml ps) >> "$DETAILS" 2>&1 || true
section SUMMARY section SUMMARY
[[ $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health") != 200 ]] && echo "[es][http] health not 200" >> "$ERRORS" [[ $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health") != 200 ]] && echo "[es][http] health not 200" >> "$ERRORS"

View File

@ -9,7 +9,14 @@ COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml"
info(){ echo -e "\033[34m[INSTALL]\033[0m $*"; } info(){ echo -e "\033[34m[INSTALL]\033[0m $*"; }
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; } err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; } require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; }
# Compose 检测:优先 docker composev2回退 docker-composev1
require_compose(){
if docker compose version >/dev/null 2>&1; then return 0; fi
if command -v docker-compose >/devnull 2>&1 && docker-compose version >/dev/null 2>&1; then return 0; fi
err "未检测到 Docker Compose请安装 docker compose v2 或 docker-compose v1"; exit 1
}
require docker curl jq awk sed tar gzip require docker curl jq awk sed tar gzip
require_compose
[[ -f "$ENV_FILE" ]] || { err "缺少 compose/.env请先运行 scripts/config.sh"; exit 1; } [[ -f "$ENV_FILE" ]] || { err "缺少 compose/.env请先运行 scripts/config.sh"; exit 1; }
info "使用环境文件: $ENV_FILE" info "使用环境文件: $ENV_FILE"
@ -52,9 +59,10 @@ done
shopt -u nullglob shopt -u nullglob
# Compose up # Compose up
info "启动服务栈 (docker compose up -d)" PROJECT="${COMPOSE_PROJECT_NAME:-argus-server}"
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d info "启动服务栈 (docker compose -p $PROJECT up -d)"
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps docker compose -p "$PROJECT" --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d
docker compose -p "$PROJECT" --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps
# Wait readiness (best-effort) # Wait readiness (best-effort)
code(){ curl -4 -s -o /dev/null -w "%{http_code}" "$1" || echo 000; } code(){ curl -4 -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }

View File

@ -4,4 +4,6 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
PKG_ROOT="$ROOT_DIR" PKG_ROOT="$ROOT_DIR"
ENV_FILE="$PKG_ROOT/compose/.env" ENV_FILE="$PKG_ROOT/compose/.env"
COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml" COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml"
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps if [[ -f "$ENV_FILE" ]]; then set -a; source "$ENV_FILE"; set +a; fi
PROJECT="${COMPOSE_PROJECT_NAME:-argus-server}"
docker compose -p "$PROJECT" --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps

View File

@ -4,6 +4,20 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
PKG_ROOT="$ROOT_DIR" PKG_ROOT="$ROOT_DIR"
ENV_FILE="$PKG_ROOT/compose/.env" ENV_FILE="$PKG_ROOT/compose/.env"
COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml" COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml"
echo "[UNINSTALL] stopping compose"
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" down --remove-orphans || true # load COMPOSE_PROJECT_NAME from env file if present
if [[ -f "$ENV_FILE" ]]; then set -a; source "$ENV_FILE"; set +a; fi
PROJECT="${COMPOSE_PROJECT_NAME:-argus-server}"
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
# Compose 检测:优先 docker composev2回退 docker-composev1
require_compose(){
if docker compose version >/dev/null 2>&1; then return 0; fi
if command -v docker-compose >/dev/null 2>&1 && docker-compose version >/dev/null 2>&1; then return 0; fi
err "未检测到 Docker Compose请安装 docker compose v2 或 docker-compose v1"; exit 1
}
require_compose
echo "[UNINSTALL] stopping compose (project=$PROJECT)"
docker compose -p "$PROJECT" --env-file "$ENV_FILE" -f "$COMPOSE_FILE" down --remove-orphans || true
echo "[UNINSTALL] done" echo "[UNINSTALL] done"