完成H20服务器部署及重启测试 #51

Merged
yuyr merged 27 commits from dev_1.1.0_yuyr_nobind into dev_1.0.0 2025-11-25 15:54:30 +08:00
4 changed files with 39 additions and 6 deletions
Showing only changes of commit 06131a268a - Show all commits

View File

@ -79,3 +79,9 @@ if [[ ${#missing[@]} -gt 0 ]]; then
err "以下变量必须在 compose/.env 中填写:${missing[*]}(已保留你现有的内容,不会被覆盖)"; exit 1; fi
info "已生成 compose/.env可执行 scripts/install.sh"
# 准备并赋权宿主日志目录(幂等,便于安装前人工检查/预创建)
mkdir -p "$PKG_ROOT/logs/train" "$PKG_ROOT/logs/infer"
chmod 1777 "$PKG_ROOT/logs/train" "$PKG_ROOT/logs/infer" || true
info "日志目录权限(期待 1777含粘滞位:"
stat -c '%a %U:%G %n' "$PKG_ROOT/logs/train" "$PKG_ROOT/logs/infer" 2>/dev/null || true

View File

@ -45,13 +45,20 @@ IMG_TGZ=$(ls -1 "$PKG_ROOT"/images/argus-sys-metric-test-node-bundle-gpu-*.tar.g
info "导入 GPU bundle 镜像: $(basename "$IMG_TGZ")"
tmp=$(mktemp); gunzip -c "$IMG_TGZ" > "$tmp"; docker load -i "$tmp" >/dev/null; rm -f "$tmp"
# 确保日志目录存在(宿主侧,用于映射 /logs/infer 与 /logs/train
# 确保日志目录存在(宿主侧,用于映射 /logs/infer 与 /logs/train,并赋权 1777粘滞位
mkdir -p "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train"
info "日志目录已准备: logs/infer logs/train"
chmod 1777 "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" || true
info "日志目录已准备并赋权 1777: logs/infer logs/train"
stat -c '%a %U:%G %n' "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" 2>/dev/null || true
# 启动 compose 并跟踪日志
info "启动 GPU 节点 (docker compose up -d)"
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps
# 再次校准宿主日志目录权限,避免容器内脚本对 bind mount 权限回退
chmod 1777 "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" || true
stat -c '%a %U:%G %n' "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" 2>/dev/null || true
info "跟踪节点容器日志(按 Ctrl+C 退出)"
docker logs -f argus-metric-gpu-node-swarm || true

View File

@ -77,7 +77,20 @@ cp -r /tmp/flb/etc/* /etc/fluent-bit/
# Create logs/buffers dirs
mkdir -p /logs/train /logs/infer /buffers
chmod 755 /logs/train /logs/infer /buffers
# 控制日志目录权限:默认对宿主 bind mount 目录采用 1777可由环境变量关闭
: "${ARGUS_LOGS_WORLD_WRITABLE:=1}"
if [[ "${ARGUS_LOGS_WORLD_WRITABLE}" == "1" ]]; then
chmod 1777 /logs/train /logs/infer || true
else
chmod 755 /logs/train /logs/infer || true
fi
# 缓冲目录仅供进程使用,不对外开放写入
chmod 770 /buffers || true
# 目录属主设置为 fluent-bit不影响 1777 粘滞位)
chown -R fluent-bit:fluent-bit /logs /buffers 2>/dev/null || true
# Wait for Elasticsearch via bash /dev/tcp to avoid curl dependency
echo "[INFO] Waiting for Elasticsearch to be ready (tcp ${ES_HOST}:${ES_PORT})..."

View File

@ -171,9 +171,16 @@ fi
# 创建日志和缓冲区目录
log_info "Creating log and buffer directories..."
mkdir -p /logs/train /logs/infer /buffers
chmod 755 /logs/train /logs/infer
chmod 770 /buffers
chown -R fluent-bit:fluent-bit /logs /buffers
# 对共享日志目录采用 1777含粘滞位便于宿主任意账号创建文件/目录
if [[ "${ARGUS_LOGS_WORLD_WRITABLE:-1}" == "1" ]]; then
chmod 1777 /logs/train /logs/infer || true
else
chmod 755 /logs/train /logs/infer || true
fi
# 缓冲目录限进程使用
chmod 770 /buffers || true
# 目录属主设置,不影响 1777 粘滞位
chown -R fluent-bit:fluent-bit /logs /buffers 2>/dev/null || true
# 启动 Fluent Bit
log_info "Starting Fluent Bit with configuration from /etc/fluent-bit/"