完成H20服务器部署及重启测试 #51
@ -79,3 +79,9 @@ if [[ ${#missing[@]} -gt 0 ]]; then
|
|||||||
err "以下变量必须在 compose/.env 中填写:${missing[*]}(已保留你现有的内容,不会被覆盖)"; exit 1; fi
|
err "以下变量必须在 compose/.env 中填写:${missing[*]}(已保留你现有的内容,不会被覆盖)"; exit 1; fi
|
||||||
|
|
||||||
info "已生成 compose/.env;可执行 scripts/install.sh"
|
info "已生成 compose/.env;可执行 scripts/install.sh"
|
||||||
|
|
||||||
|
# 准备并赋权宿主日志目录(幂等,便于安装前人工检查/预创建)
|
||||||
|
mkdir -p "$PKG_ROOT/logs/train" "$PKG_ROOT/logs/infer"
|
||||||
|
chmod 1777 "$PKG_ROOT/logs/train" "$PKG_ROOT/logs/infer" || true
|
||||||
|
info "日志目录权限(期待 1777,含粘滞位):"
|
||||||
|
stat -c '%a %U:%G %n' "$PKG_ROOT/logs/train" "$PKG_ROOT/logs/infer" 2>/dev/null || true
|
||||||
|
|||||||
@ -45,13 +45,20 @@ IMG_TGZ=$(ls -1 "$PKG_ROOT"/images/argus-sys-metric-test-node-bundle-gpu-*.tar.g
|
|||||||
info "导入 GPU bundle 镜像: $(basename "$IMG_TGZ")"
|
info "导入 GPU bundle 镜像: $(basename "$IMG_TGZ")"
|
||||||
tmp=$(mktemp); gunzip -c "$IMG_TGZ" > "$tmp"; docker load -i "$tmp" >/dev/null; rm -f "$tmp"
|
tmp=$(mktemp); gunzip -c "$IMG_TGZ" > "$tmp"; docker load -i "$tmp" >/dev/null; rm -f "$tmp"
|
||||||
|
|
||||||
# 确保日志目录存在(宿主侧,用于映射 /logs/infer 与 /logs/train)
|
# 确保日志目录存在(宿主侧,用于映射 /logs/infer 与 /logs/train),并赋权 1777(粘滞位)
|
||||||
mkdir -p "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train"
|
mkdir -p "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train"
|
||||||
info "日志目录已准备: logs/infer logs/train"
|
chmod 1777 "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" || true
|
||||||
|
info "日志目录已准备并赋权 1777: logs/infer logs/train"
|
||||||
|
stat -c '%a %U:%G %n' "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" 2>/dev/null || true
|
||||||
|
|
||||||
# 启动 compose 并跟踪日志
|
# 启动 compose 并跟踪日志
|
||||||
info "启动 GPU 节点 (docker compose up -d)"
|
info "启动 GPU 节点 (docker compose up -d)"
|
||||||
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d
|
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d
|
||||||
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps
|
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps
|
||||||
|
|
||||||
|
# 再次校准宿主日志目录权限,避免容器内脚本对 bind mount 权限回退
|
||||||
|
chmod 1777 "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" || true
|
||||||
|
stat -c '%a %U:%G %n' "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" 2>/dev/null || true
|
||||||
|
|
||||||
info "跟踪节点容器日志(按 Ctrl+C 退出)"
|
info "跟踪节点容器日志(按 Ctrl+C 退出)"
|
||||||
docker logs -f argus-metric-gpu-node-swarm || true
|
docker logs -f argus-metric-gpu-node-swarm || true
|
||||||
|
|||||||
@ -77,7 +77,20 @@ cp -r /tmp/flb/etc/* /etc/fluent-bit/
|
|||||||
|
|
||||||
# Create logs/buffers dirs
|
# Create logs/buffers dirs
|
||||||
mkdir -p /logs/train /logs/infer /buffers
|
mkdir -p /logs/train /logs/infer /buffers
|
||||||
chmod 755 /logs/train /logs/infer /buffers
|
|
||||||
|
# 控制日志目录权限:默认对宿主 bind mount 目录采用 1777(可由环境变量关闭)
|
||||||
|
: "${ARGUS_LOGS_WORLD_WRITABLE:=1}"
|
||||||
|
if [[ "${ARGUS_LOGS_WORLD_WRITABLE}" == "1" ]]; then
|
||||||
|
chmod 1777 /logs/train /logs/infer || true
|
||||||
|
else
|
||||||
|
chmod 755 /logs/train /logs/infer || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 缓冲目录仅供进程使用,不对外开放写入
|
||||||
|
chmod 770 /buffers || true
|
||||||
|
|
||||||
|
# 目录属主设置为 fluent-bit(不影响 1777 粘滞位)
|
||||||
|
chown -R fluent-bit:fluent-bit /logs /buffers 2>/dev/null || true
|
||||||
|
|
||||||
# Wait for Elasticsearch via bash /dev/tcp to avoid curl dependency
|
# Wait for Elasticsearch via bash /dev/tcp to avoid curl dependency
|
||||||
echo "[INFO] Waiting for Elasticsearch to be ready (tcp ${ES_HOST}:${ES_PORT})..."
|
echo "[INFO] Waiting for Elasticsearch to be ready (tcp ${ES_HOST}:${ES_PORT})..."
|
||||||
|
|||||||
@ -171,9 +171,16 @@ fi
|
|||||||
# 创建日志和缓冲区目录
|
# 创建日志和缓冲区目录
|
||||||
log_info "Creating log and buffer directories..."
|
log_info "Creating log and buffer directories..."
|
||||||
mkdir -p /logs/train /logs/infer /buffers
|
mkdir -p /logs/train /logs/infer /buffers
|
||||||
chmod 755 /logs/train /logs/infer
|
# 对共享日志目录采用 1777(含粘滞位),便于宿主任意账号创建文件/目录
|
||||||
chmod 770 /buffers
|
if [[ "${ARGUS_LOGS_WORLD_WRITABLE:-1}" == "1" ]]; then
|
||||||
chown -R fluent-bit:fluent-bit /logs /buffers
|
chmod 1777 /logs/train /logs/infer || true
|
||||||
|
else
|
||||||
|
chmod 755 /logs/train /logs/infer || true
|
||||||
|
fi
|
||||||
|
# 缓冲目录限进程使用
|
||||||
|
chmod 770 /buffers || true
|
||||||
|
# 目录属主设置,不影响 1777 粘滞位
|
||||||
|
chown -R fluent-bit:fluent-bit /logs /buffers 2>/dev/null || true
|
||||||
|
|
||||||
# 启动 Fluent Bit
|
# 启动 Fluent Bit
|
||||||
log_info "Starting Fluent Bit with configuration from /etc/fluent-bit/"
|
log_info "Starting Fluent Bit with configuration from /etc/fluent-bit/"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user