diff --git a/deployment_new/templates/client_gpu/scripts/config.sh b/deployment_new/templates/client_gpu/scripts/config.sh index 1f7b1fe..773e29c 100644 --- a/deployment_new/templates/client_gpu/scripts/config.sh +++ b/deployment_new/templates/client_gpu/scripts/config.sh @@ -79,3 +79,9 @@ if [[ ${#missing[@]} -gt 0 ]]; then err "以下变量必须在 compose/.env 中填写:${missing[*]}(已保留你现有的内容,不会被覆盖)"; exit 1; fi info "已生成 compose/.env;可执行 scripts/install.sh" + +# 准备并赋权宿主日志目录(幂等,便于安装前人工检查/预创建) +mkdir -p "$PKG_ROOT/logs/train" "$PKG_ROOT/logs/infer" +chmod 1777 "$PKG_ROOT/logs/train" "$PKG_ROOT/logs/infer" || true +info "日志目录权限(期待 1777,含粘滞位):" +stat -c '%a %U:%G %n' "$PKG_ROOT/logs/train" "$PKG_ROOT/logs/infer" 2>/dev/null || true diff --git a/deployment_new/templates/client_gpu/scripts/install.sh b/deployment_new/templates/client_gpu/scripts/install.sh index 27950c0..dd7b66e 100644 --- a/deployment_new/templates/client_gpu/scripts/install.sh +++ b/deployment_new/templates/client_gpu/scripts/install.sh @@ -45,13 +45,20 @@ IMG_TGZ=$(ls -1 "$PKG_ROOT"/images/argus-sys-metric-test-node-bundle-gpu-*.tar.g info "导入 GPU bundle 镜像: $(basename "$IMG_TGZ")" tmp=$(mktemp); gunzip -c "$IMG_TGZ" > "$tmp"; docker load -i "$tmp" >/dev/null; rm -f "$tmp" -# 确保日志目录存在(宿主侧,用于映射 /logs/infer 与 /logs/train) +# 确保日志目录存在(宿主侧,用于映射 /logs/infer 与 /logs/train),并赋权 1777(粘滞位) mkdir -p "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" -info "日志目录已准备: logs/infer logs/train" +chmod 1777 "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" || true +info "日志目录已准备并赋权 1777: logs/infer logs/train" +stat -c '%a %U:%G %n' "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" 2>/dev/null || true # 启动 compose 并跟踪日志 info "启动 GPU 节点 (docker compose up -d)" docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps + +# 再次校准宿主日志目录权限,避免容器内脚本对 bind mount 权限回退 +chmod 1777 "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" || true +stat -c '%a %U:%G %n' "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" 2>/dev/null || true + info "跟踪节点容器日志(按 Ctrl+C 退出)" docker logs -f argus-metric-gpu-node-swarm || true diff --git a/src/log/fluent-bit/build/start-fluent-bit.sh b/src/log/fluent-bit/build/start-fluent-bit.sh index 5b4cd35..953549a 100755 --- a/src/log/fluent-bit/build/start-fluent-bit.sh +++ b/src/log/fluent-bit/build/start-fluent-bit.sh @@ -77,7 +77,20 @@ cp -r /tmp/flb/etc/* /etc/fluent-bit/ # Create logs/buffers dirs mkdir -p /logs/train /logs/infer /buffers -chmod 755 /logs/train /logs/infer /buffers + +# 控制日志目录权限:默认对宿主 bind mount 目录采用 1777(可由环境变量关闭) +: "${ARGUS_LOGS_WORLD_WRITABLE:=1}" +if [[ "${ARGUS_LOGS_WORLD_WRITABLE}" == "1" ]]; then + chmod 1777 /logs/train /logs/infer || true +else + chmod 755 /logs/train /logs/infer || true +fi + +# 缓冲目录仅供进程使用,不对外开放写入 +chmod 770 /buffers || true + +# 目录属主设置为 fluent-bit(不影响 1777 粘滞位) +chown -R fluent-bit:fluent-bit /logs /buffers 2>/dev/null || true # Wait for Elasticsearch via bash /dev/tcp to avoid curl dependency echo "[INFO] Waiting for Elasticsearch to be ready (tcp ${ES_HOST}:${ES_PORT})..." diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/install.sh b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/install.sh index 1d5f371..5137152 100755 --- a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/install.sh +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/install.sh @@ -171,9 +171,16 @@ fi # 创建日志和缓冲区目录 log_info "Creating log and buffer directories..." mkdir -p /logs/train /logs/infer /buffers -chmod 755 /logs/train /logs/infer -chmod 770 /buffers -chown -R fluent-bit:fluent-bit /logs /buffers +# 对共享日志目录采用 1777(含粘滞位),便于宿主任意账号创建文件/目录 +if [[ "${ARGUS_LOGS_WORLD_WRITABLE:-1}" == "1" ]]; then + chmod 1777 /logs/train /logs/infer || true +else + chmod 755 /logs/train /logs/infer || true +fi +# 缓冲目录限进程使用 +chmod 770 /buffers || true +# 目录属主设置,不影响 1777 粘滞位 +chown -R fluent-bit:fluent-bit /logs /buffers 2>/dev/null || true # 启动 Fluent Bit log_info "Starting Fluent Bit with configuration from /etc/fluent-bit/"