From a6d88ddcec511bf61428edab19bcd2f6b02490e7 Mon Sep 17 00:00:00 2001 From: yuyr Date: Tue, 25 Nov 2025 15:33:37 +0800 Subject: [PATCH] =?UTF-8?q?[#49]=20=E5=A2=9E=E5=8A=A0no=20bind=E7=89=88?= =?UTF-8?q?=E6=9C=ACserver=20=20&=20client=20pkg,=20=E5=AE=8C=E6=88=90h20?= =?UTF-8?q?=20=E9=83=A8=E7=BD=B2=E6=B5=8B=E8=AF=95=E5=92=8C=E9=87=8D?= =?UTF-8?q?=E5=90=AF=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build/build_images.sh | 35 +++++++++++++------ .../build/make_client_gpu_package.sh | 9 ++--- deployment_new/build/make_server_package.sh | 9 ----- .../server/docs/INSTALL_SERVER_zh.md | 8 ++--- .../templates/server/scripts/config.sh | 3 -- .../templates/server/scripts/diagnose.sh | 5 --- .../templates/server/scripts/selfcheck.sh | 6 ---- 7 files changed, 32 insertions(+), 43 deletions(-) diff --git a/build/build_images.sh b/build/build_images.sh index 898f715..fcbdfb6 100755 --- a/build/build_images.sh +++ b/build/build_images.sh @@ -40,6 +40,8 @@ build_gpu_bundle=false build_cpu_bundle=false build_server_pkg=false build_client_pkg=false +need_bind_image=true +need_metric_ftp=true no_cache=false bundle_date="" @@ -124,6 +126,11 @@ while [[ $# -gt 0 ]]; do esac done +if [[ "$build_server_pkg" == true ]]; then + need_bind_image=false + need_metric_ftp=false +fi + root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" . "$root/scripts/common/build_user.sh" @@ -463,11 +470,11 @@ build_server_pkg_bundle() { return 1 fi local repos=( - argus-bind9 argus-master argus-elasticsearch argus-kibana \ - argus-metric-ftp argus-metric-prometheus argus-metric-grafana \ + argus-master argus-elasticsearch argus-kibana \ + argus-metric-prometheus argus-metric-grafana \ argus-alertmanager argus-web-frontend argus-web-proxy ) - echo "\n🔖 Verifying server images with :$date_tag and collecting digests" + echo "\n🔖 Verifying server images with :$date_tag and collecting digests (Bind/FTP excluded; relying on Docker DNS aliases)" for repo in "${repos[@]}"; do if ! docker image inspect "$repo:$date_tag" >/dev/null 2>&1; then echo "❌ required image missing: $repo:$date_tag (build phase should have produced it)" >&2 @@ -638,10 +645,12 @@ if [[ "$build_core" == true ]]; then echo "" - if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:${DEFAULT_IMAGE_TAG}"; then - images_built+=("argus-bind9:${DEFAULT_IMAGE_TAG}") - else - build_failed=true + if [[ "$need_bind_image" == true ]]; then + if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:${DEFAULT_IMAGE_TAG}"; then + images_built+=("argus-bind9:${DEFAULT_IMAGE_TAG}") + else + build_failed=true + fi fi fi @@ -678,19 +687,25 @@ if [[ "$build_metric" == true ]]; then echo "Building Metric module images..." metric_base_images=( - "ubuntu:22.04" "ubuntu/prometheus:3-24.04_stable" "grafana/grafana:11.1.0" ) + if [[ "$need_metric_ftp" == true ]]; then + metric_base_images+=("ubuntu:22.04") + fi + for base_image in "${metric_base_images[@]}"; do if ! pull_base_image "$base_image"; then build_failed=true fi done - metric_builds=( - "Metric FTP|src/metric/ftp/build/Dockerfile|argus-metric-ftp:${DEFAULT_IMAGE_TAG}|src/metric/ftp/build" + metric_builds=() + if [[ "$need_metric_ftp" == true ]]; then + metric_builds+=("Metric FTP|src/metric/ftp/build/Dockerfile|argus-metric-ftp:${DEFAULT_IMAGE_TAG}|src/metric/ftp/build") + fi + metric_builds+=( "Metric Prometheus|src/metric/prometheus/build/Dockerfile|argus-metric-prometheus:${DEFAULT_IMAGE_TAG}|src/metric/prometheus/build" "Metric Grafana|src/metric/grafana/build/Dockerfile|argus-metric-grafana:${DEFAULT_IMAGE_TAG}|src/metric/grafana/build" ) diff --git a/deployment_new/build/make_client_gpu_package.sh b/deployment_new/build/make_client_gpu_package.sh index fc9b480..25a239b 100755 --- a/deployment_new/build/make_client_gpu_package.sh +++ b/deployment_new/build/make_client_gpu_package.sh @@ -82,16 +82,13 @@ AGENT_USER= AGENT_INSTANCE= GPU_NODE_HOSTNAME= +# Overlay network (should match server包 overlay) +ARGUS_OVERLAY_NET=argus-sys-net + # From cluster-info.env (server package output) -BINDIP= -FTPIP= SWARM_MANAGER_ADDR= SWARM_JOIN_TOKEN_WORKER= SWARM_JOIN_TOKEN_MANAGER= - -# FTP defaults -FTP_USER=ftpuser -FTP_PASSWORD=NASPlab1234! EOF # 4) Docs from deployment_new templates diff --git a/deployment_new/build/make_server_package.sh b/deployment_new/build/make_server_package.sh index a29bffc..9d4cdd3 100755 --- a/deployment_new/build/make_server_package.sh +++ b/deployment_new/build/make_server_package.sh @@ -33,11 +33,9 @@ if [[ -z "$VERSION" ]]; then VERSION="$(today_version)"; fi require_cmd docker tar gzip awk sed IMAGES=( - argus-bind9 argus-master argus-elasticsearch argus-kibana - argus-metric-ftp argus-metric-prometheus argus-metric-grafana argus-alertmanager @@ -73,11 +71,9 @@ cat >"$ENV_EX" < 脚本做了什么: - 检查依赖与磁盘空间; - 自动从“端口 20000 起”分配所有服务端口,确保“系统未占用”且“彼此不冲突”; -- 写入 `compose/.env`(包含端口、镜像 tag、FTP 账号、overlay 名称等); +- 写入 `compose/.env`(包含端口、镜像 tag、overlay 名称与 UID/GID 等); - 将当前执行账户的 UID/GID 写入 `ARGUS_BUILD_UID/GID`(若主组名是 docker,会改用“与用户名同名的组”的 GID,避免拿到 docker 组 999); - 更新/追加 `cluster-info.env` 中的 `SWARM_MANAGER_ADDR`(不会覆盖其他键)。 @@ -70,7 +70,7 @@ export SWARM_MANAGER_ADDR=<本机管理IP> - `docker compose up -d` 启动服务; - 等待“六项就绪”: - Master `/readyz`=200、ES `/_cluster/health`=200、Prometheus TCP 可达、Grafana `/api/health`=200、Alertmanager `/api/v2/status`=200、Kibana `/api/status` level=available; -- 将各服务 overlay IP 写入 `private/argus/etc/<域名>`,Reload Bind9 与 Nginx; +- 校验 Docker DNS + overlay alias:在 `argus-web-proxy` 内通过 `getent hosts` 与 `curl` 检查 `master.argus.com`、`grafana.metric.argus.com` 等域名连通性; - 写出 `cluster-info.env`(含 `SWARM_JOIN_TOKEN_{WORKER,MANAGER}/SWARM_MANAGER_ADDR`;compose 架构下不再依赖 BINDIP/FTPIP); - 生成 `安装报告_YYYYMMDD-HHMMSS.md`(端口、健康检查摘要与提示)。 @@ -86,7 +86,7 @@ export SWARM_MANAGER_ADDR=<本机管理IP> ## 五、健康自检与常用操作 - 健康自检:`./scripts/selfcheck.sh` - 期望输出:`selfcheck OK -> logs/selfcheck.json` - - 文件 `logs/selfcheck.json` 中 `overlay_net/es/kibana/master_readyz/ftp_share_writable/prometheus/grafana/alertmanager/web_proxy_cors` 为 true。 + - 文件 `logs/selfcheck.json` 中 `overlay_net/es/kibana/master_readyz/prometheus/grafana/alertmanager/web_proxy_cors` 为 true。 - 状态:`./scripts/status.sh`(相当于 `docker compose ps`)。 - 诊断:`./scripts/diagnose.sh`(收集容器/HTTP/CORS/ES 细节,输出到 `logs/diagnose_*.log`)。 - 卸载:`./scripts/uninstall.sh`(Compose down)。 @@ -97,6 +97,6 @@ export SWARM_MANAGER_ADDR=<本机管理IP> - 对方在 Client 机器的包根放置该文件(或设置 `CLUSTER_INFO=/绝对路径`)即可。 ## 七、故障排查快览 -- Proxy 502 或 8080 连接复位:多因 Bind 域名未更新到 overlay IP;重跑 `install.sh`(会写入私有域名文件并 reload)或查看 `logs/diagnose_error.log`。 +- Proxy 502 或 8080 连接复位:通常是 overlay alias 未生效或 web-proxy 尚未解析到其它服务;重跑 `install.sh`(会重启栈并在容器内校验 DNS),或查看 `logs/diagnose_error.log`。 - Kibana 不 available:等待 1–2 分钟、查看 `argus-kibana-sys` 日志; - cluster-info.env 的 SWARM_MANAGER_ADDR 为空:重新 `export SWARM_MANAGER_ADDR=; ./scripts/config.sh` 或 `./scripts/install.sh`(会回读 `.env` 补写)。 diff --git a/deployment_new/templates/server/scripts/config.sh b/deployment_new/templates/server/scripts/config.sh index 8ff27dc..324070f 100644 --- a/deployment_new/templates/server/scripts/config.sh +++ b/deployment_new/templates/server/scripts/config.sh @@ -70,9 +70,6 @@ done info "已写入 compose/.env 的端口配置" # 覆盖/补充 Overlay 名称 grep -q '^ARGUS_OVERLAY_NET=' "$ENV_OUT" || echo 'ARGUS_OVERLAY_NET=argus-sys-net' >> "$ENV_OUT" -# FTP 默认 -grep -q '^FTP_USER=' "$ENV_OUT" || echo 'FTP_USER=ftpuser' >> "$ENV_OUT" -grep -q '^FTP_PASSWORD=' "$ENV_OUT" || echo 'FTP_PASSWORD=NASPlab1234!' >> "$ENV_OUT" # 以当前执行账户 UID/GID 写入(避免误选 docker 组) RUID=$(id -u) PRIMARY_GID=$(id -g) diff --git a/deployment_new/templates/server/scripts/diagnose.sh b/deployment_new/templates/server/scripts/diagnose.sh index 7eb77aa..954d4dd 100644 --- a/deployment_new/templates/server/scripts/diagnose.sh +++ b/deployment_new/templates/server/scripts/diagnose.sh @@ -40,11 +40,9 @@ svc() { fi } -svc bind argus-bind-sys svc master argus-master-sys svc es argus-es-sys svc kibana argus-kibana-sys -svc ftp argus-ftp svc prometheus argus-prometheus svc grafana argus-grafana svc alertmanager argus-alertmanager @@ -84,9 +82,6 @@ logd "HTTP (web-proxy): master.readyz=$(docker exec argus-web-proxy sh -lc \"cur logd "HTTP (web-proxy): es.health=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://es.log.argus.com:9200/_cluster/health\" 2>/dev/null || echo 000)" logd "HTTP (web-proxy): kibana.status=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://kibana.log.argus.com:5601/api/status\" 2>/dev/null || echo 000)" -section FTP-SHARE -docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share; test -w /private/argus/ftp/share && echo "write:OK" || echo "write:FAIL"' >> "$DETAILS" 2>&1 || true - section SYSTEM logd "uname -a:"; uname -a >> "$DETAILS" logd "docker version:"; docker version --format '{{.Server.Version}}' >> "$DETAILS" 2>&1 || true diff --git a/deployment_new/templates/server/scripts/selfcheck.sh b/deployment_new/templates/server/scripts/selfcheck.sh index 96d9ce5..5ca041e 100644 --- a/deployment_new/templates/server/scripts/selfcheck.sh +++ b/deployment_new/templates/server/scripts/selfcheck.sh @@ -40,11 +40,6 @@ fi log "checking Master" [[ $(code_for "http://localhost:${MASTER_PORT:-32300}/readyz") == 200 ]] || ok=0 -log "checking FTP" -if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then - docker exec argus-ftp sh -lc 'test -w /private/argus/ftp/share' >/dev/null 2>&1 || ok=0 -else ok=0; fi - log "checking Prometheus" wait_http "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready" 60 || ok=0 @@ -69,7 +64,6 @@ cat > "$tmp" <