diff --git a/src/sys/swarm_tests/README.md b/src/sys/swarm_tests/README.md index abbec3e..55f1eb2 100644 --- a/src/sys/swarm_tests/README.md +++ b/src/sys/swarm_tests/README.md @@ -4,7 +4,7 @@ ## 先决条件 - Docker Engine 已启用 Swarm(脚本会自动 `swarm init` 单机模式)。 -- 已构建并加载以下镜像:`argus-bind9:latest`、`argus-master:latest`、`argus-elasticsearch:latest`、`argus-kibana:latest`、`argus-metric-ftp:latest`、`argus-metric-prometheus:latest`、`argus-metric-grafana:latest`、`argus-alertmanager:latest`、`argus-web-frontend:latest`、`argus-web-proxy:latest`、以及节点镜像 `argus-sys-metric-test-node-bundle:latest`(见下文)。 +- 已构建并加载以下镜像:`argus-master:latest`、`argus-elasticsearch:latest`、`argus-kibana:latest`、`argus-metric-prometheus:latest`、`argus-metric-grafana:latest`、`argus-alertmanager:latest`、`argus-web-frontend:latest`、`argus-web-proxy:latest`、以及节点镜像 `argus-sys-metric-test-node-bundle:latest`(见下文)。 - 本地 `UID/GID` 建议通过 `configs/build_user.local.conf` 指定,脚本会读取: - `UID=1000`\n`GID=1000`(示例)。 @@ -24,7 +24,7 @@ cp .env.example .env bash scripts/00_bootstrap.sh bash scripts/01_server_up.sh -bash scripts/02_wait_ready.sh # 输出 BINDIP/FTPIP 到 .env.nodes +bash scripts/02_wait_ready.sh # 写 MASTER_ENDPOINT/AGENT_* 到 .env.nodes bash scripts/03_nodes_up.sh bash scripts/04_metric_verify.sh ``` @@ -38,7 +38,7 @@ bash scripts/99_down.sh ## 说明与注意事项 - `00_bootstrap.sh`:先加载 `scripts/common/build_user.sh`,打印并写入 `.env` 中的 `ARGUS_BUILD_UID/GID`,再准备 `private-server/` 与 `private-nodes/` 目录,并 `chown` 到对应 UID/GID。 - `01_server_up.sh`:启动服务端 compose。可用 `SWARM_FIX_PERMS=1` 打开“容器内 chmod + supervisor 重启”的兜底逻辑,默认关闭。 -- `02_wait_ready.sh`:等待 Master/ES/Prom/Grafana 就绪(Kibana 可延迟),随后解析 overlay IP,写入 `.env.nodes` 的 `BINDIP/FTPIP`,供节点 compose 使用。 +- `02_wait_ready.sh`:等待 Master/ES/Prom/Grafana 就绪(Kibana 可延迟),随后写入 `.env.nodes` 的 `MASTER_ENDPOINT/AGENT_*`,供节点 compose 使用(DNS 由 Docker 自带服务负责,不再依赖 BINDIP/FTPIP)。 - `03_nodes_up.sh`:启动单节点容器(bundle 版)。容器内 `node-bootstrap.sh` 优先本地安装,成功后执行健康检查并等待 `/private/argus/agent//node.json` 出现。 - `04_metric_verify.sh`:在本套件内执行详细校验(不再直接调用 tests 脚本): - Grafana `/api/health`(database=ok) diff --git a/src/sys/swarm_tests/docker-compose.gpu-node.yml b/src/sys/swarm_tests/docker-compose.gpu-node.yml index e6dd051..0076538 100644 --- a/src/sys/swarm_tests/docker-compose.gpu-node.yml +++ b/src/sys/swarm_tests/docker-compose.gpu-node.yml @@ -16,10 +16,6 @@ services: - TZ=Asia/Shanghai - DEBIAN_FRONTEND=noninteractive - MASTER_ENDPOINT=${MASTER_ENDPOINT:-http://master.argus.com:3000} - - FTPIP=${FTPIP} - - BINDIP=${BINDIP} - - FTP_USER=${FTP_USER:-ftpuser} - - FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!} - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} - AGENT_ENV=${AGENT_ENV:-dev2} @@ -28,9 +24,10 @@ services: - NVIDIA_VISIBLE_DEVICES=all - NVIDIA_DRIVER_CAPABILITIES=compute,utility - GPU_MODE=gpu - dns: - - ${BINDIP} - networks: [argus-sys-net] + networks: + argus-sys-net: + aliases: + - ${AGENT_INSTANCE}.node.argus.com volumes: - ./private-gpu-nodes/argus/agent:/private/argus/agent command: ["sleep", "infinity"] diff --git a/src/sys/swarm_tests/docker-compose.nodes.yml b/src/sys/swarm_tests/docker-compose.nodes.yml index 6c42cc2..7baee4c 100644 --- a/src/sys/swarm_tests/docker-compose.nodes.yml +++ b/src/sys/swarm_tests/docker-compose.nodes.yml @@ -16,19 +16,16 @@ services: - MASTER_ENDPOINT=${MASTER_ENDPOINT:-http://master.argus.com:3000} - ES_HOST=es.log.argus.com - ES_PORT=9200 - - FTPIP=${FTPIP} - - BINDIP=${BINDIP} - - FTP_USER=${FTP_USER:-ftpuser} - - FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!} - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} - AGENT_ENV=${AGENT_ENV:-dev2} - AGENT_USER=${AGENT_USER:-yuyr} - AGENT_INSTANCE=${AGENT_INSTANCE:-node001sX} - CLIENT_VERSION=${CLIENT_VERSION:-} - dns: - - ${BINDIP} - networks: [argus-sys-net] + networks: + argus-sys-net: + aliases: + - ${AGENT_INSTANCE}.node.argus.com volumes: - ./private-nodes/argus/agent:/private/argus/agent command: ["sleep", "infinity"] diff --git a/src/sys/swarm_tests/docker-compose.server.yml b/src/sys/swarm_tests/docker-compose.server.yml index a05b070..ccf9cca 100644 --- a/src/sys/swarm_tests/docker-compose.server.yml +++ b/src/sys/swarm_tests/docker-compose.server.yml @@ -5,18 +5,10 @@ networks: external: true services: - bind: - image: ${BIND_IMAGE_TAG:-argus-bind9:latest} - container_name: argus-bind-sys - networks: [argus-sys-net] - volumes: - - ./private-server:/private - restart: unless-stopped - master: image: ${MASTER_IMAGE_TAG:-argus-master:latest} container_name: argus-master-sys - depends_on: [bind] + depends_on: [] environment: - OFFLINE_THRESHOLD_SECONDS=6 - ONLINE_THRESHOLD_SECONDS=2 @@ -29,7 +21,10 @@ services: - ./private-server/argus/master:/private/argus/master - ./private-server/argus/metric/prometheus:/private/argus/metric/prometheus - ./private-server/argus/etc:/private/argus/etc - networks: [argus-sys-net] + networks: + argus-sys-net: + aliases: + - master.argus.com restart: unless-stopped es: @@ -47,7 +42,10 @@ services: ports: - "${ES_HTTP_PORT:-9200}:9200" restart: unless-stopped - networks: [argus-sys-net] + networks: + argus-sys-net: + aliases: + - es.log.argus.com kibana: image: ${KIBANA_IMAGE_TAG:-argus-kibana:latest} @@ -63,27 +61,10 @@ services: ports: - "${KIBANA_PORT:-5601}:5601" restart: unless-stopped - networks: [argus-sys-net] - - ftp: - image: ${FTP_IMAGE_TAG:-argus-metric-ftp:latest} - container_name: argus-ftp - restart: unless-stopped - environment: - - TZ=Asia/Shanghai - - FTP_BASE_PATH=/private/argus/ftp - - FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!} - - DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com} - - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} - - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} - ports: - - "${FTP_PORT:-21}:21" - - "${FTP_DATA_PORT:-20}:20" - - "${FTP_PASSIVE_HOST_RANGE:-21100-21110}:21100-21110" - volumes: - - ./private-server/argus/metric/ftp:/private/argus/ftp - - ./private-server/argus/etc:/private/argus/etc - networks: [argus-sys-net] + networks: + argus-sys-net: + aliases: + - kibana.log.argus.com prometheus: image: ${PROM_IMAGE_TAG:-argus-metric-prometheus:latest} @@ -99,7 +80,10 @@ services: volumes: - ./private-server/argus/metric/prometheus:/private/argus/metric/prometheus - ./private-server/argus/etc:/private/argus/etc - networks: [argus-sys-net] + networks: + argus-sys-net: + aliases: + - prom.metric.argus.com grafana: image: ${GRAFANA_IMAGE_TAG:-argus-metric-grafana:latest} @@ -122,7 +106,10 @@ services: - ./private-server/argus/metric/grafana:/private/argus/metric/grafana - ./private-server/argus/etc:/private/argus/etc depends_on: [prometheus] - networks: [argus-sys-net] + networks: + argus-sys-net: + aliases: + - grafana.metric.argus.com alertmanager: image: ${ALERT_IMAGE_TAG:-argus-alertmanager:latest} @@ -133,7 +120,10 @@ services: volumes: - ./private-server/argus/etc:/private/argus/etc - ./private-server/argus/alert/alertmanager:/private/argus/alert/alertmanager - networks: [argus-sys-net] + networks: + argus-sys-net: + aliases: + - alertmanager.alert.argus.com ports: - "${ALERTMANAGER_PORT:-9093}:9093" restart: unless-stopped @@ -151,19 +141,25 @@ services: - EXTERNAL_KIBANA_PORT=${WEB_PROXY_PORT_8083:-8083} volumes: - ./private-server/argus/etc:/private/argus/etc - networks: [argus-sys-net] + networks: + argus-sys-net: + aliases: + - web.argus.com restart: unless-stopped web-proxy: image: ${WEB_PROXY_IMAGE_TAG:-argus-web-proxy:latest} container_name: argus-web-proxy - depends_on: [bind, master, grafana, prometheus, kibana, alertmanager] + depends_on: [master, grafana, prometheus, kibana, alertmanager] environment: - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} volumes: - ./private-server/argus/etc:/private/argus/etc - networks: [argus-sys-net] + networks: + argus-sys-net: + aliases: + - proxy.argus.com ports: - "${WEB_PROXY_PORT_8080:-8080}:8080" - "${WEB_PROXY_PORT_8081:-8081}:8081" diff --git a/src/sys/swarm_tests/scripts/00_bootstrap.sh b/src/sys/swarm_tests/scripts/00_bootstrap.sh index 27c4462..0d37975 100755 --- a/src/sys/swarm_tests/scripts/00_bootstrap.sh +++ b/src/sys/swarm_tests/scripts/00_bootstrap.sh @@ -42,7 +42,6 @@ echo "[BOOT] preparing private directories (server/nodes)" # Server-side dirs (align with sys/tests 01_bootstrap.sh) mkdir -p \ "$ROOT/private-server/argus/etc" \ - "$ROOT/private-server/argus/bind" \ "$ROOT/private-server/argus/master" \ "$ROOT/private-server/argus/metric/prometheus" \ "$ROOT/private-server/argus/metric/prometheus/data" \ @@ -72,11 +71,9 @@ chown -R "$uid":"$gid" \ "$ROOT/private-server/argus/metric/grafana" \ "$ROOT/private-server/argus/metric/prometheus" \ "$ROOT/private-server/argus/alert" \ - "$ROOT/private-server/argus/metric/ftp" \ "$ROOT/private-server/argus/agent" \ "$ROOT/private-server/argus/etc" 2>/dev/null || true -# group-writable for etc/alert as in sys/tests chmod -R g+w "$ROOT/private-server/argus/alert" "$ROOT/private-server/argus/etc" 2>/dev/null || true # ensure .env carries the resolved UID/GID for compose env interpolation @@ -91,11 +88,4 @@ else echo "ARGUS_BUILD_GID=${gid}" >> "$ENV_FILE" fi -# distribute update-dns.sh -BIND_UPDATE_SRC="$REPO_ROOT/src/bind/build/update-dns.sh" -BIND_UPDATE_DEST="$ROOT/private-server/argus/etc/update-dns.sh" -if [[ -f "$BIND_UPDATE_SRC" ]]; then - cp "$BIND_UPDATE_SRC" "$BIND_UPDATE_DEST" && chmod +x "$BIND_UPDATE_DEST" || true -fi - echo "[BOOT] done" diff --git a/src/sys/swarm_tests/scripts/02_wait_ready.sh b/src/sys/swarm_tests/scripts/02_wait_ready.sh index 7ab0685..3906f28 100755 --- a/src/sys/swarm_tests/scripts/02_wait_ready.sh +++ b/src/sys/swarm_tests/scripts/02_wait_ready.sh @@ -36,49 +36,12 @@ done if [[ $ok -lt 4 ]]; then echo "[ERROR] services not ready" >&2; exit 1; fi -echo "[READY] resolving overlay IPs" -BINDIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-bind-sys) -FTPIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-ftp) -echo "BINDIP=$BINDIP FTPIP=$FTPIP" - ENV_NODES="$ROOT/.env.nodes" cat > "$ENV_NODES" < actual overlay IPs and reload bind/nginx (best-effort) -echo "[READY] fixing domain records to overlay IPs" -ETC_DIR="$ROOT/private-server/argus/etc"; mkdir -p "$ETC_DIR" -declare -A MAP -MAP[web-frontend]=web.argus.com -MAP[argus-grafana]=grafana.metric.argus.com -MAP[argus-prometheus]=prom.metric.argus.com -MAP[argus-kibana-sys]=kibana.log.argus.com -MAP[argus-alertmanager]=alertmanager.alert.argus.com -MAP[argus-master-sys]=master.argus.com -changed=0 -for cname in "${!MAP[@]}"; do - domain="${MAP[$cname]}"; fpath="$ETC_DIR/$domain" - ip=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' "$cname" 2>/dev/null || true) - [[ -z "$ip" ]] && { echo "[DNS-FIX][WARN] $domain: container $cname no overlay IP yet"; continue; } - cur=$(cat "$fpath" 2>/dev/null || echo "") - if [[ "$cur" != "$ip" ]]; then - echo "$ip" > "$fpath"; echo "[DNS-FIX][SET] $domain = $ip (was: ${cur:-})"; changed=1 - else - echo "[DNS-FIX][OK] $domain already $ip" - fi -done -if [[ $changed -eq 1 ]]; then - docker exec argus-bind-sys /usr/local/bin/reload-bind9.sh >/dev/null 2>&1 || true - sleep 1 -fi -docker exec argus-web-proxy nginx -s reload >/dev/null 2>&1 || true +echo "[READY] wrote $ENV_NODES (MASTER_ENDPOINT/AGENT_* only)" diff --git a/src/sys/swarm_tests/scripts/04_metric_verify.sh b/src/sys/swarm_tests/scripts/04_metric_verify.sh index 2bc4ac6..3b01cc7 100755 --- a/src/sys/swarm_tests/scripts/04_metric_verify.sh +++ b/src/sys/swarm_tests/scripts/04_metric_verify.sh @@ -81,8 +81,8 @@ fi docker exec argus-grafana sh -lc "grep -E 'url:\s*http://$PROM_DOMAIN' '$DS_FILE'" >/dev/null 2>&1 || fail "datasource not pointing to $PROM_DOMAIN" ok "datasource points to domain" -# ---- DNS resolution inside grafana ---- -info "bind resolution inside grafana" +# ---- DNS resolution inside grafana (via Docker DNS + FQDN alias) ---- +info "FQDN resolution inside grafana (Docker DNS)" tries=0 until docker exec argus-grafana getent hosts prom.metric.argus.com >/dev/null 2>&1; do tries=$((tries+1)); (( tries > 24 )) && fail "grafana cannot resolve prom.metric.argus.com" diff --git a/src/sys/swarm_tests/scripts/05a_net_warmup.sh b/src/sys/swarm_tests/scripts/05a_net_warmup.sh index 06754b7..46bb509 100755 --- a/src/sys/swarm_tests/scripts/05a_net_warmup.sh +++ b/src/sys/swarm_tests/scripts/05a_net_warmup.sh @@ -21,7 +21,6 @@ else docker run -d --rm \ --name "$WARMUP_NAME" \ --network "$NET_NAME" \ - ${BINDIP:+--dns "$BINDIP"} \ "$WARMUP_IMAGE" sleep "$WARMUP_SECONDS" rc=$? set -e @@ -43,4 +42,3 @@ done echo "[WARN] network still not inspectable locally after 60s, but warmup container is running. Compose may still pass; proceed to run GPU compose and retry if needed." >&2 exit 0 - diff --git a/src/sys/swarm_tests/tmp/metric-verify/prom_targets.json b/src/sys/swarm_tests/tmp/metric-verify/prom_targets.json index 3ca7fca..4911196 100644 --- a/src/sys/swarm_tests/tmp/metric-verify/prom_targets.json +++ b/src/sys/swarm_tests/tmp/metric-verify/prom_targets.json @@ -1 +1 @@ -{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.13:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.13:9400/metrics","globalUrl":"http://10.0.1.13:9400/metrics","lastError":"","lastScrape":"2025-11-14T16:20:36.702023128+08:00","lastScrapeDuration":0.001054193,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.13:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.13:9100/metrics","globalUrl":"http://10.0.1.13:9100/metrics","lastError":"","lastScrape":"2025-11-14T16:20:34.338081675+08:00","lastScrapeDuration":0.019183536,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}} \ No newline at end of file +{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.20:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.20","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.20","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.20:9400/metrics","globalUrl":"http://10.0.1.20:9400/metrics","lastError":"","lastScrape":"2025-11-18T15:02:15.071897295+08:00","lastScrapeDuration":0.001115439,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.20:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.20","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.20","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.20:9100/metrics","globalUrl":"http://10.0.1.20:9100/metrics","lastError":"","lastScrape":"2025-11-18T15:02:12.57609087+08:00","lastScrapeDuration":0.020143969,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}} \ No newline at end of file