[#49] 移除bind/ftp,改成使用docker 自带域名解析,实现重启后IP变化仍正常访问,完成swarm test通过
This commit is contained in:
parent
5b617f62a8
commit
1d38304936
@ -4,7 +4,7 @@
|
||||
|
||||
## 先决条件
|
||||
- Docker Engine 已启用 Swarm(脚本会自动 `swarm init` 单机模式)。
|
||||
- 已构建并加载以下镜像:`argus-bind9:latest`、`argus-master:latest`、`argus-elasticsearch:latest`、`argus-kibana:latest`、`argus-metric-ftp:latest`、`argus-metric-prometheus:latest`、`argus-metric-grafana:latest`、`argus-alertmanager:latest`、`argus-web-frontend:latest`、`argus-web-proxy:latest`、以及节点镜像 `argus-sys-metric-test-node-bundle:latest`(见下文)。
|
||||
- 已构建并加载以下镜像:`argus-master:latest`、`argus-elasticsearch:latest`、`argus-kibana:latest`、`argus-metric-prometheus:latest`、`argus-metric-grafana:latest`、`argus-alertmanager:latest`、`argus-web-frontend:latest`、`argus-web-proxy:latest`、以及节点镜像 `argus-sys-metric-test-node-bundle:latest`(见下文)。
|
||||
- 本地 `UID/GID` 建议通过 `configs/build_user.local.conf` 指定,脚本会读取:
|
||||
- `UID=1000`\n`GID=1000`(示例)。
|
||||
|
||||
@ -24,7 +24,7 @@ cp .env.example .env
|
||||
|
||||
bash scripts/00_bootstrap.sh
|
||||
bash scripts/01_server_up.sh
|
||||
bash scripts/02_wait_ready.sh # 输出 BINDIP/FTPIP 到 .env.nodes
|
||||
bash scripts/02_wait_ready.sh # 写 MASTER_ENDPOINT/AGENT_* 到 .env.nodes
|
||||
bash scripts/03_nodes_up.sh
|
||||
bash scripts/04_metric_verify.sh
|
||||
```
|
||||
@ -38,7 +38,7 @@ bash scripts/99_down.sh
|
||||
## 说明与注意事项
|
||||
- `00_bootstrap.sh`:先加载 `scripts/common/build_user.sh`,打印并写入 `.env` 中的 `ARGUS_BUILD_UID/GID`,再准备 `private-server/` 与 `private-nodes/` 目录,并 `chown` 到对应 UID/GID。
|
||||
- `01_server_up.sh`:启动服务端 compose。可用 `SWARM_FIX_PERMS=1` 打开“容器内 chmod + supervisor 重启”的兜底逻辑,默认关闭。
|
||||
- `02_wait_ready.sh`:等待 Master/ES/Prom/Grafana 就绪(Kibana 可延迟),随后解析 overlay IP,写入 `.env.nodes` 的 `BINDIP/FTPIP`,供节点 compose 使用。
|
||||
- `02_wait_ready.sh`:等待 Master/ES/Prom/Grafana 就绪(Kibana 可延迟),随后写入 `.env.nodes` 的 `MASTER_ENDPOINT/AGENT_*`,供节点 compose 使用(DNS 由 Docker 自带服务负责,不再依赖 BINDIP/FTPIP)。
|
||||
- `03_nodes_up.sh`:启动单节点容器(bundle 版)。容器内 `node-bootstrap.sh` 优先本地安装,成功后执行健康检查并等待 `/private/argus/agent/<hostname>/node.json` 出现。
|
||||
- `04_metric_verify.sh`:在本套件内执行详细校验(不再直接调用 tests 脚本):
|
||||
- Grafana `/api/health`(database=ok)
|
||||
|
||||
@ -16,10 +16,6 @@ services:
|
||||
- TZ=Asia/Shanghai
|
||||
- DEBIAN_FRONTEND=noninteractive
|
||||
- MASTER_ENDPOINT=${MASTER_ENDPOINT:-http://master.argus.com:3000}
|
||||
- FTPIP=${FTPIP}
|
||||
- BINDIP=${BINDIP}
|
||||
- FTP_USER=${FTP_USER:-ftpuser}
|
||||
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
- AGENT_ENV=${AGENT_ENV:-dev2}
|
||||
@ -28,9 +24,10 @@ services:
|
||||
- NVIDIA_VISIBLE_DEVICES=all
|
||||
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||
- GPU_MODE=gpu
|
||||
dns:
|
||||
- ${BINDIP}
|
||||
networks: [argus-sys-net]
|
||||
networks:
|
||||
argus-sys-net:
|
||||
aliases:
|
||||
- ${AGENT_INSTANCE}.node.argus.com
|
||||
volumes:
|
||||
- ./private-gpu-nodes/argus/agent:/private/argus/agent
|
||||
command: ["sleep", "infinity"]
|
||||
|
||||
@ -16,19 +16,16 @@ services:
|
||||
- MASTER_ENDPOINT=${MASTER_ENDPOINT:-http://master.argus.com:3000}
|
||||
- ES_HOST=es.log.argus.com
|
||||
- ES_PORT=9200
|
||||
- FTPIP=${FTPIP}
|
||||
- BINDIP=${BINDIP}
|
||||
- FTP_USER=${FTP_USER:-ftpuser}
|
||||
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
- AGENT_ENV=${AGENT_ENV:-dev2}
|
||||
- AGENT_USER=${AGENT_USER:-yuyr}
|
||||
- AGENT_INSTANCE=${AGENT_INSTANCE:-node001sX}
|
||||
- CLIENT_VERSION=${CLIENT_VERSION:-}
|
||||
dns:
|
||||
- ${BINDIP}
|
||||
networks: [argus-sys-net]
|
||||
networks:
|
||||
argus-sys-net:
|
||||
aliases:
|
||||
- ${AGENT_INSTANCE}.node.argus.com
|
||||
volumes:
|
||||
- ./private-nodes/argus/agent:/private/argus/agent
|
||||
command: ["sleep", "infinity"]
|
||||
|
||||
@ -5,18 +5,10 @@ networks:
|
||||
external: true
|
||||
|
||||
services:
|
||||
bind:
|
||||
image: ${BIND_IMAGE_TAG:-argus-bind9:latest}
|
||||
container_name: argus-bind-sys
|
||||
networks: [argus-sys-net]
|
||||
volumes:
|
||||
- ./private-server:/private
|
||||
restart: unless-stopped
|
||||
|
||||
master:
|
||||
image: ${MASTER_IMAGE_TAG:-argus-master:latest}
|
||||
container_name: argus-master-sys
|
||||
depends_on: [bind]
|
||||
depends_on: []
|
||||
environment:
|
||||
- OFFLINE_THRESHOLD_SECONDS=6
|
||||
- ONLINE_THRESHOLD_SECONDS=2
|
||||
@ -29,7 +21,10 @@ services:
|
||||
- ./private-server/argus/master:/private/argus/master
|
||||
- ./private-server/argus/metric/prometheus:/private/argus/metric/prometheus
|
||||
- ./private-server/argus/etc:/private/argus/etc
|
||||
networks: [argus-sys-net]
|
||||
networks:
|
||||
argus-sys-net:
|
||||
aliases:
|
||||
- master.argus.com
|
||||
restart: unless-stopped
|
||||
|
||||
es:
|
||||
@ -47,7 +42,10 @@ services:
|
||||
ports:
|
||||
- "${ES_HTTP_PORT:-9200}:9200"
|
||||
restart: unless-stopped
|
||||
networks: [argus-sys-net]
|
||||
networks:
|
||||
argus-sys-net:
|
||||
aliases:
|
||||
- es.log.argus.com
|
||||
|
||||
kibana:
|
||||
image: ${KIBANA_IMAGE_TAG:-argus-kibana:latest}
|
||||
@ -63,27 +61,10 @@ services:
|
||||
ports:
|
||||
- "${KIBANA_PORT:-5601}:5601"
|
||||
restart: unless-stopped
|
||||
networks: [argus-sys-net]
|
||||
|
||||
ftp:
|
||||
image: ${FTP_IMAGE_TAG:-argus-metric-ftp:latest}
|
||||
container_name: argus-ftp
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- TZ=Asia/Shanghai
|
||||
- FTP_BASE_PATH=/private/argus/ftp
|
||||
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
|
||||
- DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com}
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
ports:
|
||||
- "${FTP_PORT:-21}:21"
|
||||
- "${FTP_DATA_PORT:-20}:20"
|
||||
- "${FTP_PASSIVE_HOST_RANGE:-21100-21110}:21100-21110"
|
||||
volumes:
|
||||
- ./private-server/argus/metric/ftp:/private/argus/ftp
|
||||
- ./private-server/argus/etc:/private/argus/etc
|
||||
networks: [argus-sys-net]
|
||||
networks:
|
||||
argus-sys-net:
|
||||
aliases:
|
||||
- kibana.log.argus.com
|
||||
|
||||
prometheus:
|
||||
image: ${PROM_IMAGE_TAG:-argus-metric-prometheus:latest}
|
||||
@ -99,7 +80,10 @@ services:
|
||||
volumes:
|
||||
- ./private-server/argus/metric/prometheus:/private/argus/metric/prometheus
|
||||
- ./private-server/argus/etc:/private/argus/etc
|
||||
networks: [argus-sys-net]
|
||||
networks:
|
||||
argus-sys-net:
|
||||
aliases:
|
||||
- prom.metric.argus.com
|
||||
|
||||
grafana:
|
||||
image: ${GRAFANA_IMAGE_TAG:-argus-metric-grafana:latest}
|
||||
@ -122,7 +106,10 @@ services:
|
||||
- ./private-server/argus/metric/grafana:/private/argus/metric/grafana
|
||||
- ./private-server/argus/etc:/private/argus/etc
|
||||
depends_on: [prometheus]
|
||||
networks: [argus-sys-net]
|
||||
networks:
|
||||
argus-sys-net:
|
||||
aliases:
|
||||
- grafana.metric.argus.com
|
||||
|
||||
alertmanager:
|
||||
image: ${ALERT_IMAGE_TAG:-argus-alertmanager:latest}
|
||||
@ -133,7 +120,10 @@ services:
|
||||
volumes:
|
||||
- ./private-server/argus/etc:/private/argus/etc
|
||||
- ./private-server/argus/alert/alertmanager:/private/argus/alert/alertmanager
|
||||
networks: [argus-sys-net]
|
||||
networks:
|
||||
argus-sys-net:
|
||||
aliases:
|
||||
- alertmanager.alert.argus.com
|
||||
ports:
|
||||
- "${ALERTMANAGER_PORT:-9093}:9093"
|
||||
restart: unless-stopped
|
||||
@ -151,19 +141,25 @@ services:
|
||||
- EXTERNAL_KIBANA_PORT=${WEB_PROXY_PORT_8083:-8083}
|
||||
volumes:
|
||||
- ./private-server/argus/etc:/private/argus/etc
|
||||
networks: [argus-sys-net]
|
||||
networks:
|
||||
argus-sys-net:
|
||||
aliases:
|
||||
- web.argus.com
|
||||
restart: unless-stopped
|
||||
|
||||
web-proxy:
|
||||
image: ${WEB_PROXY_IMAGE_TAG:-argus-web-proxy:latest}
|
||||
container_name: argus-web-proxy
|
||||
depends_on: [bind, master, grafana, prometheus, kibana, alertmanager]
|
||||
depends_on: [master, grafana, prometheus, kibana, alertmanager]
|
||||
environment:
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
volumes:
|
||||
- ./private-server/argus/etc:/private/argus/etc
|
||||
networks: [argus-sys-net]
|
||||
networks:
|
||||
argus-sys-net:
|
||||
aliases:
|
||||
- proxy.argus.com
|
||||
ports:
|
||||
- "${WEB_PROXY_PORT_8080:-8080}:8080"
|
||||
- "${WEB_PROXY_PORT_8081:-8081}:8081"
|
||||
|
||||
@ -42,7 +42,6 @@ echo "[BOOT] preparing private directories (server/nodes)"
|
||||
# Server-side dirs (align with sys/tests 01_bootstrap.sh)
|
||||
mkdir -p \
|
||||
"$ROOT/private-server/argus/etc" \
|
||||
"$ROOT/private-server/argus/bind" \
|
||||
"$ROOT/private-server/argus/master" \
|
||||
"$ROOT/private-server/argus/metric/prometheus" \
|
||||
"$ROOT/private-server/argus/metric/prometheus/data" \
|
||||
@ -72,11 +71,9 @@ chown -R "$uid":"$gid" \
|
||||
"$ROOT/private-server/argus/metric/grafana" \
|
||||
"$ROOT/private-server/argus/metric/prometheus" \
|
||||
"$ROOT/private-server/argus/alert" \
|
||||
"$ROOT/private-server/argus/metric/ftp" \
|
||||
"$ROOT/private-server/argus/agent" \
|
||||
"$ROOT/private-server/argus/etc" 2>/dev/null || true
|
||||
|
||||
# group-writable for etc/alert as in sys/tests
|
||||
chmod -R g+w "$ROOT/private-server/argus/alert" "$ROOT/private-server/argus/etc" 2>/dev/null || true
|
||||
|
||||
# ensure .env carries the resolved UID/GID for compose env interpolation
|
||||
@ -91,11 +88,4 @@ else
|
||||
echo "ARGUS_BUILD_GID=${gid}" >> "$ENV_FILE"
|
||||
fi
|
||||
|
||||
# distribute update-dns.sh
|
||||
BIND_UPDATE_SRC="$REPO_ROOT/src/bind/build/update-dns.sh"
|
||||
BIND_UPDATE_DEST="$ROOT/private-server/argus/etc/update-dns.sh"
|
||||
if [[ -f "$BIND_UPDATE_SRC" ]]; then
|
||||
cp "$BIND_UPDATE_SRC" "$BIND_UPDATE_DEST" && chmod +x "$BIND_UPDATE_DEST" || true
|
||||
fi
|
||||
|
||||
echo "[BOOT] done"
|
||||
|
||||
@ -36,49 +36,12 @@ done
|
||||
|
||||
if [[ $ok -lt 4 ]]; then echo "[ERROR] services not ready" >&2; exit 1; fi
|
||||
|
||||
echo "[READY] resolving overlay IPs"
|
||||
BINDIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-bind-sys)
|
||||
FTPIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-ftp)
|
||||
echo "BINDIP=$BINDIP FTPIP=$FTPIP"
|
||||
|
||||
ENV_NODES="$ROOT/.env.nodes"
|
||||
cat > "$ENV_NODES" <<EOF
|
||||
BINDIP=$BINDIP
|
||||
FTPIP=$FTPIP
|
||||
MASTER_ENDPOINT=http://master.argus.com:3000
|
||||
FTP_USER=ftpuser
|
||||
FTP_PASSWORD=ZGClab1234!
|
||||
AGENT_ENV=dev2
|
||||
AGENT_USER=yuyr
|
||||
AGENT_INSTANCE=node001sX
|
||||
EOF
|
||||
|
||||
echo "[READY] wrote $ENV_NODES"
|
||||
|
||||
# Inline: fix domain records -> actual overlay IPs and reload bind/nginx (best-effort)
|
||||
echo "[READY] fixing domain records to overlay IPs"
|
||||
ETC_DIR="$ROOT/private-server/argus/etc"; mkdir -p "$ETC_DIR"
|
||||
declare -A MAP
|
||||
MAP[web-frontend]=web.argus.com
|
||||
MAP[argus-grafana]=grafana.metric.argus.com
|
||||
MAP[argus-prometheus]=prom.metric.argus.com
|
||||
MAP[argus-kibana-sys]=kibana.log.argus.com
|
||||
MAP[argus-alertmanager]=alertmanager.alert.argus.com
|
||||
MAP[argus-master-sys]=master.argus.com
|
||||
changed=0
|
||||
for cname in "${!MAP[@]}"; do
|
||||
domain="${MAP[$cname]}"; fpath="$ETC_DIR/$domain"
|
||||
ip=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' "$cname" 2>/dev/null || true)
|
||||
[[ -z "$ip" ]] && { echo "[DNS-FIX][WARN] $domain: container $cname no overlay IP yet"; continue; }
|
||||
cur=$(cat "$fpath" 2>/dev/null || echo "")
|
||||
if [[ "$cur" != "$ip" ]]; then
|
||||
echo "$ip" > "$fpath"; echo "[DNS-FIX][SET] $domain = $ip (was: ${cur:-<empty>})"; changed=1
|
||||
else
|
||||
echo "[DNS-FIX][OK] $domain already $ip"
|
||||
fi
|
||||
done
|
||||
if [[ $changed -eq 1 ]]; then
|
||||
docker exec argus-bind-sys /usr/local/bin/reload-bind9.sh >/dev/null 2>&1 || true
|
||||
sleep 1
|
||||
fi
|
||||
docker exec argus-web-proxy nginx -s reload >/dev/null 2>&1 || true
|
||||
echo "[READY] wrote $ENV_NODES (MASTER_ENDPOINT/AGENT_* only)"
|
||||
|
||||
@ -81,8 +81,8 @@ fi
|
||||
docker exec argus-grafana sh -lc "grep -E 'url:\s*http://$PROM_DOMAIN' '$DS_FILE'" >/dev/null 2>&1 || fail "datasource not pointing to $PROM_DOMAIN"
|
||||
ok "datasource points to domain"
|
||||
|
||||
# ---- DNS resolution inside grafana ----
|
||||
info "bind resolution inside grafana"
|
||||
# ---- DNS resolution inside grafana (via Docker DNS + FQDN alias) ----
|
||||
info "FQDN resolution inside grafana (Docker DNS)"
|
||||
tries=0
|
||||
until docker exec argus-grafana getent hosts prom.metric.argus.com >/dev/null 2>&1; do
|
||||
tries=$((tries+1)); (( tries > 24 )) && fail "grafana cannot resolve prom.metric.argus.com"
|
||||
|
||||
@ -21,7 +21,6 @@ else
|
||||
docker run -d --rm \
|
||||
--name "$WARMUP_NAME" \
|
||||
--network "$NET_NAME" \
|
||||
${BINDIP:+--dns "$BINDIP"} \
|
||||
"$WARMUP_IMAGE" sleep "$WARMUP_SECONDS"
|
||||
rc=$?
|
||||
set -e
|
||||
@ -43,4 +42,3 @@ done
|
||||
|
||||
echo "[WARN] network still not inspectable locally after 60s, but warmup container is running. Compose may still pass; proceed to run GPU compose and retry if needed." >&2
|
||||
exit 0
|
||||
|
||||
|
||||
@ -1 +1 @@
|
||||
{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.13:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.13:9400/metrics","globalUrl":"http://10.0.1.13:9400/metrics","lastError":"","lastScrape":"2025-11-14T16:20:36.702023128+08:00","lastScrapeDuration":0.001054193,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.13:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.13:9100/metrics","globalUrl":"http://10.0.1.13:9100/metrics","lastError":"","lastScrape":"2025-11-14T16:20:34.338081675+08:00","lastScrapeDuration":0.019183536,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}}
|
||||
{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.20:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.20","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.20","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.20:9400/metrics","globalUrl":"http://10.0.1.20:9400/metrics","lastError":"","lastScrape":"2025-11-18T15:02:15.071897295+08:00","lastScrapeDuration":0.001115439,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.20:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.20","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.20","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.20:9100/metrics","globalUrl":"http://10.0.1.20:9100/metrics","lastError":"","lastScrape":"2025-11-18T15:02:12.57609087+08:00","lastScrapeDuration":0.020143969,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}}
|
||||
Loading…
x
Reference in New Issue
Block a user