完成H20服务器部署及重启测试 #51

Merged
yuyr merged 27 commits from dev_1.1.0_yuyr_nobind into dev_1.0.0 2025-11-25 15:54:30 +08:00
9 changed files with 49 additions and 108 deletions
Showing only changes of commit 1d38304936 - Show all commits

View File

@ -4,7 +4,7 @@
## 先决条件
- Docker Engine 已启用 Swarm脚本会自动 `swarm init` 单机模式)。
- 已构建并加载以下镜像:`argus-bind9:latest`、`argus-master:latest`、`argus-elasticsearch:latest``argus-kibana:latest`、`argus-metric-ftp:latest`、`argus-metric-prometheus:latest``argus-metric-grafana:latest``argus-alertmanager:latest``argus-web-frontend:latest``argus-web-proxy:latest`、以及节点镜像 `argus-sys-metric-test-node-bundle:latest`(见下文)。
- 已构建并加载以下镜像:`argus-master:latest`、`argus-elasticsearch:latest``argus-kibana:latest`、`argus-metric-prometheus:latest``argus-metric-grafana:latest``argus-alertmanager:latest``argus-web-frontend:latest``argus-web-proxy:latest`、以及节点镜像 `argus-sys-metric-test-node-bundle:latest`(见下文)。
- 本地 `UID/GID` 建议通过 `configs/build_user.local.conf` 指定,脚本会读取:
- `UID=1000`\n`GID=1000`(示例)。
@ -24,7 +24,7 @@ cp .env.example .env
bash scripts/00_bootstrap.sh
bash scripts/01_server_up.sh
bash scripts/02_wait_ready.sh # 输出 BINDIP/FTPIP 到 .env.nodes
bash scripts/02_wait_ready.sh # 写 MASTER_ENDPOINT/AGENT_* 到 .env.nodes
bash scripts/03_nodes_up.sh
bash scripts/04_metric_verify.sh
```
@ -38,7 +38,7 @@ bash scripts/99_down.sh
## 说明与注意事项
- `00_bootstrap.sh`:先加载 `scripts/common/build_user.sh`,打印并写入 `.env` 中的 `ARGUS_BUILD_UID/GID`,再准备 `private-server/``private-nodes/` 目录,并 `chown` 到对应 UID/GID。
- `01_server_up.sh`:启动服务端 compose。可用 `SWARM_FIX_PERMS=1` 打开“容器内 chmod + supervisor 重启”的兜底逻辑,默认关闭。
- `02_wait_ready.sh`:等待 Master/ES/Prom/Grafana 就绪Kibana 可延迟),随后解析 overlay IP写入 `.env.nodes``BINDIP/FTPIP`,供节点 compose 使用
- `02_wait_ready.sh`:等待 Master/ES/Prom/Grafana 就绪Kibana 可延迟),随后写入 `.env.nodes``MASTER_ENDPOINT/AGENT_*`,供节点 compose 使用DNS 由 Docker 自带服务负责,不再依赖 BINDIP/FTPIP
- `03_nodes_up.sh`启动单节点容器bundle 版)。容器内 `node-bootstrap.sh` 优先本地安装,成功后执行健康检查并等待 `/private/argus/agent/<hostname>/node.json` 出现。
- `04_metric_verify.sh`:在本套件内执行详细校验(不再直接调用 tests 脚本):
- Grafana `/api/health`database=ok

View File

@ -16,10 +16,6 @@ services:
- TZ=Asia/Shanghai
- DEBIAN_FRONTEND=noninteractive
- MASTER_ENDPOINT=${MASTER_ENDPOINT:-http://master.argus.com:3000}
- FTPIP=${FTPIP}
- BINDIP=${BINDIP}
- FTP_USER=${FTP_USER:-ftpuser}
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
- AGENT_ENV=${AGENT_ENV:-dev2}
@ -28,9 +24,10 @@ services:
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
- GPU_MODE=gpu
dns:
- ${BINDIP}
networks: [argus-sys-net]
networks:
argus-sys-net:
aliases:
- ${AGENT_INSTANCE}.node.argus.com
volumes:
- ./private-gpu-nodes/argus/agent:/private/argus/agent
command: ["sleep", "infinity"]

View File

@ -16,19 +16,16 @@ services:
- MASTER_ENDPOINT=${MASTER_ENDPOINT:-http://master.argus.com:3000}
- ES_HOST=es.log.argus.com
- ES_PORT=9200
- FTPIP=${FTPIP}
- BINDIP=${BINDIP}
- FTP_USER=${FTP_USER:-ftpuser}
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
- AGENT_ENV=${AGENT_ENV:-dev2}
- AGENT_USER=${AGENT_USER:-yuyr}
- AGENT_INSTANCE=${AGENT_INSTANCE:-node001sX}
- CLIENT_VERSION=${CLIENT_VERSION:-}
dns:
- ${BINDIP}
networks: [argus-sys-net]
networks:
argus-sys-net:
aliases:
- ${AGENT_INSTANCE}.node.argus.com
volumes:
- ./private-nodes/argus/agent:/private/argus/agent
command: ["sleep", "infinity"]

View File

@ -5,18 +5,10 @@ networks:
external: true
services:
bind:
image: ${BIND_IMAGE_TAG:-argus-bind9:latest}
container_name: argus-bind-sys
networks: [argus-sys-net]
volumes:
- ./private-server:/private
restart: unless-stopped
master:
image: ${MASTER_IMAGE_TAG:-argus-master:latest}
container_name: argus-master-sys
depends_on: [bind]
depends_on: []
environment:
- OFFLINE_THRESHOLD_SECONDS=6
- ONLINE_THRESHOLD_SECONDS=2
@ -29,7 +21,10 @@ services:
- ./private-server/argus/master:/private/argus/master
- ./private-server/argus/metric/prometheus:/private/argus/metric/prometheus
- ./private-server/argus/etc:/private/argus/etc
networks: [argus-sys-net]
networks:
argus-sys-net:
aliases:
- master.argus.com
restart: unless-stopped
es:
@ -47,7 +42,10 @@ services:
ports:
- "${ES_HTTP_PORT:-9200}:9200"
restart: unless-stopped
networks: [argus-sys-net]
networks:
argus-sys-net:
aliases:
- es.log.argus.com
kibana:
image: ${KIBANA_IMAGE_TAG:-argus-kibana:latest}
@ -63,27 +61,10 @@ services:
ports:
- "${KIBANA_PORT:-5601}:5601"
restart: unless-stopped
networks: [argus-sys-net]
ftp:
image: ${FTP_IMAGE_TAG:-argus-metric-ftp:latest}
container_name: argus-ftp
restart: unless-stopped
environment:
- TZ=Asia/Shanghai
- FTP_BASE_PATH=/private/argus/ftp
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
- DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com}
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
ports:
- "${FTP_PORT:-21}:21"
- "${FTP_DATA_PORT:-20}:20"
- "${FTP_PASSIVE_HOST_RANGE:-21100-21110}:21100-21110"
volumes:
- ./private-server/argus/metric/ftp:/private/argus/ftp
- ./private-server/argus/etc:/private/argus/etc
networks: [argus-sys-net]
networks:
argus-sys-net:
aliases:
- kibana.log.argus.com
prometheus:
image: ${PROM_IMAGE_TAG:-argus-metric-prometheus:latest}
@ -99,7 +80,10 @@ services:
volumes:
- ./private-server/argus/metric/prometheus:/private/argus/metric/prometheus
- ./private-server/argus/etc:/private/argus/etc
networks: [argus-sys-net]
networks:
argus-sys-net:
aliases:
- prom.metric.argus.com
grafana:
image: ${GRAFANA_IMAGE_TAG:-argus-metric-grafana:latest}
@ -122,7 +106,10 @@ services:
- ./private-server/argus/metric/grafana:/private/argus/metric/grafana
- ./private-server/argus/etc:/private/argus/etc
depends_on: [prometheus]
networks: [argus-sys-net]
networks:
argus-sys-net:
aliases:
- grafana.metric.argus.com
alertmanager:
image: ${ALERT_IMAGE_TAG:-argus-alertmanager:latest}
@ -133,7 +120,10 @@ services:
volumes:
- ./private-server/argus/etc:/private/argus/etc
- ./private-server/argus/alert/alertmanager:/private/argus/alert/alertmanager
networks: [argus-sys-net]
networks:
argus-sys-net:
aliases:
- alertmanager.alert.argus.com
ports:
- "${ALERTMANAGER_PORT:-9093}:9093"
restart: unless-stopped
@ -151,19 +141,25 @@ services:
- EXTERNAL_KIBANA_PORT=${WEB_PROXY_PORT_8083:-8083}
volumes:
- ./private-server/argus/etc:/private/argus/etc
networks: [argus-sys-net]
networks:
argus-sys-net:
aliases:
- web.argus.com
restart: unless-stopped
web-proxy:
image: ${WEB_PROXY_IMAGE_TAG:-argus-web-proxy:latest}
container_name: argus-web-proxy
depends_on: [bind, master, grafana, prometheus, kibana, alertmanager]
depends_on: [master, grafana, prometheus, kibana, alertmanager]
environment:
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
volumes:
- ./private-server/argus/etc:/private/argus/etc
networks: [argus-sys-net]
networks:
argus-sys-net:
aliases:
- proxy.argus.com
ports:
- "${WEB_PROXY_PORT_8080:-8080}:8080"
- "${WEB_PROXY_PORT_8081:-8081}:8081"

View File

@ -42,7 +42,6 @@ echo "[BOOT] preparing private directories (server/nodes)"
# Server-side dirs (align with sys/tests 01_bootstrap.sh)
mkdir -p \
"$ROOT/private-server/argus/etc" \
"$ROOT/private-server/argus/bind" \
"$ROOT/private-server/argus/master" \
"$ROOT/private-server/argus/metric/prometheus" \
"$ROOT/private-server/argus/metric/prometheus/data" \
@ -72,11 +71,9 @@ chown -R "$uid":"$gid" \
"$ROOT/private-server/argus/metric/grafana" \
"$ROOT/private-server/argus/metric/prometheus" \
"$ROOT/private-server/argus/alert" \
"$ROOT/private-server/argus/metric/ftp" \
"$ROOT/private-server/argus/agent" \
"$ROOT/private-server/argus/etc" 2>/dev/null || true
# group-writable for etc/alert as in sys/tests
chmod -R g+w "$ROOT/private-server/argus/alert" "$ROOT/private-server/argus/etc" 2>/dev/null || true
# ensure .env carries the resolved UID/GID for compose env interpolation
@ -91,11 +88,4 @@ else
echo "ARGUS_BUILD_GID=${gid}" >> "$ENV_FILE"
fi
# distribute update-dns.sh
BIND_UPDATE_SRC="$REPO_ROOT/src/bind/build/update-dns.sh"
BIND_UPDATE_DEST="$ROOT/private-server/argus/etc/update-dns.sh"
if [[ -f "$BIND_UPDATE_SRC" ]]; then
cp "$BIND_UPDATE_SRC" "$BIND_UPDATE_DEST" && chmod +x "$BIND_UPDATE_DEST" || true
fi
echo "[BOOT] done"

View File

@ -36,49 +36,12 @@ done
if [[ $ok -lt 4 ]]; then echo "[ERROR] services not ready" >&2; exit 1; fi
echo "[READY] resolving overlay IPs"
BINDIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-bind-sys)
FTPIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-ftp)
echo "BINDIP=$BINDIP FTPIP=$FTPIP"
ENV_NODES="$ROOT/.env.nodes"
cat > "$ENV_NODES" <<EOF
BINDIP=$BINDIP
FTPIP=$FTPIP
MASTER_ENDPOINT=http://master.argus.com:3000
FTP_USER=ftpuser
FTP_PASSWORD=ZGClab1234!
AGENT_ENV=dev2
AGENT_USER=yuyr
AGENT_INSTANCE=node001sX
EOF
echo "[READY] wrote $ENV_NODES"
# Inline: fix domain records -> actual overlay IPs and reload bind/nginx (best-effort)
echo "[READY] fixing domain records to overlay IPs"
ETC_DIR="$ROOT/private-server/argus/etc"; mkdir -p "$ETC_DIR"
declare -A MAP
MAP[web-frontend]=web.argus.com
MAP[argus-grafana]=grafana.metric.argus.com
MAP[argus-prometheus]=prom.metric.argus.com
MAP[argus-kibana-sys]=kibana.log.argus.com
MAP[argus-alertmanager]=alertmanager.alert.argus.com
MAP[argus-master-sys]=master.argus.com
changed=0
for cname in "${!MAP[@]}"; do
domain="${MAP[$cname]}"; fpath="$ETC_DIR/$domain"
ip=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' "$cname" 2>/dev/null || true)
[[ -z "$ip" ]] && { echo "[DNS-FIX][WARN] $domain: container $cname no overlay IP yet"; continue; }
cur=$(cat "$fpath" 2>/dev/null || echo "")
if [[ "$cur" != "$ip" ]]; then
echo "$ip" > "$fpath"; echo "[DNS-FIX][SET] $domain = $ip (was: ${cur:-<empty>})"; changed=1
else
echo "[DNS-FIX][OK] $domain already $ip"
fi
done
if [[ $changed -eq 1 ]]; then
docker exec argus-bind-sys /usr/local/bin/reload-bind9.sh >/dev/null 2>&1 || true
sleep 1
fi
docker exec argus-web-proxy nginx -s reload >/dev/null 2>&1 || true
echo "[READY] wrote $ENV_NODES (MASTER_ENDPOINT/AGENT_* only)"

View File

@ -81,8 +81,8 @@ fi
docker exec argus-grafana sh -lc "grep -E 'url:\s*http://$PROM_DOMAIN' '$DS_FILE'" >/dev/null 2>&1 || fail "datasource not pointing to $PROM_DOMAIN"
ok "datasource points to domain"
# ---- DNS resolution inside grafana ----
info "bind resolution inside grafana"
# ---- DNS resolution inside grafana (via Docker DNS + FQDN alias) ----
info "FQDN resolution inside grafana (Docker DNS)"
tries=0
until docker exec argus-grafana getent hosts prom.metric.argus.com >/dev/null 2>&1; do
tries=$((tries+1)); (( tries > 24 )) && fail "grafana cannot resolve prom.metric.argus.com"

View File

@ -21,7 +21,6 @@ else
docker run -d --rm \
--name "$WARMUP_NAME" \
--network "$NET_NAME" \
${BINDIP:+--dns "$BINDIP"} \
"$WARMUP_IMAGE" sleep "$WARMUP_SECONDS"
rc=$?
set -e
@ -43,4 +42,3 @@ done
echo "[WARN] network still not inspectable locally after 60s, but warmup container is running. Compose may still pass; proceed to run GPU compose and retry if needed." >&2
exit 0

View File

@ -1 +1 @@
{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.13:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.13:9400/metrics","globalUrl":"http://10.0.1.13:9400/metrics","lastError":"","lastScrape":"2025-11-14T16:20:36.702023128+08:00","lastScrapeDuration":0.001054193,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.13:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.13:9100/metrics","globalUrl":"http://10.0.1.13:9100/metrics","lastError":"","lastScrape":"2025-11-14T16:20:34.338081675+08:00","lastScrapeDuration":0.019183536,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}}
{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.20:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.20","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.20","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.20:9400/metrics","globalUrl":"http://10.0.1.20:9400/metrics","lastError":"","lastScrape":"2025-11-18T15:02:15.071897295+08:00","lastScrapeDuration":0.001115439,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.20:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.20","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.20","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.20:9100/metrics","globalUrl":"http://10.0.1.20:9100/metrics","lastError":"","lastScrape":"2025-11-18T15:02:12.57609087+08:00","lastScrapeDuration":0.020143969,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}}