[#49] 优化swarm test支持自动reboot和verify

This commit is contained in:
yuyr 2025-11-20 15:21:18 +08:00
parent d4e0dc1511
commit 2caf0fa214
6 changed files with 203 additions and 5 deletions

View File

@ -10,6 +10,7 @@ PROM_PORT="${PROMETHEUS_PORT:-9090}"
GRAF_PORT="${GRAFANA_PORT:-3000}" GRAF_PORT="${GRAFANA_PORT:-3000}"
GRAF_URL="http://127.0.0.1:${GRAF_PORT}" GRAF_URL="http://127.0.0.1:${GRAF_PORT}"
PROM_DOMAIN="prom.metric.argus.com:${PROM_PORT}" PROM_DOMAIN="prom.metric.argus.com:${PROM_PORT}"
NODE_CONT="${SWARM_NODE_CNAME:-argus-metric-test-node-swarm}"
err() { echo "[ERR] $*" >&2; } err() { echo "[ERR] $*" >&2; }
ok() { echo "[OK] $*"; } ok() { echo "[OK] $*"; }
@ -151,8 +152,23 @@ send_logs() {
docker exec "$cname" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log" docker exec "$cname" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log"
} }
NODE_CONT="${SWARM_NODE_CNAME:-argus-metric-test-node-swarm}"
ensure_fluentbit "$NODE_CONT" ensure_fluentbit "$NODE_CONT"
# ensure fluent-bit process is really up before sending logs,
# to avoid dropping lines when tail starts after we write test logs
FLUENT_WAIT_RETRIES="${FLUENT_WAIT_RETRIES:-120}"
FLUENT_WAIT_SLEEP="${FLUENT_WAIT_SLEEP:-2}"
fluent_ok=0
for i in $(seq 1 "$FLUENT_WAIT_RETRIES"); do
if docker exec "$NODE_CONT" pgrep -x fluent-bit >/dev/null 2>&1; then
fluent_ok=1
break
fi
echo "[..] waiting fluent-bit process up in node ($i/$FLUENT_WAIT_RETRIES)"
sleep "$FLUENT_WAIT_SLEEP"
done
if [[ "$fluent_ok" -ne 1 ]]; then
fail "fluent-bit not running in node after waiting $((FLUENT_WAIT_RETRIES * FLUENT_WAIT_SLEEP))s"
fi
send_logs "$NODE_CONT" "swarm-node" send_logs "$NODE_CONT" "swarm-node"
info "waiting for ES to ingest..." info "waiting for ES to ingest..."
@ -181,3 +197,72 @@ if ! curl -fs "http://127.0.0.1:${KIBANA_PORT}/api/status" >/dev/null 2>&1; then
fi fi
ok "log pipeline verified" ok "log pipeline verified"
# ---- Node status and health (node.json + metric-*) ----
info "Node status and health (node.json + metric components)"
NODE_HEALTH_RETRIES="${NODE_HEALTH_RETRIES:-5}"
NODE_HEALTH_SLEEP="${NODE_HEALTH_SLEEP:-5}"
if ! command -v jq >/dev/null 2>&1; then
fail "node health: jq not available on host; cannot parse node.json"
fi
node_health_ok=0
for attempt in $(seq 1 "$NODE_HEALTH_RETRIES"); do
tmp_node_json="$(mktemp)"
if ! docker exec "$NODE_CONT" sh -lc '
set -e
host="$(hostname)"
f="/private/argus/agent/${host}/node.json"
if [ ! -s "$f" ]; then
echo "[ERR] node.json missing or empty: $f" >&2
exit 1
fi
cat "$f"
' > "$tmp_node_json" 2>/dev/null; then
rm -f "$tmp_node_json"
info "node health: node.json not ready (attempt $attempt/$NODE_HEALTH_RETRIES)"
else
node_name="$(jq -r '.name // ""' "$tmp_node_json")"
node_status="$(jq -r '.status // ""' "$tmp_node_json")"
node_type="$(jq -r '.type // ""' "$tmp_node_json")"
if [[ -z "$node_name" || -z "$node_status" || -z "$node_type" ]]; then
info "node health: missing required fields in node.json (attempt $attempt/$NODE_HEALTH_RETRIES)"
elif [[ "$node_status" != "online" || "$node_type" != "agent" ]]; then
info "node health: status/type not ready yet (status=$node_status type=$node_type name=$node_name attempt $attempt/$NODE_HEALTH_RETRIES)"
else
all_ok=1
for comp in metric-argus-agent metric-node-exporter metric-dcgm-exporter metric-fluent-bit; do
cstatus="$(jq -r --arg c "$comp" '.health[$c].status // ""' "$tmp_node_json")"
cerror="$(jq -r --arg c "$comp" '.health[$c].error // ""' "$tmp_node_json")"
if [[ "$cstatus" != "healthy" ]]; then
info "node health: $comp status=$cstatus (attempt $attempt/$NODE_HEALTH_RETRIES)"
all_ok=0
break
fi
if [[ -n "$cerror" && "$cerror" != "null" ]]; then
info "node health: $comp error=$cerror (attempt $attempt/$NODE_HEALTH_RETRIES)"
all_ok=0
break
fi
done
if [[ "$all_ok" -eq 1 ]]; then
node_health_ok=1
rm -f "$tmp_node_json"
break
fi
fi
rm -f "$tmp_node_json"
fi
if [[ "$attempt" -lt "$NODE_HEALTH_RETRIES" ]]; then
sleep "$NODE_HEALTH_SLEEP"
fi
done
if [[ "$node_health_ok" -ne 1 ]]; then
fail "node health: node.json or metric components not healthy after ${NODE_HEALTH_RETRIES} attempts"
fi
ok "node status online and metric components healthy"

View File

@ -0,0 +1,48 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$ROOT/.env"; set -a; source "$ENV_FILE"; set +a
ENV_NODES_FILE="$ROOT/.env.nodes"; set -a; source "$ENV_NODES_FILE"; set +a
PROJECT="${NODES_PROJECT:-argus-swarm-nodes}"
COMPOSE_FILE="$ROOT/docker-compose.nodes.yml"
NODE_CONT="${SWARM_NODE_CNAME:-argus-metric-test-node-swarm}"
echo "[RESTART] restarting node compose project: $PROJECT"
docker compose -p "$PROJECT" -f "$COMPOSE_FILE" restart
echo "[RESTART] waiting node container up: $NODE_CONT"
for i in {1..30}; do
state=$(docker ps --format '{{.Names}} {{.Status}}' | awk -v c="$NODE_CONT" '$1==c{print $2}' || true)
if [[ "$state" == Up* ]]; then
echo "[RESTART] node container is up"
break
fi
echo "[..] waiting node container up ($i/30)"
sleep 2
done
NODE_HEALTH_WAIT="${NODE_HEALTH_WAIT:-300}"
attempts=$(( NODE_HEALTH_WAIT / 30 ))
(( attempts < 1 )) && attempts=1
echo "[RESTART] waiting node health to recover (timeout=${NODE_HEALTH_WAIT}s)"
ok_flag=0
for i in $(seq 1 "$attempts"); do
if bash "$SCRIPT_DIR/04_metric_verify.sh"; then
echo "[RESTART] node restart verify passed on attempt $i/$attempts"
ok_flag=1
break
fi
echo "[..] 04_metric_verify failed after node restart; retrying ($i/$attempts)"
sleep 30
done
if [[ "$ok_flag" -ne 1 ]]; then
echo "[ERR] node restart: 04_metric_verify did not pass within ${NODE_HEALTH_WAIT}s" >&2
exit 1
fi

View File

@ -0,0 +1,22 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$ROOT/.env"; set -a; source "$ENV_FILE"; set +a
PROJECT="${SERVER_PROJECT:-argus-swarm-server}"
COMPOSE_FILE="$ROOT/docker-compose.server.yml"
echo "[RESTART] restarting server compose project: $PROJECT"
docker compose -p "$PROJECT" -f "$COMPOSE_FILE" restart
echo "[RESTART] waiting server ready after restart"
bash "$SCRIPT_DIR/02_wait_ready.sh"
echo "[RESTART] running 04_metric_verify after server restart"
bash "$SCRIPT_DIR/04_metric_verify.sh"
echo "[RESTART] server restart + verify passed"

View File

@ -0,0 +1,46 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
echo "[E2E] starting full swarm_tests E2E (cleanup -> 00-04 -> restart server/node -> keep env)"
if [[ "${E2E_SKIP_CLEAN:-0}" != "1" ]]; then
echo "[E2E] cleaning previous environment via 99_down.sh"
bash "$SCRIPT_DIR/99_down.sh" || true
else
echo "[E2E] skipping cleanup (E2E_SKIP_CLEAN=1)"
fi
echo "[E2E] running 00_bootstrap"
bash "$SCRIPT_DIR/00_bootstrap.sh"
echo "[E2E] running 01_server_up"
bash "$SCRIPT_DIR/01_server_up.sh"
echo "[E2E] running 02_wait_ready"
bash "$SCRIPT_DIR/02_wait_ready.sh"
echo "[E2E] running 03_nodes_up"
bash "$SCRIPT_DIR/03_nodes_up.sh"
echo "[E2E] baseline 04_metric_verify"
bash "$SCRIPT_DIR/04_metric_verify.sh"
if [[ "${E2E_SKIP_SERVER_RESTART:-0}" != "1" ]]; then
echo "[E2E] server restart + verify"
bash "$SCRIPT_DIR/04_restart_server_and_verify.sh"
else
echo "[E2E] skipping server restart (E2E_SKIP_SERVER_RESTART=1)"
fi
if [[ "${E2E_SKIP_NODE_RESTART:-0}" != "1" ]]; then
echo "[E2E] node restart + verify"
bash "$SCRIPT_DIR/04_restart_node_and_verify.sh"
else
echo "[E2E] skipping node restart (E2E_SKIP_NODE_RESTART=1)"
fi
echo "[E2E] done; environment kept for inspection"

View File

@ -14,9 +14,6 @@ docker compose -p "${SERVER_PROJECT:-argus-swarm-server}" -f "$ROOT/docker-compo
echo "[DOWN] removing warmup container (if any)" echo "[DOWN] removing warmup container (if any)"
docker rm -f argus-net-warmup >/dev/null 2>&1 || true docker rm -f argus-net-warmup >/dev/null 2>&1 || true
echo "[DOWN] removing overlay network"
docker network rm argus-sys-net >/dev/null 2>&1 || true
echo "[DOWN] cleanup temp files" echo "[DOWN] cleanup temp files"
rm -rf "$ROOT/private-server/tmp" "$ROOT/private-nodes/tmp" 2>/dev/null || true rm -rf "$ROOT/private-server/tmp" "$ROOT/private-nodes/tmp" 2>/dev/null || true

View File

@ -1 +1 @@
{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.12:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.12","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.12","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.12:9400/metrics","globalUrl":"http://10.0.1.12:9400/metrics","lastError":"","lastScrape":"2025-11-19T17:22:07.119337307+08:00","lastScrapeDuration":0.001359079,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.12:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.12","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.12","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.12:9100/metrics","globalUrl":"http://10.0.1.12:9100/metrics","lastError":"","lastScrape":"2025-11-19T17:22:13.427955955+08:00","lastScrapeDuration":0.020847396,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}} {"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.86:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.86","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.86","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.86:9400/metrics","globalUrl":"http://10.0.1.86:9400/metrics","lastError":"","lastScrape":"2025-11-20T14:45:34.652147179+08:00","lastScrapeDuration":0.002046883,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.86:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.86","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.86","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.86:9100/metrics","globalUrl":"http://10.0.1.86:9100/metrics","lastError":"","lastScrape":"2025-11-20T14:45:33.675131411+08:00","lastScrapeDuration":0.023311933,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}}