[#49] 优化swarm test支持自动reboot和verify
This commit is contained in:
parent
d4e0dc1511
commit
2caf0fa214
@ -10,6 +10,7 @@ PROM_PORT="${PROMETHEUS_PORT:-9090}"
|
||||
GRAF_PORT="${GRAFANA_PORT:-3000}"
|
||||
GRAF_URL="http://127.0.0.1:${GRAF_PORT}"
|
||||
PROM_DOMAIN="prom.metric.argus.com:${PROM_PORT}"
|
||||
NODE_CONT="${SWARM_NODE_CNAME:-argus-metric-test-node-swarm}"
|
||||
|
||||
err() { echo "[ERR] $*" >&2; }
|
||||
ok() { echo "[OK] $*"; }
|
||||
@ -151,8 +152,23 @@ send_logs() {
|
||||
docker exec "$cname" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log"
|
||||
}
|
||||
|
||||
NODE_CONT="${SWARM_NODE_CNAME:-argus-metric-test-node-swarm}"
|
||||
ensure_fluentbit "$NODE_CONT"
|
||||
# ensure fluent-bit process is really up before sending logs,
|
||||
# to avoid dropping lines when tail starts after we write test logs
|
||||
FLUENT_WAIT_RETRIES="${FLUENT_WAIT_RETRIES:-120}"
|
||||
FLUENT_WAIT_SLEEP="${FLUENT_WAIT_SLEEP:-2}"
|
||||
fluent_ok=0
|
||||
for i in $(seq 1 "$FLUENT_WAIT_RETRIES"); do
|
||||
if docker exec "$NODE_CONT" pgrep -x fluent-bit >/dev/null 2>&1; then
|
||||
fluent_ok=1
|
||||
break
|
||||
fi
|
||||
echo "[..] waiting fluent-bit process up in node ($i/$FLUENT_WAIT_RETRIES)"
|
||||
sleep "$FLUENT_WAIT_SLEEP"
|
||||
done
|
||||
if [[ "$fluent_ok" -ne 1 ]]; then
|
||||
fail "fluent-bit not running in node after waiting $((FLUENT_WAIT_RETRIES * FLUENT_WAIT_SLEEP))s"
|
||||
fi
|
||||
send_logs "$NODE_CONT" "swarm-node"
|
||||
|
||||
info "waiting for ES to ingest..."
|
||||
@ -181,3 +197,72 @@ if ! curl -fs "http://127.0.0.1:${KIBANA_PORT}/api/status" >/dev/null 2>&1; then
|
||||
fi
|
||||
|
||||
ok "log pipeline verified"
|
||||
|
||||
# ---- Node status and health (node.json + metric-*) ----
|
||||
info "Node status and health (node.json + metric components)"
|
||||
|
||||
NODE_HEALTH_RETRIES="${NODE_HEALTH_RETRIES:-5}"
|
||||
NODE_HEALTH_SLEEP="${NODE_HEALTH_SLEEP:-5}"
|
||||
|
||||
if ! command -v jq >/dev/null 2>&1; then
|
||||
fail "node health: jq not available on host; cannot parse node.json"
|
||||
fi
|
||||
|
||||
node_health_ok=0
|
||||
for attempt in $(seq 1 "$NODE_HEALTH_RETRIES"); do
|
||||
tmp_node_json="$(mktemp)"
|
||||
if ! docker exec "$NODE_CONT" sh -lc '
|
||||
set -e
|
||||
host="$(hostname)"
|
||||
f="/private/argus/agent/${host}/node.json"
|
||||
if [ ! -s "$f" ]; then
|
||||
echo "[ERR] node.json missing or empty: $f" >&2
|
||||
exit 1
|
||||
fi
|
||||
cat "$f"
|
||||
' > "$tmp_node_json" 2>/dev/null; then
|
||||
rm -f "$tmp_node_json"
|
||||
info "node health: node.json not ready (attempt $attempt/$NODE_HEALTH_RETRIES)"
|
||||
else
|
||||
node_name="$(jq -r '.name // ""' "$tmp_node_json")"
|
||||
node_status="$(jq -r '.status // ""' "$tmp_node_json")"
|
||||
node_type="$(jq -r '.type // ""' "$tmp_node_json")"
|
||||
|
||||
if [[ -z "$node_name" || -z "$node_status" || -z "$node_type" ]]; then
|
||||
info "node health: missing required fields in node.json (attempt $attempt/$NODE_HEALTH_RETRIES)"
|
||||
elif [[ "$node_status" != "online" || "$node_type" != "agent" ]]; then
|
||||
info "node health: status/type not ready yet (status=$node_status type=$node_type name=$node_name attempt $attempt/$NODE_HEALTH_RETRIES)"
|
||||
else
|
||||
all_ok=1
|
||||
for comp in metric-argus-agent metric-node-exporter metric-dcgm-exporter metric-fluent-bit; do
|
||||
cstatus="$(jq -r --arg c "$comp" '.health[$c].status // ""' "$tmp_node_json")"
|
||||
cerror="$(jq -r --arg c "$comp" '.health[$c].error // ""' "$tmp_node_json")"
|
||||
if [[ "$cstatus" != "healthy" ]]; then
|
||||
info "node health: $comp status=$cstatus (attempt $attempt/$NODE_HEALTH_RETRIES)"
|
||||
all_ok=0
|
||||
break
|
||||
fi
|
||||
if [[ -n "$cerror" && "$cerror" != "null" ]]; then
|
||||
info "node health: $comp error=$cerror (attempt $attempt/$NODE_HEALTH_RETRIES)"
|
||||
all_ok=0
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [[ "$all_ok" -eq 1 ]]; then
|
||||
node_health_ok=1
|
||||
rm -f "$tmp_node_json"
|
||||
break
|
||||
fi
|
||||
fi
|
||||
rm -f "$tmp_node_json"
|
||||
fi
|
||||
if [[ "$attempt" -lt "$NODE_HEALTH_RETRIES" ]]; then
|
||||
sleep "$NODE_HEALTH_SLEEP"
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ "$node_health_ok" -ne 1 ]]; then
|
||||
fail "node health: node.json or metric components not healthy after ${NODE_HEALTH_RETRIES} attempts"
|
||||
fi
|
||||
|
||||
ok "node status online and metric components healthy"
|
||||
|
||||
48
src/sys/swarm_tests/scripts/04_restart_node_and_verify.sh
Executable file
48
src/sys/swarm_tests/scripts/04_restart_node_and_verify.sh
Executable file
@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
ENV_FILE="$ROOT/.env"; set -a; source "$ENV_FILE"; set +a
|
||||
ENV_NODES_FILE="$ROOT/.env.nodes"; set -a; source "$ENV_NODES_FILE"; set +a
|
||||
|
||||
PROJECT="${NODES_PROJECT:-argus-swarm-nodes}"
|
||||
COMPOSE_FILE="$ROOT/docker-compose.nodes.yml"
|
||||
NODE_CONT="${SWARM_NODE_CNAME:-argus-metric-test-node-swarm}"
|
||||
|
||||
echo "[RESTART] restarting node compose project: $PROJECT"
|
||||
docker compose -p "$PROJECT" -f "$COMPOSE_FILE" restart
|
||||
|
||||
echo "[RESTART] waiting node container up: $NODE_CONT"
|
||||
for i in {1..30}; do
|
||||
state=$(docker ps --format '{{.Names}} {{.Status}}' | awk -v c="$NODE_CONT" '$1==c{print $2}' || true)
|
||||
if [[ "$state" == Up* ]]; then
|
||||
echo "[RESTART] node container is up"
|
||||
break
|
||||
fi
|
||||
echo "[..] waiting node container up ($i/30)"
|
||||
sleep 2
|
||||
done
|
||||
|
||||
NODE_HEALTH_WAIT="${NODE_HEALTH_WAIT:-300}"
|
||||
attempts=$(( NODE_HEALTH_WAIT / 30 ))
|
||||
(( attempts < 1 )) && attempts=1
|
||||
|
||||
echo "[RESTART] waiting node health to recover (timeout=${NODE_HEALTH_WAIT}s)"
|
||||
ok_flag=0
|
||||
for i in $(seq 1 "$attempts"); do
|
||||
if bash "$SCRIPT_DIR/04_metric_verify.sh"; then
|
||||
echo "[RESTART] node restart verify passed on attempt $i/$attempts"
|
||||
ok_flag=1
|
||||
break
|
||||
fi
|
||||
echo "[..] 04_metric_verify failed after node restart; retrying ($i/$attempts)"
|
||||
sleep 30
|
||||
done
|
||||
|
||||
if [[ "$ok_flag" -ne 1 ]]; then
|
||||
echo "[ERR] node restart: 04_metric_verify did not pass within ${NODE_HEALTH_WAIT}s" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
22
src/sys/swarm_tests/scripts/04_restart_server_and_verify.sh
Executable file
22
src/sys/swarm_tests/scripts/04_restart_server_and_verify.sh
Executable file
@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
ENV_FILE="$ROOT/.env"; set -a; source "$ENV_FILE"; set +a
|
||||
|
||||
PROJECT="${SERVER_PROJECT:-argus-swarm-server}"
|
||||
COMPOSE_FILE="$ROOT/docker-compose.server.yml"
|
||||
|
||||
echo "[RESTART] restarting server compose project: $PROJECT"
|
||||
docker compose -p "$PROJECT" -f "$COMPOSE_FILE" restart
|
||||
|
||||
echo "[RESTART] waiting server ready after restart"
|
||||
bash "$SCRIPT_DIR/02_wait_ready.sh"
|
||||
|
||||
echo "[RESTART] running 04_metric_verify after server restart"
|
||||
bash "$SCRIPT_DIR/04_metric_verify.sh"
|
||||
|
||||
echo "[RESTART] server restart + verify passed"
|
||||
|
||||
46
src/sys/swarm_tests/scripts/10_e2e_swarm_restart_verify.sh
Executable file
46
src/sys/swarm_tests/scripts/10_e2e_swarm_restart_verify.sh
Executable file
@ -0,0 +1,46 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
echo "[E2E] starting full swarm_tests E2E (cleanup -> 00-04 -> restart server/node -> keep env)"
|
||||
|
||||
if [[ "${E2E_SKIP_CLEAN:-0}" != "1" ]]; then
|
||||
echo "[E2E] cleaning previous environment via 99_down.sh"
|
||||
bash "$SCRIPT_DIR/99_down.sh" || true
|
||||
else
|
||||
echo "[E2E] skipping cleanup (E2E_SKIP_CLEAN=1)"
|
||||
fi
|
||||
|
||||
echo "[E2E] running 00_bootstrap"
|
||||
bash "$SCRIPT_DIR/00_bootstrap.sh"
|
||||
|
||||
echo "[E2E] running 01_server_up"
|
||||
bash "$SCRIPT_DIR/01_server_up.sh"
|
||||
|
||||
echo "[E2E] running 02_wait_ready"
|
||||
bash "$SCRIPT_DIR/02_wait_ready.sh"
|
||||
|
||||
echo "[E2E] running 03_nodes_up"
|
||||
bash "$SCRIPT_DIR/03_nodes_up.sh"
|
||||
|
||||
echo "[E2E] baseline 04_metric_verify"
|
||||
bash "$SCRIPT_DIR/04_metric_verify.sh"
|
||||
|
||||
if [[ "${E2E_SKIP_SERVER_RESTART:-0}" != "1" ]]; then
|
||||
echo "[E2E] server restart + verify"
|
||||
bash "$SCRIPT_DIR/04_restart_server_and_verify.sh"
|
||||
else
|
||||
echo "[E2E] skipping server restart (E2E_SKIP_SERVER_RESTART=1)"
|
||||
fi
|
||||
|
||||
if [[ "${E2E_SKIP_NODE_RESTART:-0}" != "1" ]]; then
|
||||
echo "[E2E] node restart + verify"
|
||||
bash "$SCRIPT_DIR/04_restart_node_and_verify.sh"
|
||||
else
|
||||
echo "[E2E] skipping node restart (E2E_SKIP_NODE_RESTART=1)"
|
||||
fi
|
||||
|
||||
echo "[E2E] done; environment kept for inspection"
|
||||
|
||||
@ -14,9 +14,6 @@ docker compose -p "${SERVER_PROJECT:-argus-swarm-server}" -f "$ROOT/docker-compo
|
||||
echo "[DOWN] removing warmup container (if any)"
|
||||
docker rm -f argus-net-warmup >/dev/null 2>&1 || true
|
||||
|
||||
echo "[DOWN] removing overlay network"
|
||||
docker network rm argus-sys-net >/dev/null 2>&1 || true
|
||||
|
||||
echo "[DOWN] cleanup temp files"
|
||||
rm -rf "$ROOT/private-server/tmp" "$ROOT/private-nodes/tmp" 2>/dev/null || true
|
||||
|
||||
|
||||
@ -1 +1 @@
|
||||
{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.12:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.12","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.12","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.12:9400/metrics","globalUrl":"http://10.0.1.12:9400/metrics","lastError":"","lastScrape":"2025-11-19T17:22:07.119337307+08:00","lastScrapeDuration":0.001359079,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.12:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.12","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.12","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.12:9100/metrics","globalUrl":"http://10.0.1.12:9100/metrics","lastError":"","lastScrape":"2025-11-19T17:22:13.427955955+08:00","lastScrapeDuration":0.020847396,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}}
|
||||
{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.86:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.86","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.86","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.86:9400/metrics","globalUrl":"http://10.0.1.86:9400/metrics","lastError":"","lastScrape":"2025-11-20T14:45:34.652147179+08:00","lastScrapeDuration":0.002046883,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.86:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.86","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.86","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.86:9100/metrics","globalUrl":"http://10.0.1.86:9100/metrics","lastError":"","lastScrape":"2025-11-20T14:45:33.675131411+08:00","lastScrapeDuration":0.023311933,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}}
|
||||
Loading…
x
Reference in New Issue
Block a user