diff --git a/src/sys/swarm_tests/scripts/04_metric_verify.sh b/src/sys/swarm_tests/scripts/04_metric_verify.sh index 3b01cc7..fd92c04 100755 --- a/src/sys/swarm_tests/scripts/04_metric_verify.sh +++ b/src/sys/swarm_tests/scripts/04_metric_verify.sh @@ -10,6 +10,7 @@ PROM_PORT="${PROMETHEUS_PORT:-9090}" GRAF_PORT="${GRAFANA_PORT:-3000}" GRAF_URL="http://127.0.0.1:${GRAF_PORT}" PROM_DOMAIN="prom.metric.argus.com:${PROM_PORT}" +NODE_CONT="${SWARM_NODE_CNAME:-argus-metric-test-node-swarm}" err() { echo "[ERR] $*" >&2; } ok() { echo "[OK] $*"; } @@ -151,8 +152,23 @@ send_logs() { docker exec "$cname" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log" } -NODE_CONT="${SWARM_NODE_CNAME:-argus-metric-test-node-swarm}" ensure_fluentbit "$NODE_CONT" +# ensure fluent-bit process is really up before sending logs, +# to avoid dropping lines when tail starts after we write test logs +FLUENT_WAIT_RETRIES="${FLUENT_WAIT_RETRIES:-120}" +FLUENT_WAIT_SLEEP="${FLUENT_WAIT_SLEEP:-2}" +fluent_ok=0 +for i in $(seq 1 "$FLUENT_WAIT_RETRIES"); do + if docker exec "$NODE_CONT" pgrep -x fluent-bit >/dev/null 2>&1; then + fluent_ok=1 + break + fi + echo "[..] waiting fluent-bit process up in node ($i/$FLUENT_WAIT_RETRIES)" + sleep "$FLUENT_WAIT_SLEEP" +done +if [[ "$fluent_ok" -ne 1 ]]; then + fail "fluent-bit not running in node after waiting $((FLUENT_WAIT_RETRIES * FLUENT_WAIT_SLEEP))s" +fi send_logs "$NODE_CONT" "swarm-node" info "waiting for ES to ingest..." @@ -181,3 +197,72 @@ if ! curl -fs "http://127.0.0.1:${KIBANA_PORT}/api/status" >/dev/null 2>&1; then fi ok "log pipeline verified" + +# ---- Node status and health (node.json + metric-*) ---- +info "Node status and health (node.json + metric components)" + +NODE_HEALTH_RETRIES="${NODE_HEALTH_RETRIES:-5}" +NODE_HEALTH_SLEEP="${NODE_HEALTH_SLEEP:-5}" + +if ! command -v jq >/dev/null 2>&1; then + fail "node health: jq not available on host; cannot parse node.json" +fi + +node_health_ok=0 +for attempt in $(seq 1 "$NODE_HEALTH_RETRIES"); do + tmp_node_json="$(mktemp)" + if ! docker exec "$NODE_CONT" sh -lc ' + set -e + host="$(hostname)" + f="/private/argus/agent/${host}/node.json" + if [ ! -s "$f" ]; then + echo "[ERR] node.json missing or empty: $f" >&2 + exit 1 + fi + cat "$f" + ' > "$tmp_node_json" 2>/dev/null; then + rm -f "$tmp_node_json" + info "node health: node.json not ready (attempt $attempt/$NODE_HEALTH_RETRIES)" + else + node_name="$(jq -r '.name // ""' "$tmp_node_json")" + node_status="$(jq -r '.status // ""' "$tmp_node_json")" + node_type="$(jq -r '.type // ""' "$tmp_node_json")" + + if [[ -z "$node_name" || -z "$node_status" || -z "$node_type" ]]; then + info "node health: missing required fields in node.json (attempt $attempt/$NODE_HEALTH_RETRIES)" + elif [[ "$node_status" != "online" || "$node_type" != "agent" ]]; then + info "node health: status/type not ready yet (status=$node_status type=$node_type name=$node_name attempt $attempt/$NODE_HEALTH_RETRIES)" + else + all_ok=1 + for comp in metric-argus-agent metric-node-exporter metric-dcgm-exporter metric-fluent-bit; do + cstatus="$(jq -r --arg c "$comp" '.health[$c].status // ""' "$tmp_node_json")" + cerror="$(jq -r --arg c "$comp" '.health[$c].error // ""' "$tmp_node_json")" + if [[ "$cstatus" != "healthy" ]]; then + info "node health: $comp status=$cstatus (attempt $attempt/$NODE_HEALTH_RETRIES)" + all_ok=0 + break + fi + if [[ -n "$cerror" && "$cerror" != "null" ]]; then + info "node health: $comp error=$cerror (attempt $attempt/$NODE_HEALTH_RETRIES)" + all_ok=0 + break + fi + done + if [[ "$all_ok" -eq 1 ]]; then + node_health_ok=1 + rm -f "$tmp_node_json" + break + fi + fi + rm -f "$tmp_node_json" + fi + if [[ "$attempt" -lt "$NODE_HEALTH_RETRIES" ]]; then + sleep "$NODE_HEALTH_SLEEP" + fi +done + +if [[ "$node_health_ok" -ne 1 ]]; then + fail "node health: node.json or metric components not healthy after ${NODE_HEALTH_RETRIES} attempts" +fi + +ok "node status online and metric components healthy" diff --git a/src/sys/swarm_tests/scripts/04_restart_node_and_verify.sh b/src/sys/swarm_tests/scripts/04_restart_node_and_verify.sh new file mode 100755 index 0000000..38699f0 --- /dev/null +++ b/src/sys/swarm_tests/scripts/04_restart_node_and_verify.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +ENV_FILE="$ROOT/.env"; set -a; source "$ENV_FILE"; set +a +ENV_NODES_FILE="$ROOT/.env.nodes"; set -a; source "$ENV_NODES_FILE"; set +a + +PROJECT="${NODES_PROJECT:-argus-swarm-nodes}" +COMPOSE_FILE="$ROOT/docker-compose.nodes.yml" +NODE_CONT="${SWARM_NODE_CNAME:-argus-metric-test-node-swarm}" + +echo "[RESTART] restarting node compose project: $PROJECT" +docker compose -p "$PROJECT" -f "$COMPOSE_FILE" restart + +echo "[RESTART] waiting node container up: $NODE_CONT" +for i in {1..30}; do + state=$(docker ps --format '{{.Names}} {{.Status}}' | awk -v c="$NODE_CONT" '$1==c{print $2}' || true) + if [[ "$state" == Up* ]]; then + echo "[RESTART] node container is up" + break + fi + echo "[..] waiting node container up ($i/30)" + sleep 2 +done + +NODE_HEALTH_WAIT="${NODE_HEALTH_WAIT:-300}" +attempts=$(( NODE_HEALTH_WAIT / 30 )) +(( attempts < 1 )) && attempts=1 + +echo "[RESTART] waiting node health to recover (timeout=${NODE_HEALTH_WAIT}s)" +ok_flag=0 +for i in $(seq 1 "$attempts"); do + if bash "$SCRIPT_DIR/04_metric_verify.sh"; then + echo "[RESTART] node restart verify passed on attempt $i/$attempts" + ok_flag=1 + break + fi + echo "[..] 04_metric_verify failed after node restart; retrying ($i/$attempts)" + sleep 30 +done + +if [[ "$ok_flag" -ne 1 ]]; then + echo "[ERR] node restart: 04_metric_verify did not pass within ${NODE_HEALTH_WAIT}s" >&2 + exit 1 +fi + diff --git a/src/sys/swarm_tests/scripts/04_restart_server_and_verify.sh b/src/sys/swarm_tests/scripts/04_restart_server_and_verify.sh new file mode 100755 index 0000000..597ebbd --- /dev/null +++ b/src/sys/swarm_tests/scripts/04_restart_server_and_verify.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +ENV_FILE="$ROOT/.env"; set -a; source "$ENV_FILE"; set +a + +PROJECT="${SERVER_PROJECT:-argus-swarm-server}" +COMPOSE_FILE="$ROOT/docker-compose.server.yml" + +echo "[RESTART] restarting server compose project: $PROJECT" +docker compose -p "$PROJECT" -f "$COMPOSE_FILE" restart + +echo "[RESTART] waiting server ready after restart" +bash "$SCRIPT_DIR/02_wait_ready.sh" + +echo "[RESTART] running 04_metric_verify after server restart" +bash "$SCRIPT_DIR/04_metric_verify.sh" + +echo "[RESTART] server restart + verify passed" + diff --git a/src/sys/swarm_tests/scripts/10_e2e_swarm_restart_verify.sh b/src/sys/swarm_tests/scripts/10_e2e_swarm_restart_verify.sh new file mode 100755 index 0000000..46d18ec --- /dev/null +++ b/src/sys/swarm_tests/scripts/10_e2e_swarm_restart_verify.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +echo "[E2E] starting full swarm_tests E2E (cleanup -> 00-04 -> restart server/node -> keep env)" + +if [[ "${E2E_SKIP_CLEAN:-0}" != "1" ]]; then + echo "[E2E] cleaning previous environment via 99_down.sh" + bash "$SCRIPT_DIR/99_down.sh" || true +else + echo "[E2E] skipping cleanup (E2E_SKIP_CLEAN=1)" +fi + +echo "[E2E] running 00_bootstrap" +bash "$SCRIPT_DIR/00_bootstrap.sh" + +echo "[E2E] running 01_server_up" +bash "$SCRIPT_DIR/01_server_up.sh" + +echo "[E2E] running 02_wait_ready" +bash "$SCRIPT_DIR/02_wait_ready.sh" + +echo "[E2E] running 03_nodes_up" +bash "$SCRIPT_DIR/03_nodes_up.sh" + +echo "[E2E] baseline 04_metric_verify" +bash "$SCRIPT_DIR/04_metric_verify.sh" + +if [[ "${E2E_SKIP_SERVER_RESTART:-0}" != "1" ]]; then + echo "[E2E] server restart + verify" + bash "$SCRIPT_DIR/04_restart_server_and_verify.sh" +else + echo "[E2E] skipping server restart (E2E_SKIP_SERVER_RESTART=1)" +fi + +if [[ "${E2E_SKIP_NODE_RESTART:-0}" != "1" ]]; then + echo "[E2E] node restart + verify" + bash "$SCRIPT_DIR/04_restart_node_and_verify.sh" +else + echo "[E2E] skipping node restart (E2E_SKIP_NODE_RESTART=1)" +fi + +echo "[E2E] done; environment kept for inspection" + diff --git a/src/sys/swarm_tests/scripts/99_down.sh b/src/sys/swarm_tests/scripts/99_down.sh index 28e96e2..60f760d 100755 --- a/src/sys/swarm_tests/scripts/99_down.sh +++ b/src/sys/swarm_tests/scripts/99_down.sh @@ -14,9 +14,6 @@ docker compose -p "${SERVER_PROJECT:-argus-swarm-server}" -f "$ROOT/docker-compo echo "[DOWN] removing warmup container (if any)" docker rm -f argus-net-warmup >/dev/null 2>&1 || true -echo "[DOWN] removing overlay network" -docker network rm argus-sys-net >/dev/null 2>&1 || true - echo "[DOWN] cleanup temp files" rm -rf "$ROOT/private-server/tmp" "$ROOT/private-nodes/tmp" 2>/dev/null || true diff --git a/src/sys/swarm_tests/tmp/metric-verify/prom_targets.json b/src/sys/swarm_tests/tmp/metric-verify/prom_targets.json index 79b5937..b176d28 100644 --- a/src/sys/swarm_tests/tmp/metric-verify/prom_targets.json +++ b/src/sys/swarm_tests/tmp/metric-verify/prom_targets.json @@ -1 +1 @@ -{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.12:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.12","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.12","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.12:9400/metrics","globalUrl":"http://10.0.1.12:9400/metrics","lastError":"","lastScrape":"2025-11-19T17:22:07.119337307+08:00","lastScrapeDuration":0.001359079,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.12:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.12","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.12","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.12:9100/metrics","globalUrl":"http://10.0.1.12:9100/metrics","lastError":"","lastScrape":"2025-11-19T17:22:13.427955955+08:00","lastScrapeDuration":0.020847396,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}} \ No newline at end of file +{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.86:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.86","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.86","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.86:9400/metrics","globalUrl":"http://10.0.1.86:9400/metrics","lastError":"","lastScrape":"2025-11-20T14:45:34.652147179+08:00","lastScrapeDuration":0.002046883,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.86:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.86","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.86","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.86:9100/metrics","globalUrl":"http://10.0.1.86:9100/metrics","lastError":"","lastScrape":"2025-11-20T14:45:33.675131411+08:00","lastScrapeDuration":0.023311933,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}} \ No newline at end of file