From e423985d71aadec2f19b82d04693c7645a5f28e1 Mon Sep 17 00:00:00 2001 From: yuyr Date: Fri, 10 Oct 2025 16:36:00 +0800 Subject: [PATCH] =?UTF-8?q?[#13]=20=E5=A2=9E=E5=8A=A0=E7=B3=BB=E7=BB=9F?= =?UTF-8?q?=E9=9B=86=E6=88=90=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/log/tests/scripts/04_query_es.sh | 39 ++++- src/sys/tests/docker-compose.yml | 139 ++++++++++++++++++ src/sys/tests/scripts/00_e2e_test.sh | 26 ++++ src/sys/tests/scripts/01_bootstrap.sh | 77 ++++++++++ src/sys/tests/scripts/02_up.sh | 22 +++ src/sys/tests/scripts/03_wait_ready.sh | 75 ++++++++++ .../tests/scripts/04_verify_dns_routing.sh | 54 +++++++ src/sys/tests/scripts/05_agent_register.sh | 87 +++++++++++ .../scripts/06_write_health_and_assert.sh | 67 +++++++++ .../tests/scripts/07_logs_send_and_assert.sh | 63 ++++++++ .../scripts/08_restart_agent_reregister.sh | 95 ++++++++++++ src/sys/tests/scripts/09_down.sh | 38 +++++ src/sys/tests/scripts/node_entrypoint.sh | 57 +++++++ 13 files changed, 837 insertions(+), 2 deletions(-) create mode 100644 src/sys/tests/docker-compose.yml create mode 100755 src/sys/tests/scripts/00_e2e_test.sh create mode 100755 src/sys/tests/scripts/01_bootstrap.sh create mode 100755 src/sys/tests/scripts/02_up.sh create mode 100755 src/sys/tests/scripts/03_wait_ready.sh create mode 100755 src/sys/tests/scripts/04_verify_dns_routing.sh create mode 100755 src/sys/tests/scripts/05_agent_register.sh create mode 100755 src/sys/tests/scripts/06_write_health_and_assert.sh create mode 100755 src/sys/tests/scripts/07_logs_send_and_assert.sh create mode 100755 src/sys/tests/scripts/08_restart_agent_reregister.sh create mode 100755 src/sys/tests/scripts/09_down.sh create mode 100755 src/sys/tests/scripts/node_entrypoint.sh diff --git a/src/log/tests/scripts/04_query_es.sh b/src/log/tests/scripts/04_query_es.sh index 2cf427e..73c8bb7 100755 --- a/src/log/tests/scripts/04_query_es.sh +++ b/src/log/tests/scripts/04_query_es.sh @@ -1,7 +1,42 @@ #!/usr/bin/env bash set -euo pipefail + +# ES endpoint and wait strategy ES="${ES:-http://localhost:9200}" +es_wait_attempts="${ES_WAIT_ATTEMPTS:-60}" # total attempts to wait for ES +es_wait_interval="${ES_WAIT_INTERVAL:-2}" # seconds between attempts + echo "[i] 查询 ES 端点:$ES" + +wait_for_es() { + local attempt=1 + while (( attempt <= es_wait_attempts )); do + # 等待集群达到至少 yellow 状态;请求失败则重试 + if curl -fsS "$ES/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then + echo "[ok] Elasticsearch 已就绪 (attempt=${attempt}/${es_wait_attempts})" + return 0 + fi + echo "[..] 等待 Elasticsearch 可用中 (${attempt}/${es_wait_attempts})" + sleep "${es_wait_interval}" + (( attempt++ )) + done + echo "[err] Elasticsearch 在 ${es_wait_attempts} 次尝试后仍不可用" + return 1 +} + +safe_count() { + # 对缺失索引返回 0,避免 404 触发失败 + local pattern="$1" + local json + json=$(curl -fsS "$ES/${pattern}/_count?ignore_unavailable=true&allow_no_indices=true" 2>/dev/null || echo '{}') + echo "$json" | sed -E 's/.*"count":([0-9]+).*/\1/' | awk 'NF{print $0;exit} END{if(NR==0)print 0}' +} + +wait_for_es + +# 列出相关索引(可能为空,允许) curl -fsS "$ES/_cat/indices?v" | egrep 'train-|infer-|logstash' || true -printf "train-* 计数:"; curl -fsS "$ES/train-*/_count" | sed -E 's/.*"count":([0-9]+).*/\1/'; echo -printf "infer-* 计数:"; curl -fsS "$ES/infer-*/_count" | sed -E 's/.*"count":([0-9]+).*/\1/'; echo + +# 打印计数,缺失索引按 0 处理 +printf "train-* 计数:"; safe_count "train-*"; echo +printf "infer-* 计数:"; safe_count "infer-*"; echo diff --git a/src/sys/tests/docker-compose.yml b/src/sys/tests/docker-compose.yml new file mode 100644 index 0000000..0f688f7 --- /dev/null +++ b/src/sys/tests/docker-compose.yml @@ -0,0 +1,139 @@ +version: "3.8" + +networks: + default: + name: argus-sys-net + driver: bridge + ipam: + driver: default + config: + - subnet: 172.29.0.0/16 + +services: + bind: + image: ${BIND_IMAGE_TAG:-argus-bind9:latest} + container_name: argus-bind-sys + networks: + default: + ipv4_address: 172.29.0.2 + volumes: + - ./private:/private + restart: unless-stopped + + master: + image: ${MASTER_IMAGE_TAG:-argus-master:latest} + container_name: argus-master-sys + depends_on: + - bind + environment: + - OFFLINE_THRESHOLD_SECONDS=6 + - ONLINE_THRESHOLD_SECONDS=2 + - SCHEDULER_INTERVAL_SECONDS=1 + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + ports: + - "32300:3000" + volumes: + - ./private/argus/master:/private/argus/master + - ./private/argus/metric/prometheus:/private/argus/metric/prometheus + - ./private/argus/etc:/private/argus/etc + networks: + default: + ipv4_address: 172.29.0.10 + restart: unless-stopped + + es: + image: argus-elasticsearch:latest + container_name: argus-es-sys + environment: + - discovery.type=single-node + - xpack.security.enabled=false + - ES_JAVA_OPTS=-Xms512m -Xmx512m + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + volumes: + - ./private/argus/log/elasticsearch:/private/argus/log/elasticsearch + - ./private/argus/etc:/private/argus/etc + ports: + - "9200:9200" + restart: unless-stopped + + kibana: + image: argus-kibana:latest + container_name: argus-kibana-sys + environment: + - ELASTICSEARCH_HOSTS=http://es.log.argus.com:9200 + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + volumes: + - ./private/argus/log/kibana:/private/argus/log/kibana + - ./private/argus/etc:/private/argus/etc + depends_on: + - es + ports: + - "5601:5601" + restart: unless-stopped + + node-a: + image: ubuntu:22.04 + container_name: argus-node-a + hostname: dev-yyrshare-nbnyx10-cp2f-pod-0 + depends_on: + - master + - bind + - es + environment: + - MASTER_ENDPOINT=http://master.argus.com:3000 + - REPORT_INTERVAL_SECONDS=2 + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + - ES_HOST=es + - ES_PORT=9200 + - CLUSTER=local + - RACK=dev + volumes: + - ./private-nodea/argus/agent/dev-yyrshare-nbnyx10-cp2f-pod-0:/private/argus/agent/dev-yyrshare-nbnyx10-cp2f-pod-0 + - ../../agent/dist/argus-agent:/usr/local/bin/argus-agent:ro + - ./scripts/node_entrypoint.sh:/usr/local/bin/node-entrypoint.sh:ro + - ../../log/fluent-bit/build/start-fluent-bit.sh:/assets/start-fluent-bit.sh:ro + - ../../log/fluent-bit/build/etc:/assets/fluent-bit/etc:ro + - ../../log/fluent-bit/build/packages:/assets/fluent-bit/packages:ro + entrypoint: + - /usr/local/bin/node-entrypoint.sh + dns: + - 172.29.0.2 + ports: + - "2020:2020" + restart: unless-stopped + + node-b: + image: ubuntu:22.04 + container_name: argus-node-b + hostname: dev-yyrshare-uuuu10-ep2f-pod-0 + depends_on: + - master + - bind + - es + environment: + - MASTER_ENDPOINT=http://master.argus.com:3000 + - REPORT_INTERVAL_SECONDS=2 + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + - ES_HOST=es + - ES_PORT=9200 + - CLUSTER=local + - RACK=dev + volumes: + - ./private-node2/argus/agent/dev-yyrshare-uuuu10-ep2f-pod-0:/private/argus/agent/dev-yyrshare-uuuu10-ep2f-pod-0 + - ../../agent/dist/argus-agent:/usr/local/bin/argus-agent:ro + - ./scripts/node_entrypoint.sh:/usr/local/bin/node-entrypoint.sh:ro + - ../../log/fluent-bit/build/start-fluent-bit.sh:/assets/start-fluent-bit.sh:ro + - ../../log/fluent-bit/build/etc:/assets/fluent-bit/etc:ro + - ../../log/fluent-bit/build/packages:/assets/fluent-bit/packages:ro + entrypoint: + - /usr/local/bin/node-entrypoint.sh + dns: + - 172.29.0.2 + ports: + - "2021:2020" + restart: unless-stopped diff --git a/src/sys/tests/scripts/00_e2e_test.sh b/src/sys/tests/scripts/00_e2e_test.sh new file mode 100755 index 0000000..2079c4f --- /dev/null +++ b/src/sys/tests/scripts/00_e2e_test.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +SCRIPTS=( + "01_bootstrap.sh" + "02_up.sh" + "03_wait_ready.sh" + "04_verify_dns_routing.sh" + "05_agent_register.sh" + "06_write_health_and_assert.sh" + "07_logs_send_and_assert.sh" + "08_restart_agent_reregister.sh" + "09_down.sh" +) + +for script in "${SCRIPTS[@]}"; do + echo "[SYS-E2E] Running $script" + "$SCRIPT_DIR/$script" + echo "[SYS-E2E] $script completed" + echo +done + +echo "[SYS-E2E] All tests completed" + diff --git a/src/sys/tests/scripts/01_bootstrap.sh b/src/sys/tests/scripts/01_bootstrap.sh new file mode 100755 index 0000000..61168f5 --- /dev/null +++ b/src/sys/tests/scripts/01_bootstrap.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)" + +PRIVATE_CORE="$TEST_ROOT/private" +PRIVATE_NODEA="$TEST_ROOT/private-nodea" +PRIVATE_NODEB="$TEST_ROOT/private-node2" +TMP_DIR="$TEST_ROOT/tmp" + +source "$REPO_ROOT/scripts/common/build_user.sh" +load_build_user + +ensure_image() { + local image="$1" + if ! docker image inspect "$image" >/dev/null 2>&1; then + echo "[ERROR] Missing image: $image. Please run ./build/build_images.sh" >&2 + exit 1 + fi +} + +echo "[INFO] Preparing directories..." +mkdir -p \ + "$PRIVATE_CORE/argus/etc" \ + "$PRIVATE_CORE/argus/bind" \ + "$PRIVATE_CORE/argus/master" \ + "$PRIVATE_CORE/argus/metric/prometheus" \ + "$PRIVATE_CORE/argus/log/elasticsearch" \ + "$PRIVATE_CORE/argus/log/kibana" \ + "$PRIVATE_NODEA/argus/agent/dev-yyrshare-nbnyx10-cp2f-pod-0/health" \ + "$PRIVATE_NODEB/argus/agent/dev-yyrshare-uuuu10-ep2f-pod-0/health" \ + "$TMP_DIR" + +# Align ownership for supervisor-managed services (ES/Kibana expect UID/GID inside container) +echo "[INFO] Fixing ownership for core private directories..." +chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" \ + "$PRIVATE_CORE/argus/log/elasticsearch" \ + "$PRIVATE_CORE/argus/log/kibana" \ + "$PRIVATE_CORE/argus/etc" 2>/dev/null || true + +echo "[INFO] Distributing update-dns.sh for core services (bind/master/es/kibana)" +BIND_UPDATE_SRC="$REPO_ROOT/src/bind/build/update-dns.sh" +BIND_UPDATE_DEST="$PRIVATE_CORE/argus/etc/update-dns.sh" +if [[ -f "$BIND_UPDATE_SRC" ]]; then + cp "$BIND_UPDATE_SRC" "$BIND_UPDATE_DEST" + chmod +x "$BIND_UPDATE_DEST" +else + echo "[WARN] bind update-dns.sh not found at $BIND_UPDATE_SRC" +fi + +echo "[INFO] Ensuring images present..." +ensure_image "argus-elasticsearch:latest" +ensure_image "argus-kibana:latest" +ensure_image "argus-bind9:latest" +ensure_image "argus-master:latest" + +echo "[INFO] Building agent binary..." +pushd "$REPO_ROOT/src/agent" >/dev/null +./scripts/build_binary.sh +popd >/dev/null + +AGENT_BIN="$REPO_ROOT/src/agent/dist/argus-agent" +if [[ ! -x "$AGENT_BIN" ]]; then + echo "[ERROR] Agent binary not found at $AGENT_BIN" >&2 + exit 1 +fi +echo "$AGENT_BIN" > "$TMP_DIR/agent_binary_path" + +echo "[INFO] Writing .env with UID/GID" +cat > "$TEST_ROOT/.env" </dev/null 2>&1; then + docker compose "$@" + else + docker-compose "$@" + fi +} + +echo "[INFO] Bringing up system stack..." +pushd "$TEST_ROOT" >/dev/null +compose -p argus-sys down --remove-orphans || true +compose -p argus-sys up -d +popd >/dev/null + +echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021" + diff --git a/src/sys/tests/scripts/03_wait_ready.sh b/src/sys/tests/scripts/03_wait_ready.sh new file mode 100755 index 0000000..4887181 --- /dev/null +++ b/src/sys/tests/scripts/03_wait_ready.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +compose() { + if docker compose version >/dev/null 2>&1; then + docker compose "$@" + else + docker-compose "$@" + fi +} + +service_id() { + compose -p argus-sys ps -q "$1" +} + +wait_http() { + local url="$1"; local attempts="${2:-120}"; local i=1 + while (( i <= attempts )); do + if curl -fsS "$url" >/dev/null 2>&1; then return 0; fi + echo "[..] waiting $url ($i/$attempts)"; sleep 5; ((i++)) + done + echo "[ERR] Timeout waiting for $url" >&2; return 1 +} + +echo "[INFO] Waiting for ES/Kibana/Master/Fluent Bit/Bind..." + +# ES (>= yellow) +attempt=1; max=120 +while (( attempt <= max )); do + if curl -fsS "http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then + break + fi + echo "[..] waiting ES ($attempt/$max)"; sleep 5; ((attempt++)) +done +[[ $attempt -le $max ]] || { echo "[ERR] ES not ready" >&2; exit 1; } + +# Kibana: must be HTTP 200 and overall.level=available +echo "[INFO] Waiting for Kibana to be available (HTTP 200)..." +kb_attempt=1; kb_max=180 +while (( kb_attempt <= kb_max )); do + body=$(curl -sS "http://localhost:5601/api/status" 2>/dev/null || true) + code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:5601/api/status" || echo 000) + if [[ "$code" == "200" ]]; then + if echo "$body" | grep -q '"level":"available"'; then + echo "[OK] Kibana available (HTTP 200)" + break + fi + fi + echo "[..] waiting kibana 200 ($kb_attempt/$kb_max), last_code=$code" + sleep 5 + ((kb_attempt++)) +done +if (( kb_attempt > kb_max )); then + echo "[ERR] Kibana did not reach HTTP 200 available in time" >&2; exit 1 +fi + +# Master +wait_http "http://localhost:32300/readyz" 120 + +# Fluent Bit (host metrics on host ports) +wait_http "http://localhost:2020/api/v2/metrics" 120 +wait_http "http://localhost:2021/api/v2/metrics" 120 + +# Bind config check +BIND_ID="$(service_id bind)" +if [[ -n "$BIND_ID" ]]; then + docker exec "$BIND_ID" named-checkconf >/dev/null +else + echo "[WARN] bind container id not found" +fi + +echo "[OK] All services are ready" diff --git a/src/sys/tests/scripts/04_verify_dns_routing.sh b/src/sys/tests/scripts/04_verify_dns_routing.sh new file mode 100755 index 0000000..635c4fe --- /dev/null +++ b/src/sys/tests/scripts/04_verify_dns_routing.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +compose() { + if docker compose version >/dev/null 2>&1; then + docker compose "$@" + else + docker-compose "$@" + fi +} + +service_id() { + compose -p argus-sys ps -q "$1" +} + +echo "[INFO] Verifying DNS routing via bind..." + +# Check master IP file exists in shared private +MASTER_FILE="$TEST_ROOT/private/argus/etc/master.argus.com" +if [[ ! -f "$MASTER_FILE" ]]; then + echo "[ERR] master.argus.com file missing at $MASTER_FILE" >&2 + exit 1 +fi +MASTER_IP_HOST="$(cat "$MASTER_FILE" | tr -d '\r\n' || true)" +echo "[INFO] master.argus.com file content: ${MASTER_IP_HOST}" + +# dig inside bind container +BIN_ID="$(service_id bind)" +if [[ -n "$BIN_ID" ]]; then + DIG_IP="$(docker exec "$BIN_ID" dig +short master.argus.com A | tail -n1 || true)" + echo "[INFO] dig(master.argus.com) from bind container -> $DIG_IP" + if [[ -z "$DIG_IP" ]]; then + echo "[ERR] bind did not resolve master.argus.com" >&2; exit 1 + fi +else + echo "[WARN] bind container not found; skip dig" +fi + +for node in node-a node-b; do + CID="$(service_id "$node")" + echo "[INFO] Checking resolution inside $node..." + if ! docker exec "$CID" getent hosts master.argus.com >/dev/null 2>&1; then + echo "[ERR] $node cannot resolve master.argus.com" >&2 + exit 1 + fi + RES="$(docker exec "$CID" getent hosts master.argus.com | awk '{print $1}' | head -n1)" + echo "[OK] $node resolved master.argus.com -> $RES" +done + +echo "[OK] DNS routing verified" + diff --git a/src/sys/tests/scripts/05_agent_register.sh b/src/sys/tests/scripts/05_agent_register.sh new file mode 100755 index 0000000..7f42f07 --- /dev/null +++ b/src/sys/tests/scripts/05_agent_register.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_DIR="$TEST_ROOT/tmp" + +API_BASE="http://localhost:32300/api/v1/master" + +HOST_A="dev-yyrshare-nbnyx10-cp2f-pod-0" +HOST_B="dev-yyrshare-uuuu10-ep2f-pod-0" + +mkdir -p "$TMP_DIR" + +echo "[INFO] Waiting for agent nodes to register..." + +extract_node() { + local name="$1"; local output="$2"; local json_file="$3" + python3 - "$name" "$output" "$json_file" <<'PY' +import json, sys, pathlib +name = sys.argv[1] +out = pathlib.Path(sys.argv[2]) +json_file = sys.argv[3] +with open(json_file, 'r') as fh: + data = json.load(fh) +node = next((n for n in data if n.get("name") == name), None) +if node: + out.write_text(node["id"]) # save id + print(node["id"]) # also print for shell capture +PY +} + +ID_A=""; ID_B="" +for _ in {1..60}; do + sleep 2 + resp=$(curl -fsS "$API_BASE/nodes" 2>/dev/null || true) + if [[ -z "$resp" ]]; then + continue + fi + # only try to parse when it's a JSON array + if ! echo "$resp" | head -c1 | grep -q '\['; then + continue + fi + echo "$resp" > "$TMP_DIR/nodes_list.json" + ID_A=$(extract_node "$HOST_A" "$TMP_DIR/node_id_a" "$TMP_DIR/nodes_list.json" 2>/dev/null || true) + ID_B=$(extract_node "$HOST_B" "$TMP_DIR/node_id_b" "$TMP_DIR/nodes_list.json" 2>/dev/null || true) + if [[ -s "$TMP_DIR/node_id_a" && -s "$TMP_DIR/node_id_b" ]]; then + break + fi +done + +if [[ ! -s "$TMP_DIR/node_id_a" || ! -s "$TMP_DIR/node_id_b" ]]; then + echo "[ERR] Agents did not register in time" >&2 + exit 1 +fi + +node_detail() { + local id="$1"; local out="$2" + curl -fsS "$API_BASE/nodes/$id" -o "$out" +} + +node_detail "$(cat "$TMP_DIR/node_id_a")" "$TMP_DIR/detail_a.json" +node_detail "$(cat "$TMP_DIR/node_id_b")" "$TMP_DIR/detail_b.json" + +python3 - "$TMP_DIR/detail_a.json" "$TMP_DIR/initial_ip_a" <<'PY' +import json, sys, pathlib +node=json.load(open(sys.argv[1])) +ip=node.get("meta_data",{}).get("ip") +assert ip, "missing ip" +pathlib.Path(sys.argv[2]).write_text(ip) +PY + +python3 - "$TMP_DIR/detail_b.json" "$TMP_DIR/initial_ip_b" <<'PY' +import json, sys, pathlib +node=json.load(open(sys.argv[1])) +ip=node.get("meta_data",{}).get("ip") +assert ip, "missing ip" +pathlib.Path(sys.argv[2]).write_text(ip) +PY + +NODE_JSON_A="$TEST_ROOT/private-nodea/argus/agent/$HOST_A/node.json" +NODE_JSON_B="$TEST_ROOT/private-node2/argus/agent/$HOST_B/node.json" + +[[ -f "$NODE_JSON_A" ]] || { echo "[ERR] node.json missing for $HOST_A" >&2; exit 1; } +[[ -f "$NODE_JSON_B" ]] || { echo "[ERR] node.json missing for $HOST_B" >&2; exit 1; } + +echo "[OK] Agents registered: $(cat "$TMP_DIR/node_id_a") , $(cat "$TMP_DIR/node_id_b")" diff --git a/src/sys/tests/scripts/06_write_health_and_assert.sh b/src/sys/tests/scripts/06_write_health_and_assert.sh new file mode 100755 index 0000000..e0d66f4 --- /dev/null +++ b/src/sys/tests/scripts/06_write_health_and_assert.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_DIR="$TEST_ROOT/tmp" + +API_BASE="http://localhost:32300/api/v1/master" + +HOST_A="dev-yyrshare-nbnyx10-cp2f-pod-0" +HOST_B="dev-yyrshare-uuuu10-ep2f-pod-0" + +HEALTH_A="$TEST_ROOT/private-nodea/argus/agent/$HOST_A/health" +HEALTH_B="$TEST_ROOT/private-node2/argus/agent/$HOST_B/health" + +write_health() { + local dir="$1"; mkdir -p "$dir" + cat > "$dir/log-fluentbit.json" < "$dir/metric-node-exporter.json" </dev/null || true) + [[ -z "$resp" ]] && continue + echo "$resp" > "$TMP_DIR/node_${id}_detail.json" + if python3 - "$TMP_DIR/node_${id}_detail.json" <<'PY' +import json,sys +node=json.load(open(sys.argv[1])) +h=node.get("health",{}) +sys.exit(0 if ("log-fluentbit" in h and "metric-node-exporter" in h) else 1) +PY + then return 0; fi + done + return 1 +} + +check_health "$ID_A" || { echo "[ERR] health keys not reported for node A" >&2; exit 1; } +check_health "$ID_B" || { echo "[ERR] health keys not reported for node B" >&2; exit 1; } + +NODES_JSON="$TEST_ROOT/private/argus/metric/prometheus/nodes.json" +if [[ ! -f "$NODES_JSON" ]]; then + echo "[ERR] nodes.json missing at $NODES_JSON" >&2; exit 1 +fi + +python3 - "$NODES_JSON" <<'PY' +import json,sys +with open(sys.argv[1]) as h: + nodes=json.load(h) +assert isinstance(nodes,list) +assert len(nodes) == 2, f"expected 2 nodes online, got {len(nodes)}" +PY + +echo "[OK] Health reported and nodes.json has 2 online nodes" diff --git a/src/sys/tests/scripts/07_logs_send_and_assert.sh b/src/sys/tests/scripts/07_logs_send_and_assert.sh new file mode 100755 index 0000000..0363ebf --- /dev/null +++ b/src/sys/tests/scripts/07_logs_send_and_assert.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +set -euo pipefail + +echo "[INFO] Sending logs via node-a/node-b and asserting ES counts..." + +get_count() { + local idx="$1" + curl -s "http://localhost:9200/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" | sed -E 's/.*"count":([0-9]+).*/\1/' | awk 'NF{print $0;exit} END{if(NR==0)print 0}' +} + +train0=$(get_count "train-*") +infer0=$(get_count "infer-*") +base=$((train0 + infer0)) +echo "[INFO] initial counts: train=${train0} infer=${infer0} total=${base}" + +send_logs() { + local cname="$1"; local hosttag="$2" + docker exec "$cname" sh -lc 'mkdir -p /logs/train /logs/infer' + docker exec "$cname" sh -lc "ts=\ +\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log" + docker exec "$cname" sh -lc "ts=\ +\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log" + docker exec "$cname" sh -lc "ts=\ +\$(date '+%F %T'); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log" +} + +# Determine container names +node_a=$(docker ps --format '{{.Names}}' | grep -E '^argus-node-a$|argus-sys-node-a-1' | head -n1) +node_b=$(docker ps --format '{{.Names}}' | grep -E '^argus-node-b$|argus-sys-node-b-1' | head -n1) + +send_logs "$node_a" "host01" +send_logs "$node_b" "host02" + +echo "[INFO] Waiting for ES to ingest..." +sleep 10 + +train1=$(get_count "train-*") +infer1=$(get_count "infer-*") +final=$((train1 + infer1)) +echo "[INFO] final counts: train=${train1} infer=${infer1} total=${final}" + +if (( final <= base )); then + echo "[ERR] ES total did not increase (${base} -> ${final})" >&2 + exit 1 +fi + +if (( final < 4 )); then + echo "[ERR] ES total below expected threshold: ${final} < 4" >&2 + exit 1 +fi + +# Health endpoints +es_health=$(curl -s "http://localhost:9200/_cluster/health" | grep -o '"status":"[^"]*"' | cut -d'"' -f4) +if [[ "$es_health" != "green" && "$es_health" != "yellow" ]]; then + echo "[ERR] ES health not green/yellow: $es_health" >&2 + exit 1 +fi + +if ! curl -fs "http://localhost:5601/api/status" >/dev/null 2>&1; then + echo "[WARN] Kibana status endpoint not available" +fi + +echo "[OK] ES counts increased and services healthy" diff --git a/src/sys/tests/scripts/08_restart_agent_reregister.sh b/src/sys/tests/scripts/08_restart_agent_reregister.sh new file mode 100755 index 0000000..21918d1 --- /dev/null +++ b/src/sys/tests/scripts/08_restart_agent_reregister.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_DIR="$TEST_ROOT/tmp" +REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)" + +API_BASE="http://localhost:32300/api/v1/master" + +ID_B="$(cat "$TMP_DIR/node_id_b")" +IP0_B="$(cat "$TMP_DIR/initial_ip_b")" + +detail_before="$TMP_DIR/node_b_before.json" +curl -fsS "$API_BASE/nodes/$ID_B" -o "$detail_before" +LAST0=$(python3 - "$detail_before" <<'PY' +import json,sys +node=json.load(open(sys.argv[1])) +print(node.get("last_updated","")) +PY +) +IP_BEFORE=$(python3 - "$detail_before" <<'PY' +import json,sys +node=json.load(open(sys.argv[1])) +print(node.get("meta_data",{}).get("ip","")) +PY +) + +if [[ "$IP_BEFORE" != "$IP0_B" ]]; then + echo "[ERR] Expected initial IP $IP0_B for node-b, got $IP_BEFORE" >&2 + exit 1 +fi + +compose() { + if docker compose version >/dev/null 2>&1; then + docker compose "$@" + else + docker-compose "$@" + fi +} + +echo "[INFO] Recreating node-b with static IP 172.29.0.200..." +pushd "$TEST_ROOT" >/dev/null +compose -p argus-sys rm -sf node-b || true +popd >/dev/null + +docker rm -f argus-node-b >/dev/null 2>&1 || true + +AGENT_BIN_PATH="$(cat "$TMP_DIR/agent_binary_path")" + +docker run -d \ + --name argus-node-b \ + --hostname dev-yyrshare-uuuu10-ep2f-pod-0 \ + --network argus-sys-net \ + --ip 172.29.0.200 \ + --dns 172.29.0.2 \ + -e MASTER_ENDPOINT=http://master.argus.com:3000 \ + -e REPORT_INTERVAL_SECONDS=2 \ + -e ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} \ + -e ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} \ + -e ES_HOST=es \ + -e ES_PORT=9200 \ + -p 2021:2020 \ + -v "$TEST_ROOT/private-node2/argus/agent/dev-yyrshare-uuuu10-ep2f-pod-0:/private/argus/agent/dev-yyrshare-uuuu10-ep2f-pod-0" \ + -v "$AGENT_BIN_PATH:/usr/local/bin/argus-agent:ro" \ + -v "$SCRIPT_DIR/node_entrypoint.sh:/usr/local/bin/node-entrypoint.sh:ro" \ + -v "$REPO_ROOT/src/log/fluent-bit/build/start-fluent-bit.sh:/assets/start-fluent-bit.sh:ro" \ + -v "$REPO_ROOT/src/log/fluent-bit/build/etc:/assets/fluent-bit/etc:ro" \ + -v "$REPO_ROOT/src/log/fluent-bit/build/packages:/assets/fluent-bit/packages:ro" \ + --entrypoint /usr/local/bin/node-entrypoint.sh \ + ubuntu:22.04 >/dev/null + +echo "[INFO] Waiting for node-b to re-register with new IP..." +for _ in {1..40}; do + sleep 3 + if curl -fsS "$API_BASE/nodes/$ID_B" -o "$TMP_DIR/node_b_after.json"; then + if python3 - "$TMP_DIR/node_b_after.json" "$LAST0" <<'PY' +import json,sys +node=json.load(open(sys.argv[1])) +last0=sys.argv[2] +ip=node.get("meta_data",{}).get("ip") +lu=node.get("last_updated") +assert ip=="172.29.0.200" +assert lu and lu!=last0 +PY + then + echo "[OK] node-b re-registered with new IP 172.29.0.200" + exit 0 + fi + fi +done + +echo "[ERR] node-b did not update to IP 172.29.0.200 in time" >&2 +exit 1 + diff --git a/src/sys/tests/scripts/09_down.sh b/src/sys/tests/scripts/09_down.sh new file mode 100755 index 0000000..754ae2d --- /dev/null +++ b/src/sys/tests/scripts/09_down.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +compose() { + if docker compose version >/dev/null 2>&1; then + docker compose "$@" + else + docker-compose "$@" + fi +} + +docker rm -f argus-node-b >/dev/null 2>&1 || true + +pushd "$TEST_ROOT" >/dev/null +compose -p argus-sys down --remove-orphans || true +popd >/dev/null + +echo "[INFO] Cleaning private directories..." +if [[ -d "$TEST_ROOT/private" ]]; then + docker run --rm -v "$TEST_ROOT/private:/target" ubuntu:24.04 chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true + rm -rf "$TEST_ROOT/private" +fi +if [[ -d "$TEST_ROOT/private-nodea" ]]; then + docker run --rm -v "$TEST_ROOT/private-nodea:/target" ubuntu:24.04 chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true + rm -rf "$TEST_ROOT/private-nodea" +fi +if [[ -d "$TEST_ROOT/private-node2" ]]; then + docker run --rm -v "$TEST_ROOT/private-node2:/target" ubuntu:24.04 chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true + rm -rf "$TEST_ROOT/private-node2" +fi + +rm -rf "$TEST_ROOT/tmp" "$TEST_ROOT/.env" || true + +echo "[OK] Cleaned up system E2E" + diff --git a/src/sys/tests/scripts/node_entrypoint.sh b/src/sys/tests/scripts/node_entrypoint.sh new file mode 100755 index 0000000..e1ed888 --- /dev/null +++ b/src/sys/tests/scripts/node_entrypoint.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +set -euo pipefail + +LOG_PREFIX="[NODE]" +RUNTIME_USER="argusagent" +RUNTIME_GROUP="argusagent" +AGENT_UID="${ARGUS_BUILD_UID:-2133}" +AGENT_GID="${ARGUS_BUILD_GID:-2015}" +HOSTNAME_VAL="${HOSTNAME:-unknown}" + +log() { echo "${LOG_PREFIX} $*"; } + +# Prepare runtime user +if ! getent group "$AGENT_GID" >/dev/null 2>&1; then + groupadd -g "$AGENT_GID" "$RUNTIME_GROUP" || true +else + RUNTIME_GROUP="$(getent group "$AGENT_GID" | cut -d: -f1)" +fi +if ! getent passwd "$AGENT_UID" >/dev/null 2>&1; then + useradd -u "$AGENT_UID" -g "$AGENT_GID" -M -s /bin/bash "$RUNTIME_USER" || true +else + RUNTIME_USER="$(getent passwd "$AGENT_UID" | cut -d: -f1)" +fi +log "runtime user: $RUNTIME_USER ($AGENT_UID:$AGENT_GID)" + +# Ensure agent data dirs exist (host volumes mounted) +AGENT_DIR="/private/argus/agent/${HOSTNAME_VAL}" +HEALTH_DIR="${AGENT_DIR}/health" +mkdir -p "$HEALTH_DIR" +chown -R "$AGENT_UID:$AGENT_GID" "$AGENT_DIR" 2>/dev/null || true + +# Stage Fluent Bit assets into /private to reuse existing startup script +mkdir -p /private +if [[ -f /assets/start-fluent-bit.sh ]]; then + cp /assets/start-fluent-bit.sh /private/start-fluent-bit.sh + chmod +x /private/start-fluent-bit.sh +fi +if [[ -d /assets/fluent-bit/etc ]]; then + rm -rf /private/etc && mkdir -p /private + cp -r /assets/fluent-bit/etc /private/ +fi +if [[ -d /assets/fluent-bit/packages ]]; then + cp -r /assets/fluent-bit/packages /private/ +fi + +# Start Fluent Bit in background (will block, so run via bash -lc &) +if [[ -x /private/start-fluent-bit.sh ]]; then + log "starting fluent-bit" + bash -lc '/private/start-fluent-bit.sh' & +else + log "missing /private/start-fluent-bit.sh; fluent-bit will not start" +fi + +# Start agent in foreground as runtime user +log "starting argus-agent" +exec su -s /bin/bash -c /usr/local/bin/argus-agent "$RUNTIME_USER" +