Compare commits

...

10 Commits

38 changed files with 1094 additions and 51 deletions

View File

@ -5,3 +5,10 @@
项目文档【腾讯文档】GPU集群运维系统
https://docs.qq.com/doc/DQUxDdmhIZ1dpeERk
## 构建账号配置
镜像构建和运行账号的 UID/GID 可通过 `configs/build_user.conf` 配置,详细说明见 `doc/build-user-config.md`
## 本地端口占用提示
如需运行 BIND 模块端到端测试且宿主机 53 端口已占用,可通过环境变量 `HOST_DNS_PORT`(默认 1053指定对外映射端口例如 `HOST_DNS_PORT=12053 ./scripts/00_e2e_test.sh`

View File

@ -21,6 +21,7 @@ EOF
}
use_intranet=false
build_master=true
build_master_offline=false
while [[ $# -gt 0 ]]; do
@ -29,7 +30,12 @@ while [[ $# -gt 0 ]]; do
use_intranet=true
shift
;;
--master)
build_master=true
shift
;;
--master-offline)
build_master=true
build_master_offline=true
shift
;;
@ -142,11 +148,23 @@ fi
echo ""
if [[ "$build_master_offline" == true ]]; then
echo "🏗️ Building master offline image"
if [[ "$build_master" == true ]]; then
echo ""
echo "🔄 Building Master image..."
pushd "$master_root" >/dev/null
if ./scripts/build_images.sh --offline --tag argus-master:offline; then
images_built+=("argus-master:offline")
master_args=("--tag" "argus-master:latest")
if [[ "$use_intranet" == true ]]; then
master_args+=("--intranet")
fi
if [[ "$build_master_offline" == true ]]; then
master_args+=("--offline")
fi
if ./scripts/build_images.sh "${master_args[@]}"; then
if [[ "$build_master_offline" == true ]]; then
images_built+=("argus-master:offline")
else
images_built+=("argus-master:latest")
fi
else
build_failed=true
fi

View File

@ -1,3 +1,5 @@
build/
*.egg-info/
__pycache__/
.env

View File

@ -1,19 +1,19 @@
services:
bind:
image: ${BIND_IMAGE_TAG:-argus-bind9:e2e}
image: ${BIND_IMAGE_TAG:-argus-bind9:latest}
container_name: argus-bind-agent-e2e
volumes:
- ./private:/private
restart: unless-stopped
networks:
default:
ipv4_address: 172.28.0.2
environment:
- "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
- "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
restart: always
master:
image: argus-master:dev
image: argus-master:latest
container_name: argus-master-agent-e2e
depends_on:
- bind
@ -32,9 +32,10 @@ services:
networks:
default:
ipv4_address: 172.28.0.10
restart: always
agent:
image: ubuntu:24.04
image: ubuntu:22.04
container_name: argus-agent-e2e
hostname: dev-e2euser-e2einst-pod-0
depends_on:
@ -57,6 +58,7 @@ services:
networks:
default:
ipv4_address: 172.28.0.20
restart: always
networks:
default:

View File

@ -15,9 +15,17 @@ AGENT_HEALTH_DIR="$PRIVATE_ROOT/argus/agent/$AGENT_HOSTNAME/health"
MASTER_PRIVATE_DIR="$PRIVATE_ROOT/argus/master"
METRIC_PRIVATE_DIR="$PRIVATE_ROOT/argus/metric/prometheus"
DNS_DIR="$PRIVATE_ROOT/argus/etc"
BIND_IMAGE_TAG="${BIND_IMAGE_TAG:-argus-bind9:e2e}"
BIND_IMAGE_TAG="${BIND_IMAGE_TAG:-argus-bind9:latest}"
BIND_ROOT="$(cd "$MASTER_ROOT/../bind" && pwd)"
ensure_image() {
local image="$1"
if ! docker image inspect "$image" >/dev/null 2>&1; then
echo "[ERROR] Docker image '$image' 未找到,请先运行统一构建脚本 (例如 ./build/build_images.sh) 生成所需镜像" >&2
exit 1
fi
}
mkdir -p "$AGENT_CONFIG_DIR"
mkdir -p "$AGENT_HEALTH_DIR"
mkdir -p "$MASTER_PRIVATE_DIR"
@ -35,9 +43,8 @@ else
echo "[WARN] bind update script missing at $BIND_ROOT/build/update-dns.sh"
fi
pushd "$MASTER_ROOT" >/dev/null
./scripts/build_images.sh --tag argus-master:dev
popd >/dev/null
ensure_image "argus-master:latest"
ensure_image "$BIND_IMAGE_TAG"
AGENT_BINARY="$AGENT_ROOT/dist/argus-agent"
@ -50,11 +57,6 @@ if [[ ! -x "$AGENT_BINARY" ]]; then
exit 1
fi
# 中文提示:构建测试专用 bind9 镜像,确保解析服务可用
pushd "$REPO_ROOT" >/dev/null
docker build -f src/bind/build/Dockerfile -t "$BIND_IMAGE_TAG" .
popd >/dev/null
echo "$AGENT_BINARY" > "$TMP_ROOT/agent_binary_path"
echo "$BIND_IMAGE_TAG" > "$TMP_ROOT/bind_image_tag"

View File

@ -28,7 +28,7 @@ if [[ ! -x "$AGENT_BINARY" ]]; then
exit 1
fi
BIND_IMAGE_TAG_VALUE="argus-bind9:e2e"
BIND_IMAGE_TAG_VALUE="argus-bind9:latest"
if [[ -f "$TMP_ROOT/bind_image_tag" ]]; then
BIND_IMAGE_TAG_VALUE="$(cat "$TMP_ROOT/bind_image_tag")"
fi

View File

@ -10,6 +10,7 @@ AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
NETWORK_NAME="tests_default"
NEW_AGENT_IP="172.28.0.200"
ENTRYPOINT_SCRIPT="$SCRIPT_DIR/agent_entrypoint.sh"
ENV_FILE="$TEST_ROOT/.env"
# 中文提示:重启场景也需要同样的入口脚本,确保 DNS 注册逻辑一致
if [[ ! -f "$ENTRYPOINT_SCRIPT" ]]; then
@ -28,6 +29,21 @@ if [[ ! -x "$AGENT_BINARY" ]]; then
exit 1
fi
if [[ -f "$ENV_FILE" ]]; then
set -a
# shellcheck disable=SC1090
source "$ENV_FILE"
set +a
else
REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
# shellcheck disable=SC1090
source "$REPO_ROOT/scripts/common/build_user.sh"
load_build_user
fi
AGENT_UID="${ARGUS_BUILD_UID:-2133}"
AGENT_GID="${ARGUS_BUILD_GID:-2015}"
compose() {
if docker compose version >/dev/null 2>&1; then
docker compose "$@"
@ -80,8 +96,10 @@ if ! docker run -d \
-v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
-e REPORT_INTERVAL_SECONDS=2 \
-e ARGUS_BUILD_UID="$AGENT_UID" \
-e ARGUS_BUILD_GID="$AGENT_GID" \
--entrypoint /usr/local/bin/agent-entrypoint.sh \
ubuntu:24.04 >/dev/null; then
ubuntu:22.04 >/dev/null; then
echo "[ERROR] Failed to start agent container with custom IP" >&2
exit 1
fi

View File

@ -3,8 +3,8 @@ services:
image: argus-bind9:latest
container_name: argus-bind9-test
ports:
- "53:53/tcp"
- "53:53/udp"
- "${HOST_DNS_PORT:-1053}:53/tcp"
- "${HOST_DNS_PORT:-1053}:53/udp"
volumes:
- ./private:/private
restart: unless-stopped
@ -13,4 +13,4 @@ services:
networks:
bind-test-network:
driver: bridge
driver: bridge

View File

@ -7,6 +7,9 @@
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
export HOST_DNS_PORT
echo "=========================================="
echo "BIND9 DNS Server End-to-End Test Suite"
@ -112,4 +115,4 @@ else
echo " - Review BIND9 configuration files"
echo " - Check system resources and port availability"
exit 1
fi
fi

View File

@ -7,13 +7,17 @@ set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(dirname "$SCRIPT_DIR")"
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
export HOST_DNS_PORT
cd "$TEST_DIR"
echo "Starting BIND9 test container..."
# Ensure private directory exists with proper permissions
mkdir -p private
mkdir -p private/argus/bind
mkdir -p private/argus/etc
chmod 777 private
# Start the container
@ -35,4 +39,4 @@ fi
echo ""
echo "BIND9 test environment is ready!"
echo "DNS server listening on localhost:53"
echo "DNS server listening on localhost:${HOST_DNS_PORT}"

View File

@ -5,7 +5,10 @@
set -e
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
echo "Testing DNS resolution with dig..."
echo "Using DNS server localhost:${HOST_DNS_PORT}"
# Function to test DNS query
test_dns_query() {
@ -19,7 +22,7 @@ test_dns_query() {
echo "Expected IP: $expected_ip"
# Perform dig query
result=$(dig @localhost $hostname.argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
if [ "$result" = "QUERY_FAILED" ]; then
echo "✗ DNS query failed"
@ -69,4 +72,4 @@ if [ $failed_tests -eq 0 ]; then
else
echo "$failed_tests test(s) failed"
exit 1
fi
fi

View File

@ -6,10 +6,13 @@
set -e
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(dirname "$SCRIPT_DIR")"
echo "=== DNS Auto-Sync Functionality Test ==="
echo "Using DNS server localhost:${HOST_DNS_PORT}"
# Check if container is running
if ! docker compose ps | grep -q "Up"; then
@ -36,7 +39,7 @@ test_dns_query() {
# Wait a moment for DNS cache
sleep 2
result=$(dig @localhost $hostname.argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
if [ "$result" = "$expected_ip" ]; then
echo "$result"
@ -90,7 +93,7 @@ echo ""
echo "Step 2: Testing initial DNS configuration..."
# Get current IP for web.argus.com (may have been changed by previous tests)
current_web_ip=$(dig @localhost web.argus.com A +short 2>/dev/null || echo "UNKNOWN")
current_web_ip=$(dig @localhost -p "$HOST_DNS_PORT" web.argus.com A +short 2>/dev/null || echo "UNKNOWN")
echo "Current web.argus.com IP: $current_web_ip"
# Test that DNS is working (regardless of specific IP)
@ -185,7 +188,7 @@ docker compose exec bind9 bash -c 'echo "this is not an IP address" > /private/a
wait_for_sync
# Verify invalid record was not added (should fail to resolve)
result=$(dig @localhost invalid.argus.com A +short 2>/dev/null || echo "NO_RESULT")
result=$(dig @localhost -p "$HOST_DNS_PORT" invalid.argus.com A +short 2>/dev/null || echo "NO_RESULT")
if [ "$result" = "NO_RESULT" ] || [ -z "$result" ]; then
echo "✓ Invalid IP correctly ignored"
else

View File

@ -5,10 +5,13 @@
set -e
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(dirname "$SCRIPT_DIR")"
echo "=== DNS Configuration Reload Test ==="
echo "Using DNS server localhost:${HOST_DNS_PORT}"
# Check if container is running
if ! docker compose ps | grep -q "Up"; then
@ -32,7 +35,7 @@ test_dns_query() {
echo "Testing: $description"
echo "Query: $hostname.argus.com -> Expected: $expected_ip"
result=$(dig @localhost $hostname.argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
if [ "$result" = "$expected_ip" ]; then
echo "$result"
@ -109,4 +112,4 @@ fi
echo ""
echo "✓ DNS configuration reload test completed successfully!"
echo "✓ IP address changed from 12.4.5.6 to 192.168.1.100"
echo "✓ Configuration persisted and reloaded correctly"
echo "✓ Configuration persisted and reloaded correctly"

View File

@ -5,10 +5,13 @@
set -e
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(dirname "$SCRIPT_DIR")"
echo "=== Configuration Persistence Test ==="
echo "Using DNS server localhost:${HOST_DNS_PORT}"
# Check if dig is available
if ! command -v dig &> /dev/null; then
@ -25,7 +28,7 @@ test_dns_query() {
echo "Testing: $description"
echo "Query: $hostname.argus.com -> Expected: $expected_ip"
result=$(dig @localhost $hostname.argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
if [ "$result" = "$expected_ip" ]; then
echo "$result"
@ -112,4 +115,4 @@ echo ""
echo "✓ Configuration persistence test completed successfully!"
echo "✓ Modified IP (192.168.1.100) persisted after container restart"
echo "✓ Configuration files properly linked to persistent storage"
echo "✓ DNS resolution working correctly with persisted configuration"
echo "✓ DNS resolution working correctly with persisted configuration"

View File

@ -7,6 +7,9 @@ set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(dirname "$SCRIPT_DIR")"
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
export HOST_DNS_PORT
# Parse command line arguments
FULL_CLEANUP=true

View File

@ -17,6 +17,7 @@ services:
interval: 10s
timeout: 5s
retries: 30
restart: always
kibana:
build:
@ -73,13 +74,11 @@ services:
interval: 15s
timeout: 10s
retries: 30
restart: always
bind9:
image: argus-bind9:latest
ports:
- "53:53/tcp"
- "53:53/udp"
volumes:
- ./private/argus:/private/argus/
restart: unless-stopped
restart: always

View File

@ -15,9 +15,9 @@ mkdir -p "$root/private/argus/etc/"
# 设置数据目录权限ES 和 Kibana 容器都使用 UID 1000
echo "[INFO] Setting permissions for data directories..."
sudo chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/log/elasticsearch" 2>/dev/null || true
sudo chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/log/kibana" 2>/dev/null || true
sudo chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/etc" 2>/dev/null || true
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/log/elasticsearch" 2>/dev/null || true
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/log/kibana" 2>/dev/null || true
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/etc" 2>/dev/null || true
echo "[INFO] Supervisor-based containers will manage their own scripts and configurations"

View File

@ -1,7 +1,42 @@
#!/usr/bin/env bash
set -euo pipefail
# ES endpoint and wait strategy
ES="${ES:-http://localhost:9200}"
es_wait_attempts="${ES_WAIT_ATTEMPTS:-60}" # total attempts to wait for ES
es_wait_interval="${ES_WAIT_INTERVAL:-2}" # seconds between attempts
echo "[i] 查询 ES 端点:$ES"
wait_for_es() {
local attempt=1
while (( attempt <= es_wait_attempts )); do
# 等待集群达到至少 yellow 状态;请求失败则重试
if curl -fsS "$ES/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then
echo "[ok] Elasticsearch 已就绪 (attempt=${attempt}/${es_wait_attempts})"
return 0
fi
echo "[..] 等待 Elasticsearch 可用中 (${attempt}/${es_wait_attempts})"
sleep "${es_wait_interval}"
(( attempt++ ))
done
echo "[err] Elasticsearch 在 ${es_wait_attempts} 次尝试后仍不可用"
return 1
}
safe_count() {
# 对缺失索引返回 0避免 404 触发失败
local pattern="$1"
local json
json=$(curl -fsS "$ES/${pattern}/_count?ignore_unavailable=true&allow_no_indices=true" 2>/dev/null || echo '{}')
echo "$json" | sed -E 's/.*"count":([0-9]+).*/\1/' | awk 'NF{print $0;exit} END{if(NR==0)print 0}'
}
wait_for_es
# 列出相关索引(可能为空,允许)
curl -fsS "$ES/_cat/indices?v" | egrep 'train-|infer-|logstash' || true
printf "train-* 计数:"; curl -fsS "$ES/train-*/_count" | sed -E 's/.*"count":([0-9]+).*/\1/'; echo
printf "infer-* 计数:"; curl -fsS "$ES/infer-*/_count" | sed -E 's/.*"count":([0-9]+).*/\1/'; echo
# 打印计数,缺失索引按 0 处理
printf "train-* 计数:"; safe_count "train-*"; echo
printf "infer-* 计数:"; safe_count "infer-*"; echo

View File

@ -11,7 +11,7 @@ Argus Master 是基于 Flask + SQLite 的节点管理服务,负责:
```bash
cd src/master
./scripts/build_images.sh # 生成 argus-master:dev 镜像
./scripts/build_images.sh # 生成 argus-master:latest 镜像
```
如需离线构建,先在有网环境运行准备脚本:
@ -25,7 +25,7 @@ cd src/master
```bash
cd src/master
./scripts/build_images.sh --offline --tag argus-master:dev
./scripts/build_images.sh --offline --tag argus-master:latest
```
若内网缺少 `python:3.11-slim`,请提前在外网 `docker save` 后通过离线介质 `docker load`

View File

@ -8,14 +8,14 @@ Usage: $0 [--intranet] [--offline] [--tag <image_tag>]
Options:
--intranet 使用指定的 PyPI 镜像源(默认清华镜像)。
--offline 完全离线构建,依赖 offline_wheels/ 目录中的离线依赖包。
--tag <image_tag> 自定义镜像标签,默认 argus-master:dev
--tag <image_tag> 自定义镜像标签,默认 argus-master:latest
USAGE
}
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
MODULE_ROOT="$PROJECT_ROOT/src/master"
IMAGE_TAG="${IMAGE_TAG:-argus-master:dev}"
IMAGE_TAG="${IMAGE_TAG:-argus-master:latest}"
DOCKERFILE="src/master/Dockerfile"
BUILD_ARGS=()
OFFLINE_MODE=0

View File

@ -8,7 +8,7 @@ usage() {
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
DEFAULT_OUTPUT="$PROJECT_ROOT/images/argus-master-dev.tar"
IMAGE_TAG="${IMAGE_TAG:-argus-master:dev}"
IMAGE_TAG="${IMAGE_TAG:-argus-master:latest}"
OUTPUT_PATH="$DEFAULT_OUTPUT"
while [[ "$#" -gt 0 ]]; do

View File

@ -1,6 +1,6 @@
services:
master:
image: ${MASTER_IMAGE_TAG:-argus-master:dev}
image: ${MASTER_IMAGE_TAG:-argus-master:latest}
container_name: argus-master-e2e
environment:
- OFFLINE_THRESHOLD_SECONDS=6

View File

@ -17,7 +17,7 @@ SCRIPTS=(
for script in "${SCRIPTS[@]}"; do
echo "[TEST] Running $script"
MASTER_IMAGE_TAG="${MASTER_IMAGE_TAG:-argus-master:dev}" "$SCRIPT_DIR/$script"
MASTER_IMAGE_TAG="${MASTER_IMAGE_TAG:-argus-master:latest}" "$SCRIPT_DIR/$script"
echo "[TEST] $script completed"
echo
done

View File

@ -44,7 +44,7 @@ fi
pushd "$TEST_ROOT" >/dev/null
compose down --remove-orphans || true
MASTER_IMAGE_TAG="${MASTER_IMAGE_TAG:-argus-master:dev}" compose up -d
MASTER_IMAGE_TAG="${MASTER_IMAGE_TAG:-argus-master:latest}" compose up -d
popd >/dev/null
echo "[INFO] Master container is up on http://localhost:31300"

2
src/sys/README.md Normal file
View File

@ -0,0 +1,2 @@

138
src/sys/tests/README.md Normal file
View File

@ -0,0 +1,138 @@
# ARGUS 系统级端到端测试Sys E2E
本目录包含将 log 与 agent 两线验证合并后的系统级端到端测试。依赖 bind/master/es/kibana + 两个“日志节点”(每个节点容器内同时运行 Fluent Bit 与 argus-agent
---
## 一、如何运行
- 前置条件
- 已构建镜像:`argus-elasticsearch:latest``argus-kibana:latest``argus-bind9:latest``argus-master:latest`
- 可用根目录命令构建:`./build/build_images.sh [--intranet]`
- 主机具备 Docker 与 Docker Compose。
- UID/GID 配置(用于容器内文件属主与挂载卷写入权限)
- 默认值:`UID=2133``GID=2015`
- 方式 A推荐在仓库根目录创建 `configs/build_user.local.conf`
UID=<你的宿主用户UID>
GID=<你的宿主用户GID>
例如:
UID=1000
GID=1000
- 方式 B通过环境变量覆盖优先级最高
export ARGUS_BUILD_UID=1000
export ARGUS_BUILD_GID=1000
- 说明:`scripts/common/build_user.sh` 会按顺序读取 `configs/build_user.local.conf``configs/build_user.conf` → 环境变量,最终值会用于镜像构建参数与测试脚本,并在 `01_bootstrap.sh` 中对 `src/sys/tests/private/argus/*` 进行 `chown` 以匹配容器内运行用户。
- 一键执行
- `cd src/sys/tests`
- `./scripts/00_e2e_test.sh`
- 分步执行(推荐用于排查)
- `./scripts/01_bootstrap.sh` 生成目录/拷贝 `update-dns.sh`/构建 agent 二进制/写 `.env`
- `./scripts/02_up.sh` 启动 Compose 栈(工程名 `argus-sys`
- `./scripts/03_wait_ready.sh` 等待 ES/Kibana/Master/FluentBit/Bind 就绪Kibana 必须返回 200 且 overall.level=available
- `./scripts/04_verify_dns_routing.sh` 校验 bind 解析与节点内域名解析
- `./scripts/05_agent_register.sh` 获取两个节点的 `node_id` 与初始 IP检查本地 `node.json`
- `./scripts/06_write_health_and_assert.sh` 写健康文件并断言 `nodes.json` 仅包含 2 个在线节点
- `./scripts/07_logs_send_and_assert.sh` 向两个节点写日志,断言 ES `train-*`/`infer-*` 计数增长
- `./scripts/08_restart_agent_reregister.sh` `node-b` 改为固定 IP `172.29.0.200`,验证保持同一节点 ID 且 IP/时间戳更新
- `./scripts/09_down.sh` 回收容器、网络并清理 `private*/``tmp/`
- 重置环境
- 任何阶段失败可执行 `./scripts/09_down.sh` 后重跑 `01→…`
---
## 二、测试部署架构docker-compose
- 网络
- 自定义 bridge`argus-sys-net`,子网 `172.29.0.0/16`
- 固定地址bind=`172.29.0.2`master=`172.29.0.10`
- 服务与端口
- `bind``argus-bind9:latest`):监听 53/tcp+udp负责同步 `*.argus.com` 记录
- `master``argus-master:latest`):对外 `32300→3000`API `http://localhost:32300`
- `es``argus-elasticsearch:latest``9200→9200`;单节点,无安全
- `kibana``argus-kibana:latest``5601→5601`;通过 `ELASTICSEARCH_HOSTS=http://es:9200` 访问 ES
- `node-a``ubuntu:22.04`):同时运行 Fluent Bit + argus-agent`hostname=dev-yyrshare-nbnyx10-cp2f-pod-0``2020→2020`
- `node-b``ubuntu:22.04`):同时运行 Fluent Bit + argus-agent`hostname=dev-yyrshare-uuuu10-ep2f-pod-0``2021→2020`
- 卷与目录
- 核心服务bind/master/es/kibana共享宿主 `./private` 挂载到容器 `/private`
- 两个节点使用独立数据卷,互不与核心服务混用:
- node-a`./private-nodea/argus/agent/<HOST> → /private/argus/agent/<HOST>`
- node-b`./private-nodeb/argus/agent/<HOST> → /private/argus/agent/<HOST>`
- 节点容器的 Fluent Bit/agent 资产以只读方式挂载到 `/assets`/`/usr/local/bin/argus-agent`
- DNS 配置
- 节点容器通过 compose 配置 `dns: [172.29.0.2]` 指向 bind不挂载 `/etc/resolv.conf`,也不依赖 `update-dns.sh`
- master/es/kibana 仍共享 `./private`master 启动会写 `/private/argus/etc/master.argus.com` 供 bind 同步 A 记录
- 节点入口
- `scripts/node_entrypoint.sh`
- 复制 `/assets/fluent-bit/*` 到容器 `/private`,后台启动 Fluent Bit监听 2020
- 以运行用户(映射 UID/GID前台启动 `argus-agent`
- 节点环境变量:`MASTER_ENDPOINT=http://master.argus.com:3000``REPORT_INTERVAL_SECONDS=2``ES_HOST=es``ES_PORT=9200``CLUSTER=local``RACK=dev`
---
## 三、脚本与验证目标
- `01_bootstrap.sh`
- 目的:准备目录结构、修正 ES/Kibana 数据目录属主、分发 `update-dns.sh`(仅核心服务使用)、构建 agent 二进制、写 `.env`
- 失败排查:若 ES 无法写入数据,重跑本步骤确保目录属主为指定 UID/GID
- `02_up.sh`
- 目的:以工程名 `argus-sys` 启动全栈;自动清理旧栈/网络
- `03_wait_ready.sh`
- 目的:等待关键端口/健康接口可用
- 判定:
- ES `/_cluster/health?wait_for_status=yellow` 成功
- Kibana `GET /api/status` 返回 200 且 `overall.level=available`
- Master `/readyz` 成功
- Fluent Bit 指标接口 `:2020/:2021` 可访问
- bind `named-checkconf` 通过
- `04_verify_dns_routing.sh`
- 目的:验证从 bind → 节点容器的解析链路
- 判定:
- `private/argus/etc/master.argus.com` 存在且为 master IP
- 在 node-a/node-b 内 `getent hosts master.argus.com` 成功解析到 master IP
- `05_agent_register.sh`
- 目的:确认两个节点注册到 master 并持久化 `node.json`
- 输出:`tmp/node_id_a|b``tmp/initial_ip_a|b``tmp/detail_*.json`
- `06_write_health_and_assert.sh`
- 目的:模拟节点健康上报并在 master 侧可见;`nodes.json` 仅保留在线节点
- 操作:写 `log-fluentbit.json``metric-node-exporter.json` 至两个节点的 health 目录
- `07_logs_send_and_assert.sh`
- 目的:通过 Fluent Bit 将两类日志注入 ES计数应较基线增长且达到阈值≥4
- 同时校验 ES 健康 `green|yellow`
- `08_restart_agent_reregister.sh`
- 目的:验证节点重启与 IP 变更时保持相同 `id` 并更新 `meta_data.ip``last_updated`
- 操作:以固定 IP `172.29.0.200` 重建 nodeb 后轮询校验
- `09_down.sh`
- 目的:栈销毁与环境清理;必要时使用临时容器修正属主再删除 `private*` 目录
---
### 常见问题与排查
- Kibana 长时间 503机器较慢时初始化较久脚本最长等待 ~15 分钟;先确认 ES 已就绪。
- Fluent Bit 指标未就绪:检查节点容器日志与环境变量 `CLUSTER/RACK` 是否设置;确认入口脚本已经复制资产到 `/private`
- ES 无法启动:多为宿主目录权限问题;重跑 `01_bootstrap.sh`,或手动 `chown -R <UID:GID> src/sys/tests/private/argus/log/*`
---
如需更严格的断言(例如 Kibana 载入具体插件、ES 文档字段校验),可在 `07_*.sh` 中追加查询与校验逻辑。

View File

@ -0,0 +1,139 @@
version: "3.8"
networks:
default:
name: argus-sys-net
driver: bridge
ipam:
driver: default
config:
- subnet: 172.29.0.0/16
services:
bind:
image: ${BIND_IMAGE_TAG:-argus-bind9:latest}
container_name: argus-bind-sys
networks:
default:
ipv4_address: 172.29.0.2
volumes:
- ./private:/private
restart: unless-stopped
master:
image: ${MASTER_IMAGE_TAG:-argus-master:latest}
container_name: argus-master-sys
depends_on:
- bind
environment:
- OFFLINE_THRESHOLD_SECONDS=6
- ONLINE_THRESHOLD_SECONDS=2
- SCHEDULER_INTERVAL_SECONDS=1
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
ports:
- "32300:3000"
volumes:
- ./private/argus/master:/private/argus/master
- ./private/argus/metric/prometheus:/private/argus/metric/prometheus
- ./private/argus/etc:/private/argus/etc
networks:
default:
ipv4_address: 172.29.0.10
restart: unless-stopped
es:
image: argus-elasticsearch:latest
container_name: argus-es-sys
environment:
- discovery.type=single-node
- xpack.security.enabled=false
- ES_JAVA_OPTS=-Xms512m -Xmx512m
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
volumes:
- ./private/argus/log/elasticsearch:/private/argus/log/elasticsearch
- ./private/argus/etc:/private/argus/etc
ports:
- "9200:9200"
restart: unless-stopped
kibana:
image: argus-kibana:latest
container_name: argus-kibana-sys
environment:
- ELASTICSEARCH_HOSTS=http://es.log.argus.com:9200
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
volumes:
- ./private/argus/log/kibana:/private/argus/log/kibana
- ./private/argus/etc:/private/argus/etc
depends_on:
- es
ports:
- "5601:5601"
restart: unless-stopped
node-a:
image: ubuntu:22.04
container_name: argus-node-a
hostname: dev-yyrshare-nbnyx10-cp2f-pod-0
depends_on:
- master
- bind
- es
environment:
- MASTER_ENDPOINT=http://master.argus.com:3000
- REPORT_INTERVAL_SECONDS=2
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
- ES_HOST=es
- ES_PORT=9200
- CLUSTER=local
- RACK=dev
volumes:
- ./private-nodea/argus/agent/dev-yyrshare-nbnyx10-cp2f-pod-0:/private/argus/agent/dev-yyrshare-nbnyx10-cp2f-pod-0
- ../../agent/dist/argus-agent:/usr/local/bin/argus-agent:ro
- ./scripts/node_entrypoint.sh:/usr/local/bin/node-entrypoint.sh:ro
- ../../log/fluent-bit/build/start-fluent-bit.sh:/assets/start-fluent-bit.sh:ro
- ../../log/fluent-bit/build/etc:/assets/fluent-bit/etc:ro
- ../../log/fluent-bit/build/packages:/assets/fluent-bit/packages:ro
entrypoint:
- /usr/local/bin/node-entrypoint.sh
dns:
- 172.29.0.2
ports:
- "2020:2020"
restart: unless-stopped
node-b:
image: ubuntu:22.04
container_name: argus-node-b
hostname: dev-yyrshare-uuuu10-ep2f-pod-0
depends_on:
- master
- bind
- es
environment:
- MASTER_ENDPOINT=http://master.argus.com:3000
- REPORT_INTERVAL_SECONDS=2
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
- ES_HOST=es
- ES_PORT=9200
- CLUSTER=local
- RACK=dev
volumes:
- ./private-nodeb/argus/agent/dev-yyrshare-uuuu10-ep2f-pod-0:/private/argus/agent/dev-yyrshare-uuuu10-ep2f-pod-0
- ../../agent/dist/argus-agent:/usr/local/bin/argus-agent:ro
- ./scripts/node_entrypoint.sh:/usr/local/bin/node-entrypoint.sh:ro
- ../../log/fluent-bit/build/start-fluent-bit.sh:/assets/start-fluent-bit.sh:ro
- ../../log/fluent-bit/build/etc:/assets/fluent-bit/etc:ro
- ../../log/fluent-bit/build/packages:/assets/fluent-bit/packages:ro
entrypoint:
- /usr/local/bin/node-entrypoint.sh
dns:
- 172.29.0.2
ports:
- "2021:2020"
restart: unless-stopped

View File

@ -0,0 +1,26 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SCRIPTS=(
"01_bootstrap.sh"
"02_up.sh"
"03_wait_ready.sh"
"04_verify_dns_routing.sh"
"05_agent_register.sh"
"06_write_health_and_assert.sh"
"07_logs_send_and_assert.sh"
"08_restart_agent_reregister.sh"
"09_down.sh"
)
for script in "${SCRIPTS[@]}"; do
echo "[SYS-E2E] Running $script"
"$SCRIPT_DIR/$script"
echo "[SYS-E2E] $script completed"
echo
done
echo "[SYS-E2E] All tests completed"

View File

@ -0,0 +1,77 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
PRIVATE_CORE="$TEST_ROOT/private"
PRIVATE_NODEA="$TEST_ROOT/private-nodea"
PRIVATE_NODEB="$TEST_ROOT/private-nodeb"
TMP_DIR="$TEST_ROOT/tmp"
source "$REPO_ROOT/scripts/common/build_user.sh"
load_build_user
ensure_image() {
local image="$1"
if ! docker image inspect "$image" >/dev/null 2>&1; then
echo "[ERROR] Missing image: $image. Please run ./build/build_images.sh" >&2
exit 1
fi
}
echo "[INFO] Preparing directories..."
mkdir -p \
"$PRIVATE_CORE/argus/etc" \
"$PRIVATE_CORE/argus/bind" \
"$PRIVATE_CORE/argus/master" \
"$PRIVATE_CORE/argus/metric/prometheus" \
"$PRIVATE_CORE/argus/log/elasticsearch" \
"$PRIVATE_CORE/argus/log/kibana" \
"$PRIVATE_NODEA/argus/agent/dev-yyrshare-nbnyx10-cp2f-pod-0/health" \
"$PRIVATE_NODEB/argus/agent/dev-yyrshare-uuuu10-ep2f-pod-0/health" \
"$TMP_DIR"
# Align ownership for supervisor-managed services (ES/Kibana expect UID/GID inside container)
echo "[INFO] Fixing ownership for core private directories..."
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" \
"$PRIVATE_CORE/argus/log/elasticsearch" \
"$PRIVATE_CORE/argus/log/kibana" \
"$PRIVATE_CORE/argus/etc" 2>/dev/null || true
echo "[INFO] Distributing update-dns.sh for core services (bind/master/es/kibana)"
BIND_UPDATE_SRC="$REPO_ROOT/src/bind/build/update-dns.sh"
BIND_UPDATE_DEST="$PRIVATE_CORE/argus/etc/update-dns.sh"
if [[ -f "$BIND_UPDATE_SRC" ]]; then
cp "$BIND_UPDATE_SRC" "$BIND_UPDATE_DEST"
chmod +x "$BIND_UPDATE_DEST"
else
echo "[WARN] bind update-dns.sh not found at $BIND_UPDATE_SRC"
fi
echo "[INFO] Ensuring images present..."
ensure_image "argus-elasticsearch:latest"
ensure_image "argus-kibana:latest"
ensure_image "argus-bind9:latest"
ensure_image "argus-master:latest"
echo "[INFO] Building agent binary..."
pushd "$REPO_ROOT/src/agent" >/dev/null
./scripts/build_binary.sh
popd >/dev/null
AGENT_BIN="$REPO_ROOT/src/agent/dist/argus-agent"
if [[ ! -x "$AGENT_BIN" ]]; then
echo "[ERROR] Agent binary not found at $AGENT_BIN" >&2
exit 1
fi
echo "$AGENT_BIN" > "$TMP_DIR/agent_binary_path"
echo "[INFO] Writing .env with UID/GID"
cat > "$TEST_ROOT/.env" <<EOF
ARGUS_BUILD_UID=$ARGUS_BUILD_UID
ARGUS_BUILD_GID=$ARGUS_BUILD_GID
EOF
echo "[OK] Bootstrap completed"

22
src/sys/tests/scripts/02_up.sh Executable file
View File

@ -0,0 +1,22 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
compose() {
if docker compose version >/dev/null 2>&1; then
docker compose "$@"
else
docker-compose "$@"
fi
}
echo "[INFO] Bringing up system stack..."
pushd "$TEST_ROOT" >/dev/null
compose -p argus-sys down --remove-orphans || true
compose -p argus-sys up -d
popd >/dev/null
echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021"

View File

@ -0,0 +1,75 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
compose() {
if docker compose version >/dev/null 2>&1; then
docker compose "$@"
else
docker-compose "$@"
fi
}
service_id() {
compose -p argus-sys ps -q "$1"
}
wait_http() {
local url="$1"; local attempts="${2:-120}"; local i=1
while (( i <= attempts )); do
if curl -fsS "$url" >/dev/null 2>&1; then return 0; fi
echo "[..] waiting $url ($i/$attempts)"; sleep 5; ((i++))
done
echo "[ERR] Timeout waiting for $url" >&2; return 1
}
echo "[INFO] Waiting for ES/Kibana/Master/Fluent Bit/Bind..."
# ES (>= yellow)
attempt=1; max=120
while (( attempt <= max )); do
if curl -fsS "http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then
break
fi
echo "[..] waiting ES ($attempt/$max)"; sleep 5; ((attempt++))
done
[[ $attempt -le $max ]] || { echo "[ERR] ES not ready" >&2; exit 1; }
# Kibana: must be HTTP 200 and overall.level=available
echo "[INFO] Waiting for Kibana to be available (HTTP 200)..."
kb_attempt=1; kb_max=180
while (( kb_attempt <= kb_max )); do
body=$(curl -sS "http://localhost:5601/api/status" 2>/dev/null || true)
code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:5601/api/status" || echo 000)
if [[ "$code" == "200" ]]; then
if echo "$body" | grep -q '"level":"available"'; then
echo "[OK] Kibana available (HTTP 200)"
break
fi
fi
echo "[..] waiting kibana 200 ($kb_attempt/$kb_max), last_code=$code"
sleep 5
((kb_attempt++))
done
if (( kb_attempt > kb_max )); then
echo "[ERR] Kibana did not reach HTTP 200 available in time" >&2; exit 1
fi
# Master
wait_http "http://localhost:32300/readyz" 120
# Fluent Bit (host metrics on host ports)
wait_http "http://localhost:2020/api/v2/metrics" 120
wait_http "http://localhost:2021/api/v2/metrics" 120
# Bind config check
BIND_ID="$(service_id bind)"
if [[ -n "$BIND_ID" ]]; then
docker exec "$BIND_ID" named-checkconf >/dev/null
else
echo "[WARN] bind container id not found"
fi
echo "[OK] All services are ready"

View File

@ -0,0 +1,54 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
compose() {
if docker compose version >/dev/null 2>&1; then
docker compose "$@"
else
docker-compose "$@"
fi
}
service_id() {
compose -p argus-sys ps -q "$1"
}
echo "[INFO] Verifying DNS routing via bind..."
# Check master IP file exists in shared private
MASTER_FILE="$TEST_ROOT/private/argus/etc/master.argus.com"
if [[ ! -f "$MASTER_FILE" ]]; then
echo "[ERR] master.argus.com file missing at $MASTER_FILE" >&2
exit 1
fi
MASTER_IP_HOST="$(cat "$MASTER_FILE" | tr -d '\r\n' || true)"
echo "[INFO] master.argus.com file content: ${MASTER_IP_HOST}"
# dig inside bind container
BIN_ID="$(service_id bind)"
if [[ -n "$BIN_ID" ]]; then
DIG_IP="$(docker exec "$BIN_ID" dig +short master.argus.com A | tail -n1 || true)"
echo "[INFO] dig(master.argus.com) from bind container -> $DIG_IP"
if [[ -z "$DIG_IP" ]]; then
echo "[ERR] bind did not resolve master.argus.com" >&2; exit 1
fi
else
echo "[WARN] bind container not found; skip dig"
fi
for node in node-a node-b; do
CID="$(service_id "$node")"
echo "[INFO] Checking resolution inside $node..."
if ! docker exec "$CID" getent hosts master.argus.com >/dev/null 2>&1; then
echo "[ERR] $node cannot resolve master.argus.com" >&2
exit 1
fi
RES="$(docker exec "$CID" getent hosts master.argus.com | awk '{print $1}' | head -n1)"
echo "[OK] $node resolved master.argus.com -> $RES"
done
echo "[OK] DNS routing verified"

View File

@ -0,0 +1,87 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TMP_DIR="$TEST_ROOT/tmp"
API_BASE="http://localhost:32300/api/v1/master"
HOST_A="dev-yyrshare-nbnyx10-cp2f-pod-0"
HOST_B="dev-yyrshare-uuuu10-ep2f-pod-0"
mkdir -p "$TMP_DIR"
echo "[INFO] Waiting for agent nodes to register..."
extract_node() {
local name="$1"; local output="$2"; local json_file="$3"
python3 - "$name" "$output" "$json_file" <<'PY'
import json, sys, pathlib
name = sys.argv[1]
out = pathlib.Path(sys.argv[2])
json_file = sys.argv[3]
with open(json_file, 'r') as fh:
data = json.load(fh)
node = next((n for n in data if n.get("name") == name), None)
if node:
out.write_text(node["id"]) # save id
print(node["id"]) # also print for shell capture
PY
}
ID_A=""; ID_B=""
for _ in {1..60}; do
sleep 2
resp=$(curl -fsS "$API_BASE/nodes" 2>/dev/null || true)
if [[ -z "$resp" ]]; then
continue
fi
# only try to parse when it's a JSON array
if ! echo "$resp" | head -c1 | grep -q '\['; then
continue
fi
echo "$resp" > "$TMP_DIR/nodes_list.json"
ID_A=$(extract_node "$HOST_A" "$TMP_DIR/node_id_a" "$TMP_DIR/nodes_list.json" 2>/dev/null || true)
ID_B=$(extract_node "$HOST_B" "$TMP_DIR/node_id_b" "$TMP_DIR/nodes_list.json" 2>/dev/null || true)
if [[ -s "$TMP_DIR/node_id_a" && -s "$TMP_DIR/node_id_b" ]]; then
break
fi
done
if [[ ! -s "$TMP_DIR/node_id_a" || ! -s "$TMP_DIR/node_id_b" ]]; then
echo "[ERR] Agents did not register in time" >&2
exit 1
fi
node_detail() {
local id="$1"; local out="$2"
curl -fsS "$API_BASE/nodes/$id" -o "$out"
}
node_detail "$(cat "$TMP_DIR/node_id_a")" "$TMP_DIR/detail_a.json"
node_detail "$(cat "$TMP_DIR/node_id_b")" "$TMP_DIR/detail_b.json"
python3 - "$TMP_DIR/detail_a.json" "$TMP_DIR/initial_ip_a" <<'PY'
import json, sys, pathlib
node=json.load(open(sys.argv[1]))
ip=node.get("meta_data",{}).get("ip")
assert ip, "missing ip"
pathlib.Path(sys.argv[2]).write_text(ip)
PY
python3 - "$TMP_DIR/detail_b.json" "$TMP_DIR/initial_ip_b" <<'PY'
import json, sys, pathlib
node=json.load(open(sys.argv[1]))
ip=node.get("meta_data",{}).get("ip")
assert ip, "missing ip"
pathlib.Path(sys.argv[2]).write_text(ip)
PY
NODE_JSON_A="$TEST_ROOT/private-nodea/argus/agent/$HOST_A/node.json"
NODE_JSON_B="$TEST_ROOT/private-nodeb/argus/agent/$HOST_B/node.json"
[[ -f "$NODE_JSON_A" ]] || { echo "[ERR] node.json missing for $HOST_A" >&2; exit 1; }
[[ -f "$NODE_JSON_B" ]] || { echo "[ERR] node.json missing for $HOST_B" >&2; exit 1; }
echo "[OK] Agents registered: $(cat "$TMP_DIR/node_id_a") , $(cat "$TMP_DIR/node_id_b")"

View File

@ -0,0 +1,67 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TMP_DIR="$TEST_ROOT/tmp"
API_BASE="http://localhost:32300/api/v1/master"
HOST_A="dev-yyrshare-nbnyx10-cp2f-pod-0"
HOST_B="dev-yyrshare-uuuu10-ep2f-pod-0"
HEALTH_A="$TEST_ROOT/private-nodea/argus/agent/$HOST_A/health"
HEALTH_B="$TEST_ROOT/private-nodeb/argus/agent/$HOST_B/health"
write_health() {
local dir="$1"; mkdir -p "$dir"
cat > "$dir/log-fluentbit.json" <<JSON
{ "status": "healthy", "timestamp": "2024-10-05T12:05:00Z" }
JSON
cat > "$dir/metric-node-exporter.json" <<JSON
{ "status": "healthy", "timestamp": "2024-10-05T12:05:00Z" }
JSON
}
echo "[INFO] Writing health files for both nodes..."
write_health "$HEALTH_A"
write_health "$HEALTH_B"
ID_A="$(cat "$TMP_DIR/node_id_a")"
ID_B="$(cat "$TMP_DIR/node_id_b")"
check_health() {
local id="$1"; local tries=40
for _ in $(seq 1 $tries); do
sleep 2
resp=$(curl -fsS "$API_BASE/nodes/$id" 2>/dev/null || true)
[[ -z "$resp" ]] && continue
echo "$resp" > "$TMP_DIR/node_${id}_detail.json"
if python3 - "$TMP_DIR/node_${id}_detail.json" <<'PY'
import json,sys
node=json.load(open(sys.argv[1]))
h=node.get("health",{})
sys.exit(0 if ("log-fluentbit" in h and "metric-node-exporter" in h) else 1)
PY
then return 0; fi
done
return 1
}
check_health "$ID_A" || { echo "[ERR] health keys not reported for node A" >&2; exit 1; }
check_health "$ID_B" || { echo "[ERR] health keys not reported for node B" >&2; exit 1; }
NODES_JSON="$TEST_ROOT/private/argus/metric/prometheus/nodes.json"
if [[ ! -f "$NODES_JSON" ]]; then
echo "[ERR] nodes.json missing at $NODES_JSON" >&2; exit 1
fi
python3 - "$NODES_JSON" <<'PY'
import json,sys
with open(sys.argv[1]) as h:
nodes=json.load(h)
assert isinstance(nodes,list)
assert len(nodes) == 2, f"expected 2 nodes online, got {len(nodes)}"
PY
echo "[OK] Health reported and nodes.json has 2 online nodes"

View File

@ -0,0 +1,63 @@
#!/usr/bin/env bash
set -euo pipefail
echo "[INFO] Sending logs via node-a/node-b and asserting ES counts..."
get_count() {
local idx="$1"
curl -s "http://localhost:9200/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" | sed -E 's/.*"count":([0-9]+).*/\1/' | awk 'NF{print $0;exit} END{if(NR==0)print 0}'
}
train0=$(get_count "train-*")
infer0=$(get_count "infer-*")
base=$((train0 + infer0))
echo "[INFO] initial counts: train=${train0} infer=${infer0} total=${base}"
send_logs() {
local cname="$1"; local hosttag="$2"
docker exec "$cname" sh -lc 'mkdir -p /logs/train /logs/infer'
docker exec "$cname" sh -lc "ts=\
\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log"
docker exec "$cname" sh -lc "ts=\
\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log"
docker exec "$cname" sh -lc "ts=\
\$(date '+%F %T'); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log"
}
# Determine container names
node_a=$(docker ps --format '{{.Names}}' | grep -E '^argus-node-a$|argus-sys-node-a-1' | head -n1)
node_b=$(docker ps --format '{{.Names}}' | grep -E '^argus-node-b$|argus-sys-node-b-1' | head -n1)
send_logs "$node_a" "host01"
send_logs "$node_b" "host02"
echo "[INFO] Waiting for ES to ingest..."
sleep 10
train1=$(get_count "train-*")
infer1=$(get_count "infer-*")
final=$((train1 + infer1))
echo "[INFO] final counts: train=${train1} infer=${infer1} total=${final}"
if (( final <= base )); then
echo "[ERR] ES total did not increase (${base} -> ${final})" >&2
exit 1
fi
if (( final < 4 )); then
echo "[ERR] ES total below expected threshold: ${final} < 4" >&2
exit 1
fi
# Health endpoints
es_health=$(curl -s "http://localhost:9200/_cluster/health" | grep -o '"status":"[^"]*"' | cut -d'"' -f4)
if [[ "$es_health" != "green" && "$es_health" != "yellow" ]]; then
echo "[ERR] ES health not green/yellow: $es_health" >&2
exit 1
fi
if ! curl -fs "http://localhost:5601/api/status" >/dev/null 2>&1; then
echo "[WARN] Kibana status endpoint not available"
fi
echo "[OK] ES counts increased and services healthy"

View File

@ -0,0 +1,94 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TMP_DIR="$TEST_ROOT/tmp"
REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
API_BASE="http://localhost:32300/api/v1/master"
ID_B="$(cat "$TMP_DIR/node_id_b")"
IP0_B="$(cat "$TMP_DIR/initial_ip_b")"
detail_before="$TMP_DIR/node_b_before.json"
curl -fsS "$API_BASE/nodes/$ID_B" -o "$detail_before"
LAST0=$(python3 - "$detail_before" <<'PY'
import json,sys
node=json.load(open(sys.argv[1]))
print(node.get("last_updated",""))
PY
)
IP_BEFORE=$(python3 - "$detail_before" <<'PY'
import json,sys
node=json.load(open(sys.argv[1]))
print(node.get("meta_data",{}).get("ip",""))
PY
)
if [[ "$IP_BEFORE" != "$IP0_B" ]]; then
echo "[ERR] Expected initial IP $IP0_B for node-b, got $IP_BEFORE" >&2
exit 1
fi
compose() {
if docker compose version >/dev/null 2>&1; then
docker compose "$@"
else
docker-compose "$@"
fi
}
echo "[INFO] Recreating node-b with static IP 172.29.0.200..."
pushd "$TEST_ROOT" >/dev/null
compose -p argus-sys rm -sf node-b || true
popd >/dev/null
docker rm -f argus-node-b >/dev/null 2>&1 || true
AGENT_BIN_PATH="$(cat "$TMP_DIR/agent_binary_path")"
docker run -d \
--name argus-node-b \
--hostname dev-yyrshare-uuuu10-ep2f-pod-0 \
--network argus-sys-net \
--ip 172.29.0.200 \
--dns 172.29.0.2 \
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
-e REPORT_INTERVAL_SECONDS=2 \
-e ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} \
-e ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} \
-e ES_HOST=es \
-e ES_PORT=9200 \
-p 2021:2020 \
-v "$TEST_ROOT/private-nodeb/argus/agent/dev-yyrshare-uuuu10-ep2f-pod-0:/private/argus/agent/dev-yyrshare-uuuu10-ep2f-pod-0" \
-v "$AGENT_BIN_PATH:/usr/local/bin/argus-agent:ro" \
-v "$SCRIPT_DIR/node_entrypoint.sh:/usr/local/bin/node-entrypoint.sh:ro" \
-v "$REPO_ROOT/src/log/fluent-bit/build/start-fluent-bit.sh:/assets/start-fluent-bit.sh:ro" \
-v "$REPO_ROOT/src/log/fluent-bit/build/etc:/assets/fluent-bit/etc:ro" \
-v "$REPO_ROOT/src/log/fluent-bit/build/packages:/assets/fluent-bit/packages:ro" \
--entrypoint /usr/local/bin/node-entrypoint.sh \
ubuntu:22.04 >/dev/null
echo "[INFO] Waiting for node-b to re-register with new IP..."
for _ in {1..40}; do
sleep 3
if curl -fsS "$API_BASE/nodes/$ID_B" -o "$TMP_DIR/node_b_after.json"; then
if python3 - "$TMP_DIR/node_b_after.json" "$LAST0" <<'PY'
import json,sys
node=json.load(open(sys.argv[1]))
last0=sys.argv[2]
ip=node.get("meta_data",{}).get("ip")
lu=node.get("last_updated")
assert ip=="172.29.0.200"
assert lu and lu!=last0
PY
then
echo "[OK] node-b re-registered with new IP 172.29.0.200"
exit 0
fi
fi
done
echo "[ERR] node-b did not update to IP 172.29.0.200 in time" >&2
exit 1

View File

@ -0,0 +1,37 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
compose() {
if docker compose version >/dev/null 2>&1; then
docker compose "$@"
else
docker-compose "$@"
fi
}
docker rm -f argus-node-b >/dev/null 2>&1 || true
pushd "$TEST_ROOT" >/dev/null
compose -p argus-sys down --remove-orphans || true
popd >/dev/null
echo "[INFO] Cleaning private directories..."
if [[ -d "$TEST_ROOT/private" ]]; then
docker run --rm -v "$TEST_ROOT/private:/target" ubuntu:24.04 chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true
rm -rf "$TEST_ROOT/private"
fi
if [[ -d "$TEST_ROOT/private-nodea" ]]; then
docker run --rm -v "$TEST_ROOT/private-nodea:/target" ubuntu:24.04 chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true
rm -rf "$TEST_ROOT/private-nodea"
fi
if [[ -d "$TEST_ROOT/private-nodeb" ]]; then
docker run --rm -v "$TEST_ROOT/private-nodeb:/target" ubuntu:24.04 chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true
rm -rf "$TEST_ROOT/private-nodeb"
fi
rm -rf "$TEST_ROOT/tmp" "$TEST_ROOT/.env" || true
echo "[OK] Cleaned up system E2E"

View File

@ -0,0 +1,57 @@
#!/usr/bin/env bash
set -euo pipefail
LOG_PREFIX="[NODE]"
RUNTIME_USER="argusagent"
RUNTIME_GROUP="argusagent"
AGENT_UID="${ARGUS_BUILD_UID:-2133}"
AGENT_GID="${ARGUS_BUILD_GID:-2015}"
HOSTNAME_VAL="${HOSTNAME:-unknown}"
log() { echo "${LOG_PREFIX} $*"; }
# Prepare runtime user
if ! getent group "$AGENT_GID" >/dev/null 2>&1; then
groupadd -g "$AGENT_GID" "$RUNTIME_GROUP" || true
else
RUNTIME_GROUP="$(getent group "$AGENT_GID" | cut -d: -f1)"
fi
if ! getent passwd "$AGENT_UID" >/dev/null 2>&1; then
useradd -u "$AGENT_UID" -g "$AGENT_GID" -M -s /bin/bash "$RUNTIME_USER" || true
else
RUNTIME_USER="$(getent passwd "$AGENT_UID" | cut -d: -f1)"
fi
log "runtime user: $RUNTIME_USER ($AGENT_UID:$AGENT_GID)"
# Ensure agent data dirs exist (host volumes mounted)
AGENT_DIR="/private/argus/agent/${HOSTNAME_VAL}"
HEALTH_DIR="${AGENT_DIR}/health"
mkdir -p "$HEALTH_DIR"
chown -R "$AGENT_UID:$AGENT_GID" "$AGENT_DIR" 2>/dev/null || true
# Stage Fluent Bit assets into /private to reuse existing startup script
mkdir -p /private
if [[ -f /assets/start-fluent-bit.sh ]]; then
cp /assets/start-fluent-bit.sh /private/start-fluent-bit.sh
chmod +x /private/start-fluent-bit.sh
fi
if [[ -d /assets/fluent-bit/etc ]]; then
rm -rf /private/etc && mkdir -p /private
cp -r /assets/fluent-bit/etc /private/
fi
if [[ -d /assets/fluent-bit/packages ]]; then
cp -r /assets/fluent-bit/packages /private/
fi
# Start Fluent Bit in background (will block, so run via bash -lc &)
if [[ -x /private/start-fluent-bit.sh ]]; then
log "starting fluent-bit"
bash -lc '/private/start-fluent-bit.sh' &
else
log "missing /private/start-fluent-bit.sh; fluent-bit will not start"
fi
# Start agent in foreground as runtime user
log "starting argus-agent"
exec su -s /bin/bash -c /usr/local/bin/argus-agent "$RUNTIME_USER"