From b0d451cbe737c5e47d53366f1a8aa79f4c16dcd9 Mon Sep 17 00:00:00 2001 From: "sundapeng.sdp" Date: Tue, 21 Oct 2025 09:31:29 +0800 Subject: [PATCH] =?UTF-8?q?refactor:=20metric=20e2e=E6=B5=8B=E8=AF=95?= =?UTF-8?q?=E6=B5=81=E7=A8=8B=E8=9E=8D=E5=90=88=E5=88=B0=20sys/tests=20?= =?UTF-8?q?=E6=AD=A5=E9=AA=A4=E4=B8=AD(test-gpu-node/check-service-install?= =?UTF-8?q?ed);?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit refs #29 --- src/sys/tests/scripts/02_up.sh | 18 +++- .../tests/scripts/07_test_gpu_node_install.sh | 18 +++- .../scripts/08_check_services_installed.sh | 96 +++++++++++++++++++ ...ssert.sh => 09_write_health_and_assert.sh} | 0 ...d_assert.sh => 10_logs_send_and_assert.sh} | 0 ...ster.sh => 11_restart_agent_reregister.sh} | 0 .../tests/scripts/{09_down.sh => 12_down.sh} | 0 7 files changed, 122 insertions(+), 10 deletions(-) create mode 100755 src/sys/tests/scripts/08_check_services_installed.sh rename src/sys/tests/scripts/{06_write_health_and_assert.sh => 09_write_health_and_assert.sh} (100%) rename src/sys/tests/scripts/{07_logs_send_and_assert.sh => 10_logs_send_and_assert.sh} (100%) rename src/sys/tests/scripts/{08_restart_agent_reregister.sh => 11_restart_agent_reregister.sh} (100%) rename src/sys/tests/scripts/{09_down.sh => 12_down.sh} (100%) diff --git a/src/sys/tests/scripts/02_up.sh b/src/sys/tests/scripts/02_up.sh index a65de73..30df6b9 100755 --- a/src/sys/tests/scripts/02_up.sh +++ b/src/sys/tests/scripts/02_up.sh @@ -3,6 +3,7 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)" compose() { if docker compose version >/dev/null 2>&1; then @@ -14,12 +15,19 @@ compose() { echo "[INFO] Bringing up system stack..." -# 加载环境变量 -if [ -f "$TEST_ROOT/.env" ]; then - source "$TEST_ROOT/.env" - echo "[INFO] 已加载环境变量,GPU_AVAILABLE=$GPU_AVAILABLE" +# 检测GPU环境 +echo "[INFO] 检测GPU环境..." +GPU_CHECK_SCRIPT="$REPO_ROOT/src/metric/tests/scripts/common/check-gpu.sh" +if [ -f "$GPU_CHECK_SCRIPT" ]; then + if bash "$GPU_CHECK_SCRIPT" >/dev/null 2>&1; then + echo "[INFO] GPU环境可用,将启动GPU测试节点" + GPU_AVAILABLE=true + else + echo "[INFO] GPU环境不可用,将跳过GPU测试节点" + GPU_AVAILABLE=false + fi else - echo "[WARN] 未找到.env文件,默认GPU不可用" + echo "[WARN] 未找到GPU检测脚本: $GPU_CHECK_SCRIPT,跳过GPU检测" GPU_AVAILABLE=false fi diff --git a/src/sys/tests/scripts/07_test_gpu_node_install.sh b/src/sys/tests/scripts/07_test_gpu_node_install.sh index e8fa4e3..4979dc6 100755 --- a/src/sys/tests/scripts/07_test_gpu_node_install.sh +++ b/src/sys/tests/scripts/07_test_gpu_node_install.sh @@ -2,7 +2,8 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -COMMON_DIR="$SCRIPT_DIR/common" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)" FTP_SERVER="${FTP_SERVER:-172.29.0.40}" FTP_USER="${FTP_USER:-ftpuser}" @@ -13,11 +14,18 @@ FTP_HOST="${FTP_SERVER}" echo "[04] 检测GPU环境..." # 检测GPU环境 -if bash "$COMMON_DIR/check-gpu.sh"; then - echo "[04] GPU环境可用,继续执行GPU节点安装" - GPU_AVAILABLE=true +GPU_CHECK_SCRIPT="$REPO_ROOT/metric/tests/scripts/common/check-gpu.sh" +if [ -f "$GPU_CHECK_SCRIPT" ]; then + if bash "$GPU_CHECK_SCRIPT"; then + echo "[04] GPU环境可用,继续执行GPU节点安装" + GPU_AVAILABLE=true + else + echo "[04] GPU环境不可用,跳过GPU节点安装" + GPU_AVAILABLE=false + exit 0 + fi else - echo "[04] GPU环境不可用,跳过GPU节点安装" + echo "[04] 未找到GPU检测脚本: $GPU_CHECK_SCRIPT,跳过GPU节点安装" GPU_AVAILABLE=false exit 0 fi diff --git a/src/sys/tests/scripts/08_check_services_installed.sh b/src/sys/tests/scripts/08_check_services_installed.sh new file mode 100755 index 0000000..5a33a05 --- /dev/null +++ b/src/sys/tests/scripts/08_check_services_installed.sh @@ -0,0 +1,96 @@ +#!/bin/bash +set -e + +echo "[04] 验证安装结果 - 检查监控端口..." +echo "==========================================" + +# 检查容器是否运行 +if ! docker ps --format '{{.Names}}' | grep -q '^argus-metric-test-node$'; then + echo "错误: 容器 argus-metric-test-node 未运行" + exit 1 +fi + +ERRORS=0 + +# ==================== 检查监听端口 ==================== +echo "" +echo "[1] 检查监听端口..." +echo "----------------------------------------" +CHECK_RESULT=$(docker exec argus-metric-test-node bash -c ' +if command -v netstat >/dev/null 2>&1; then + echo "使用 netstat 检查端口:" + if netstat -tlnp 2>/dev/null | grep -E ":(9100|9400|2020)"; then + echo "✓ 找到监控端口" + exit 0 + else + echo "✗ 未找到监控端口 (9100/9400/2020)" + exit 1 + fi +elif command -v ss >/dev/null 2>&1; then + echo "使用 ss 检查端口:" + if ss -tlnp 2>/dev/null | grep -E ":(9100|9400|2020)"; then + echo "✓ 找到监控端口" + exit 0 + else + echo "✗ 未找到监控端口 (9100/9400/2020)" + exit 1 + fi +elif command -v lsof >/dev/null 2>&1; then + echo "使用 lsof 检查端口:" + if lsof -i :9100 -i :9400 -i :2020 2>/dev/null | grep LISTEN; then + echo "✓ 找到监控端口" + exit 0 + else + echo "✗ 未找到监控端口 (9100/9400/2020)" + exit 1 + fi +else + echo "? 没有可用的端口检查工具 (netstat/ss/lsof),跳过此检查" + exit 0 +fi +') +echo "$CHECK_RESULT" +# 只有在明确失败时才计入错误(exit 1),没有工具(exit 0)不算错误 +if echo "$CHECK_RESULT" | grep -q "✗ 未找到监控端口"; then + ERRORS=$((ERRORS + 1)) +fi + +# ==================== 测试端口连通性 ==================== +echo "" +echo "[2] 测试端口连通性..." +echo "----------------------------------------" +docker exec argus-metric-test-node bash -c ' +if command -v curl >/dev/null 2>&1; then + FAILED=0 + for port in 9100 9400 2020; do + echo -n "端口 $port: " + if curl -s --connect-timeout 2 "http://localhost:$port/metrics" > /dev/null 2>&1; then + echo "✓ 可访问 (/metrics)" + elif curl -s --connect-timeout 2 "http://localhost:$port/" > /dev/null 2>&1; then + echo "✓ 可访问 (根路径)" + else + echo "✗ 不可访问" + FAILED=$((FAILED + 1)) + fi + done + exit $FAILED +else + echo "? curl 不可用,跳过连通性测试" + exit 0 +fi +' || ERRORS=$((ERRORS + 1)) + +echo "" +echo "==========================================" +if [ $ERRORS -eq 0 ]; then + echo "✓ [04] 验证完成 - 所有端口检查通过" +else + echo "✗ [04] 验证失败 - 发现 $ERRORS 个问题" + echo "" + echo "调试建议:" + echo " 1. 进入容器检查: docker exec -it argus-metric-test-node bash" + echo " 2. 查看进程: docker exec argus-metric-test-node ps aux" + echo " 3. 查看日志: docker exec argus-metric-test-node cat /tmp/argus_install.log" + exit 1 +fi +echo "==========================================" diff --git a/src/sys/tests/scripts/06_write_health_and_assert.sh b/src/sys/tests/scripts/09_write_health_and_assert.sh similarity index 100% rename from src/sys/tests/scripts/06_write_health_and_assert.sh rename to src/sys/tests/scripts/09_write_health_and_assert.sh diff --git a/src/sys/tests/scripts/07_logs_send_and_assert.sh b/src/sys/tests/scripts/10_logs_send_and_assert.sh similarity index 100% rename from src/sys/tests/scripts/07_logs_send_and_assert.sh rename to src/sys/tests/scripts/10_logs_send_and_assert.sh diff --git a/src/sys/tests/scripts/08_restart_agent_reregister.sh b/src/sys/tests/scripts/11_restart_agent_reregister.sh similarity index 100% rename from src/sys/tests/scripts/08_restart_agent_reregister.sh rename to src/sys/tests/scripts/11_restart_agent_reregister.sh diff --git a/src/sys/tests/scripts/09_down.sh b/src/sys/tests/scripts/12_down.sh similarity index 100% rename from src/sys/tests/scripts/09_down.sh rename to src/sys/tests/scripts/12_down.sh