argus/src/log/tests/scripts/e2e_test.sh

#!/usr/bin/env bash
set -euo pipefail

echo "======================================="
echo "ARGUS Log System End-to-End Test"
echo "======================================="
echo ""

# 记录测试开始时间
test_start_time=$(date +%s)

# 函数：获取ES中的日志计数
get_log_count() {
    local train_count=$(curl -s "http://localhost:9200/train-*/_count" 2>/dev/null | grep -o '"count":[0-9]*' | cut -d':' -f2 || echo "0")
    local infer_count=$(curl -s "http://localhost:9200/infer-*/_count" 2>/dev/null | grep -o '"count":[0-9]*' | cut -d':' -f2 || echo "0")
    echo "$((train_count + infer_count))"
}

# 函数：等待服务就绪
wait_for_services() {
    echo "[INFO] Waiting for all services to be ready..."
    local max_attempts=${SERVICE_WAIT_ATTEMPTS:-120}
    local attempt=1

    while [ $attempt -le $max_attempts ]; do
        if curl -fs http://localhost:9200/_cluster/health >/dev/null 2>&1 && \
           curl -fs http://localhost:5601/api/status >/dev/null 2>&1 && \
           curl -fs http://localhost:2020/api/v2/metrics >/dev/null 2>&1 && \
           curl -fs http://localhost:2021/api/v2/metrics >/dev/null 2>&1; then
            echo "[OK] All services are ready!"
            return 0
        fi
        echo "    Waiting for services... ($attempt/$max_attempts)"
        sleep 5
        ((attempt++))
    done

    echo "[ERROR] Services not ready after $max_attempts attempts"
    return 1
}

# 函数：显示测试步骤
show_step() {
    echo ""
    echo "🔄 Step $1: $2"
    echo "----------------------------------------"
}

# 函数：验证步骤结果
verify_step() {
    if [ $? -eq 0 ]; then
        echo "✅ $1 - SUCCESS"
    else
        echo "❌ $1 - FAILED"
        exit 1
    fi
}

# 开始端到端测试
show_step "1" "Bootstrap - Initialize environment"
./scripts/01_bootstrap.sh
verify_step "Bootstrap"

show_step "2" "Startup - Start all services"
./scripts/02_up.sh
verify_step "Service startup"

# 等待服务完全就绪
wait_for_services || exit 1

# 记录发送测试数据前的日志计数
initial_count=$(get_log_count)
echo "[INFO] Initial log count: $initial_count"

show_step "3a" "Send test data - Host01"
./scripts/03_send_test_host01.sh
verify_step "Test data sending (host01)"

show_step "3b" "Send test data - Host02"
./scripts/03_send_test_host02.sh
verify_step "Test data sending (host02)"

# 等待数据被处理
echo "[INFO] Waiting for data to be processed..."
sleep 10

show_step "4" "Verify data - Query Elasticsearch"
./scripts/04_query_es.sh
verify_step "Data verification"

# 记录发送测试数据后的日志计数
final_count=$(get_log_count)
echo "[INFO] Final log count: $final_count"

# 验证日志数量是否增加
if [ "$final_count" -gt "$initial_count" ]; then
    added_logs=$((final_count - initial_count))
    echo "✅ Log count verification - SUCCESS: Added $added_logs logs (from $initial_count to $final_count)"
else
    echo "❌ Log count verification - FAILED: Expected count to increase, but got $initial_count -> $final_count"
    exit 1
fi

# 验证预期的最小日志数量（每个主机应该发送一些日志）
expected_min_logs=4  # 至少应该有几条日志
if [ "$final_count" -ge "$expected_min_logs" ]; then
    echo "✅ Minimum log threshold - SUCCESS: $final_count logs (>= $expected_min_logs expected)"
else
    echo "❌ Minimum log threshold - FAILED: Only $final_count logs (>= $expected_min_logs expected)"
    exit 1
fi

# 检查服务健康状态
show_step "Health" "Check service health"
echo "[INFO] Checking service health..."

# 检查 Elasticsearch 健康状态
es_health=$(curl -s "http://localhost:9200/_cluster/health" | grep -o '"status":"[^"]*"' | cut -d'"' -f4)
if [ "$es_health" = "green" ] || [ "$es_health" = "yellow" ]; then
    echo "✅ Elasticsearch health: $es_health"
else
    echo "❌ Elasticsearch health: $es_health"
fi

# 检查 Kibana 状态
if curl -fs "http://localhost:5601/api/status" >/dev/null 2>&1; then
    kb_status="available"
    echo "✅ Kibana status: $kb_status"
else
    kb_status="unavailable"
    echo "⚠️  Kibana status: $kb_status"
fi

# 检查 Fluent-Bit 指标
fb_host01_uptime=$(curl -s "http://localhost:2020/api/v2/metrics" | grep "fluentbit_uptime" | head -1 | grep -o "[0-9]\+$" || echo "0")
fb_host02_uptime=$(curl -s "http://localhost:2021/api/v2/metrics" | grep "fluentbit_uptime" | head -1 | grep -o "[0-9]\+$" || echo "0")

if [ "$fb_host01_uptime" -gt 0 ] && [ "$fb_host02_uptime" -gt 0 ]; then
    echo "✅ Fluent-Bit services: host01 uptime=${fb_host01_uptime}s, host02 uptime=${fb_host02_uptime}s"
else
    echo "⚠️  Fluent-Bit services: host01 uptime=${fb_host01_uptime}s, host02 uptime=${fb_host02_uptime}s"
fi

verify_step "Service health check"

show_step "5" "Cleanup - Stop all services"
./scripts/05_down.sh
verify_step "Service cleanup"

# 计算总测试时间
test_end_time=$(date +%s)
total_time=$((test_end_time - test_start_time))

echo ""
echo "======================================="
echo "🎉 END-TO-END TEST COMPLETED SUCCESSFULLY!"
echo "======================================="
echo "📊 Test Summary:"
echo "   • Initial logs: $initial_count"
echo "   • Final logs: $final_count"
echo "   • Added logs: $added_logs"
echo "   • Total time: ${total_time}s"
echo "   • ES health: $es_health"
echo "   • Kibana status: $kb_status"
echo "   • DNS resolv: ✅ Passed (ES domain verified)"
echo "   • All services started and stopped successfully"
echo ""
echo "✅ The ARGUS log system is working correctly!"
echo ""