argus/src/sys/tests/scripts/15_alert_verify.sh

104 lines
3.2 KiB
Bash
Executable File

#!/bin/bash
# verify_alertmanager.sh
# Verify the communication between Prometheus and Alertmanager after deployment
set -euo pipefail
echo "[INFO] Verifying Prometheus ↔ Alertmanager communication..."
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TMP_DIR="$TEST_ROOT/tmp"
mkdir -p "$TMP_DIR"
PRIVATE_CORE="$TEST_ROOT/private"
#=============================
# Load environment variables
#=============================
if [[ -f "$TEST_ROOT/.env" ]]; then
set -a; source "$TEST_ROOT/.env"; set +a
fi
#=============================
# Basic configuration
#=============================
PROM_URL="http://localhost:${PROMETHEUS_PORT:-9090}"
ALERT_URL="http://localhost:${ALERTMANAGER_PORT:-9093}"
RULE_DIR="$PRIVATE_CORE/argus/metric/prometheus/rules"
TMP_RULE="$TMP_DIR/test_rule.yml"
#=============================
# Helper functions
#=============================
GREEN="\033[32m"; RED="\033[31m"; YELLOW="\033[33m"; RESET="\033[0m"
log_info() { echo -e "${YELLOW}[INFO]${RESET} $1"; }
log_success() { echo -e "${GREEN}[OK]${RESET} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${RESET} $1"; }
log_error() { echo -e "${RED}[ERROR]${RESET} $1"; }
fail_exit() { log_error "$1"; exit 1; }
#=============================
# Step 1: Check Alertmanager accessibility
#=============================
log_info "Checking Alertmanager status..."
if curl -sSf "${ALERT_URL}/api/v2/status" >/dev/null 2>&1; then
log_success "Alertmanager is reachable at ${ALERT_URL}"
else
fail_exit "Alertmanager is not reachable. Please check container or port mapping."
fi
#=============================
# Step 2: Create and load a temporary test alert rule
#=============================
log_info "Creating temporary alert rule at ${TMP_RULE}..."
cat <<EOF > "${TMP_RULE}"
groups:
- name: deploy-verify-group
rules:
- alert: DeployVerifyAlert
expr: vector(1)
labels:
severity: warning
annotations:
summary: "Deployment verification alert"
EOF
mkdir -p "${RULE_DIR}"
cp "${TMP_RULE}" "${RULE_DIR}/test_rule.yml"
log_info "Reloading Prometheus to apply the test rule..."
if curl -s -X POST "${PROM_URL}/-/reload" >/dev/null; then
log_success "Prometheus successfully reloaded rules"
else
fail_exit "Failed to reload Prometheus. Check API accessibility."
fi
#=============================
# Step 3: Verify alert received by Alertmanager
#=============================
log_info "Waiting for alert propagation (~30 seconds)..."
sleep 30
if curl -s "${ALERT_URL}/api/v2/alerts" | grep -q "DeployVerifyAlert"; then
log_success "Prometheus → Alertmanager alert path verified successfully"
else
fail_exit "DeployVerifyAlert not found in Alertmanager. Check configuration or network."
fi
#=============================
# Step 4: Cleanup test rule
#=============================
log_info "Cleaning up temporary alert rule..."
rm -f "${RULE_DIR}/test_rule.yml" "${TMP_RULE}"
if curl -s -X POST "${PROM_URL}/-/reload" >/dev/null; then
log_success "Prometheus successfully reloaded after cleanup"
else
log_warn "Prometheus reload after cleanup failed. Please check manually."
fi
log_success "Alertmanager verification completed successfully. Communication with Prometheus is healthy."