#!/bin/bash # verify_alertmanager.sh # Verify the communication between Prometheus and Alertmanager after deployment set -euo pipefail echo "[INFO] Verifying Prometheus ↔ Alertmanager communication..." SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" TMP_DIR="$TEST_ROOT/tmp" mkdir -p "$TMP_DIR" PRIVATE_CORE="$TEST_ROOT/private" #============================= # Load environment variables #============================= if [[ -f "$TEST_ROOT/.env" ]]; then set -a; source "$TEST_ROOT/.env"; set +a fi #============================= # Basic configuration #============================= PROM_URL="http://localhost:${PROMETHEUS_PORT:-9090}" ALERT_URL="http://localhost:${ALERTMANAGER_PORT:-9093}" RULE_DIR="$PRIVATE_CORE/argus/metric/prometheus/rules" TMP_RULE="$TMP_DIR/test_rule.yml" #============================= # Helper functions #============================= GREEN="\033[32m"; RED="\033[31m"; YELLOW="\033[33m"; RESET="\033[0m" log_info() { echo -e "${YELLOW}[INFO]${RESET} $1"; } log_success() { echo -e "${GREEN}[OK]${RESET} $1"; } log_warn() { echo -e "${YELLOW}[WARN]${RESET} $1"; } log_error() { echo -e "${RED}[ERROR]${RESET} $1"; } fail_exit() { log_error "$1"; exit 1; } #============================= # Step 1: Check Alertmanager accessibility #============================= log_info "Checking Alertmanager status..." if curl -sSf "${ALERT_URL}/api/v2/status" >/dev/null 2>&1; then log_success "Alertmanager is reachable at ${ALERT_URL}" else fail_exit "Alertmanager is not reachable. Please check container or port mapping." fi #============================= # Step 2: Create and load a temporary test alert rule #============================= log_info "Creating temporary alert rule at ${TMP_RULE}..." cat < "${TMP_RULE}" groups: - name: deploy-verify-group rules: - alert: DeployVerifyAlert expr: vector(1) labels: severity: warning annotations: summary: "Deployment verification alert" EOF mkdir -p "${RULE_DIR}" cp "${TMP_RULE}" "${RULE_DIR}/test_rule.yml" log_info "Reloading Prometheus to apply the test rule..." if curl -s -X POST "${PROM_URL}/-/reload" >/dev/null; then log_success "Prometheus successfully reloaded rules" else fail_exit "Failed to reload Prometheus. Check API accessibility." fi #============================= # Step 3: Verify alert received by Alertmanager #============================= log_info "Waiting for alert propagation (~30 seconds)..." sleep 30 if curl -s "${ALERT_URL}/api/v2/alerts" | grep -q "DeployVerifyAlert"; then log_success "Prometheus → Alertmanager alert path verified successfully" else fail_exit "DeployVerifyAlert not found in Alertmanager. Check configuration or network." fi #============================= # Step 4: Cleanup test rule #============================= log_info "Cleaning up temporary alert rule..." rm -f "${RULE_DIR}/test_rule.yml" "${TMP_RULE}" if curl -s -X POST "${PROM_URL}/-/reload" >/dev/null; then log_success "Prometheus successfully reloaded after cleanup" else log_warn "Prometheus reload after cleanup failed. Please check manually." fi log_success "Alertmanager verification completed successfully. Communication with Prometheus is healthy."