#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" TMP_DIR="$TEST_ROOT/tmp/metric-verify" mkdir -p "$TMP_DIR" # 载入端口变量 if [[ -f "$TEST_ROOT/.env" ]]; then set -a; source "$TEST_ROOT/.env"; set +a fi PROM_BASE="http://localhost:${PROMETHEUS_PORT:-9090}/api/v1" HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}" nodes_json="$TEST_ROOT/private/argus/metric/prometheus/nodes.json" targets_json="$TEST_ROOT/private/argus/metric/prometheus/targets/node_exporter.json" echo "[VERIFY:PROM] nodes.json present and contains hostname=$HOSTNAME" [[ -f "$nodes_json" ]] || { echo "[ERR] $nodes_json missing" >&2; exit 1; } python3 - "$nodes_json" "$HOSTNAME" <<'PY' import json,sys arr=json.load(open(sys.argv[1])) host=sys.argv[2] assert any((i.get('hostname')==host) for i in arr), f"{host} not found in nodes.json" PY echo "[OK] nodes.json contains target" echo "[VERIFY:PROM] file_sd targets exist for nodes.json entries" [[ -f "$targets_json" ]] || { echo "[ERR] $targets_json missing" >&2; exit 1; } python3 - "$nodes_json" "$targets_json" "$HOSTNAME" >"$TMP_DIR/prom_targets_ip_inst.txt" <<'PY' import json,sys nodes=json.load(open(sys.argv[1])) file_sd=json.load(open(sys.argv[2])) host=sys.argv[3] targets=set() for item in file_sd: for t in item.get('targets',[]): targets.add(t) # choose node matching hostname; fallback to first metric user node; otherwise first sel = None for n in nodes: if n.get('hostname') == host: sel = n break if not sel: for n in nodes: if n.get('user_id') == 'metric': sel = n break if not sel and nodes: sel = nodes[0] if not sel: raise SystemExit('nodes.json empty or no suitable node found') ip = sel['ip'] inst = f"{ip}:9100" print(ip) print(inst) PY IP_FIRST=$(sed -n '1p' "$TMP_DIR/prom_targets_ip_inst.txt") INSTANCE=$(sed -n '2p' "$TMP_DIR/prom_targets_ip_inst.txt") echo "[INFO] expecting instance in file_sd: $INSTANCE" # 尝试在 Prometheus 容器内主动刷新 targets(可选加速) if docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then echo "[..] triggering update_targets inside argus-prometheus" docker exec argus-prometheus bash -lc \ 'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true' fi # 给 Prometheus 一次初始 scrape 周期 sleep 10 # 若短暂未生成,进行重试(最多 180s),期间多次触发刷新 retry=0 until jq -r '.[].targets[]' "$targets_json" 2>/dev/null | grep -q "^${IP_FIRST}:9100$"; do if (( retry >= 36 )); then echo "[ERR] ${IP_FIRST}:9100 not present in file_sd after timeout" >&2 echo "[HINT] current targets file content:" >&2 sed -n '1,200p' "$targets_json" >&2 || true exit 1 fi if (( retry % 3 == 0 )) && docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then docker exec argus-prometheus bash -lc \ 'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true' fi echo "[..] waiting file_sd refresh ($retry/36)"; sleep 5; ((retry++)) done # 改为以 PromQL up 指标作为健康依据,避免 targets 页面状态抖动 echo "[VERIFY:PROM] up{job=\"node\",ip=\"$IP_FIRST\"} > 0" attempt=0 until (( attempt >= 60 )); do curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=up{job=\"node\",ip=\"$IP_FIRST\"}" > "$TMP_DIR/prom_up_inst_active.json" || true if python3 - "$TMP_DIR/prom_up_inst_active.json" <<'PY' import json,sys try: j=json.load(open(sys.argv[1])) except Exception: raise SystemExit(1) res=j.get('data',{}).get('result',[]) if res: try: val=float(res[0]['value'][1]) if val>0: raise SystemExit(0) except Exception: pass raise SystemExit(1) PY then echo "[OK] up > 0 (control-plane scrape works)"; break fi if (( attempt % 6 == 0 )) && docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then docker exec argus-prometheus bash -lc \ 'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true' fi echo "[..] waiting up{job=\"node\",ip=\"$IP_FIRST\"} > 0 ($attempt/60)"; sleep 5; ((attempt++)) done if (( attempt >= 60 )); then echo "[ERR] up{job=\"node\",ip=\"$IP_FIRST\"} did not become > 0" >&2 exit 1 fi echo "[VERIFY:PROM] instant up query > 0" curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=up{job=\"node\",ip=\"$IP_FIRST\"}" > "$TMP_DIR/prom_up_inst.json" python3 - "$TMP_DIR/prom_up_inst.json" <<'PY' import json,sys j=json.load(open(sys.argv[1])) res=j.get('data',{}).get('result',[]) assert res, 'empty result for up{job="node",instance=...}' val=float(res[0]['value'][1]) assert val>0, f"up value not > 0: {val}" PY echo "[OK] up > 0" echo "[VERIFY:PROM] count(up{job=\"node\"}==1) >= 1" curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=count(up{job=\"node\"}==1)" > "$TMP_DIR/prom_up_count.json" python3 - "$TMP_DIR/prom_up_count.json" <<'PY' import json,sys j=json.load(open(sys.argv[1])) res=j.get('data',{}).get('result',[]) assert res, 'empty result for count(up{job="node"}==1)' val=float(res[0]['value'][1]) assert val>=1, f"count < 1: {val}" PY echo "[OK] up count satisfied" echo "[DONE] prometheus verify"