argus/src/sys/tests/scripts/13_metric_verify_prometheus.sh

148 lines
5.5 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TMP_DIR="$TEST_ROOT/tmp/metric-verify"
mkdir -p "$TMP_DIR"
# 载入端口变量
if [[ -f "$TEST_ROOT/.env" ]]; then
set -a; source "$TEST_ROOT/.env"; set +a
fi
PROM_BASE="http://localhost:${PROMETHEUS_PORT:-9090}/api/v1"
HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}"
nodes_json="$TEST_ROOT/private/argus/metric/prometheus/nodes.json"
targets_json="$TEST_ROOT/private/argus/metric/prometheus/targets/node_exporter.json"
echo "[VERIFY:PROM] nodes.json present and contains hostname=$HOSTNAME"
[[ -f "$nodes_json" ]] || { echo "[ERR] $nodes_json missing" >&2; exit 1; }
python3 - "$nodes_json" "$HOSTNAME" <<'PY'
import json,sys
arr=json.load(open(sys.argv[1]))
host=sys.argv[2]
assert any((i.get('hostname')==host) for i in arr), f"{host} not found in nodes.json"
PY
echo "[OK] nodes.json contains target"
echo "[VERIFY:PROM] file_sd targets exist for nodes.json entries"
[[ -f "$targets_json" ]] || { echo "[ERR] $targets_json missing" >&2; exit 1; }
python3 - "$nodes_json" "$targets_json" "$HOSTNAME" >"$TMP_DIR/prom_targets_ip_inst.txt" <<'PY'
import json,sys
nodes=json.load(open(sys.argv[1]))
file_sd=json.load(open(sys.argv[2]))
host=sys.argv[3]
targets=set()
for item in file_sd:
for t in item.get('targets',[]): targets.add(t)
# choose node matching hostname; fallback to first metric user node; otherwise first
sel = None
for n in nodes:
if n.get('hostname') == host:
sel = n
break
if not sel:
for n in nodes:
if n.get('user_id') == 'metric':
sel = n
break
if not sel and nodes:
sel = nodes[0]
if not sel:
raise SystemExit('nodes.json empty or no suitable node found')
ip = sel['ip']
inst = f"{ip}:9100"
print(ip)
print(inst)
PY
IP_FIRST=$(sed -n '1p' "$TMP_DIR/prom_targets_ip_inst.txt")
INSTANCE=$(sed -n '2p' "$TMP_DIR/prom_targets_ip_inst.txt")
echo "[INFO] expecting instance in file_sd: $INSTANCE"
# 尝试在 Prometheus 容器内主动刷新 targets可选加速
if docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then
echo "[..] triggering update_targets inside argus-prometheus"
docker exec argus-prometheus bash -lc \
'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true'
fi
# 给 Prometheus 一次初始 scrape 周期
sleep 10
# 若短暂未生成,进行重试(最多 180s期间多次触发刷新
retry=0
until jq -r '.[].targets[]' "$targets_json" 2>/dev/null | grep -q "^${IP_FIRST}:9100$"; do
if (( retry >= 36 )); then
echo "[ERR] ${IP_FIRST}:9100 not present in file_sd after timeout" >&2
echo "[HINT] current targets file content:" >&2
sed -n '1,200p' "$targets_json" >&2 || true
exit 1
fi
if (( retry % 3 == 0 )) && docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then
docker exec argus-prometheus bash -lc \
'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true'
fi
echo "[..] waiting file_sd refresh ($retry/36)"; sleep 5; ((retry++))
done
# 改为以 PromQL up 指标作为健康依据,避免 targets 页面状态抖动
echo "[VERIFY:PROM] up{job=\"node\",ip=\"$IP_FIRST\"} > 0"
attempt=0
until (( attempt >= 60 )); do
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=up{job=\"node\",ip=\"$IP_FIRST\"}" > "$TMP_DIR/prom_up_inst_active.json" || true
if python3 - "$TMP_DIR/prom_up_inst_active.json" <<'PY'
import json,sys
try:
j=json.load(open(sys.argv[1]))
except Exception:
raise SystemExit(1)
res=j.get('data',{}).get('result',[])
if res:
try:
val=float(res[0]['value'][1])
if val>0: raise SystemExit(0)
except Exception:
pass
raise SystemExit(1)
PY
then
echo "[OK] up > 0 (control-plane scrape works)"; break
fi
if (( attempt % 6 == 0 )) && docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then
docker exec argus-prometheus bash -lc \
'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true'
fi
echo "[..] waiting up{job=\"node\",ip=\"$IP_FIRST\"} > 0 ($attempt/60)"; sleep 5; ((attempt++))
done
if (( attempt >= 60 )); then
echo "[ERR] up{job=\"node\",ip=\"$IP_FIRST\"} did not become > 0" >&2
exit 1
fi
echo "[VERIFY:PROM] instant up query > 0"
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=up{job=\"node\",ip=\"$IP_FIRST\"}" > "$TMP_DIR/prom_up_inst.json"
python3 - "$TMP_DIR/prom_up_inst.json" <<'PY'
import json,sys
j=json.load(open(sys.argv[1]))
res=j.get('data',{}).get('result',[])
assert res, 'empty result for up{job="node",instance=...}'
val=float(res[0]['value'][1])
assert val>0, f"up value not > 0: {val}"
PY
echo "[OK] up > 0"
echo "[VERIFY:PROM] count(up{job=\"node\"}==1) >= 1"
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=count(up{job=\"node\"}==1)" > "$TMP_DIR/prom_up_count.json"
python3 - "$TMP_DIR/prom_up_count.json" <<'PY'
import json,sys
j=json.load(open(sys.argv[1]))
res=j.get('data',{}).get('result',[])
assert res, 'empty result for count(up{job="node"}==1)'
val=float(res[0]['value'][1])
assert val>=1, f"count < 1: {val}"
PY
echo "[OK] up count satisfied"
echo "[DONE] prometheus verify"