148 lines
5.5 KiB
Bash
Executable File
148 lines
5.5 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
set -euo pipefail
|
||
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||
TMP_DIR="$TEST_ROOT/tmp/metric-verify"
|
||
mkdir -p "$TMP_DIR"
|
||
|
||
# 载入端口变量
|
||
if [[ -f "$TEST_ROOT/.env" ]]; then
|
||
set -a; source "$TEST_ROOT/.env"; set +a
|
||
fi
|
||
|
||
PROM_BASE="http://localhost:${PROMETHEUS_PORT:-9090}/api/v1"
|
||
HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}"
|
||
|
||
nodes_json="$TEST_ROOT/private/argus/metric/prometheus/nodes.json"
|
||
targets_json="$TEST_ROOT/private/argus/metric/prometheus/targets/node_exporter.json"
|
||
|
||
echo "[VERIFY:PROM] nodes.json present and contains hostname=$HOSTNAME"
|
||
[[ -f "$nodes_json" ]] || { echo "[ERR] $nodes_json missing" >&2; exit 1; }
|
||
python3 - "$nodes_json" "$HOSTNAME" <<'PY'
|
||
import json,sys
|
||
arr=json.load(open(sys.argv[1]))
|
||
host=sys.argv[2]
|
||
assert any((i.get('hostname')==host) for i in arr), f"{host} not found in nodes.json"
|
||
PY
|
||
echo "[OK] nodes.json contains target"
|
||
|
||
echo "[VERIFY:PROM] file_sd targets exist for nodes.json entries"
|
||
[[ -f "$targets_json" ]] || { echo "[ERR] $targets_json missing" >&2; exit 1; }
|
||
python3 - "$nodes_json" "$targets_json" "$HOSTNAME" >"$TMP_DIR/prom_targets_ip_inst.txt" <<'PY'
|
||
import json,sys
|
||
nodes=json.load(open(sys.argv[1]))
|
||
file_sd=json.load(open(sys.argv[2]))
|
||
host=sys.argv[3]
|
||
targets=set()
|
||
for item in file_sd:
|
||
for t in item.get('targets',[]): targets.add(t)
|
||
# choose node matching hostname; fallback to first metric user node; otherwise first
|
||
sel = None
|
||
for n in nodes:
|
||
if n.get('hostname') == host:
|
||
sel = n
|
||
break
|
||
if not sel:
|
||
for n in nodes:
|
||
if n.get('user_id') == 'metric':
|
||
sel = n
|
||
break
|
||
if not sel and nodes:
|
||
sel = nodes[0]
|
||
if not sel:
|
||
raise SystemExit('nodes.json empty or no suitable node found')
|
||
ip = sel['ip']
|
||
inst = f"{ip}:9100"
|
||
print(ip)
|
||
print(inst)
|
||
PY
|
||
IP_FIRST=$(sed -n '1p' "$TMP_DIR/prom_targets_ip_inst.txt")
|
||
INSTANCE=$(sed -n '2p' "$TMP_DIR/prom_targets_ip_inst.txt")
|
||
echo "[INFO] expecting instance in file_sd: $INSTANCE"
|
||
|
||
# 尝试在 Prometheus 容器内主动刷新 targets(可选加速)
|
||
if docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then
|
||
echo "[..] triggering update_targets inside argus-prometheus"
|
||
docker exec argus-prometheus bash -lc \
|
||
'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true'
|
||
fi
|
||
|
||
# 给 Prometheus 一次初始 scrape 周期
|
||
sleep 10
|
||
|
||
# 若短暂未生成,进行重试(最多 180s),期间多次触发刷新
|
||
retry=0
|
||
until jq -r '.[].targets[]' "$targets_json" 2>/dev/null | grep -q "^${IP_FIRST}:9100$"; do
|
||
if (( retry >= 36 )); then
|
||
echo "[ERR] ${IP_FIRST}:9100 not present in file_sd after timeout" >&2
|
||
echo "[HINT] current targets file content:" >&2
|
||
sed -n '1,200p' "$targets_json" >&2 || true
|
||
exit 1
|
||
fi
|
||
if (( retry % 3 == 0 )) && docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then
|
||
docker exec argus-prometheus bash -lc \
|
||
'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true'
|
||
fi
|
||
echo "[..] waiting file_sd refresh ($retry/36)"; sleep 5; ((retry++))
|
||
done
|
||
|
||
# 改为以 PromQL up 指标作为健康依据,避免 targets 页面状态抖动
|
||
echo "[VERIFY:PROM] up{job=\"node\",ip=\"$IP_FIRST\"} > 0"
|
||
attempt=0
|
||
until (( attempt >= 60 )); do
|
||
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=up{job=\"node\",ip=\"$IP_FIRST\"}" > "$TMP_DIR/prom_up_inst_active.json" || true
|
||
if python3 - "$TMP_DIR/prom_up_inst_active.json" <<'PY'
|
||
import json,sys
|
||
try:
|
||
j=json.load(open(sys.argv[1]))
|
||
except Exception:
|
||
raise SystemExit(1)
|
||
res=j.get('data',{}).get('result',[])
|
||
if res:
|
||
try:
|
||
val=float(res[0]['value'][1])
|
||
if val>0: raise SystemExit(0)
|
||
except Exception:
|
||
pass
|
||
raise SystemExit(1)
|
||
PY
|
||
then
|
||
echo "[OK] up > 0 (control-plane scrape works)"; break
|
||
fi
|
||
if (( attempt % 6 == 0 )) && docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then
|
||
docker exec argus-prometheus bash -lc \
|
||
'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true'
|
||
fi
|
||
echo "[..] waiting up{job=\"node\",ip=\"$IP_FIRST\"} > 0 ($attempt/60)"; sleep 5; ((attempt++))
|
||
done
|
||
if (( attempt >= 60 )); then
|
||
echo "[ERR] up{job=\"node\",ip=\"$IP_FIRST\"} did not become > 0" >&2
|
||
exit 1
|
||
fi
|
||
|
||
echo "[VERIFY:PROM] instant up query > 0"
|
||
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=up{job=\"node\",ip=\"$IP_FIRST\"}" > "$TMP_DIR/prom_up_inst.json"
|
||
python3 - "$TMP_DIR/prom_up_inst.json" <<'PY'
|
||
import json,sys
|
||
j=json.load(open(sys.argv[1]))
|
||
res=j.get('data',{}).get('result',[])
|
||
assert res, 'empty result for up{job="node",instance=...}'
|
||
val=float(res[0]['value'][1])
|
||
assert val>0, f"up value not > 0: {val}"
|
||
PY
|
||
echo "[OK] up > 0"
|
||
|
||
echo "[VERIFY:PROM] count(up{job=\"node\"}==1) >= 1"
|
||
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=count(up{job=\"node\"}==1)" > "$TMP_DIR/prom_up_count.json"
|
||
python3 - "$TMP_DIR/prom_up_count.json" <<'PY'
|
||
import json,sys
|
||
j=json.load(open(sys.argv[1]))
|
||
res=j.get('data',{}).get('result',[])
|
||
assert res, 'empty result for count(up{job="node"}==1)'
|
||
val=float(res[0]['value'][1])
|
||
assert val>=1, f"count < 1: {val}"
|
||
PY
|
||
echo "[OK] up count satisfied"
|
||
echo "[DONE] prometheus verify"
|