71 lines
2.3 KiB
Bash
Executable File
71 lines
2.3 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
TMP_DIR="$TEST_ROOT/tmp/metric-verify"
|
|
mkdir -p "$TMP_DIR"
|
|
|
|
GRAF="http://localhost:3000"
|
|
HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}"
|
|
|
|
echo "[VERIFY:GRAF-PANELS] resolve Prometheus datasource UID via Grafana"
|
|
DS_JSON="$TMP_DIR/graf_ds.json"
|
|
curl -fsS --max-time 10 "$GRAF/api/datasources" >"$DS_JSON"
|
|
DS_UID=$(python3 - "$DS_JSON" <<'PY'
|
|
import json,sys
|
|
arr=json.load(open(sys.argv[1]))
|
|
for ds in arr:
|
|
if (ds.get('type')=='prometheus'):
|
|
print(ds.get('uid',''))
|
|
break
|
|
PY
|
|
)
|
|
if [[ -z "$DS_UID" ]]; then echo "[ERR] no prometheus datasource found in grafana" >&2; exit 1; fi
|
|
echo "[OK] Prometheus DS UID=$DS_UID"
|
|
|
|
proxy_query() {
|
|
local q="$1"; local out="$2"
|
|
curl -fsS --max-time 10 --get "$GRAF/api/datasources/proxy/uid/$DS_UID/api/v1/query" \
|
|
--data-urlencode "query=$q" >"$out"
|
|
}
|
|
|
|
assert_vector_recent_nonempty() {
|
|
local json="$1"; local max_age_sec="${2:-180}"
|
|
python3 - <<'PY' "$json" "$max_age_sec"
|
|
import json,sys,time
|
|
doc=json.load(open(sys.argv[1]))
|
|
if doc.get('status')!='success':
|
|
raise SystemExit('prom status != success')
|
|
res=doc.get('data',{}).get('result',[])
|
|
assert res, 'empty result'
|
|
ts=float(res[0]['value'][0])
|
|
assert time.time()-ts < float(sys.argv[2]), f'timestamp too old: {ts}'
|
|
print(int(ts))
|
|
PY
|
|
}
|
|
|
|
echo "[VERIFY:GRAF-PANELS] Dashboard: Node and GPU Metrics — System Load"
|
|
Q_NODE_LOAD="node_load1{hostname=\"$HOSTNAME\"}"
|
|
proxy_query "$Q_NODE_LOAD" "$TMP_DIR/graf_panel_node_load.json"
|
|
assert_vector_recent_nonempty "$TMP_DIR/graf_panel_node_load.json" 300 >/dev/null
|
|
echo "[OK] node_load1 has recent sample via Grafana proxy"
|
|
|
|
echo "[VERIFY:GRAF-PANELS] Dashboard: Cluster Dashboard — Node online count"
|
|
Q_NODE_ONLINE='count(count by(hostname) (up{job="node"} == 1))'
|
|
proxy_query "$Q_NODE_ONLINE" "$TMP_DIR/graf_panel_node_online.json"
|
|
python3 - "$TMP_DIR/graf_panel_node_online.json" <<'PY'
|
|
import json,sys
|
|
doc=json.load(open(sys.argv[1]))
|
|
assert doc.get('status')=='success', 'prom status not success'
|
|
res=doc.get('data',{}).get('result',[])
|
|
assert res, 'no series for node online count'
|
|
val=float(res[0]['value'][1])
|
|
assert val>=1, f'node online < 1: {val}'
|
|
print('OK',val)
|
|
PY
|
|
echo "[OK] cluster node online count >= 1 via Grafana proxy"
|
|
|
|
echo "[DONE] grafana panels verify"
|
|
|