argus/src/sys/tests/scripts/13_metric_verify_grafana_panels.sh

71 lines
2.3 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TMP_DIR="$TEST_ROOT/tmp/metric-verify"
mkdir -p "$TMP_DIR"
GRAF="http://localhost:3000"
HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}"
echo "[VERIFY:GRAF-PANELS] resolve Prometheus datasource UID via Grafana"
DS_JSON="$TMP_DIR/graf_ds.json"
curl -fsS --max-time 10 "$GRAF/api/datasources" >"$DS_JSON"
DS_UID=$(python3 - "$DS_JSON" <<'PY'
import json,sys
arr=json.load(open(sys.argv[1]))
for ds in arr:
if (ds.get('type')=='prometheus'):
print(ds.get('uid',''))
break
PY
)
if [[ -z "$DS_UID" ]]; then echo "[ERR] no prometheus datasource found in grafana" >&2; exit 1; fi
echo "[OK] Prometheus DS UID=$DS_UID"
proxy_query() {
local q="$1"; local out="$2"
curl -fsS --max-time 10 --get "$GRAF/api/datasources/proxy/uid/$DS_UID/api/v1/query" \
--data-urlencode "query=$q" >"$out"
}
assert_vector_recent_nonempty() {
local json="$1"; local max_age_sec="${2:-180}"
python3 - <<'PY' "$json" "$max_age_sec"
import json,sys,time
doc=json.load(open(sys.argv[1]))
if doc.get('status')!='success':
raise SystemExit('prom status != success')
res=doc.get('data',{}).get('result',[])
assert res, 'empty result'
ts=float(res[0]['value'][0])
assert time.time()-ts < float(sys.argv[2]), f'timestamp too old: {ts}'
print(int(ts))
PY
}
echo "[VERIFY:GRAF-PANELS] Dashboard: Node and GPU Metrics — System Load"
Q_NODE_LOAD="node_load1{hostname=\"$HOSTNAME\"}"
proxy_query "$Q_NODE_LOAD" "$TMP_DIR/graf_panel_node_load.json"
assert_vector_recent_nonempty "$TMP_DIR/graf_panel_node_load.json" 300 >/dev/null
echo "[OK] node_load1 has recent sample via Grafana proxy"
echo "[VERIFY:GRAF-PANELS] Dashboard: Cluster Dashboard — Node online count"
Q_NODE_ONLINE='count(count by(hostname) (up{job="node"} == 1))'
proxy_query "$Q_NODE_ONLINE" "$TMP_DIR/graf_panel_node_online.json"
python3 - "$TMP_DIR/graf_panel_node_online.json" <<'PY'
import json,sys
doc=json.load(open(sys.argv[1]))
assert doc.get('status')=='success', 'prom status not success'
res=doc.get('data',{}).get('result',[])
assert res, 'no series for node online count'
val=float(res[0]['value'][1])
assert val>=1, f'node online < 1: {val}'
print('OK',val)
PY
echo "[OK] cluster node online count >= 1 via Grafana proxy"
echo "[DONE] grafana panels verify"