argus/src/sys/tests/scripts/13_metric_verify_master.sh

106 lines
3.3 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TMP_DIR="$TEST_ROOT/tmp/metric-verify"
mkdir -p "$TMP_DIR"
MASTER_BASE="http://localhost:32300/api/v1/master"
HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}"
curl_json() { curl -fsS --max-time 5 "$1"; }
echo "[VERIFY:MASTER] list nodes and locate target hostname=$HOSTNAME"
ALL_NODES_JSON="$TMP_DIR/master_nodes.json"
# 重试等待节点出现在 /nodes 列表(最多 120s
NODE_ID=""
for attempt in {1..24}; do
curl_json "$MASTER_BASE/nodes" > "$ALL_NODES_JSON" || true
NODE_ID=$(python3 - "$ALL_NODES_JSON" "$HOSTNAME" <<'PY'
import json,sys
try:
nodes=json.load(open(sys.argv[1]))
except Exception:
nodes=[]
name=sys.argv[2]
for n in nodes:
if n.get('name')==name:
print(n.get('id',''))
break
PY
)
if [[ -n "$NODE_ID" ]]; then break; fi
echo "[..] waiting node to appear in /nodes ($attempt/24)"; sleep 5
done
if [[ -z "$NODE_ID" ]]; then
echo "[ERR] master /nodes 中未找到 $HOSTNAME(等待超时)" >&2
echo "[HINT] 当前 /nodes 列表如下:" >&2
sed -n '1,160p' "$ALL_NODES_JSON" >&2 || true
exit 1
fi
echo "[OK] node id=$NODE_ID"
echo "[VERIFY:MASTER] get node detail and assert fields"
DETAIL1_JSON="$TMP_DIR/master_node_${NODE_ID}_detail_1.json"
curl_json "$MASTER_BASE/nodes/$NODE_ID" > "$DETAIL1_JSON"
# 基础字段与健康项检查(不强制立即 online
python3 - "$DETAIL1_JSON" "$HOSTNAME" <<'PY'
import json,sys,datetime
j=json.load(open(sys.argv[1]))
host=sys.argv[2]
assert j.get('name')==host, f"name mismatch: {j.get('name')} != {host}"
status=j.get('status')
assert status in ('initialized','online','offline'), f"unexpected status: {status}"
md=j.get('meta_data',{})
assert md.get('hostname',j.get('name'))==host, 'meta_data.hostname mismatch'
assert 'last_report' in j and j['last_report'], 'last_report missing'
h=j.get('health',{})
for key in ('metric-node-exporter','metric-fluent-bit','metric-argus-agent'):
if key in h:
assert h[key].get('status')=='healthy', f"{key} not healthy: {h[key]}"
print('OK')
PY
# 轮询等待 last_report 前进并最终转为 online最多 90s容忍短暂 5xx/网络错误
attempt=0
T_PRE=0
until [[ $attempt -ge 18 ]]; do
sleep 5
DETAIL_CUR="$TMP_DIR/master_node_${NODE_ID}_detail_cur.json"
if ! curl_json "$MASTER_BASE/nodes/$NODE_ID" > "$DETAIL_CUR" 2>/dev/null; then
echo "[..] retrying node detail fetch ($attempt/18)"; ((attempt++)); continue
fi
read -r STATUS_CUR T_CUR < <(python3 - "$DETAIL_CUR" <<'PY'
import json,sys,datetime
j=json.load(open(sys.argv[1]))
st=j.get('status','')
ts=j.get('last_report','')
if ts.endswith('Z'): ts=ts.replace('Z','+00:00')
try:
t=float(datetime.datetime.fromisoformat(ts).timestamp())
except Exception:
t=0.0
print(st)
print(t)
PY
)
if awk -v a="$T_PRE" -v b="$T_CUR" 'BEGIN{exit !(b>a)}'; then
T_PRE="$T_CUR"
fi
if [[ "$STATUS_CUR" == "online" ]]; then
echo "[OK] status online and last_report progressed"
break
fi
((attempt++))
done
if (( attempt >= 18 )) && [[ "$STATUS_CUR" != "online" ]]; then
echo "[WARN] status did not reach online within timeout; continuing"
fi
echo "$NODE_ID" > "$TMP_DIR/node_id_metric"
echo "[DONE] master verify"