106 lines
3.3 KiB
Bash
Executable File
106 lines
3.3 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
set -euo pipefail
|
||
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||
TMP_DIR="$TEST_ROOT/tmp/metric-verify"
|
||
mkdir -p "$TMP_DIR"
|
||
|
||
MASTER_BASE="http://localhost:32300/api/v1/master"
|
||
HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}"
|
||
|
||
curl_json() { curl -fsS --max-time 5 "$1"; }
|
||
|
||
echo "[VERIFY:MASTER] list nodes and locate target hostname=$HOSTNAME"
|
||
ALL_NODES_JSON="$TMP_DIR/master_nodes.json"
|
||
|
||
# 重试等待节点出现在 /nodes 列表(最多 120s)
|
||
NODE_ID=""
|
||
for attempt in {1..24}; do
|
||
curl_json "$MASTER_BASE/nodes" > "$ALL_NODES_JSON" || true
|
||
NODE_ID=$(python3 - "$ALL_NODES_JSON" "$HOSTNAME" <<'PY'
|
||
import json,sys
|
||
try:
|
||
nodes=json.load(open(sys.argv[1]))
|
||
except Exception:
|
||
nodes=[]
|
||
name=sys.argv[2]
|
||
for n in nodes:
|
||
if n.get('name')==name:
|
||
print(n.get('id',''))
|
||
break
|
||
PY
|
||
)
|
||
if [[ -n "$NODE_ID" ]]; then break; fi
|
||
echo "[..] waiting node to appear in /nodes ($attempt/24)"; sleep 5
|
||
done
|
||
|
||
if [[ -z "$NODE_ID" ]]; then
|
||
echo "[ERR] master /nodes 中未找到 $HOSTNAME(等待超时)" >&2
|
||
echo "[HINT] 当前 /nodes 列表如下:" >&2
|
||
sed -n '1,160p' "$ALL_NODES_JSON" >&2 || true
|
||
exit 1
|
||
fi
|
||
echo "[OK] node id=$NODE_ID"
|
||
|
||
echo "[VERIFY:MASTER] get node detail and assert fields"
|
||
DETAIL1_JSON="$TMP_DIR/master_node_${NODE_ID}_detail_1.json"
|
||
curl_json "$MASTER_BASE/nodes/$NODE_ID" > "$DETAIL1_JSON"
|
||
|
||
# 基础字段与健康项检查(不强制立即 online)
|
||
python3 - "$DETAIL1_JSON" "$HOSTNAME" <<'PY'
|
||
import json,sys,datetime
|
||
j=json.load(open(sys.argv[1]))
|
||
host=sys.argv[2]
|
||
assert j.get('name')==host, f"name mismatch: {j.get('name')} != {host}"
|
||
status=j.get('status')
|
||
assert status in ('initialized','online','offline'), f"unexpected status: {status}"
|
||
md=j.get('meta_data',{})
|
||
assert md.get('hostname',j.get('name'))==host, 'meta_data.hostname mismatch'
|
||
assert 'last_report' in j and j['last_report'], 'last_report missing'
|
||
h=j.get('health',{})
|
||
for key in ('metric-node-exporter','metric-fluent-bit','metric-argus-agent'):
|
||
if key in h:
|
||
assert h[key].get('status')=='healthy', f"{key} not healthy: {h[key]}"
|
||
print('OK')
|
||
PY
|
||
|
||
# 轮询等待 last_report 前进并最终转为 online(最多 90s),容忍短暂 5xx/网络错误
|
||
attempt=0
|
||
T_PRE=0
|
||
until [[ $attempt -ge 18 ]]; do
|
||
sleep 5
|
||
DETAIL_CUR="$TMP_DIR/master_node_${NODE_ID}_detail_cur.json"
|
||
if ! curl_json "$MASTER_BASE/nodes/$NODE_ID" > "$DETAIL_CUR" 2>/dev/null; then
|
||
echo "[..] retrying node detail fetch ($attempt/18)"; ((attempt++)); continue
|
||
fi
|
||
read -r STATUS_CUR T_CUR < <(python3 - "$DETAIL_CUR" <<'PY'
|
||
import json,sys,datetime
|
||
j=json.load(open(sys.argv[1]))
|
||
st=j.get('status','')
|
||
ts=j.get('last_report','')
|
||
if ts.endswith('Z'): ts=ts.replace('Z','+00:00')
|
||
try:
|
||
t=float(datetime.datetime.fromisoformat(ts).timestamp())
|
||
except Exception:
|
||
t=0.0
|
||
print(st)
|
||
print(t)
|
||
PY
|
||
)
|
||
if awk -v a="$T_PRE" -v b="$T_CUR" 'BEGIN{exit !(b>a)}'; then
|
||
T_PRE="$T_CUR"
|
||
fi
|
||
if [[ "$STATUS_CUR" == "online" ]]; then
|
||
echo "[OK] status online and last_report progressed"
|
||
break
|
||
fi
|
||
((attempt++))
|
||
done
|
||
if (( attempt >= 18 )) && [[ "$STATUS_CUR" != "online" ]]; then
|
||
echo "[WARN] status did not reach online within timeout; continuing"
|
||
fi
|
||
|
||
echo "$NODE_ID" > "$TMP_DIR/node_id_metric"
|
||
echo "[DONE] master verify"
|