#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" TMP_DIR="$TEST_ROOT/tmp/metric-verify" mkdir -p "$TMP_DIR" MASTER_BASE="http://localhost:32300/api/v1/master" HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}" curl_json() { curl -fsS --max-time 5 "$1"; } echo "[VERIFY:MASTER] list nodes and locate target hostname=$HOSTNAME" ALL_NODES_JSON="$TMP_DIR/master_nodes.json" # 重试等待节点出现在 /nodes 列表(最多 120s) NODE_ID="" for attempt in {1..24}; do curl_json "$MASTER_BASE/nodes" > "$ALL_NODES_JSON" || true NODE_ID=$(python3 - "$ALL_NODES_JSON" "$HOSTNAME" <<'PY' import json,sys try: nodes=json.load(open(sys.argv[1])) except Exception: nodes=[] name=sys.argv[2] for n in nodes: if n.get('name')==name: print(n.get('id','')) break PY ) if [[ -n "$NODE_ID" ]]; then break; fi echo "[..] waiting node to appear in /nodes ($attempt/24)"; sleep 5 done if [[ -z "$NODE_ID" ]]; then echo "[ERR] master /nodes 中未找到 $HOSTNAME(等待超时)" >&2 echo "[HINT] 当前 /nodes 列表如下:" >&2 sed -n '1,160p' "$ALL_NODES_JSON" >&2 || true exit 1 fi echo "[OK] node id=$NODE_ID" echo "[VERIFY:MASTER] get node detail and assert fields" DETAIL1_JSON="$TMP_DIR/master_node_${NODE_ID}_detail_1.json" curl_json "$MASTER_BASE/nodes/$NODE_ID" > "$DETAIL1_JSON" # 基础字段与健康项检查(不强制立即 online) python3 - "$DETAIL1_JSON" "$HOSTNAME" <<'PY' import json,sys,datetime j=json.load(open(sys.argv[1])) host=sys.argv[2] assert j.get('name')==host, f"name mismatch: {j.get('name')} != {host}" status=j.get('status') assert status in ('initialized','online','offline'), f"unexpected status: {status}" md=j.get('meta_data',{}) assert md.get('hostname',j.get('name'))==host, 'meta_data.hostname mismatch' assert 'last_report' in j and j['last_report'], 'last_report missing' h=j.get('health',{}) for key in ('metric-node-exporter','metric-fluent-bit','metric-argus-agent'): if key in h: assert h[key].get('status')=='healthy', f"{key} not healthy: {h[key]}" print('OK') PY # 轮询等待 last_report 前进并最终转为 online(最多 90s),容忍短暂 5xx/网络错误 attempt=0 T_PRE=0 until [[ $attempt -ge 18 ]]; do sleep 5 DETAIL_CUR="$TMP_DIR/master_node_${NODE_ID}_detail_cur.json" if ! curl_json "$MASTER_BASE/nodes/$NODE_ID" > "$DETAIL_CUR" 2>/dev/null; then echo "[..] retrying node detail fetch ($attempt/18)"; ((attempt++)); continue fi read -r STATUS_CUR T_CUR < <(python3 - "$DETAIL_CUR" <<'PY' import json,sys,datetime j=json.load(open(sys.argv[1])) st=j.get('status','') ts=j.get('last_report','') if ts.endswith('Z'): ts=ts.replace('Z','+00:00') try: t=float(datetime.datetime.fromisoformat(ts).timestamp()) except Exception: t=0.0 print(st) print(t) PY ) if awk -v a="$T_PRE" -v b="$T_CUR" 'BEGIN{exit !(b>a)}'; then T_PRE="$T_CUR" fi if [[ "$STATUS_CUR" == "online" ]]; then echo "[OK] status online and last_report progressed" break fi ((attempt++)) done if (( attempt >= 18 )) && [[ "$STATUS_CUR" != "online" ]]; then echo "[WARN] status did not reach online within timeout; continuing" fi echo "$NODE_ID" > "$TMP_DIR/node_id_metric" echo "[DONE] master verify"