argus/src/master/tests/scripts/09_restart_persistence.sh

185 lines
5.3 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
PRIVATE_ROOT="$TEST_ROOT/private"
TMP_ROOT="$TEST_ROOT/tmp"
API_BASE="http://localhost:31300/api/v1/master"
ROOT_BASE="http://localhost:31300"
DB_PATH="$PRIVATE_ROOT/argus/master/db.sqlite3"
compose() {
if docker compose version >/dev/null 2>&1; then
docker compose "$@"
else
docker-compose "$@"
fi
}
if [[ ! -f "$TMP_ROOT/node_id" ]]; then
echo "[ERROR] 主节点 ID 缺失,请先执行前置用例" >&2
exit 1
fi
if [[ ! -f "$TMP_ROOT/second_node_id" ]]; then
echo "[ERROR] 第二个节点 ID 缺失,请先执行多节点场景脚本" >&2
exit 1
fi
if [[ ! -f "$DB_PATH" ]]; then
echo "[ERROR] 持久化数据库缺失: $DB_PATH" >&2
exit 1
fi
NODE_ID="$(cat "$TMP_ROOT/node_id")"
SECOND_NODE_ID="$(cat "$TMP_ROOT/second_node_id")"
# 在重启前抓取节点详情与节点文件、统计信息,作为对比基线
first_before="$TMP_ROOT/${NODE_ID}_pre_restart.json"
second_before="$TMP_ROOT/${SECOND_NODE_ID}_pre_restart.json"
curl -sS "$API_BASE/nodes/$NODE_ID" -o "$first_before"
curl -sS "$API_BASE/nodes/$SECOND_NODE_ID" -o "$second_before"
nodes_json_before="$TMP_ROOT/nodes_json_pre_restart.json"
cp "$PRIVATE_ROOT/argus/metric/prometheus/nodes.json" "$nodes_json_before"
stats_before="$TMP_ROOT/stats_pre_restart.json"
curl -sS "$API_BASE/nodes/statistics" -o "$stats_before"
# 重启 master 容器,模拟服务重启后的持久化场景
pushd "$TEST_ROOT" >/dev/null
compose restart master
popd >/dev/null
# 等待 /readyz 恢复 200
for _ in {1..30}; do
status=$(curl -s -o /dev/null -w '%{http_code}' "$ROOT_BASE/readyz" || true)
if [[ "$status" == "200" ]]; then
break
fi
sleep 1
done
if [[ "${status:-}" != "200" ]]; then
echo "[ERROR] master 容器重启后未恢复健康状态readyz=$status" >&2
exit 1
fi
sleep 2
first_after="$TMP_ROOT/${NODE_ID}_post_restart.json"
second_after="$TMP_ROOT/${SECOND_NODE_ID}_post_restart.json"
curl -sS "$API_BASE/nodes/$NODE_ID" -o "$first_after"
curl -sS "$API_BASE/nodes/$SECOND_NODE_ID" -o "$second_after"
# 对比重启前后的节点关键信息,确保无丢失
python3 - "$first_before" "$first_after" <<'PY'
import json, sys
before_path, after_path = sys.argv[1:3]
with open(before_path, 'r', encoding='utf-8') as handle:
before = json.load(handle)
with open(after_path, 'r', encoding='utf-8') as handle:
after = json.load(handle)
keys = [
"id",
"name",
"type",
"version",
"register_time",
"meta_data",
"config",
"label",
"health",
"last_report",
"agent_last_report",
]
for key in keys:
if before.get(key) != after.get(key):
raise AssertionError(f"Key {key} changed after restart: {before.get(key)} -> {after.get(key)}")
PY
python3 - "$second_before" "$second_after" <<'PY'
import json, sys
before_path, after_path = sys.argv[1:3]
with open(before_path, 'r', encoding='utf-8') as handle:
before = json.load(handle)
with open(after_path, 'r', encoding='utf-8') as handle:
after = json.load(handle)
keys = [
"id",
"name",
"type",
"version",
"register_time",
"meta_data",
"config",
"label",
"health",
"last_report",
"agent_last_report",
]
for key in keys:
if before.get(key) != after.get(key):
raise AssertionError(f"Key {key} changed after restart: {before.get(key)} -> {after.get(key)}")
PY
payload=$(python3 - <<'PY'
import json
from datetime import datetime, timezone
body = {
"timestamp": datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z"),
"health": {
"log-fluentbit": {"status": "healthy"}
}
}
print(json.dumps(body))
PY
)
curl -sS -o "$TMP_ROOT/restart_second_status.json" -w '%{http_code}' \
-H 'Content-Type: application/json' -X PUT \
"$API_BASE/nodes/$SECOND_NODE_ID/status" -d "$payload" > "$TMP_ROOT/restart_second_status_code"
if [[ $(cat "$TMP_ROOT/restart_second_status_code") != "200" ]]; then
echo "[ERROR] Failed to restore second node status post-restart" >&2
cat "$TMP_ROOT/restart_second_status.json" >&2
exit 1
fi
sleep 3
# 对比重启前后的 nodes.json 与统计信息,验证持久化一致性
nodes_json_after="$TMP_ROOT/nodes_json_post_restart.json"
cp "$PRIVATE_ROOT/argus/metric/prometheus/nodes.json" "$nodes_json_after"
stats_after="$TMP_ROOT/stats_after_restart.json"
curl -sS "$API_BASE/nodes/statistics" -o "$stats_after"
python3 - "$nodes_json_before" "$nodes_json_after" <<'PY'
import json, sys
with open(sys.argv[1], 'r', encoding='utf-8') as handle:
before = json.load(handle)
with open(sys.argv[2], 'r', encoding='utf-8') as handle:
after = json.load(handle)
if before != after:
raise AssertionError(f"nodes.json changed after restart: {before} -> {after}")
PY
python3 - "$stats_before" "$stats_after" <<'PY'
import json, sys
with open(sys.argv[1], 'r', encoding='utf-8') as handle:
before = json.load(handle)
with open(sys.argv[2], 'r', encoding='utf-8') as handle:
after = json.load(handle)
if before != after:
raise AssertionError(f"Statistics changed after restart: {before} -> {after}")
PY
if [[ ! -s "$DB_PATH" ]]; then
echo "[ERROR] 数据库文件为空,疑似未持久化" >&2
exit 1
fi
echo "[INFO] Master 重启后持久化数据校验通过"