[#2] health目录统一在hostname下
This commit is contained in:
parent
d70d2c2305
commit
c5652c65c8
@ -38,7 +38,7 @@ Agent 不再依赖配置文件;所有参数均由环境变量与主机名推
|
|||||||
派生路径:
|
派生路径:
|
||||||
|
|
||||||
- 节点信息:`/private/argus/agent/<hostname>/node.json`
|
- 节点信息:`/private/argus/agent/<hostname>/node.json`
|
||||||
- 子模块健康目录:`/private/argus/agent/health/<hostname>/`
|
- 子模块健康目录:`/private/argus/agent/<hostname>/health/`
|
||||||
|
|
||||||
健康目录中的文件需遵循 `<模块前缀>-*.json` 命名(例如 `log-fluentbit.json`、`metric-node-exporter.json`),文件内容会原样并入上报的 `health` 字段。
|
健康目录中的文件需遵循 `<模块前缀>-*.json` 命名(例如 `log-fluentbit.json`、`metric-node-exporter.json`),文件内容会原样并入上报的 `health` 字段。
|
||||||
|
|
||||||
@ -64,4 +64,3 @@ cd src/agent/tests
|
|||||||
4. 清理 `tests/private/` 与临时容器网络。
|
4. 清理 `tests/private/` 与临时容器网络。
|
||||||
|
|
||||||
如需在真实环境部署,只需将 `dist/argus-agent` 连同健康目录挂载到目标主机,并按上表设置环境变量即可。
|
如需在真实环境部署,只需将 `dist/argus-agent` 连同健康目录挂载到目标主机,并按上表设置环境变量即可。
|
||||||
|
|
||||||
|
@ -52,7 +52,7 @@ def load_config() -> AgentConfig:
|
|||||||
|
|
||||||
hostname = _resolve_hostname()
|
hostname = _resolve_hostname()
|
||||||
node_file = f"/private/argus/agent/{hostname}/node.json"
|
node_file = f"/private/argus/agent/{hostname}/node.json"
|
||||||
health_dir = f"/private/argus/agent/health/{hostname}/"
|
health_dir = f"/private/argus/agent/{hostname}/health/"
|
||||||
|
|
||||||
master_endpoint_env = os.environ.get("MASTER_ENDPOINT")
|
master_endpoint_env = os.environ.get("MASTER_ENDPOINT")
|
||||||
if master_endpoint_env is None:
|
if master_endpoint_env is None:
|
||||||
|
714
src/agent/scripts/agent_deployment_verify.sh
Executable file
714
src/agent/scripts/agent_deployment_verify.sh
Executable file
@ -0,0 +1,714 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
LOG_PREFIX="[AGENT-VERIFY]"
|
||||||
|
MASTER_ENDPOINT_DEFAULT=""
|
||||||
|
AGENT_DATA_ROOT_DEFAULT="/private/argus/agent"
|
||||||
|
AGENT_ETC_ROOT_DEFAULT="/private/argus/etc"
|
||||||
|
REPORT_INTERVAL_DEFAULT="2"
|
||||||
|
|
||||||
|
ALLOW_CONFIG_TOUCH="false"
|
||||||
|
KEEP_TEST_HEALTH="false"
|
||||||
|
|
||||||
|
log_info() {
|
||||||
|
echo "${LOG_PREFIX} INFO $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_warn() {
|
||||||
|
echo "${LOG_PREFIX} WARN $*" >&2
|
||||||
|
}
|
||||||
|
|
||||||
|
log_error() {
|
||||||
|
echo "${LOG_PREFIX} ERROR $*" >&2
|
||||||
|
}
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<'USAGE'
|
||||||
|
Usage: agent_deployment_verify.sh [options]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--allow-config-touch Enable optional config PUT dry-run check.
|
||||||
|
--keep-test-health Keep the temporary verify health file after checks.
|
||||||
|
-h, --help Show this help message.
|
||||||
|
|
||||||
|
Environment variables:
|
||||||
|
MASTER_ENDPOINT (required) Master API base endpoint, e.g. http://master:3000
|
||||||
|
AGENT_DATA_ROOT (default: /private/argus/agent)
|
||||||
|
AGENT_ETC_ROOT (default: /private/argus/etc)
|
||||||
|
VERIFY_HOSTNAME (default: output of hostname)
|
||||||
|
REPORT_INTERVAL_SECONDS (default: 2) Agent report interval in seconds
|
||||||
|
USAGE
|
||||||
|
}
|
||||||
|
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--allow-config-touch)
|
||||||
|
ALLOW_CONFIG_TOUCH="true"
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--keep-test-health)
|
||||||
|
KEEP_TEST_HEALTH="true"
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
-h|--help)
|
||||||
|
usage
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
log_error "Unknown option: $1"
|
||||||
|
usage >&2
|
||||||
|
exit 2
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
MASTER_ENDPOINT="${MASTER_ENDPOINT:-$MASTER_ENDPOINT_DEFAULT}"
|
||||||
|
AGENT_DATA_ROOT="${AGENT_DATA_ROOT:-$AGENT_DATA_ROOT_DEFAULT}"
|
||||||
|
AGENT_ETC_ROOT="${AGENT_ETC_ROOT:-$AGENT_ETC_ROOT_DEFAULT}"
|
||||||
|
VERIFY_HOSTNAME="${VERIFY_HOSTNAME:-$(hostname)}"
|
||||||
|
REPORT_INTERVAL_SECONDS="${REPORT_INTERVAL_SECONDS:-$REPORT_INTERVAL_DEFAULT}"
|
||||||
|
|
||||||
|
if [[ -z "$MASTER_ENDPOINT" ]]; then
|
||||||
|
log_error "MASTER_ENDPOINT is required"
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! [[ "$REPORT_INTERVAL_SECONDS" =~ ^[0-9]+$ ]] || [[ "$REPORT_INTERVAL_SECONDS" -le 0 ]]; then
|
||||||
|
log_warn "Invalid REPORT_INTERVAL_SECONDS='$REPORT_INTERVAL_SECONDS', fallback to $REPORT_INTERVAL_DEFAULT"
|
||||||
|
REPORT_INTERVAL_SECONDS="$REPORT_INTERVAL_DEFAULT"
|
||||||
|
fi
|
||||||
|
|
||||||
|
normalize_endpoint() {
|
||||||
|
local endpoint="$1"
|
||||||
|
if [[ "$endpoint" != http://* && "$endpoint" != https://* ]]; then
|
||||||
|
endpoint="http://$endpoint"
|
||||||
|
fi
|
||||||
|
endpoint="${endpoint%/}"
|
||||||
|
echo "$endpoint"
|
||||||
|
}
|
||||||
|
|
||||||
|
MASTER_BASE="$(normalize_endpoint "$MASTER_ENDPOINT")"
|
||||||
|
|
||||||
|
NODE_DIR="$AGENT_DATA_ROOT/$VERIFY_HOSTNAME"
|
||||||
|
NODE_JSON="$NODE_DIR/node.json"
|
||||||
|
HEALTH_DIR="$NODE_DIR/health"
|
||||||
|
DNS_CONF="$AGENT_ETC_ROOT/dns.conf"
|
||||||
|
UPDATE_SCRIPT="$AGENT_ETC_ROOT/update-dns.sh"
|
||||||
|
|
||||||
|
declare -a RESULTS_PASS=()
|
||||||
|
declare -a RESULTS_WARN=()
|
||||||
|
declare -a RESULTS_FAIL=()
|
||||||
|
|
||||||
|
add_result() {
|
||||||
|
local level="$1" message="$2"
|
||||||
|
case "$level" in
|
||||||
|
PASS)
|
||||||
|
RESULTS_PASS+=("$message")
|
||||||
|
log_info "$message"
|
||||||
|
;;
|
||||||
|
WARN)
|
||||||
|
RESULTS_WARN+=("$message")
|
||||||
|
log_warn "$message"
|
||||||
|
;;
|
||||||
|
FAIL)
|
||||||
|
RESULTS_FAIL+=("$message")
|
||||||
|
log_error "$message"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
|
||||||
|
HAS_JQ="0"
|
||||||
|
if command -v jq >/dev/null 2>&1; then
|
||||||
|
HAS_JQ="1"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$HAS_JQ" == "0" ]] && ! command -v python3 >/dev/null 2>&1; then
|
||||||
|
log_error "Neither jq nor python3 is available for JSON processing"
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
CURL_OPTS=(--fail --show-error --silent --max-time 10)
|
||||||
|
|
||||||
|
curl_json() {
|
||||||
|
local url="$1"
|
||||||
|
if ! curl "${CURL_OPTS[@]}" "$url"; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
json_query() {
|
||||||
|
local json="$1" jq_expr="$2" py_expr="$3"
|
||||||
|
if [[ "$HAS_JQ" == "1" ]]; then
|
||||||
|
if ! output=$(printf '%s' "$json" | jq -e -r "$jq_expr" 2>/dev/null); then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
printf '%s' "$output"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
python3 - "$py_expr" <<'PY'
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
expr = sys.argv[1]
|
||||||
|
try:
|
||||||
|
data = json.load(sys.stdin)
|
||||||
|
value = eval(expr, {}, {"data": data})
|
||||||
|
except Exception:
|
||||||
|
sys.exit(1)
|
||||||
|
if value is None:
|
||||||
|
sys.exit(1)
|
||||||
|
if isinstance(value, (dict, list)):
|
||||||
|
print(json.dumps(value))
|
||||||
|
else:
|
||||||
|
print(value)
|
||||||
|
PY
|
||||||
|
}
|
||||||
|
|
||||||
|
json_length() {
|
||||||
|
local json="$1" jq_expr="$2" py_expr="$3"
|
||||||
|
if [[ "$HAS_JQ" == "1" ]]; then
|
||||||
|
if ! output=$(printf '%s' "$json" | jq -e "$jq_expr" 2>/dev/null); then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
printf '%s' "$output"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
python3 - "$py_expr" <<'PY'
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
expr = sys.argv[1]
|
||||||
|
try:
|
||||||
|
data = json.load(sys.stdin)
|
||||||
|
value = eval(expr, {}, {"data": data})
|
||||||
|
except Exception:
|
||||||
|
sys.exit(1)
|
||||||
|
try:
|
||||||
|
print(len(value))
|
||||||
|
except Exception:
|
||||||
|
sys.exit(1)
|
||||||
|
PY
|
||||||
|
}
|
||||||
|
|
||||||
|
json_has_key() {
|
||||||
|
local json="$1" jq_expr="$2" py_expr="$3"
|
||||||
|
if [[ "$HAS_JQ" == "1" ]]; then
|
||||||
|
if printf '%s' "$json" | jq -e "$jq_expr" >/dev/null 2>&1; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
python3 - "$py_expr" <<'PY'
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
expr = sys.argv[1]
|
||||||
|
try:
|
||||||
|
data = json.load(sys.stdin)
|
||||||
|
value = eval(expr, {}, {"data": data})
|
||||||
|
except Exception:
|
||||||
|
sys.exit(1)
|
||||||
|
if value:
|
||||||
|
sys.exit(0)
|
||||||
|
sys.exit(1)
|
||||||
|
PY
|
||||||
|
}
|
||||||
|
|
||||||
|
iso_to_epoch() {
|
||||||
|
local value="$1"
|
||||||
|
python3 - "$value" <<'PY'
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
value = sys.argv[1]
|
||||||
|
if value is None or value == "":
|
||||||
|
sys.exit(1)
|
||||||
|
if value.endswith('Z'):
|
||||||
|
value = value[:-1] + '+00:00'
|
||||||
|
try:
|
||||||
|
dt = datetime.fromisoformat(value)
|
||||||
|
except ValueError:
|
||||||
|
sys.exit(1)
|
||||||
|
print(int(dt.timestamp()))
|
||||||
|
PY
|
||||||
|
}
|
||||||
|
|
||||||
|
validate_json_file() {
|
||||||
|
local path="$1"
|
||||||
|
python3 - "$path" <<'PY'
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
path = sys.argv[1]
|
||||||
|
with open(path, 'r', encoding='utf-8') as handle:
|
||||||
|
json.load(handle)
|
||||||
|
PY
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_directory() {
|
||||||
|
local dir="$1"
|
||||||
|
if [[ ! -d "$dir" ]]; then
|
||||||
|
log_warn "Creating missing directory $dir"
|
||||||
|
mkdir -p "$dir"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_HEALTH_FILE=""
|
||||||
|
TEST_HEALTH_BACKUP=""
|
||||||
|
TEST_HEALTH_EXISTED="false"
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
if [[ -n "$TEST_HEALTH_FILE" ]]; then
|
||||||
|
if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then
|
||||||
|
printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE"
|
||||||
|
elif [[ "$KEEP_TEST_HEALTH" == "true" ]]; then
|
||||||
|
:
|
||||||
|
else
|
||||||
|
rm -f "$TEST_HEALTH_FILE"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
log_info "Starting agent deployment verification for hostname '$VERIFY_HOSTNAME'"
|
||||||
|
|
||||||
|
# 4.2 Master health checks
|
||||||
|
health_resp=""
|
||||||
|
if ! health_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/healthz" 2>/tmp/agent_verify_healthz.err); then
|
||||||
|
error_detail=$(cat /tmp/agent_verify_healthz.err || true)
|
||||||
|
add_result FAIL "GET /healthz failed: $error_detail"
|
||||||
|
else
|
||||||
|
http_meta=$(tail -n1 <<<"$health_resp")
|
||||||
|
payload=$(head -n -1 <<<"$health_resp" || true)
|
||||||
|
status_code=${http_meta%% *}
|
||||||
|
elapsed=${http_meta##* }
|
||||||
|
add_result PASS "GET /healthz status=$status_code elapsed=${elapsed}s payload=$payload"
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_healthz.err
|
||||||
|
|
||||||
|
if ! readyz_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/readyz" 2>/tmp/agent_verify_readyz.err); then
|
||||||
|
error_detail=$(cat /tmp/agent_verify_readyz.err || true)
|
||||||
|
add_result FAIL "GET /readyz failed: $error_detail"
|
||||||
|
readyz_payload=""
|
||||||
|
else
|
||||||
|
readyz_meta=$(tail -n1 <<<"$readyz_resp")
|
||||||
|
readyz_payload=$(head -n -1 <<<"$readyz_resp" || true)
|
||||||
|
readyz_status=${readyz_meta%% *}
|
||||||
|
readyz_elapsed=${readyz_meta##* }
|
||||||
|
add_result PASS "GET /readyz status=$readyz_status elapsed=${readyz_elapsed}s"
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_readyz.err
|
||||||
|
|
||||||
|
# 4.3 Nodes list and detail
|
||||||
|
if ! nodes_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes" 2>/tmp/agent_verify_nodes.err); then
|
||||||
|
error_detail=$(cat /tmp/agent_verify_nodes.err || true)
|
||||||
|
add_result FAIL "GET /api/v1/master/nodes failed: $error_detail"
|
||||||
|
nodes_json=""
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_nodes.err
|
||||||
|
|
||||||
|
NODE_ENTRY=""
|
||||||
|
NODE_ID=""
|
||||||
|
NODE_IP=""
|
||||||
|
if [[ -n "$nodes_json" ]]; then
|
||||||
|
if [[ "$HAS_JQ" == "1" ]]; then
|
||||||
|
NODE_ENTRY=$(printf '%s' "$nodes_json" | jq -e --arg name "$VERIFY_HOSTNAME" '.[] | select(.name == $name)') || NODE_ENTRY=""
|
||||||
|
else
|
||||||
|
NODE_ENTRY=$(python3 - "$VERIFY_HOSTNAME" <<'PY'
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
hostname = sys.argv[1]
|
||||||
|
nodes = json.load(sys.stdin)
|
||||||
|
for node in nodes:
|
||||||
|
if node.get("name") == hostname:
|
||||||
|
import json as _json
|
||||||
|
print(_json.dumps(node))
|
||||||
|
sys.exit(0)
|
||||||
|
sys.exit(1)
|
||||||
|
PY
|
||||||
|
) || NODE_ENTRY=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z "$NODE_ENTRY" ]]; then
|
||||||
|
add_result FAIL "Current node '$VERIFY_HOSTNAME' not found in master nodes list"
|
||||||
|
else
|
||||||
|
if NODE_ID=$(json_query "$NODE_ENTRY" '.id' 'data["id"]'); then
|
||||||
|
add_result PASS "Discovered node id '$NODE_ID' for hostname '$VERIFY_HOSTNAME'"
|
||||||
|
else
|
||||||
|
add_result FAIL "Failed to extract node id from master response"
|
||||||
|
fi
|
||||||
|
if NODE_IP=$(json_query "$NODE_ENTRY" '.meta_data.host_ip // empty' 'data.get("meta_data", {}).get("host_ip", "")'); then
|
||||||
|
if [[ -n "$NODE_IP" ]]; then
|
||||||
|
add_result PASS "Registered node host_ip=$NODE_IP"
|
||||||
|
else
|
||||||
|
add_result WARN "Node host_ip missing in master meta_data"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result WARN "Unable to read meta_data.host_ip"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "$NODE_ENTRY" ]] && NODE_DETAIL=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_node_detail.err); then
|
||||||
|
NODE_DETAIL_JSON="$NODE_DETAIL"
|
||||||
|
add_result PASS "Fetched node detail for $NODE_ID"
|
||||||
|
else
|
||||||
|
error_detail=$(cat /tmp/agent_verify_node_detail.err 2>/dev/null || true)
|
||||||
|
add_result FAIL "Failed to fetch node detail for $NODE_ID: $error_detail"
|
||||||
|
NODE_DETAIL_JSON=""
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_node_detail.err
|
||||||
|
|
||||||
|
if stats_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes/statistics" 2>/tmp/agent_verify_stats.err); then
|
||||||
|
if total_nodes=$(json_query "$stats_json" '.total_nodes' 'data["total_nodes"]'); then
|
||||||
|
if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$total_nodes" -ge 1 ]]; then
|
||||||
|
add_result PASS "Statistics total_nodes=$total_nodes"
|
||||||
|
else
|
||||||
|
add_result FAIL "Statistics total_nodes invalid: $total_nodes"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result FAIL "Unable to read total_nodes from statistics"
|
||||||
|
fi
|
||||||
|
if active_nodes=$(json_query "$stats_json" '.active_nodes' 'data["active_nodes"]'); then
|
||||||
|
if [[ "$active_nodes" =~ ^[0-9]+$ ]]; then
|
||||||
|
add_result PASS "Statistics active_nodes=$active_nodes"
|
||||||
|
else
|
||||||
|
add_result WARN "Statistics active_nodes not numeric: $active_nodes"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
if inactive_nodes=$(json_query "$stats_json" '.inactive_nodes' 'data["inactive_nodes"]'); then
|
||||||
|
if [[ "$inactive_nodes" =~ ^[0-9]+$ ]]; then
|
||||||
|
add_result PASS "Statistics inactive_nodes=$inactive_nodes"
|
||||||
|
else
|
||||||
|
add_result WARN "Statistics inactive_nodes not numeric: $inactive_nodes"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$HAS_JQ" == "1" ]]; then
|
||||||
|
node_count=$(printf '%s' "$nodes_json" | jq 'length')
|
||||||
|
else
|
||||||
|
node_count=$(json_length "$nodes_json" 'length' 'len(data)')
|
||||||
|
fi
|
||||||
|
if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$node_count" =~ ^[0-9]+$ ]]; then
|
||||||
|
if [[ "$total_nodes" -lt "$node_count" ]]; then
|
||||||
|
add_result WARN "Statistics total_nodes=$total_nodes less than nodes list count=$node_count"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
error_detail=$(cat /tmp/agent_verify_stats.err 2>/dev/null || true)
|
||||||
|
add_result FAIL "Failed to fetch node statistics: $error_detail"
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_stats.err
|
||||||
|
else
|
||||||
|
NODE_DETAIL_JSON=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 4.4 Agent persistence checks
|
||||||
|
if [[ -f "$NODE_JSON" ]]; then
|
||||||
|
node_file_content="$(cat "$NODE_JSON")"
|
||||||
|
if node_id_local=$(json_query "$node_file_content" '.id' 'data["id"]'); then
|
||||||
|
if [[ "$NODE_ID" != "" && "$node_id_local" == "$NODE_ID" ]]; then
|
||||||
|
add_result PASS "node.json id matches master ($NODE_ID)"
|
||||||
|
else
|
||||||
|
add_result FAIL "node.json id '$node_id_local' differs from master id '$NODE_ID'"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result FAIL "Unable to extract id from node.json"
|
||||||
|
fi
|
||||||
|
if node_name_local=$(json_query "$node_file_content" '.name' 'data["name"]'); then
|
||||||
|
if [[ "$node_name_local" == "$VERIFY_HOSTNAME" ]]; then
|
||||||
|
add_result PASS "node.json name matches $VERIFY_HOSTNAME"
|
||||||
|
else
|
||||||
|
add_result FAIL "node.json name '$node_name_local' differs from hostname '$VERIFY_HOSTNAME'"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result FAIL "Unable to extract name from node.json"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if register_time=$(json_query "$node_file_content" '.register_time' 'data.get("register_time")'); then
|
||||||
|
if iso_to_epoch "$register_time" >/dev/null 2>&1; then
|
||||||
|
add_result PASS "node.json register_time valid ISO timestamp"
|
||||||
|
else
|
||||||
|
add_result WARN "node.json register_time invalid: $register_time"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result WARN "node.json missing register_time"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then
|
||||||
|
if iso_to_epoch "$last_updated" >/dev/null 2>&1; then
|
||||||
|
add_result PASS "node.json last_updated valid ISO timestamp"
|
||||||
|
else
|
||||||
|
add_result WARN "node.json last_updated invalid: $last_updated"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result WARN "node.json missing last_updated"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result FAIL "node.json not found at $NODE_JSON"
|
||||||
|
node_file_content=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
ensure_directory "$HEALTH_DIR"
|
||||||
|
|
||||||
|
if [[ -d "$HEALTH_DIR" ]]; then
|
||||||
|
shopt -s nullglob
|
||||||
|
health_files=("$HEALTH_DIR"/*.json)
|
||||||
|
shopt -u nullglob
|
||||||
|
if [[ ${#health_files[@]} -eq 0 ]]; then
|
||||||
|
add_result WARN "Health directory $HEALTH_DIR is empty"
|
||||||
|
else
|
||||||
|
for hf in "${health_files[@]}"; do
|
||||||
|
base=$(basename "$hf")
|
||||||
|
if [[ "$base" != *-* ]]; then
|
||||||
|
add_result WARN "Health file $base does not follow <module>-*.json"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if ! validate_json_file "$hf" >/dev/null 2>&1; then
|
||||||
|
add_result WARN "Health file $base is not valid JSON"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result WARN "Health directory $HEALTH_DIR missing"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -f "$DNS_CONF" ]]; then
|
||||||
|
nameservers=$(awk '/^nameserver/{print $2}' "$DNS_CONF" | xargs)
|
||||||
|
if [[ -z "$nameservers" ]]; then
|
||||||
|
add_result FAIL "dns.conf found but contains no nameserver entries"
|
||||||
|
else
|
||||||
|
add_result PASS "dns.conf nameservers: $nameservers"
|
||||||
|
if getent hosts master.argus.com >/dev/null 2>&1; then
|
||||||
|
resolved_ips=$(getent hosts master.argus.com | awk '{print $1}' | xargs)
|
||||||
|
match_found="false"
|
||||||
|
for ns in $nameservers; do
|
||||||
|
if grep -qw "$ns" <<<"$resolved_ips"; then
|
||||||
|
match_found="true"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if [[ "$match_found" == "true" ]]; then
|
||||||
|
add_result PASS "master.argus.com resolves via configured nameserver"
|
||||||
|
else
|
||||||
|
add_result WARN "master.argus.com resolved IPs ($resolved_ips) do not match dns.conf nameservers ($nameservers)"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result WARN "Failed to resolve master.argus.com"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result FAIL "dns.conf not found at $DNS_CONF"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -f "$UPDATE_SCRIPT" ]]; then
|
||||||
|
dns_mtime=$(stat -c %Y "$DNS_CONF" 2>/dev/null || echo 0)
|
||||||
|
upd_mtime=$(stat -c %Y "$UPDATE_SCRIPT" 2>/dev/null || echo 0)
|
||||||
|
if [[ "$dns_mtime" -gt 0 && "$upd_mtime" -gt 0 ]]; then
|
||||||
|
diff=$((dns_mtime - upd_mtime))
|
||||||
|
[[ $diff -lt 0 ]] && diff=$((-diff))
|
||||||
|
if [[ $diff -le 300 ]]; then
|
||||||
|
add_result PASS "dns.conf and update-dns.sh timestamps within 5 minutes"
|
||||||
|
else
|
||||||
|
add_result WARN "dns.conf and update-dns.sh timestamps differ by more than 5 minutes"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result WARN "update-dns.sh not found at $UPDATE_SCRIPT"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 4.5 Master-Node status consistency
|
||||||
|
sleep_interval=$((REPORT_INTERVAL_SECONDS + 2))
|
||||||
|
|
||||||
|
if [[ -n "$NODE_DETAIL_JSON" ]]; then
|
||||||
|
detail_pre="$NODE_DETAIL_JSON"
|
||||||
|
else
|
||||||
|
detail_pre=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z "$detail_pre" && -n "$NODE_ID" ]]; then
|
||||||
|
if detail_pre=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_pre.err); then
|
||||||
|
add_result PASS "Fetched node detail pre-check"
|
||||||
|
else
|
||||||
|
error_detail=$(cat /tmp/agent_verify_detail_pre.err 2>/dev/null || true)
|
||||||
|
add_result FAIL "Unable to fetch node detail for status check: $error_detail"
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_detail_pre.err
|
||||||
|
fi
|
||||||
|
|
||||||
|
server_ts_pre=""
|
||||||
|
agent_ts_pre=""
|
||||||
|
server_ts_post=""
|
||||||
|
agent_ts_post=""
|
||||||
|
|
||||||
|
if [[ -n "$detail_pre" ]]; then
|
||||||
|
server_ts_pre=$(json_query "$detail_pre" '.last_report.server_timestamp' 'data.get("last_report", {}).get("server_timestamp")' || echo "")
|
||||||
|
agent_ts_pre=$(json_query "$detail_pre" '.last_report.agent_timestamp' 'data.get("last_report", {}).get("agent_timestamp")' || echo "")
|
||||||
|
log_info "Captured initial last_report timestamps server='$server_ts_pre' agent='$agent_ts_pre'"
|
||||||
|
|
||||||
|
sleep "$sleep_interval"
|
||||||
|
|
||||||
|
if detail_post=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_post.err); then
|
||||||
|
server_ts_post=$(json_query "$detail_post" '.last_report.server_timestamp' 'data.get("last_report", {}).get("server_timestamp")' || echo "")
|
||||||
|
agent_ts_post=$(json_query "$detail_post" '.last_report.agent_timestamp' 'data.get("last_report", {}).get("agent_timestamp")' || echo "")
|
||||||
|
if [[ "$server_ts_post" != "$server_ts_pre" ]]; then
|
||||||
|
add_result PASS "last_report.server_timestamp advanced (pre=$server_ts_pre post=$server_ts_post)"
|
||||||
|
else
|
||||||
|
add_result FAIL "last_report.server_timestamp did not change after ${sleep_interval}s"
|
||||||
|
fi
|
||||||
|
if [[ "$agent_ts_post" != "$agent_ts_pre" ]]; then
|
||||||
|
add_result PASS "last_report.agent_timestamp advanced"
|
||||||
|
else
|
||||||
|
add_result FAIL "last_report.agent_timestamp did not change"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "$node_file_content" ]]; then
|
||||||
|
if node_last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then
|
||||||
|
if epoch_post=$(iso_to_epoch "$server_ts_post" 2>/dev/null); then
|
||||||
|
if node_epoch=$(iso_to_epoch "$node_last_updated" 2>/dev/null); then
|
||||||
|
diff=$((epoch_post - node_epoch))
|
||||||
|
[[ $diff -lt 0 ]] && diff=$((-diff))
|
||||||
|
tolerance=$((REPORT_INTERVAL_SECONDS * 2))
|
||||||
|
if [[ $diff -le $tolerance ]]; then
|
||||||
|
add_result PASS "last_report.server_timestamp and node.json last_updated within tolerance ($diff s)"
|
||||||
|
else
|
||||||
|
add_result WARN "Timestamp gap between master ($server_ts_post) and node.json ($node_last_updated) is ${diff}s"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
NODE_DETAIL_JSON="$detail_post"
|
||||||
|
else
|
||||||
|
error_detail=$(cat /tmp/agent_verify_detail_post.err 2>/dev/null || true)
|
||||||
|
add_result FAIL "Failed to fetch node detail post-check: $error_detail"
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_detail_post.err
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 4.6 Health simulation
|
||||||
|
TEST_HEALTH_FILE="$HEALTH_DIR/verify-master.json"
|
||||||
|
ensure_directory "$HEALTH_DIR"
|
||||||
|
|
||||||
|
if [[ -f "$TEST_HEALTH_FILE" ]]; then
|
||||||
|
TEST_HEALTH_EXISTED="true"
|
||||||
|
TEST_HEALTH_BACKUP="$(cat "$TEST_HEALTH_FILE")"
|
||||||
|
else
|
||||||
|
TEST_HEALTH_EXISTED="false"
|
||||||
|
fi
|
||||||
|
|
||||||
|
create_health_file() {
|
||||||
|
local message="$1"
|
||||||
|
cat > "$TEST_HEALTH_FILE" <<HEALTHJSON
|
||||||
|
{"status":"ok","message":"$message"}
|
||||||
|
HEALTHJSON
|
||||||
|
}
|
||||||
|
|
||||||
|
validate_health_in_master() {
|
||||||
|
local expected_message="$1"
|
||||||
|
local detail_json="$2"
|
||||||
|
local message
|
||||||
|
if message=$(json_query "$detail_json" '.meta_data.health["verify-master"].message' 'data.get("meta_data", {}).get("health", {}).get("verify-master", {}).get("message")'); then
|
||||||
|
if [[ "$message" == "$expected_message" ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
remove_health_from_master() {
|
||||||
|
local detail_json="$1"
|
||||||
|
if json_has_key "$detail_json" '(.meta_data.health | has("verify-master"))' '"verify-master" in data.get("meta_data", {}).get("health", {})'; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
health_message_one="verify $(date +%s)"
|
||||||
|
create_health_file "$health_message_one"
|
||||||
|
add_result PASS "Created test health file $TEST_HEALTH_FILE"
|
||||||
|
|
||||||
|
sleep "$sleep_interval"
|
||||||
|
if detail_health_one=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health1.err); then
|
||||||
|
if validate_health_in_master "$health_message_one" "$detail_health_one"; then
|
||||||
|
add_result PASS "Master reflects verify-master health message"
|
||||||
|
else
|
||||||
|
add_result FAIL "Master health payload does not match test message"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
error_detail=$(cat /tmp/agent_verify_health1.err 2>/dev/null || true)
|
||||||
|
add_result FAIL "Failed to fetch node detail during health validation: $error_detail"
|
||||||
|
detail_health_one=""
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_health1.err
|
||||||
|
|
||||||
|
health_message_two="verify $(date +%s)-update"
|
||||||
|
create_health_file "$health_message_two"
|
||||||
|
sleep "$sleep_interval"
|
||||||
|
if detail_health_two=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health2.err); then
|
||||||
|
if validate_health_in_master "$health_message_two" "$detail_health_two"; then
|
||||||
|
add_result PASS "Master health updated to new message"
|
||||||
|
else
|
||||||
|
add_result FAIL "Master health message did not update"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
error_detail=$(cat /tmp/agent_verify_health2.err 2>/dev/null || true)
|
||||||
|
add_result FAIL "Failed to fetch node detail after health update: $error_detail"
|
||||||
|
detail_health_two=""
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_health2.err
|
||||||
|
|
||||||
|
rm -f "$TEST_HEALTH_FILE"
|
||||||
|
sleep "$sleep_interval"
|
||||||
|
if detail_health_three=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health3.err); then
|
||||||
|
if remove_health_from_master "$detail_health_three"; then
|
||||||
|
add_result PASS "Master health no longer lists verify-master after removal"
|
||||||
|
else
|
||||||
|
add_result FAIL "Master health still contains verify-master after file deletion"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
error_detail=$(cat /tmp/agent_verify_health3.err 2>/dev/null || true)
|
||||||
|
add_result FAIL "Failed to fetch node detail after health removal: $error_detail"
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_health3.err
|
||||||
|
|
||||||
|
if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then
|
||||||
|
printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Optional config touch
|
||||||
|
if [[ "$ALLOW_CONFIG_TOUCH" == "true" ]]; then
|
||||||
|
if [[ -n "$NODE_ID" ]]; then
|
||||||
|
payload='{"label": {"verify": "true"}}'
|
||||||
|
if curl "${CURL_OPTS[@]}" -X PUT -H 'Content-Type: application/json' -d "$payload" "$MASTER_BASE/api/v1/master/nodes/$NODE_ID/config" >/tmp/agent_verify_config.log 2>&1; then
|
||||||
|
add_result PASS "Config PUT dry-run succeeded"
|
||||||
|
else
|
||||||
|
add_result WARN "Config PUT dry-run failed: $(cat /tmp/agent_verify_config.log)"
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_config.log
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result WARN "Config PUT dry-run skipped (enable with --allow-config-touch)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Result summary
|
||||||
|
echo
|
||||||
|
echo "==== Verification Summary ===="
|
||||||
|
for entry in "${RESULTS_PASS[@]}"; do
|
||||||
|
printf 'PASS: %s\n' "$entry"
|
||||||
|
done
|
||||||
|
for entry in "${RESULTS_WARN[@]}"; do
|
||||||
|
printf 'WARN: %s\n' "$entry"
|
||||||
|
done
|
||||||
|
for entry in "${RESULTS_FAIL[@]}"; do
|
||||||
|
printf 'FAIL: %s\n' "$entry"
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ ${#RESULTS_FAIL[@]} -gt 0 ]]; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
@ -40,7 +40,7 @@ services:
|
|||||||
- REPORT_INTERVAL_SECONDS=2
|
- REPORT_INTERVAL_SECONDS=2
|
||||||
volumes:
|
volumes:
|
||||||
- ./private/argus/agent/dev-e2euser-e2einst-pod-0:/private/argus/agent/dev-e2euser-e2einst-pod-0
|
- ./private/argus/agent/dev-e2euser-e2einst-pod-0:/private/argus/agent/dev-e2euser-e2einst-pod-0
|
||||||
- ./private/argus/agent/health/dev-e2euser-e2einst-pod-0:/private/argus/agent/health/dev-e2euser-e2einst-pod-0
|
- ./private/argus/agent/dev-e2euser-e2einst-pod-0/health:/private/argus/agent/dev-e2euser-e2einst-pod-0/health
|
||||||
- ./private/argus/etc:/private/argus/etc
|
- ./private/argus/etc:/private/argus/etc
|
||||||
- ../dist/argus-agent:/usr/local/bin/argus-agent:ro
|
- ../dist/argus-agent:/usr/local/bin/argus-agent:ro
|
||||||
- ./scripts/agent_entrypoint.sh:/usr/local/bin/agent-entrypoint.sh:ro
|
- ./scripts/agent_entrypoint.sh:/usr/local/bin/agent-entrypoint.sh:ro
|
||||||
|
@ -3,7 +3,7 @@ set -euo pipefail
|
|||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
HEALTH_DIR="$TEST_ROOT/private/argus/agent/health/dev-e2euser-e2einst-pod-0"
|
HEALTH_DIR="$TEST_ROOT/private/argus/agent/dev-e2euser-e2einst-pod-0/health"
|
||||||
|
|
||||||
cat > "$HEALTH_DIR/log-fluentbit.json" <<JSON
|
cat > "$HEALTH_DIR/log-fluentbit.json" <<JSON
|
||||||
{
|
{
|
||||||
|
@ -65,7 +65,7 @@ popd >/dev/null
|
|||||||
docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true
|
docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true
|
||||||
|
|
||||||
AGENT_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME"
|
AGENT_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME"
|
||||||
HEALTH_DIR="$TEST_ROOT/private/argus/agent/health/$AGENT_HOSTNAME"
|
HEALTH_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/health"
|
||||||
|
|
||||||
# 先以 sleep 方式启动容器,确保我们掌握注册时的网络状态
|
# 先以 sleep 方式启动容器,确保我们掌握注册时的网络状态
|
||||||
if ! docker run -d \
|
if ! docker run -d \
|
||||||
@ -74,7 +74,7 @@ if ! docker run -d \
|
|||||||
--network "$NETWORK_NAME" \
|
--network "$NETWORK_NAME" \
|
||||||
--ip "$NEW_AGENT_IP" \
|
--ip "$NEW_AGENT_IP" \
|
||||||
-v "$AGENT_DIR:/private/argus/agent/$AGENT_HOSTNAME" \
|
-v "$AGENT_DIR:/private/argus/agent/$AGENT_HOSTNAME" \
|
||||||
-v "$HEALTH_DIR:/private/argus/agent/health/$AGENT_HOSTNAME" \
|
-v "$HEALTH_DIR:/private/argus/agent/$AGENT_HOSTNAME/health" \
|
||||||
-v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \
|
-v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \
|
||||||
-v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
|
-v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
|
||||||
-v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \
|
-v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \
|
||||||
|
Loading…
x
Reference in New Issue
Block a user