diff --git a/src/agent/README.md b/src/agent/README.md index fa43f38..f89334d 100644 --- a/src/agent/README.md +++ b/src/agent/README.md @@ -38,7 +38,7 @@ Agent 不再依赖配置文件;所有参数均由环境变量与主机名推 派生路径: - 节点信息:`/private/argus/agent//node.json` -- 子模块健康目录:`/private/argus/agent/health//` +- 子模块健康目录:`/private/argus/agent//health/` 健康目录中的文件需遵循 `<模块前缀>-*.json` 命名(例如 `log-fluentbit.json`、`metric-node-exporter.json`),文件内容会原样并入上报的 `health` 字段。 @@ -64,4 +64,3 @@ cd src/agent/tests 4. 清理 `tests/private/` 与临时容器网络。 如需在真实环境部署,只需将 `dist/argus-agent` 连同健康目录挂载到目标主机,并按上表设置环境变量即可。 - diff --git a/src/agent/app/config.py b/src/agent/app/config.py index f5359f8..dae5d47 100644 --- a/src/agent/app/config.py +++ b/src/agent/app/config.py @@ -52,7 +52,7 @@ def load_config() -> AgentConfig: hostname = _resolve_hostname() node_file = f"/private/argus/agent/{hostname}/node.json" - health_dir = f"/private/argus/agent/health/{hostname}/" + health_dir = f"/private/argus/agent/{hostname}/health/" master_endpoint_env = os.environ.get("MASTER_ENDPOINT") if master_endpoint_env is None: diff --git a/src/agent/scripts/agent_deployment_verify.sh b/src/agent/scripts/agent_deployment_verify.sh new file mode 100755 index 0000000..86249a0 --- /dev/null +++ b/src/agent/scripts/agent_deployment_verify.sh @@ -0,0 +1,714 @@ +#!/usr/bin/env bash +set -euo pipefail + +LOG_PREFIX="[AGENT-VERIFY]" +MASTER_ENDPOINT_DEFAULT="" +AGENT_DATA_ROOT_DEFAULT="/private/argus/agent" +AGENT_ETC_ROOT_DEFAULT="/private/argus/etc" +REPORT_INTERVAL_DEFAULT="2" + +ALLOW_CONFIG_TOUCH="false" +KEEP_TEST_HEALTH="false" + +log_info() { + echo "${LOG_PREFIX} INFO $*" +} + +log_warn() { + echo "${LOG_PREFIX} WARN $*" >&2 +} + +log_error() { + echo "${LOG_PREFIX} ERROR $*" >&2 +} + +usage() { + cat <<'USAGE' +Usage: agent_deployment_verify.sh [options] + +Options: + --allow-config-touch Enable optional config PUT dry-run check. + --keep-test-health Keep the temporary verify health file after checks. + -h, --help Show this help message. + +Environment variables: + MASTER_ENDPOINT (required) Master API base endpoint, e.g. http://master:3000 + AGENT_DATA_ROOT (default: /private/argus/agent) + AGENT_ETC_ROOT (default: /private/argus/etc) + VERIFY_HOSTNAME (default: output of hostname) + REPORT_INTERVAL_SECONDS (default: 2) Agent report interval in seconds +USAGE +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --allow-config-touch) + ALLOW_CONFIG_TOUCH="true" + shift + ;; + --keep-test-health) + KEEP_TEST_HEALTH="true" + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + log_error "Unknown option: $1" + usage >&2 + exit 2 + ;; + esac +done + +MASTER_ENDPOINT="${MASTER_ENDPOINT:-$MASTER_ENDPOINT_DEFAULT}" +AGENT_DATA_ROOT="${AGENT_DATA_ROOT:-$AGENT_DATA_ROOT_DEFAULT}" +AGENT_ETC_ROOT="${AGENT_ETC_ROOT:-$AGENT_ETC_ROOT_DEFAULT}" +VERIFY_HOSTNAME="${VERIFY_HOSTNAME:-$(hostname)}" +REPORT_INTERVAL_SECONDS="${REPORT_INTERVAL_SECONDS:-$REPORT_INTERVAL_DEFAULT}" + +if [[ -z "$MASTER_ENDPOINT" ]]; then + log_error "MASTER_ENDPOINT is required" + exit 2 +fi + +if ! [[ "$REPORT_INTERVAL_SECONDS" =~ ^[0-9]+$ ]] || [[ "$REPORT_INTERVAL_SECONDS" -le 0 ]]; then + log_warn "Invalid REPORT_INTERVAL_SECONDS='$REPORT_INTERVAL_SECONDS', fallback to $REPORT_INTERVAL_DEFAULT" + REPORT_INTERVAL_SECONDS="$REPORT_INTERVAL_DEFAULT" +fi + +normalize_endpoint() { + local endpoint="$1" + if [[ "$endpoint" != http://* && "$endpoint" != https://* ]]; then + endpoint="http://$endpoint" + fi + endpoint="${endpoint%/}" + echo "$endpoint" +} + +MASTER_BASE="$(normalize_endpoint "$MASTER_ENDPOINT")" + +NODE_DIR="$AGENT_DATA_ROOT/$VERIFY_HOSTNAME" +NODE_JSON="$NODE_DIR/node.json" +HEALTH_DIR="$NODE_DIR/health" +DNS_CONF="$AGENT_ETC_ROOT/dns.conf" +UPDATE_SCRIPT="$AGENT_ETC_ROOT/update-dns.sh" + +declare -a RESULTS_PASS=() +declare -a RESULTS_WARN=() +declare -a RESULTS_FAIL=() + +add_result() { + local level="$1" message="$2" + case "$level" in + PASS) + RESULTS_PASS+=("$message") + log_info "$message" + ;; + WARN) + RESULTS_WARN+=("$message") + log_warn "$message" + ;; + FAIL) + RESULTS_FAIL+=("$message") + log_error "$message" + ;; + esac +} + +HAS_JQ="0" +if command -v jq >/dev/null 2>&1; then + HAS_JQ="1" +fi + +if [[ "$HAS_JQ" == "0" ]] && ! command -v python3 >/dev/null 2>&1; then + log_error "Neither jq nor python3 is available for JSON processing" + exit 2 +fi + +CURL_OPTS=(--fail --show-error --silent --max-time 10) + +curl_json() { + local url="$1" + if ! curl "${CURL_OPTS[@]}" "$url"; then + return 1 + fi +} + +json_query() { + local json="$1" jq_expr="$2" py_expr="$3" + if [[ "$HAS_JQ" == "1" ]]; then + if ! output=$(printf '%s' "$json" | jq -e -r "$jq_expr" 2>/dev/null); then + return 1 + fi + printf '%s' "$output" + return 0 + fi + + python3 - "$py_expr" <<'PY' +import json +import sys + +expr = sys.argv[1] +try: + data = json.load(sys.stdin) + value = eval(expr, {}, {"data": data}) +except Exception: + sys.exit(1) +if value is None: + sys.exit(1) +if isinstance(value, (dict, list)): + print(json.dumps(value)) +else: + print(value) +PY +} + +json_length() { + local json="$1" jq_expr="$2" py_expr="$3" + if [[ "$HAS_JQ" == "1" ]]; then + if ! output=$(printf '%s' "$json" | jq -e "$jq_expr" 2>/dev/null); then + return 1 + fi + printf '%s' "$output" + return 0 + fi + + python3 - "$py_expr" <<'PY' +import json +import sys + +expr = sys.argv[1] +try: + data = json.load(sys.stdin) + value = eval(expr, {}, {"data": data}) +except Exception: + sys.exit(1) +try: + print(len(value)) +except Exception: + sys.exit(1) +PY +} + +json_has_key() { + local json="$1" jq_expr="$2" py_expr="$3" + if [[ "$HAS_JQ" == "1" ]]; then + if printf '%s' "$json" | jq -e "$jq_expr" >/dev/null 2>&1; then + return 0 + fi + return 1 + fi + + python3 - "$py_expr" <<'PY' +import json +import sys + +expr = sys.argv[1] +try: + data = json.load(sys.stdin) + value = eval(expr, {}, {"data": data}) +except Exception: + sys.exit(1) +if value: + sys.exit(0) +sys.exit(1) +PY +} + +iso_to_epoch() { + local value="$1" + python3 - "$value" <<'PY' +import sys +from datetime import datetime + +value = sys.argv[1] +if value is None or value == "": + sys.exit(1) +if value.endswith('Z'): + value = value[:-1] + '+00:00' +try: + dt = datetime.fromisoformat(value) +except ValueError: + sys.exit(1) +print(int(dt.timestamp())) +PY +} + +validate_json_file() { + local path="$1" + python3 - "$path" <<'PY' +import json +import sys +path = sys.argv[1] +with open(path, 'r', encoding='utf-8') as handle: + json.load(handle) +PY +} + +ensure_directory() { + local dir="$1" + if [[ ! -d "$dir" ]]; then + log_warn "Creating missing directory $dir" + mkdir -p "$dir" + fi +} + +TEST_HEALTH_FILE="" +TEST_HEALTH_BACKUP="" +TEST_HEALTH_EXISTED="false" + +cleanup() { + if [[ -n "$TEST_HEALTH_FILE" ]]; then + if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then + printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE" + elif [[ "$KEEP_TEST_HEALTH" == "true" ]]; then + : + else + rm -f "$TEST_HEALTH_FILE" + fi + fi +} + +trap cleanup EXIT + +log_info "Starting agent deployment verification for hostname '$VERIFY_HOSTNAME'" + +# 4.2 Master health checks +health_resp="" +if ! health_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/healthz" 2>/tmp/agent_verify_healthz.err); then + error_detail=$(cat /tmp/agent_verify_healthz.err || true) + add_result FAIL "GET /healthz failed: $error_detail" +else + http_meta=$(tail -n1 <<<"$health_resp") + payload=$(head -n -1 <<<"$health_resp" || true) + status_code=${http_meta%% *} + elapsed=${http_meta##* } + add_result PASS "GET /healthz status=$status_code elapsed=${elapsed}s payload=$payload" +fi +rm -f /tmp/agent_verify_healthz.err + +if ! readyz_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/readyz" 2>/tmp/agent_verify_readyz.err); then + error_detail=$(cat /tmp/agent_verify_readyz.err || true) + add_result FAIL "GET /readyz failed: $error_detail" + readyz_payload="" +else + readyz_meta=$(tail -n1 <<<"$readyz_resp") + readyz_payload=$(head -n -1 <<<"$readyz_resp" || true) + readyz_status=${readyz_meta%% *} + readyz_elapsed=${readyz_meta##* } + add_result PASS "GET /readyz status=$readyz_status elapsed=${readyz_elapsed}s" +fi +rm -f /tmp/agent_verify_readyz.err + +# 4.3 Nodes list and detail +if ! nodes_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes" 2>/tmp/agent_verify_nodes.err); then + error_detail=$(cat /tmp/agent_verify_nodes.err || true) + add_result FAIL "GET /api/v1/master/nodes failed: $error_detail" + nodes_json="" +fi +rm -f /tmp/agent_verify_nodes.err + +NODE_ENTRY="" +NODE_ID="" +NODE_IP="" +if [[ -n "$nodes_json" ]]; then + if [[ "$HAS_JQ" == "1" ]]; then + NODE_ENTRY=$(printf '%s' "$nodes_json" | jq -e --arg name "$VERIFY_HOSTNAME" '.[] | select(.name == $name)') || NODE_ENTRY="" + else + NODE_ENTRY=$(python3 - "$VERIFY_HOSTNAME" <<'PY' +import json +import sys + +hostname = sys.argv[1] +nodes = json.load(sys.stdin) +for node in nodes: + if node.get("name") == hostname: + import json as _json + print(_json.dumps(node)) + sys.exit(0) +sys.exit(1) +PY + ) || NODE_ENTRY="" + fi + + if [[ -z "$NODE_ENTRY" ]]; then + add_result FAIL "Current node '$VERIFY_HOSTNAME' not found in master nodes list" + else + if NODE_ID=$(json_query "$NODE_ENTRY" '.id' 'data["id"]'); then + add_result PASS "Discovered node id '$NODE_ID' for hostname '$VERIFY_HOSTNAME'" + else + add_result FAIL "Failed to extract node id from master response" + fi + if NODE_IP=$(json_query "$NODE_ENTRY" '.meta_data.host_ip // empty' 'data.get("meta_data", {}).get("host_ip", "")'); then + if [[ -n "$NODE_IP" ]]; then + add_result PASS "Registered node host_ip=$NODE_IP" + else + add_result WARN "Node host_ip missing in master meta_data" + fi + else + add_result WARN "Unable to read meta_data.host_ip" + fi + fi + + if [[ -n "$NODE_ENTRY" ]] && NODE_DETAIL=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_node_detail.err); then + NODE_DETAIL_JSON="$NODE_DETAIL" + add_result PASS "Fetched node detail for $NODE_ID" + else + error_detail=$(cat /tmp/agent_verify_node_detail.err 2>/dev/null || true) + add_result FAIL "Failed to fetch node detail for $NODE_ID: $error_detail" + NODE_DETAIL_JSON="" + fi + rm -f /tmp/agent_verify_node_detail.err + + if stats_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes/statistics" 2>/tmp/agent_verify_stats.err); then + if total_nodes=$(json_query "$stats_json" '.total_nodes' 'data["total_nodes"]'); then + if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$total_nodes" -ge 1 ]]; then + add_result PASS "Statistics total_nodes=$total_nodes" + else + add_result FAIL "Statistics total_nodes invalid: $total_nodes" + fi + else + add_result FAIL "Unable to read total_nodes from statistics" + fi + if active_nodes=$(json_query "$stats_json" '.active_nodes' 'data["active_nodes"]'); then + if [[ "$active_nodes" =~ ^[0-9]+$ ]]; then + add_result PASS "Statistics active_nodes=$active_nodes" + else + add_result WARN "Statistics active_nodes not numeric: $active_nodes" + fi + fi + if inactive_nodes=$(json_query "$stats_json" '.inactive_nodes' 'data["inactive_nodes"]'); then + if [[ "$inactive_nodes" =~ ^[0-9]+$ ]]; then + add_result PASS "Statistics inactive_nodes=$inactive_nodes" + else + add_result WARN "Statistics inactive_nodes not numeric: $inactive_nodes" + fi + fi + + if [[ "$HAS_JQ" == "1" ]]; then + node_count=$(printf '%s' "$nodes_json" | jq 'length') + else + node_count=$(json_length "$nodes_json" 'length' 'len(data)') + fi + if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$node_count" =~ ^[0-9]+$ ]]; then + if [[ "$total_nodes" -lt "$node_count" ]]; then + add_result WARN "Statistics total_nodes=$total_nodes less than nodes list count=$node_count" + fi + fi + else + error_detail=$(cat /tmp/agent_verify_stats.err 2>/dev/null || true) + add_result FAIL "Failed to fetch node statistics: $error_detail" + fi + rm -f /tmp/agent_verify_stats.err +else + NODE_DETAIL_JSON="" +fi + +# 4.4 Agent persistence checks +if [[ -f "$NODE_JSON" ]]; then + node_file_content="$(cat "$NODE_JSON")" + if node_id_local=$(json_query "$node_file_content" '.id' 'data["id"]'); then + if [[ "$NODE_ID" != "" && "$node_id_local" == "$NODE_ID" ]]; then + add_result PASS "node.json id matches master ($NODE_ID)" + else + add_result FAIL "node.json id '$node_id_local' differs from master id '$NODE_ID'" + fi + else + add_result FAIL "Unable to extract id from node.json" + fi + if node_name_local=$(json_query "$node_file_content" '.name' 'data["name"]'); then + if [[ "$node_name_local" == "$VERIFY_HOSTNAME" ]]; then + add_result PASS "node.json name matches $VERIFY_HOSTNAME" + else + add_result FAIL "node.json name '$node_name_local' differs from hostname '$VERIFY_HOSTNAME'" + fi + else + add_result FAIL "Unable to extract name from node.json" + fi + + if register_time=$(json_query "$node_file_content" '.register_time' 'data.get("register_time")'); then + if iso_to_epoch "$register_time" >/dev/null 2>&1; then + add_result PASS "node.json register_time valid ISO timestamp" + else + add_result WARN "node.json register_time invalid: $register_time" + fi + else + add_result WARN "node.json missing register_time" + fi + + if last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then + if iso_to_epoch "$last_updated" >/dev/null 2>&1; then + add_result PASS "node.json last_updated valid ISO timestamp" + else + add_result WARN "node.json last_updated invalid: $last_updated" + fi + else + add_result WARN "node.json missing last_updated" + fi +else + add_result FAIL "node.json not found at $NODE_JSON" + node_file_content="" +fi + +ensure_directory "$HEALTH_DIR" + +if [[ -d "$HEALTH_DIR" ]]; then + shopt -s nullglob + health_files=("$HEALTH_DIR"/*.json) + shopt -u nullglob + if [[ ${#health_files[@]} -eq 0 ]]; then + add_result WARN "Health directory $HEALTH_DIR is empty" + else + for hf in "${health_files[@]}"; do + base=$(basename "$hf") + if [[ "$base" != *-* ]]; then + add_result WARN "Health file $base does not follow -*.json" + continue + fi + if ! validate_json_file "$hf" >/dev/null 2>&1; then + add_result WARN "Health file $base is not valid JSON" + fi + done + fi +else + add_result WARN "Health directory $HEALTH_DIR missing" +fi + +if [[ -f "$DNS_CONF" ]]; then + nameservers=$(awk '/^nameserver/{print $2}' "$DNS_CONF" | xargs) + if [[ -z "$nameservers" ]]; then + add_result FAIL "dns.conf found but contains no nameserver entries" + else + add_result PASS "dns.conf nameservers: $nameservers" + if getent hosts master.argus.com >/dev/null 2>&1; then + resolved_ips=$(getent hosts master.argus.com | awk '{print $1}' | xargs) + match_found="false" + for ns in $nameservers; do + if grep -qw "$ns" <<<"$resolved_ips"; then + match_found="true" + fi + done + if [[ "$match_found" == "true" ]]; then + add_result PASS "master.argus.com resolves via configured nameserver" + else + add_result WARN "master.argus.com resolved IPs ($resolved_ips) do not match dns.conf nameservers ($nameservers)" + fi + else + add_result WARN "Failed to resolve master.argus.com" + fi + fi +else + add_result FAIL "dns.conf not found at $DNS_CONF" +fi + +if [[ -f "$UPDATE_SCRIPT" ]]; then + dns_mtime=$(stat -c %Y "$DNS_CONF" 2>/dev/null || echo 0) + upd_mtime=$(stat -c %Y "$UPDATE_SCRIPT" 2>/dev/null || echo 0) + if [[ "$dns_mtime" -gt 0 && "$upd_mtime" -gt 0 ]]; then + diff=$((dns_mtime - upd_mtime)) + [[ $diff -lt 0 ]] && diff=$((-diff)) + if [[ $diff -le 300 ]]; then + add_result PASS "dns.conf and update-dns.sh timestamps within 5 minutes" + else + add_result WARN "dns.conf and update-dns.sh timestamps differ by more than 5 minutes" + fi + fi +else + add_result WARN "update-dns.sh not found at $UPDATE_SCRIPT" +fi + +# 4.5 Master-Node status consistency +sleep_interval=$((REPORT_INTERVAL_SECONDS + 2)) + +if [[ -n "$NODE_DETAIL_JSON" ]]; then + detail_pre="$NODE_DETAIL_JSON" +else + detail_pre="" +fi + +if [[ -z "$detail_pre" && -n "$NODE_ID" ]]; then + if detail_pre=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_pre.err); then + add_result PASS "Fetched node detail pre-check" + else + error_detail=$(cat /tmp/agent_verify_detail_pre.err 2>/dev/null || true) + add_result FAIL "Unable to fetch node detail for status check: $error_detail" + fi + rm -f /tmp/agent_verify_detail_pre.err +fi + +server_ts_pre="" +agent_ts_pre="" +server_ts_post="" +agent_ts_post="" + +if [[ -n "$detail_pre" ]]; then + server_ts_pre=$(json_query "$detail_pre" '.last_report.server_timestamp' 'data.get("last_report", {}).get("server_timestamp")' || echo "") + agent_ts_pre=$(json_query "$detail_pre" '.last_report.agent_timestamp' 'data.get("last_report", {}).get("agent_timestamp")' || echo "") + log_info "Captured initial last_report timestamps server='$server_ts_pre' agent='$agent_ts_pre'" + + sleep "$sleep_interval" + + if detail_post=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_post.err); then + server_ts_post=$(json_query "$detail_post" '.last_report.server_timestamp' 'data.get("last_report", {}).get("server_timestamp")' || echo "") + agent_ts_post=$(json_query "$detail_post" '.last_report.agent_timestamp' 'data.get("last_report", {}).get("agent_timestamp")' || echo "") + if [[ "$server_ts_post" != "$server_ts_pre" ]]; then + add_result PASS "last_report.server_timestamp advanced (pre=$server_ts_pre post=$server_ts_post)" + else + add_result FAIL "last_report.server_timestamp did not change after ${sleep_interval}s" + fi + if [[ "$agent_ts_post" != "$agent_ts_pre" ]]; then + add_result PASS "last_report.agent_timestamp advanced" + else + add_result FAIL "last_report.agent_timestamp did not change" + fi + + if [[ -n "$node_file_content" ]]; then + if node_last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then + if epoch_post=$(iso_to_epoch "$server_ts_post" 2>/dev/null); then + if node_epoch=$(iso_to_epoch "$node_last_updated" 2>/dev/null); then + diff=$((epoch_post - node_epoch)) + [[ $diff -lt 0 ]] && diff=$((-diff)) + tolerance=$((REPORT_INTERVAL_SECONDS * 2)) + if [[ $diff -le $tolerance ]]; then + add_result PASS "last_report.server_timestamp and node.json last_updated within tolerance ($diff s)" + else + add_result WARN "Timestamp gap between master ($server_ts_post) and node.json ($node_last_updated) is ${diff}s" + fi + fi + fi + fi + fi + + NODE_DETAIL_JSON="$detail_post" + else + error_detail=$(cat /tmp/agent_verify_detail_post.err 2>/dev/null || true) + add_result FAIL "Failed to fetch node detail post-check: $error_detail" + fi + rm -f /tmp/agent_verify_detail_post.err +fi + +# 4.6 Health simulation +TEST_HEALTH_FILE="$HEALTH_DIR/verify-master.json" +ensure_directory "$HEALTH_DIR" + +if [[ -f "$TEST_HEALTH_FILE" ]]; then + TEST_HEALTH_EXISTED="true" + TEST_HEALTH_BACKUP="$(cat "$TEST_HEALTH_FILE")" +else + TEST_HEALTH_EXISTED="false" +fi + +create_health_file() { + local message="$1" + cat > "$TEST_HEALTH_FILE" </tmp/agent_verify_health1.err); then + if validate_health_in_master "$health_message_one" "$detail_health_one"; then + add_result PASS "Master reflects verify-master health message" + else + add_result FAIL "Master health payload does not match test message" + fi +else + error_detail=$(cat /tmp/agent_verify_health1.err 2>/dev/null || true) + add_result FAIL "Failed to fetch node detail during health validation: $error_detail" + detail_health_one="" +fi +rm -f /tmp/agent_verify_health1.err + +health_message_two="verify $(date +%s)-update" +create_health_file "$health_message_two" +sleep "$sleep_interval" +if detail_health_two=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health2.err); then + if validate_health_in_master "$health_message_two" "$detail_health_two"; then + add_result PASS "Master health updated to new message" + else + add_result FAIL "Master health message did not update" + fi +else + error_detail=$(cat /tmp/agent_verify_health2.err 2>/dev/null || true) + add_result FAIL "Failed to fetch node detail after health update: $error_detail" + detail_health_two="" +fi +rm -f /tmp/agent_verify_health2.err + +rm -f "$TEST_HEALTH_FILE" +sleep "$sleep_interval" +if detail_health_three=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health3.err); then + if remove_health_from_master "$detail_health_three"; then + add_result PASS "Master health no longer lists verify-master after removal" + else + add_result FAIL "Master health still contains verify-master after file deletion" + fi +else + error_detail=$(cat /tmp/agent_verify_health3.err 2>/dev/null || true) + add_result FAIL "Failed to fetch node detail after health removal: $error_detail" +fi +rm -f /tmp/agent_verify_health3.err + +if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then + printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE" +fi + +# Optional config touch +if [[ "$ALLOW_CONFIG_TOUCH" == "true" ]]; then + if [[ -n "$NODE_ID" ]]; then + payload='{"label": {"verify": "true"}}' + if curl "${CURL_OPTS[@]}" -X PUT -H 'Content-Type: application/json' -d "$payload" "$MASTER_BASE/api/v1/master/nodes/$NODE_ID/config" >/tmp/agent_verify_config.log 2>&1; then + add_result PASS "Config PUT dry-run succeeded" + else + add_result WARN "Config PUT dry-run failed: $(cat /tmp/agent_verify_config.log)" + fi + rm -f /tmp/agent_verify_config.log + fi +else + add_result WARN "Config PUT dry-run skipped (enable with --allow-config-touch)" +fi + +# Result summary +echo +echo "==== Verification Summary ====" +for entry in "${RESULTS_PASS[@]}"; do + printf 'PASS: %s\n' "$entry" +done +for entry in "${RESULTS_WARN[@]}"; do + printf 'WARN: %s\n' "$entry" +done +for entry in "${RESULTS_FAIL[@]}"; do + printf 'FAIL: %s\n' "$entry" +done + +if [[ ${#RESULTS_FAIL[@]} -gt 0 ]]; then + exit 1 +fi + +exit 0 diff --git a/src/agent/tests/docker-compose.yml b/src/agent/tests/docker-compose.yml index 2cd4220..6696402 100644 --- a/src/agent/tests/docker-compose.yml +++ b/src/agent/tests/docker-compose.yml @@ -40,7 +40,7 @@ services: - REPORT_INTERVAL_SECONDS=2 volumes: - ./private/argus/agent/dev-e2euser-e2einst-pod-0:/private/argus/agent/dev-e2euser-e2einst-pod-0 - - ./private/argus/agent/health/dev-e2euser-e2einst-pod-0:/private/argus/agent/health/dev-e2euser-e2einst-pod-0 + - ./private/argus/agent/dev-e2euser-e2einst-pod-0/health:/private/argus/agent/dev-e2euser-e2einst-pod-0/health - ./private/argus/etc:/private/argus/etc - ../dist/argus-agent:/usr/local/bin/argus-agent:ro - ./scripts/agent_entrypoint.sh:/usr/local/bin/agent-entrypoint.sh:ro diff --git a/src/agent/tests/scripts/04_write_health_files.sh b/src/agent/tests/scripts/04_write_health_files.sh index d5ec974..ba7128e 100755 --- a/src/agent/tests/scripts/04_write_health_files.sh +++ b/src/agent/tests/scripts/04_write_health_files.sh @@ -3,7 +3,7 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" -HEALTH_DIR="$TEST_ROOT/private/argus/agent/health/dev-e2euser-e2einst-pod-0" +HEALTH_DIR="$TEST_ROOT/private/argus/agent/dev-e2euser-e2einst-pod-0/health" cat > "$HEALTH_DIR/log-fluentbit.json" </dev/null docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true AGENT_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME" -HEALTH_DIR="$TEST_ROOT/private/argus/agent/health/$AGENT_HOSTNAME" +HEALTH_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/health" # 先以 sleep 方式启动容器,确保我们掌握注册时的网络状态 if ! docker run -d \ @@ -74,7 +74,7 @@ if ! docker run -d \ --network "$NETWORK_NAME" \ --ip "$NEW_AGENT_IP" \ -v "$AGENT_DIR:/private/argus/agent/$AGENT_HOSTNAME" \ - -v "$HEALTH_DIR:/private/argus/agent/health/$AGENT_HOSTNAME" \ + -v "$HEALTH_DIR:/private/argus/agent/$AGENT_HOSTNAME/health" \ -v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \ -v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \ -v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \