[#2] health目录统一在hostname下

This commit is contained in:
yuyr 2025-09-26 10:04:24 +00:00
parent db0727cdc7
commit 493141ab77
6 changed files with 720 additions and 7 deletions

View File

@ -38,7 +38,7 @@ Agent 不再依赖配置文件;所有参数均由环境变量与主机名推
派生路径: 派生路径:
- 节点信息:`/private/argus/agent/<hostname>/node.json` - 节点信息:`/private/argus/agent/<hostname>/node.json`
- 子模块健康目录:`/private/argus/agent/health/<hostname>/` - 子模块健康目录:`/private/argus/agent/<hostname>/health/`
健康目录中的文件需遵循 `<模块前缀>-*.json` 命名(例如 `log-fluentbit.json``metric-node-exporter.json`),文件内容会原样并入上报的 `health` 字段。 健康目录中的文件需遵循 `<模块前缀>-*.json` 命名(例如 `log-fluentbit.json``metric-node-exporter.json`),文件内容会原样并入上报的 `health` 字段。
@ -64,4 +64,3 @@ cd src/agent/tests
4. 清理 `tests/private/` 与临时容器网络。 4. 清理 `tests/private/` 与临时容器网络。
如需在真实环境部署,只需将 `dist/argus-agent` 连同健康目录挂载到目标主机,并按上表设置环境变量即可。 如需在真实环境部署,只需将 `dist/argus-agent` 连同健康目录挂载到目标主机,并按上表设置环境变量即可。

View File

@ -52,7 +52,7 @@ def load_config() -> AgentConfig:
hostname = _resolve_hostname() hostname = _resolve_hostname()
node_file = f"/private/argus/agent/{hostname}/node.json" node_file = f"/private/argus/agent/{hostname}/node.json"
health_dir = f"/private/argus/agent/health/{hostname}/" health_dir = f"/private/argus/agent/{hostname}/health/"
master_endpoint_env = os.environ.get("MASTER_ENDPOINT") master_endpoint_env = os.environ.get("MASTER_ENDPOINT")
if master_endpoint_env is None: if master_endpoint_env is None:

View File

@ -0,0 +1,714 @@
#!/usr/bin/env bash
set -euo pipefail
LOG_PREFIX="[AGENT-VERIFY]"
MASTER_ENDPOINT_DEFAULT=""
AGENT_DATA_ROOT_DEFAULT="/private/argus/agent"
AGENT_ETC_ROOT_DEFAULT="/private/argus/etc"
REPORT_INTERVAL_DEFAULT="2"
ALLOW_CONFIG_TOUCH="false"
KEEP_TEST_HEALTH="false"
log_info() {
echo "${LOG_PREFIX} INFO $*"
}
log_warn() {
echo "${LOG_PREFIX} WARN $*" >&2
}
log_error() {
echo "${LOG_PREFIX} ERROR $*" >&2
}
usage() {
cat <<'USAGE'
Usage: agent_deployment_verify.sh [options]
Options:
--allow-config-touch Enable optional config PUT dry-run check.
--keep-test-health Keep the temporary verify health file after checks.
-h, --help Show this help message.
Environment variables:
MASTER_ENDPOINT (required) Master API base endpoint, e.g. http://master:3000
AGENT_DATA_ROOT (default: /private/argus/agent)
AGENT_ETC_ROOT (default: /private/argus/etc)
VERIFY_HOSTNAME (default: output of hostname)
REPORT_INTERVAL_SECONDS (default: 2) Agent report interval in seconds
USAGE
}
while [[ $# -gt 0 ]]; do
case "$1" in
--allow-config-touch)
ALLOW_CONFIG_TOUCH="true"
shift
;;
--keep-test-health)
KEEP_TEST_HEALTH="true"
shift
;;
-h|--help)
usage
exit 0
;;
*)
log_error "Unknown option: $1"
usage >&2
exit 2
;;
esac
done
MASTER_ENDPOINT="${MASTER_ENDPOINT:-$MASTER_ENDPOINT_DEFAULT}"
AGENT_DATA_ROOT="${AGENT_DATA_ROOT:-$AGENT_DATA_ROOT_DEFAULT}"
AGENT_ETC_ROOT="${AGENT_ETC_ROOT:-$AGENT_ETC_ROOT_DEFAULT}"
VERIFY_HOSTNAME="${VERIFY_HOSTNAME:-$(hostname)}"
REPORT_INTERVAL_SECONDS="${REPORT_INTERVAL_SECONDS:-$REPORT_INTERVAL_DEFAULT}"
if [[ -z "$MASTER_ENDPOINT" ]]; then
log_error "MASTER_ENDPOINT is required"
exit 2
fi
if ! [[ "$REPORT_INTERVAL_SECONDS" =~ ^[0-9]+$ ]] || [[ "$REPORT_INTERVAL_SECONDS" -le 0 ]]; then
log_warn "Invalid REPORT_INTERVAL_SECONDS='$REPORT_INTERVAL_SECONDS', fallback to $REPORT_INTERVAL_DEFAULT"
REPORT_INTERVAL_SECONDS="$REPORT_INTERVAL_DEFAULT"
fi
normalize_endpoint() {
local endpoint="$1"
if [[ "$endpoint" != http://* && "$endpoint" != https://* ]]; then
endpoint="http://$endpoint"
fi
endpoint="${endpoint%/}"
echo "$endpoint"
}
MASTER_BASE="$(normalize_endpoint "$MASTER_ENDPOINT")"
NODE_DIR="$AGENT_DATA_ROOT/$VERIFY_HOSTNAME"
NODE_JSON="$NODE_DIR/node.json"
HEALTH_DIR="$NODE_DIR/health"
DNS_CONF="$AGENT_ETC_ROOT/dns.conf"
UPDATE_SCRIPT="$AGENT_ETC_ROOT/update-dns.sh"
declare -a RESULTS_PASS=()
declare -a RESULTS_WARN=()
declare -a RESULTS_FAIL=()
add_result() {
local level="$1" message="$2"
case "$level" in
PASS)
RESULTS_PASS+=("$message")
log_info "$message"
;;
WARN)
RESULTS_WARN+=("$message")
log_warn "$message"
;;
FAIL)
RESULTS_FAIL+=("$message")
log_error "$message"
;;
esac
}
HAS_JQ="0"
if command -v jq >/dev/null 2>&1; then
HAS_JQ="1"
fi
if [[ "$HAS_JQ" == "0" ]] && ! command -v python3 >/dev/null 2>&1; then
log_error "Neither jq nor python3 is available for JSON processing"
exit 2
fi
CURL_OPTS=(--fail --show-error --silent --max-time 10)
curl_json() {
local url="$1"
if ! curl "${CURL_OPTS[@]}" "$url"; then
return 1
fi
}
json_query() {
local json="$1" jq_expr="$2" py_expr="$3"
if [[ "$HAS_JQ" == "1" ]]; then
if ! output=$(printf '%s' "$json" | jq -e -r "$jq_expr" 2>/dev/null); then
return 1
fi
printf '%s' "$output"
return 0
fi
python3 - "$py_expr" <<'PY'
import json
import sys
expr = sys.argv[1]
try:
data = json.load(sys.stdin)
value = eval(expr, {}, {"data": data})
except Exception:
sys.exit(1)
if value is None:
sys.exit(1)
if isinstance(value, (dict, list)):
print(json.dumps(value))
else:
print(value)
PY
}
json_length() {
local json="$1" jq_expr="$2" py_expr="$3"
if [[ "$HAS_JQ" == "1" ]]; then
if ! output=$(printf '%s' "$json" | jq -e "$jq_expr" 2>/dev/null); then
return 1
fi
printf '%s' "$output"
return 0
fi
python3 - "$py_expr" <<'PY'
import json
import sys
expr = sys.argv[1]
try:
data = json.load(sys.stdin)
value = eval(expr, {}, {"data": data})
except Exception:
sys.exit(1)
try:
print(len(value))
except Exception:
sys.exit(1)
PY
}
json_has_key() {
local json="$1" jq_expr="$2" py_expr="$3"
if [[ "$HAS_JQ" == "1" ]]; then
if printf '%s' "$json" | jq -e "$jq_expr" >/dev/null 2>&1; then
return 0
fi
return 1
fi
python3 - "$py_expr" <<'PY'
import json
import sys
expr = sys.argv[1]
try:
data = json.load(sys.stdin)
value = eval(expr, {}, {"data": data})
except Exception:
sys.exit(1)
if value:
sys.exit(0)
sys.exit(1)
PY
}
iso_to_epoch() {
local value="$1"
python3 - "$value" <<'PY'
import sys
from datetime import datetime
value = sys.argv[1]
if value is None or value == "":
sys.exit(1)
if value.endswith('Z'):
value = value[:-1] + '+00:00'
try:
dt = datetime.fromisoformat(value)
except ValueError:
sys.exit(1)
print(int(dt.timestamp()))
PY
}
validate_json_file() {
local path="$1"
python3 - "$path" <<'PY'
import json
import sys
path = sys.argv[1]
with open(path, 'r', encoding='utf-8') as handle:
json.load(handle)
PY
}
ensure_directory() {
local dir="$1"
if [[ ! -d "$dir" ]]; then
log_warn "Creating missing directory $dir"
mkdir -p "$dir"
fi
}
TEST_HEALTH_FILE=""
TEST_HEALTH_BACKUP=""
TEST_HEALTH_EXISTED="false"
cleanup() {
if [[ -n "$TEST_HEALTH_FILE" ]]; then
if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then
printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE"
elif [[ "$KEEP_TEST_HEALTH" == "true" ]]; then
:
else
rm -f "$TEST_HEALTH_FILE"
fi
fi
}
trap cleanup EXIT
log_info "Starting agent deployment verification for hostname '$VERIFY_HOSTNAME'"
# 4.2 Master health checks
health_resp=""
if ! health_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/healthz" 2>/tmp/agent_verify_healthz.err); then
error_detail=$(cat /tmp/agent_verify_healthz.err || true)
add_result FAIL "GET /healthz failed: $error_detail"
else
http_meta=$(tail -n1 <<<"$health_resp")
payload=$(head -n -1 <<<"$health_resp" || true)
status_code=${http_meta%% *}
elapsed=${http_meta##* }
add_result PASS "GET /healthz status=$status_code elapsed=${elapsed}s payload=$payload"
fi
rm -f /tmp/agent_verify_healthz.err
if ! readyz_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/readyz" 2>/tmp/agent_verify_readyz.err); then
error_detail=$(cat /tmp/agent_verify_readyz.err || true)
add_result FAIL "GET /readyz failed: $error_detail"
readyz_payload=""
else
readyz_meta=$(tail -n1 <<<"$readyz_resp")
readyz_payload=$(head -n -1 <<<"$readyz_resp" || true)
readyz_status=${readyz_meta%% *}
readyz_elapsed=${readyz_meta##* }
add_result PASS "GET /readyz status=$readyz_status elapsed=${readyz_elapsed}s"
fi
rm -f /tmp/agent_verify_readyz.err
# 4.3 Nodes list and detail
if ! nodes_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes" 2>/tmp/agent_verify_nodes.err); then
error_detail=$(cat /tmp/agent_verify_nodes.err || true)
add_result FAIL "GET /api/v1/master/nodes failed: $error_detail"
nodes_json=""
fi
rm -f /tmp/agent_verify_nodes.err
NODE_ENTRY=""
NODE_ID=""
NODE_IP=""
if [[ -n "$nodes_json" ]]; then
if [[ "$HAS_JQ" == "1" ]]; then
NODE_ENTRY=$(printf '%s' "$nodes_json" | jq -e --arg name "$VERIFY_HOSTNAME" '.[] | select(.name == $name)') || NODE_ENTRY=""
else
NODE_ENTRY=$(python3 - "$VERIFY_HOSTNAME" <<'PY'
import json
import sys
hostname = sys.argv[1]
nodes = json.load(sys.stdin)
for node in nodes:
if node.get("name") == hostname:
import json as _json
print(_json.dumps(node))
sys.exit(0)
sys.exit(1)
PY
) || NODE_ENTRY=""
fi
if [[ -z "$NODE_ENTRY" ]]; then
add_result FAIL "Current node '$VERIFY_HOSTNAME' not found in master nodes list"
else
if NODE_ID=$(json_query "$NODE_ENTRY" '.id' 'data["id"]'); then
add_result PASS "Discovered node id '$NODE_ID' for hostname '$VERIFY_HOSTNAME'"
else
add_result FAIL "Failed to extract node id from master response"
fi
if NODE_IP=$(json_query "$NODE_ENTRY" '.meta_data.host_ip // empty' 'data.get("meta_data", {}).get("host_ip", "")'); then
if [[ -n "$NODE_IP" ]]; then
add_result PASS "Registered node host_ip=$NODE_IP"
else
add_result WARN "Node host_ip missing in master meta_data"
fi
else
add_result WARN "Unable to read meta_data.host_ip"
fi
fi
if [[ -n "$NODE_ENTRY" ]] && NODE_DETAIL=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_node_detail.err); then
NODE_DETAIL_JSON="$NODE_DETAIL"
add_result PASS "Fetched node detail for $NODE_ID"
else
error_detail=$(cat /tmp/agent_verify_node_detail.err 2>/dev/null || true)
add_result FAIL "Failed to fetch node detail for $NODE_ID: $error_detail"
NODE_DETAIL_JSON=""
fi
rm -f /tmp/agent_verify_node_detail.err
if stats_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes/statistics" 2>/tmp/agent_verify_stats.err); then
if total_nodes=$(json_query "$stats_json" '.total_nodes' 'data["total_nodes"]'); then
if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$total_nodes" -ge 1 ]]; then
add_result PASS "Statistics total_nodes=$total_nodes"
else
add_result FAIL "Statistics total_nodes invalid: $total_nodes"
fi
else
add_result FAIL "Unable to read total_nodes from statistics"
fi
if active_nodes=$(json_query "$stats_json" '.active_nodes' 'data["active_nodes"]'); then
if [[ "$active_nodes" =~ ^[0-9]+$ ]]; then
add_result PASS "Statistics active_nodes=$active_nodes"
else
add_result WARN "Statistics active_nodes not numeric: $active_nodes"
fi
fi
if inactive_nodes=$(json_query "$stats_json" '.inactive_nodes' 'data["inactive_nodes"]'); then
if [[ "$inactive_nodes" =~ ^[0-9]+$ ]]; then
add_result PASS "Statistics inactive_nodes=$inactive_nodes"
else
add_result WARN "Statistics inactive_nodes not numeric: $inactive_nodes"
fi
fi
if [[ "$HAS_JQ" == "1" ]]; then
node_count=$(printf '%s' "$nodes_json" | jq 'length')
else
node_count=$(json_length "$nodes_json" 'length' 'len(data)')
fi
if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$node_count" =~ ^[0-9]+$ ]]; then
if [[ "$total_nodes" -lt "$node_count" ]]; then
add_result WARN "Statistics total_nodes=$total_nodes less than nodes list count=$node_count"
fi
fi
else
error_detail=$(cat /tmp/agent_verify_stats.err 2>/dev/null || true)
add_result FAIL "Failed to fetch node statistics: $error_detail"
fi
rm -f /tmp/agent_verify_stats.err
else
NODE_DETAIL_JSON=""
fi
# 4.4 Agent persistence checks
if [[ -f "$NODE_JSON" ]]; then
node_file_content="$(cat "$NODE_JSON")"
if node_id_local=$(json_query "$node_file_content" '.id' 'data["id"]'); then
if [[ "$NODE_ID" != "" && "$node_id_local" == "$NODE_ID" ]]; then
add_result PASS "node.json id matches master ($NODE_ID)"
else
add_result FAIL "node.json id '$node_id_local' differs from master id '$NODE_ID'"
fi
else
add_result FAIL "Unable to extract id from node.json"
fi
if node_name_local=$(json_query "$node_file_content" '.name' 'data["name"]'); then
if [[ "$node_name_local" == "$VERIFY_HOSTNAME" ]]; then
add_result PASS "node.json name matches $VERIFY_HOSTNAME"
else
add_result FAIL "node.json name '$node_name_local' differs from hostname '$VERIFY_HOSTNAME'"
fi
else
add_result FAIL "Unable to extract name from node.json"
fi
if register_time=$(json_query "$node_file_content" '.register_time' 'data.get("register_time")'); then
if iso_to_epoch "$register_time" >/dev/null 2>&1; then
add_result PASS "node.json register_time valid ISO timestamp"
else
add_result WARN "node.json register_time invalid: $register_time"
fi
else
add_result WARN "node.json missing register_time"
fi
if last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then
if iso_to_epoch "$last_updated" >/dev/null 2>&1; then
add_result PASS "node.json last_updated valid ISO timestamp"
else
add_result WARN "node.json last_updated invalid: $last_updated"
fi
else
add_result WARN "node.json missing last_updated"
fi
else
add_result FAIL "node.json not found at $NODE_JSON"
node_file_content=""
fi
ensure_directory "$HEALTH_DIR"
if [[ -d "$HEALTH_DIR" ]]; then
shopt -s nullglob
health_files=("$HEALTH_DIR"/*.json)
shopt -u nullglob
if [[ ${#health_files[@]} -eq 0 ]]; then
add_result WARN "Health directory $HEALTH_DIR is empty"
else
for hf in "${health_files[@]}"; do
base=$(basename "$hf")
if [[ "$base" != *-* ]]; then
add_result WARN "Health file $base does not follow <module>-*.json"
continue
fi
if ! validate_json_file "$hf" >/dev/null 2>&1; then
add_result WARN "Health file $base is not valid JSON"
fi
done
fi
else
add_result WARN "Health directory $HEALTH_DIR missing"
fi
if [[ -f "$DNS_CONF" ]]; then
nameservers=$(awk '/^nameserver/{print $2}' "$DNS_CONF" | xargs)
if [[ -z "$nameservers" ]]; then
add_result FAIL "dns.conf found but contains no nameserver entries"
else
add_result PASS "dns.conf nameservers: $nameservers"
if getent hosts master.argus.com >/dev/null 2>&1; then
resolved_ips=$(getent hosts master.argus.com | awk '{print $1}' | xargs)
match_found="false"
for ns in $nameservers; do
if grep -qw "$ns" <<<"$resolved_ips"; then
match_found="true"
fi
done
if [[ "$match_found" == "true" ]]; then
add_result PASS "master.argus.com resolves via configured nameserver"
else
add_result WARN "master.argus.com resolved IPs ($resolved_ips) do not match dns.conf nameservers ($nameservers)"
fi
else
add_result WARN "Failed to resolve master.argus.com"
fi
fi
else
add_result FAIL "dns.conf not found at $DNS_CONF"
fi
if [[ -f "$UPDATE_SCRIPT" ]]; then
dns_mtime=$(stat -c %Y "$DNS_CONF" 2>/dev/null || echo 0)
upd_mtime=$(stat -c %Y "$UPDATE_SCRIPT" 2>/dev/null || echo 0)
if [[ "$dns_mtime" -gt 0 && "$upd_mtime" -gt 0 ]]; then
diff=$((dns_mtime - upd_mtime))
[[ $diff -lt 0 ]] && diff=$((-diff))
if [[ $diff -le 300 ]]; then
add_result PASS "dns.conf and update-dns.sh timestamps within 5 minutes"
else
add_result WARN "dns.conf and update-dns.sh timestamps differ by more than 5 minutes"
fi
fi
else
add_result WARN "update-dns.sh not found at $UPDATE_SCRIPT"
fi
# 4.5 Master-Node status consistency
sleep_interval=$((REPORT_INTERVAL_SECONDS + 2))
if [[ -n "$NODE_DETAIL_JSON" ]]; then
detail_pre="$NODE_DETAIL_JSON"
else
detail_pre=""
fi
if [[ -z "$detail_pre" && -n "$NODE_ID" ]]; then
if detail_pre=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_pre.err); then
add_result PASS "Fetched node detail pre-check"
else
error_detail=$(cat /tmp/agent_verify_detail_pre.err 2>/dev/null || true)
add_result FAIL "Unable to fetch node detail for status check: $error_detail"
fi
rm -f /tmp/agent_verify_detail_pre.err
fi
server_ts_pre=""
agent_ts_pre=""
server_ts_post=""
agent_ts_post=""
if [[ -n "$detail_pre" ]]; then
server_ts_pre=$(json_query "$detail_pre" '.last_report.server_timestamp' 'data.get("last_report", {}).get("server_timestamp")' || echo "")
agent_ts_pre=$(json_query "$detail_pre" '.last_report.agent_timestamp' 'data.get("last_report", {}).get("agent_timestamp")' || echo "")
log_info "Captured initial last_report timestamps server='$server_ts_pre' agent='$agent_ts_pre'"
sleep "$sleep_interval"
if detail_post=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_post.err); then
server_ts_post=$(json_query "$detail_post" '.last_report.server_timestamp' 'data.get("last_report", {}).get("server_timestamp")' || echo "")
agent_ts_post=$(json_query "$detail_post" '.last_report.agent_timestamp' 'data.get("last_report", {}).get("agent_timestamp")' || echo "")
if [[ "$server_ts_post" != "$server_ts_pre" ]]; then
add_result PASS "last_report.server_timestamp advanced (pre=$server_ts_pre post=$server_ts_post)"
else
add_result FAIL "last_report.server_timestamp did not change after ${sleep_interval}s"
fi
if [[ "$agent_ts_post" != "$agent_ts_pre" ]]; then
add_result PASS "last_report.agent_timestamp advanced"
else
add_result FAIL "last_report.agent_timestamp did not change"
fi
if [[ -n "$node_file_content" ]]; then
if node_last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then
if epoch_post=$(iso_to_epoch "$server_ts_post" 2>/dev/null); then
if node_epoch=$(iso_to_epoch "$node_last_updated" 2>/dev/null); then
diff=$((epoch_post - node_epoch))
[[ $diff -lt 0 ]] && diff=$((-diff))
tolerance=$((REPORT_INTERVAL_SECONDS * 2))
if [[ $diff -le $tolerance ]]; then
add_result PASS "last_report.server_timestamp and node.json last_updated within tolerance ($diff s)"
else
add_result WARN "Timestamp gap between master ($server_ts_post) and node.json ($node_last_updated) is ${diff}s"
fi
fi
fi
fi
fi
NODE_DETAIL_JSON="$detail_post"
else
error_detail=$(cat /tmp/agent_verify_detail_post.err 2>/dev/null || true)
add_result FAIL "Failed to fetch node detail post-check: $error_detail"
fi
rm -f /tmp/agent_verify_detail_post.err
fi
# 4.6 Health simulation
TEST_HEALTH_FILE="$HEALTH_DIR/verify-master.json"
ensure_directory "$HEALTH_DIR"
if [[ -f "$TEST_HEALTH_FILE" ]]; then
TEST_HEALTH_EXISTED="true"
TEST_HEALTH_BACKUP="$(cat "$TEST_HEALTH_FILE")"
else
TEST_HEALTH_EXISTED="false"
fi
create_health_file() {
local message="$1"
cat > "$TEST_HEALTH_FILE" <<HEALTHJSON
{"status":"ok","message":"$message"}
HEALTHJSON
}
validate_health_in_master() {
local expected_message="$1"
local detail_json="$2"
local message
if message=$(json_query "$detail_json" '.meta_data.health["verify-master"].message' 'data.get("meta_data", {}).get("health", {}).get("verify-master", {}).get("message")'); then
if [[ "$message" == "$expected_message" ]]; then
return 0
fi
fi
return 1
}
remove_health_from_master() {
local detail_json="$1"
if json_has_key "$detail_json" '(.meta_data.health | has("verify-master"))' '"verify-master" in data.get("meta_data", {}).get("health", {})'; then
return 1
fi
return 0
}
health_message_one="verify $(date +%s)"
create_health_file "$health_message_one"
add_result PASS "Created test health file $TEST_HEALTH_FILE"
sleep "$sleep_interval"
if detail_health_one=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health1.err); then
if validate_health_in_master "$health_message_one" "$detail_health_one"; then
add_result PASS "Master reflects verify-master health message"
else
add_result FAIL "Master health payload does not match test message"
fi
else
error_detail=$(cat /tmp/agent_verify_health1.err 2>/dev/null || true)
add_result FAIL "Failed to fetch node detail during health validation: $error_detail"
detail_health_one=""
fi
rm -f /tmp/agent_verify_health1.err
health_message_two="verify $(date +%s)-update"
create_health_file "$health_message_two"
sleep "$sleep_interval"
if detail_health_two=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health2.err); then
if validate_health_in_master "$health_message_two" "$detail_health_two"; then
add_result PASS "Master health updated to new message"
else
add_result FAIL "Master health message did not update"
fi
else
error_detail=$(cat /tmp/agent_verify_health2.err 2>/dev/null || true)
add_result FAIL "Failed to fetch node detail after health update: $error_detail"
detail_health_two=""
fi
rm -f /tmp/agent_verify_health2.err
rm -f "$TEST_HEALTH_FILE"
sleep "$sleep_interval"
if detail_health_three=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health3.err); then
if remove_health_from_master "$detail_health_three"; then
add_result PASS "Master health no longer lists verify-master after removal"
else
add_result FAIL "Master health still contains verify-master after file deletion"
fi
else
error_detail=$(cat /tmp/agent_verify_health3.err 2>/dev/null || true)
add_result FAIL "Failed to fetch node detail after health removal: $error_detail"
fi
rm -f /tmp/agent_verify_health3.err
if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then
printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE"
fi
# Optional config touch
if [[ "$ALLOW_CONFIG_TOUCH" == "true" ]]; then
if [[ -n "$NODE_ID" ]]; then
payload='{"label": {"verify": "true"}}'
if curl "${CURL_OPTS[@]}" -X PUT -H 'Content-Type: application/json' -d "$payload" "$MASTER_BASE/api/v1/master/nodes/$NODE_ID/config" >/tmp/agent_verify_config.log 2>&1; then
add_result PASS "Config PUT dry-run succeeded"
else
add_result WARN "Config PUT dry-run failed: $(cat /tmp/agent_verify_config.log)"
fi
rm -f /tmp/agent_verify_config.log
fi
else
add_result WARN "Config PUT dry-run skipped (enable with --allow-config-touch)"
fi
# Result summary
echo
echo "==== Verification Summary ===="
for entry in "${RESULTS_PASS[@]}"; do
printf 'PASS: %s\n' "$entry"
done
for entry in "${RESULTS_WARN[@]}"; do
printf 'WARN: %s\n' "$entry"
done
for entry in "${RESULTS_FAIL[@]}"; do
printf 'FAIL: %s\n' "$entry"
done
if [[ ${#RESULTS_FAIL[@]} -gt 0 ]]; then
exit 1
fi
exit 0

View File

@ -40,7 +40,7 @@ services:
- REPORT_INTERVAL_SECONDS=2 - REPORT_INTERVAL_SECONDS=2
volumes: volumes:
- ./private/argus/agent/dev-e2euser-e2einst-pod-0:/private/argus/agent/dev-e2euser-e2einst-pod-0 - ./private/argus/agent/dev-e2euser-e2einst-pod-0:/private/argus/agent/dev-e2euser-e2einst-pod-0
- ./private/argus/agent/health/dev-e2euser-e2einst-pod-0:/private/argus/agent/health/dev-e2euser-e2einst-pod-0 - ./private/argus/agent/dev-e2euser-e2einst-pod-0/health:/private/argus/agent/dev-e2euser-e2einst-pod-0/health
- ./private/argus/etc:/private/argus/etc - ./private/argus/etc:/private/argus/etc
- ../dist/argus-agent:/usr/local/bin/argus-agent:ro - ../dist/argus-agent:/usr/local/bin/argus-agent:ro
- ./scripts/agent_entrypoint.sh:/usr/local/bin/agent-entrypoint.sh:ro - ./scripts/agent_entrypoint.sh:/usr/local/bin/agent-entrypoint.sh:ro

View File

@ -3,7 +3,7 @@ set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
HEALTH_DIR="$TEST_ROOT/private/argus/agent/health/dev-e2euser-e2einst-pod-0" HEALTH_DIR="$TEST_ROOT/private/argus/agent/dev-e2euser-e2einst-pod-0/health"
cat > "$HEALTH_DIR/log-fluentbit.json" <<JSON cat > "$HEALTH_DIR/log-fluentbit.json" <<JSON
{ {

View File

@ -65,7 +65,7 @@ popd >/dev/null
docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true
AGENT_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME" AGENT_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME"
HEALTH_DIR="$TEST_ROOT/private/argus/agent/health/$AGENT_HOSTNAME" HEALTH_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/health"
# 先以 sleep 方式启动容器,确保我们掌握注册时的网络状态 # 先以 sleep 方式启动容器,确保我们掌握注册时的网络状态
if ! docker run -d \ if ! docker run -d \
@ -74,7 +74,7 @@ if ! docker run -d \
--network "$NETWORK_NAME" \ --network "$NETWORK_NAME" \
--ip "$NEW_AGENT_IP" \ --ip "$NEW_AGENT_IP" \
-v "$AGENT_DIR:/private/argus/agent/$AGENT_HOSTNAME" \ -v "$AGENT_DIR:/private/argus/agent/$AGENT_HOSTNAME" \
-v "$HEALTH_DIR:/private/argus/agent/health/$AGENT_HOSTNAME" \ -v "$HEALTH_DIR:/private/argus/agent/$AGENT_HOSTNAME/health" \
-v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \ -v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \
-v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \ -v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
-v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \ -v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \