#!/usr/bin/env bash set -euo pipefail LOG_PREFIX="[AGENT-VERIFY]" MASTER_ENDPOINT_DEFAULT="" AGENT_DATA_ROOT_DEFAULT="/private/argus/agent" AGENT_ETC_ROOT_DEFAULT="/private/argus/etc" REPORT_INTERVAL_DEFAULT="2" ALLOW_CONFIG_TOUCH="false" KEEP_TEST_HEALTH="false" log_info() { echo "${LOG_PREFIX} INFO $*" } log_warn() { echo "${LOG_PREFIX} WARN $*" >&2 } log_error() { echo "${LOG_PREFIX} ERROR $*" >&2 } usage() { cat <<'USAGE' Usage: agent_deployment_verify.sh [options] Options: --allow-config-touch Enable optional config PUT dry-run check. --keep-test-health Keep the temporary verify health file after checks. -h, --help Show this help message. Environment variables: MASTER_ENDPOINT (required) Master API base endpoint, e.g. http://master:3000 AGENT_DATA_ROOT (default: /private/argus/agent) AGENT_ETC_ROOT (default: /private/argus/etc) VERIFY_HOSTNAME (default: output of hostname) REPORT_INTERVAL_SECONDS (default: 2) Agent report interval in seconds USAGE } while [[ $# -gt 0 ]]; do case "$1" in --allow-config-touch) ALLOW_CONFIG_TOUCH="true" shift ;; --keep-test-health) KEEP_TEST_HEALTH="true" shift ;; -h|--help) usage exit 0 ;; *) log_error "Unknown option: $1" usage >&2 exit 2 ;; esac done MASTER_ENDPOINT="${MASTER_ENDPOINT:-$MASTER_ENDPOINT_DEFAULT}" AGENT_DATA_ROOT="${AGENT_DATA_ROOT:-$AGENT_DATA_ROOT_DEFAULT}" AGENT_ETC_ROOT="${AGENT_ETC_ROOT:-$AGENT_ETC_ROOT_DEFAULT}" VERIFY_HOSTNAME="${VERIFY_HOSTNAME:-$(hostname)}" REPORT_INTERVAL_SECONDS="${REPORT_INTERVAL_SECONDS:-$REPORT_INTERVAL_DEFAULT}" if [[ -z "$MASTER_ENDPOINT" ]]; then log_error "MASTER_ENDPOINT is required" exit 2 fi if ! [[ "$REPORT_INTERVAL_SECONDS" =~ ^[0-9]+$ ]] || [[ "$REPORT_INTERVAL_SECONDS" -le 0 ]]; then log_warn "Invalid REPORT_INTERVAL_SECONDS='$REPORT_INTERVAL_SECONDS', fallback to $REPORT_INTERVAL_DEFAULT" REPORT_INTERVAL_SECONDS="$REPORT_INTERVAL_DEFAULT" fi normalize_endpoint() { local endpoint="$1" if [[ "$endpoint" != http://* && "$endpoint" != https://* ]]; then endpoint="http://$endpoint" fi endpoint="${endpoint%/}" echo "$endpoint" } MASTER_BASE="$(normalize_endpoint "$MASTER_ENDPOINT")" NODE_DIR="$AGENT_DATA_ROOT/$VERIFY_HOSTNAME" NODE_JSON="$NODE_DIR/node.json" HEALTH_DIR="$NODE_DIR/health" DNS_CONF="$AGENT_ETC_ROOT/dns.conf" UPDATE_SCRIPT="$AGENT_ETC_ROOT/update-dns.sh" declare -a RESULTS_PASS=() declare -a RESULTS_WARN=() declare -a RESULTS_FAIL=() add_result() { local level="$1" message="$2" case "$level" in PASS) RESULTS_PASS+=("$message") log_info "$message" ;; WARN) RESULTS_WARN+=("$message") log_warn "$message" ;; FAIL) RESULTS_FAIL+=("$message") log_error "$message" ;; esac } HAS_JQ="0" if command -v jq >/dev/null 2>&1; then HAS_JQ="1" fi if ! command -v curl >/dev/null 2>&1; then log_error "curl command not found; please install curl (e.g. apt-get install -y curl)" exit 2 fi if [[ "$HAS_JQ" == "0" ]] && ! command -v python3 >/dev/null 2>&1; then log_error "Neither jq nor python3 is available for JSON processing" exit 2 fi CURL_OPTS=(--fail --show-error --silent --max-time 10) curl_json() { local url="$1" if ! curl "${CURL_OPTS[@]}" "$url"; then return 1 fi } json_query() { local json="$1" jq_expr="$2" py_expr="$3" if [[ "$HAS_JQ" == "1" ]]; then if ! output=$(printf '%s' "$json" | jq -e -r "$jq_expr" 2>/dev/null); then return 1 fi printf '%s' "$output" return 0 fi python3 - "$py_expr" <<'PY' import json import sys expr = sys.argv[1] try: data = json.load(sys.stdin) value = eval(expr, {}, {"data": data}) except Exception: sys.exit(1) if value is None: sys.exit(1) if isinstance(value, (dict, list)): print(json.dumps(value)) else: print(value) PY } json_length() { local json="$1" jq_expr="$2" py_expr="$3" if [[ "$HAS_JQ" == "1" ]]; then if ! output=$(printf '%s' "$json" | jq -e "$jq_expr" 2>/dev/null); then return 1 fi printf '%s' "$output" return 0 fi python3 - "$py_expr" <<'PY' import json import sys expr = sys.argv[1] try: data = json.load(sys.stdin) value = eval(expr, {}, {"data": data}) except Exception: sys.exit(1) try: print(len(value)) except Exception: sys.exit(1) PY } json_has_key() { local json="$1" jq_expr="$2" py_expr="$3" if [[ "$HAS_JQ" == "1" ]]; then if printf '%s' "$json" | jq -e "$jq_expr" >/dev/null 2>&1; then return 0 fi return 1 fi python3 - "$py_expr" <<'PY' import json import sys expr = sys.argv[1] try: data = json.load(sys.stdin) value = eval(expr, {}, {"data": data}) except Exception: sys.exit(1) if value: sys.exit(0) sys.exit(1) PY } iso_to_epoch() { local value="$1" if command -v date >/dev/null 2>&1; then date -d "$value" +%s 2>/dev/null && return 0 fi if command -v python3 >/dev/null 2>&1; then python3 - "$value" <<'PY' import sys from datetime import datetime value = sys.argv[1] if value is None or value == "": sys.exit(1) if value.endswith('Z'): value = value[:-1] + '+00:00' try: dt = datetime.fromisoformat(value) except ValueError: sys.exit(1) print(int(dt.timestamp())) PY return $? fi return 1 } validate_json_file() { local path="$1" if [[ "$HAS_JQ" == "1" ]]; then jq empty "$path" >/dev/null 2>&1 && return 0 return 1 fi if command -v python3 >/dev/null 2>&1; then python3 - "$path" <<'PY' import json import sys path = sys.argv[1] with open(path, 'r', encoding='utf-8') as handle: json.load(handle) PY return $? fi return 0 } ensure_directory() { local dir="$1" if [[ ! -d "$dir" ]]; then log_warn "Creating missing directory $dir" mkdir -p "$dir" fi } TEST_HEALTH_FILE="" TEST_HEALTH_BACKUP="" TEST_HEALTH_EXISTED="false" cleanup() { if [[ -n "$TEST_HEALTH_FILE" ]]; then if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE" elif [[ "$KEEP_TEST_HEALTH" == "true" ]]; then : else rm -f "$TEST_HEALTH_FILE" fi fi } trap cleanup EXIT log_info "Starting agent deployment verification for hostname '$VERIFY_HOSTNAME'" # 4.2 Master health checks health_resp="" if ! health_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/healthz" 2>/tmp/agent_verify_healthz.err); then error_detail=$(cat /tmp/agent_verify_healthz.err || true) add_result FAIL "GET /healthz failed: $error_detail" else http_meta=$(tail -n1 <<<"$health_resp") payload=$(head -n -1 <<<"$health_resp" || true) status_code=${http_meta%% *} elapsed=${http_meta##* } add_result PASS "GET /healthz status=$status_code elapsed=${elapsed}s payload=$payload" fi rm -f /tmp/agent_verify_healthz.err if ! readyz_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/readyz" 2>/tmp/agent_verify_readyz.err); then error_detail=$(cat /tmp/agent_verify_readyz.err || true) add_result FAIL "GET /readyz failed: $error_detail" readyz_payload="" else readyz_meta=$(tail -n1 <<<"$readyz_resp") readyz_payload=$(head -n -1 <<<"$readyz_resp" || true) readyz_status=${readyz_meta%% *} readyz_elapsed=${readyz_meta##* } add_result PASS "GET /readyz status=$readyz_status elapsed=${readyz_elapsed}s" fi rm -f /tmp/agent_verify_readyz.err # 4.3 Nodes list and detail if ! nodes_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes" 2>/tmp/agent_verify_nodes.err); then error_detail=$(cat /tmp/agent_verify_nodes.err || true) add_result FAIL "GET /api/v1/master/nodes failed: $error_detail" nodes_json="" fi rm -f /tmp/agent_verify_nodes.err NODE_ENTRY="" NODE_ID="" NODE_IP="" if [[ -n "$nodes_json" ]]; then if [[ "$HAS_JQ" == "1" ]]; then NODE_ENTRY=$(printf '%s' "$nodes_json" | jq -e --arg name "$VERIFY_HOSTNAME" '.[] | select(.name == $name)') || NODE_ENTRY="" else NODE_ENTRY=$(python3 - "$VERIFY_HOSTNAME" <<'PY' import json import sys hostname = sys.argv[1] nodes = json.load(sys.stdin) for node in nodes: if node.get("name") == hostname: import json as _json print(_json.dumps(node)) sys.exit(0) sys.exit(1) PY ) || NODE_ENTRY="" fi if [[ -z "$NODE_ENTRY" ]]; then add_result FAIL "Current node '$VERIFY_HOSTNAME' not found in master nodes list" else if NODE_ID=$(json_query "$NODE_ENTRY" '.id' 'data["id"]'); then add_result PASS "Discovered node id '$NODE_ID' for hostname '$VERIFY_HOSTNAME'" else add_result FAIL "Failed to extract node id from master response" fi fi if [[ -n "$NODE_ENTRY" ]] && NODE_DETAIL=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_node_detail.err); then NODE_DETAIL_JSON="$NODE_DETAIL" add_result PASS "Fetched node detail for $NODE_ID" if NODE_IP=$(json_query "$NODE_DETAIL_JSON" '.meta_data.ip // .meta_data.host_ip // empty' 'data.get("meta_data", {}).get("ip") or data.get("meta_data", {}).get("host_ip") or ""'); then if [[ -n "$NODE_IP" ]]; then add_result PASS "Registered node IP=$NODE_IP" else add_result INFO "Node detail does not expose IP fields" fi fi else error_detail=$(cat /tmp/agent_verify_node_detail.err 2>/dev/null || true) add_result FAIL "Failed to fetch node detail for $NODE_ID: $error_detail" NODE_DETAIL_JSON="" fi rm -f /tmp/agent_verify_node_detail.err if stats_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes/statistics" 2>/tmp/agent_verify_stats.err); then if total_nodes=$(json_query "$stats_json" '.total // .total_nodes' 'data.get("total") or data.get("total_nodes")'); then if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$total_nodes" -ge 1 ]]; then add_result PASS "Statistics total=$total_nodes" else add_result WARN "Statistics total field not numeric: $total_nodes" fi else add_result WARN "Unable to read total field from statistics" fi active_nodes="" if [[ "$HAS_JQ" == "1" ]]; then active_nodes=$(printf '%s' "$stats_json" | jq -e 'if .status_statistics then (.status_statistics[] | select(.status == "online") | .count) else empty end' 2>/dev/null | head -n1 || true) elif command -v python3 >/dev/null 2>&1; then active_nodes=$(printf '%s' "$stats_json" | python3 -c 'import json,sys; data=json.load(sys.stdin); print(next((row.get("count") for row in data.get("status_statistics", []) if row.get("status")=="online"), ""))' 2>/dev/null) fi if [[ -n "$active_nodes" ]]; then add_result PASS "Online nodes reported by master: $active_nodes" fi if [[ "$HAS_JQ" == "1" ]]; then node_count=$(printf '%s' "$nodes_json" | jq 'length') else node_count=$(json_length "$nodes_json" 'length' 'len(data)') fi if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$node_count" =~ ^[0-9]+$ ]] && [[ "$total_nodes" -lt "$node_count" ]]; then add_result WARN "Statistics total=$total_nodes less than nodes list count=$node_count" fi else error_detail=$(cat /tmp/agent_verify_stats.err 2>/dev/null || true) add_result FAIL "Failed to fetch node statistics: $error_detail" fi rm -f /tmp/agent_verify_stats.err else NODE_DETAIL_JSON="" fi # 4.4 Agent persistence checks if [[ -f "$NODE_JSON" ]]; then node_file_content="$(cat "$NODE_JSON")" if node_id_local=$(json_query "$node_file_content" '.id' 'data["id"]'); then if [[ "$NODE_ID" != "" && "$node_id_local" == "$NODE_ID" ]]; then add_result PASS "node.json id matches master ($NODE_ID)" else add_result FAIL "node.json id '$node_id_local' differs from master id '$NODE_ID'" fi else add_result FAIL "Unable to extract id from node.json" fi if node_name_local=$(json_query "$node_file_content" '.name' 'data["name"]'); then if [[ "$node_name_local" == "$VERIFY_HOSTNAME" ]]; then add_result PASS "node.json name matches $VERIFY_HOSTNAME" else add_result FAIL "node.json name '$node_name_local' differs from hostname '$VERIFY_HOSTNAME'" fi else add_result FAIL "Unable to extract name from node.json" fi if register_time=$(json_query "$node_file_content" '.register_time' 'data.get("register_time")'); then if iso_to_epoch "$register_time" >/dev/null 2>&1; then add_result PASS "node.json register_time valid ISO timestamp" else add_result WARN "node.json register_time invalid: $register_time" fi else add_result WARN "node.json missing register_time" fi if last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then if iso_to_epoch "$last_updated" >/dev/null 2>&1; then add_result PASS "node.json last_updated valid ISO timestamp" else add_result WARN "node.json last_updated invalid: $last_updated" fi else add_result WARN "node.json missing last_updated" fi else add_result FAIL "node.json not found at $NODE_JSON" node_file_content="" fi ensure_directory "$HEALTH_DIR" if [[ -d "$HEALTH_DIR" ]]; then shopt -s nullglob health_files=("$HEALTH_DIR"/*.json) shopt -u nullglob if [[ ${#health_files[@]} -eq 0 ]]; then add_result WARN "Health directory $HEALTH_DIR is empty" else for hf in "${health_files[@]}"; do base=$(basename "$hf") if [[ "$base" != *-* ]]; then add_result WARN "Health file $base does not follow -*.json" continue fi if ! validate_json_file "$hf" >/dev/null 2>&1; then add_result WARN "Health file $base is not valid JSON" fi done fi else add_result WARN "Health directory $HEALTH_DIR missing" fi if getent hosts master.argus.com >/dev/null 2>&1; then resolved_ips=$(getent hosts master.argus.com | awk '{print $1}' | xargs) add_result PASS "master.argus.com resolves to $resolved_ips" else add_result FAIL "Failed to resolve master.argus.com" fi # 4.5 Master-Node status consistency sleep_interval=$((REPORT_INTERVAL_SECONDS + 2)) if [[ -n "$NODE_DETAIL_JSON" ]]; then detail_pre="$NODE_DETAIL_JSON" else detail_pre="" fi if [[ -z "$detail_pre" && -n "$NODE_ID" ]]; then if detail_pre=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_pre.err); then add_result PASS "Fetched node detail pre-check" else error_detail=$(cat /tmp/agent_verify_detail_pre.err 2>/dev/null || true) add_result FAIL "Unable to fetch node detail for status check: $error_detail" fi rm -f /tmp/agent_verify_detail_pre.err fi server_ts_pre="" agent_ts_pre="" server_ts_post="" agent_ts_post="" if [[ -n "$detail_pre" ]]; then server_ts_pre=$(json_query "$detail_pre" '.last_report' 'data.get("last_report")' || echo "") agent_ts_pre=$(json_query "$detail_pre" '.agent_last_report' 'data.get("agent_last_report")' || echo "") log_info "Captured initial last_report timestamps server='$server_ts_pre' agent='$agent_ts_pre'" sleep "$sleep_interval" if detail_post=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_post.err); then server_ts_post=$(json_query "$detail_post" '.last_report' 'data.get("last_report")' || echo "") agent_ts_post=$(json_query "$detail_post" '.agent_last_report' 'data.get("agent_last_report")' || echo "") if [[ "$server_ts_post" != "$server_ts_pre" ]]; then add_result PASS "last_report.server_timestamp advanced (pre=$server_ts_pre post=$server_ts_post)" else add_result FAIL "last_report.server_timestamp did not change after ${sleep_interval}s" fi if [[ "$agent_ts_post" != "$agent_ts_pre" ]]; then add_result PASS "last_report.agent_timestamp advanced" else add_result FAIL "last_report.agent_timestamp did not change" fi if [[ -n "$node_file_content" ]]; then if node_last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then if epoch_post=$(iso_to_epoch "$server_ts_post" 2>/dev/null); then if node_epoch=$(iso_to_epoch "$node_last_updated" 2>/dev/null); then diff=$((epoch_post - node_epoch)) [[ $diff -lt 0 ]] && diff=$((-diff)) tolerance=$((REPORT_INTERVAL_SECONDS * 2)) if [[ $diff -le $tolerance ]]; then add_result PASS "last_report.server_timestamp and node.json last_updated within tolerance ($diff s)" else add_result WARN "Timestamp gap between master ($server_ts_post) and node.json ($node_last_updated) is ${diff}s" fi fi fi fi fi NODE_DETAIL_JSON="$detail_post" else error_detail=$(cat /tmp/agent_verify_detail_post.err 2>/dev/null || true) add_result FAIL "Failed to fetch node detail post-check: $error_detail" fi rm -f /tmp/agent_verify_detail_post.err fi # 4.6 Health simulation TEST_HEALTH_FILE="$HEALTH_DIR/verify-master.json" ensure_directory "$HEALTH_DIR" if [[ -f "$TEST_HEALTH_FILE" ]]; then TEST_HEALTH_EXISTED="true" TEST_HEALTH_BACKUP="$(cat "$TEST_HEALTH_FILE")" else TEST_HEALTH_EXISTED="false" fi create_health_file() { local message="$1" cat > "$TEST_HEALTH_FILE" </tmp/agent_verify_health1.err); then if validate_health_in_master "$health_message_one" "$detail_health_one"; then add_result PASS "Master reflects verify-master health message" else add_result FAIL "Master health payload does not match test message" fi else error_detail=$(cat /tmp/agent_verify_health1.err 2>/dev/null || true) add_result FAIL "Failed to fetch node detail during health validation: $error_detail" detail_health_one="" fi rm -f /tmp/agent_verify_health1.err health_message_two="verify $(date +%s)-update" create_health_file "$health_message_two" sleep "$sleep_interval" if detail_health_two=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health2.err); then if validate_health_in_master "$health_message_two" "$detail_health_two"; then add_result PASS "Master health updated to new message" else add_result FAIL "Master health message did not update" fi else error_detail=$(cat /tmp/agent_verify_health2.err 2>/dev/null || true) add_result FAIL "Failed to fetch node detail after health update: $error_detail" detail_health_two="" fi rm -f /tmp/agent_verify_health2.err rm -f "$TEST_HEALTH_FILE" sleep "$sleep_interval" if detail_health_three=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health3.err); then if remove_health_from_master "$detail_health_three"; then add_result PASS "Master health no longer lists verify-master after removal" else add_result FAIL "Master health still contains verify-master after file deletion" fi else error_detail=$(cat /tmp/agent_verify_health3.err 2>/dev/null || true) add_result FAIL "Failed to fetch node detail after health removal: $error_detail" fi rm -f /tmp/agent_verify_health3.err if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE" fi # Optional config touch if [[ "$ALLOW_CONFIG_TOUCH" == "true" ]]; then if [[ -n "$NODE_ID" ]]; then payload='{"label": {"verify": "true"}}' if curl "${CURL_OPTS[@]}" -X PUT -H 'Content-Type: application/json' -d "$payload" "$MASTER_BASE/api/v1/master/nodes/$NODE_ID/config" >/tmp/agent_verify_config.log 2>&1; then add_result PASS "Config PUT dry-run succeeded" else add_result WARN "Config PUT dry-run failed: $(cat /tmp/agent_verify_config.log)" fi rm -f /tmp/agent_verify_config.log fi else add_result WARN "Config PUT dry-run skipped (enable with --allow-config-touch)" fi # Result summary echo echo "==== Verification Summary ====" for entry in "${RESULTS_PASS[@]}"; do printf 'PASS: %s\n' "$entry" done for entry in "${RESULTS_WARN[@]}"; do printf 'WARN: %s\n' "$entry" done for entry in "${RESULTS_FAIL[@]}"; do printf 'FAIL: %s\n' "$entry" done if [[ ${#RESULTS_FAIL[@]} -gt 0 ]]; then exit 1 fi exit 0