255 lines
7.5 KiB
Bash
Executable File
255 lines
7.5 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
TMP_ROOT="$TEST_ROOT/tmp"
|
|
API_BASE="http://localhost:32300/api/v1/master"
|
|
NODE_ID="$(cat "$TMP_ROOT/node_id")"
|
|
ENV_NODE_ID_FILE="$TMP_ROOT/node_id_host_abc"
|
|
if [[ ! -f "$ENV_NODE_ID_FILE" ]]; then
|
|
echo "[ERROR] Env agent node id file missing at $ENV_NODE_ID_FILE" >&2
|
|
exit 1
|
|
fi
|
|
|
|
ENV_NODE_ID="$(cat "$ENV_NODE_ID_FILE")"
|
|
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
|
|
ENV_AGENT_HOSTNAME="host_abc"
|
|
NETWORK_NAME="tests_default"
|
|
NEW_AGENT_IP="172.28.0.200"
|
|
NEW_ENV_AGENT_IP="172.28.0.210"
|
|
ENTRYPOINT_SCRIPT="$SCRIPT_DIR/agent_entrypoint.sh"
|
|
VERIFY_SCRIPT="$TEST_ROOT/../scripts/agent_deployment_verify.sh"
|
|
ENV_FILE="$TEST_ROOT/.env"
|
|
|
|
# 中文提示:重启场景也需要同样的入口脚本,确保 DNS 注册逻辑一致
|
|
if [[ ! -f "$ENTRYPOINT_SCRIPT" ]]; then
|
|
echo "[ERROR] agent entrypoint script missing at $ENTRYPOINT_SCRIPT" >&2
|
|
exit 1
|
|
fi
|
|
|
|
if [[ ! -f "$VERIFY_SCRIPT" ]]; then
|
|
echo "[ERROR] agent verification script missing at $VERIFY_SCRIPT" >&2
|
|
exit 1
|
|
fi
|
|
|
|
if [[ ! -f "$TMP_ROOT/agent_binary_path" ]]; then
|
|
echo "[ERROR] Agent binary path missing; rerun bootstrap" >&2
|
|
exit 1
|
|
fi
|
|
|
|
AGENT_BINARY="$(cat "$TMP_ROOT/agent_binary_path")"
|
|
if [[ ! -x "$AGENT_BINARY" ]]; then
|
|
echo "[ERROR] Agent binary not executable: $AGENT_BINARY" >&2
|
|
exit 1
|
|
fi
|
|
|
|
if [[ -f "$ENV_FILE" ]]; then
|
|
set -a
|
|
# shellcheck disable=SC1090
|
|
source "$ENV_FILE"
|
|
set +a
|
|
else
|
|
REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
|
|
# shellcheck disable=SC1090
|
|
source "$REPO_ROOT/scripts/common/build_user.sh"
|
|
load_build_user
|
|
fi
|
|
|
|
AGENT_UID="${ARGUS_BUILD_UID:-2133}"
|
|
AGENT_GID="${ARGUS_BUILD_GID:-2015}"
|
|
|
|
compose() {
|
|
if docker compose version >/dev/null 2>&1; then
|
|
docker compose "$@"
|
|
else
|
|
docker-compose "$@"
|
|
fi
|
|
}
|
|
|
|
before_file="$TMP_ROOT/before_restart.json"
|
|
curl -sS "$API_BASE/nodes/$NODE_ID" -o "$before_file"
|
|
prev_last_updated=$(python3 - "$before_file" <<'PY'
|
|
import json, sys
|
|
with open(sys.argv[1]) as handle:
|
|
node = json.load(handle)
|
|
print(node.get("last_updated", ""))
|
|
PY
|
|
)
|
|
prev_ip=$(python3 - "$before_file" <<'PY'
|
|
import json, sys
|
|
with open(sys.argv[1]) as handle:
|
|
node = json.load(handle)
|
|
print(node["meta_data"].get("ip", ""))
|
|
PY
|
|
)
|
|
initial_ip=$(cat "$TMP_ROOT/initial_ip")
|
|
if [[ "$prev_ip" != "$initial_ip" ]]; then
|
|
echo "[ERROR] Expected initial IP $initial_ip, got $prev_ip" >&2
|
|
exit 1
|
|
fi
|
|
|
|
env_before_file="$TMP_ROOT/env_before_restart.json"
|
|
curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_before_file"
|
|
env_prev_last_updated=$(python3 - "$env_before_file" <<'PY'
|
|
import json, sys
|
|
with open(sys.argv[1]) as handle:
|
|
node = json.load(handle)
|
|
print(node.get("last_updated", ""))
|
|
PY
|
|
)
|
|
env_prev_ip=$(python3 - "$env_before_file" <<'PY'
|
|
import json, sys
|
|
with open(sys.argv[1]) as handle:
|
|
node = json.load(handle)
|
|
print(node["meta_data"].get("ip", ""))
|
|
PY
|
|
)
|
|
|
|
pushd "$TEST_ROOT" >/dev/null
|
|
compose rm -sf agent
|
|
compose rm -sf agent_env
|
|
popd >/dev/null
|
|
|
|
docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true
|
|
docker container rm -f argus-agent-env-e2e >/dev/null 2>&1 || true
|
|
|
|
AGENT_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME"
|
|
HEALTH_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/health"
|
|
|
|
ENV_AGENT_DIR="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME"
|
|
ENV_HEALTH_DIR="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME/health"
|
|
|
|
# 先以 sleep 方式启动容器,确保我们掌握注册时的网络状态
|
|
if ! docker run -d \
|
|
--name argus-agent-e2e \
|
|
--hostname "$AGENT_HOSTNAME" \
|
|
--network "$NETWORK_NAME" \
|
|
--ip "$NEW_AGENT_IP" \
|
|
-v "$AGENT_DIR:/private/argus/agent/$AGENT_HOSTNAME" \
|
|
-v "$HEALTH_DIR:/private/argus/agent/$AGENT_HOSTNAME/health" \
|
|
-v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \
|
|
-v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
|
|
-v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \
|
|
-v "$VERIFY_SCRIPT:/usr/local/bin/agent_deployment_verify.sh:ro" \
|
|
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
|
|
-e REPORT_INTERVAL_SECONDS=2 \
|
|
-e ARGUS_BUILD_UID="$AGENT_UID" \
|
|
-e ARGUS_BUILD_GID="$AGENT_GID" \
|
|
--entrypoint /usr/local/bin/agent-entrypoint.sh \
|
|
ubuntu:22.04 >/dev/null; then
|
|
echo "[ERROR] Failed to start agent container with custom IP" >&2
|
|
exit 1
|
|
fi
|
|
|
|
success=false
|
|
detail_file="$TMP_ROOT/post_restart.json"
|
|
for _ in {1..20}; do
|
|
sleep 3
|
|
if ! curl -sS "$API_BASE/nodes/$NODE_ID" -o "$detail_file"; then
|
|
continue
|
|
fi
|
|
if python3 - "$detail_file" "$prev_last_updated" "$NODE_ID" "$prev_ip" "$NEW_AGENT_IP" <<'PY'
|
|
import json, sys
|
|
with open(sys.argv[1]) as handle:
|
|
node = json.load(handle)
|
|
prev_last_updated = sys.argv[2]
|
|
expected_id = sys.argv[3]
|
|
old_ip = sys.argv[4]
|
|
expected_ip = sys.argv[5]
|
|
last_updated = node.get("last_updated")
|
|
current_ip = node["meta_data"].get("ip")
|
|
assert node["id"] == expected_id
|
|
if current_ip != expected_ip:
|
|
raise SystemExit(1)
|
|
if current_ip == old_ip:
|
|
raise SystemExit(1)
|
|
if not last_updated or last_updated == prev_last_updated:
|
|
raise SystemExit(1)
|
|
PY
|
|
then
|
|
success=true
|
|
break
|
|
fi
|
|
done
|
|
|
|
if [[ "$success" != true ]]; then
|
|
echo "[ERROR] Agent did not report expected new IP $NEW_AGENT_IP after restart" >&2
|
|
exit 1
|
|
fi
|
|
|
|
echo "[INFO] Agent restart produced successful re-registration with IP change"
|
|
|
|
# ---- Restart env-driven agent without metadata environment variables ----
|
|
|
|
if [[ ! -d "$ENV_AGENT_DIR" ]]; then
|
|
echo "[ERROR] Env agent data dir missing at $ENV_AGENT_DIR" >&2
|
|
exit 1
|
|
fi
|
|
|
|
if [[ ! -d "$ENV_HEALTH_DIR" ]]; then
|
|
mkdir -p "$ENV_HEALTH_DIR"
|
|
fi
|
|
|
|
if ! docker run -d \
|
|
--name argus-agent-env-e2e \
|
|
--hostname "$ENV_AGENT_HOSTNAME" \
|
|
--network "$NETWORK_NAME" \
|
|
--ip "$NEW_ENV_AGENT_IP" \
|
|
-v "$ENV_AGENT_DIR:/private/argus/agent/$ENV_AGENT_HOSTNAME" \
|
|
-v "$ENV_HEALTH_DIR:/private/argus/agent/$ENV_AGENT_HOSTNAME/health" \
|
|
-v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \
|
|
-v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
|
|
-v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \
|
|
-v "$VERIFY_SCRIPT:/usr/local/bin/agent_deployment_verify.sh:ro" \
|
|
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
|
|
-e REPORT_INTERVAL_SECONDS=2 \
|
|
-e ARGUS_BUILD_UID="$AGENT_UID" \
|
|
-e ARGUS_BUILD_GID="$AGENT_GID" \
|
|
--entrypoint /usr/local/bin/agent-entrypoint.sh \
|
|
ubuntu:22.04 >/dev/null; then
|
|
echo "[ERROR] Failed to start env-driven agent container without metadata env" >&2
|
|
exit 1
|
|
fi
|
|
|
|
env_success=false
|
|
env_detail_file="$TMP_ROOT/env_post_restart.json"
|
|
for _ in {1..20}; do
|
|
sleep 3
|
|
if ! curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_detail_file"; then
|
|
continue
|
|
fi
|
|
if python3 - "$env_detail_file" "$env_prev_last_updated" "$ENV_NODE_ID" "$env_prev_ip" "$NEW_ENV_AGENT_IP" <<'PY'
|
|
import json, sys
|
|
with open(sys.argv[1]) as handle:
|
|
node = json.load(handle)
|
|
prev_last_updated = sys.argv[2]
|
|
expected_id = sys.argv[3]
|
|
old_ip = sys.argv[4]
|
|
expected_ip = sys.argv[5]
|
|
last_updated = node.get("last_updated")
|
|
current_ip = node["meta_data"].get("ip")
|
|
meta = node.get("meta_data", {})
|
|
assert node["id"] == expected_id
|
|
if current_ip != expected_ip:
|
|
raise SystemExit(1)
|
|
if current_ip == old_ip:
|
|
raise SystemExit(1)
|
|
if not last_updated or last_updated == prev_last_updated:
|
|
raise SystemExit(1)
|
|
if meta.get("env") != "prod" or meta.get("user") != "ml" or meta.get("instance") != "node-3":
|
|
raise SystemExit(1)
|
|
PY
|
|
then
|
|
env_success=true
|
|
break
|
|
fi
|
|
done
|
|
|
|
if [[ "$env_success" != true ]]; then
|
|
echo "[ERROR] Env-driven agent did not reuse persisted metadata after restart" >&2
|
|
exit 1
|
|
fi
|
|
|
|
echo "[INFO] Env-driven agent restart succeeded with persisted metadata"
|