diff --git a/src/agent/README.md b/src/agent/README.md index 55877d2..df96bdb 100644 --- a/src/agent/README.md +++ b/src/agent/README.md @@ -41,8 +41,11 @@ Agent 不再依赖配置文件;所有参数均由环境变量与主机名推 主机名与元数据的解析优先级: 1. 若设置 `AGENT_ENV` / `AGENT_USER` / `AGENT_INSTANCE` 且全部存在,则直接使用这些值。 -2. 否则按历史约定从主机名解析 `env-user-instance` 前缀。 -3. 如果两者都无法得到完整结果,Agent 启动会失败并提示需要提供上述环境变量。 +2. 否则检查历史 `node.json`(注册成功后由 Master 返回的信息),若包含 `env` / `user` / `instance` 则沿用。 +3. 若以上均不可用,则按历史约定从主机名解析 `env-user-instance` 前缀。 +4. 如果仍无法得到完整结果,Agent 启动会失败并提示需要提供上述环境变量。 + +> 提示:在首次部署时需确保环境变量或主机名能够提供完整信息。完成注册后,Agent 会把 Master 返回的元数据写入 `node.json`,后续重启无需再次提供环境变量就能保持一致性。 派生路径: diff --git a/src/agent/app/config.py b/src/agent/app/config.py index bf02cf5..057b92a 100644 --- a/src/agent/app/config.py +++ b/src/agent/app/config.py @@ -6,10 +6,14 @@ from dataclasses import dataclass from pathlib import Path from typing import Final +from .state import load_node_state from .version import VERSION +from .log import get_logger DEFAULT_REPORT_INTERVAL_SECONDS: Final[int] = 60 +LOGGER = get_logger("argus.agent.config") + @dataclass(frozen=True) class AgentConfig: @@ -50,7 +54,28 @@ def _resolve_hostname() -> str: return os.environ.get("AGENT_HOSTNAME") or socket.gethostname() -def _resolve_metadata_fields(hostname: str) -> tuple[str, str, str]: +def _load_metadata_from_state(node_file: str) -> tuple[str, str, str] | None: + state = load_node_state(node_file) + if not state: + return None + + meta = state.get("meta_data") or {} + env = meta.get("env") or state.get("env") + user = meta.get("user") or state.get("user") + instance = meta.get("instance") or state.get("instance") + + if env and user and instance: + LOGGER.debug("Metadata resolved from node state", extra={"node_file": node_file}) + return env, user, instance + + LOGGER.warning( + "node.json missing metadata fields; ignoring", + extra={"node_file": node_file, "meta_data": meta}, + ) + return None + + +def _resolve_metadata_fields(hostname: str, node_file: str) -> tuple[str, str, str]: env = os.environ.get("AGENT_ENV") user = os.environ.get("AGENT_USER") instance = os.environ.get("AGENT_INSTANCE") @@ -59,23 +84,18 @@ def _resolve_metadata_fields(hostname: str) -> tuple[str, str, str]: return env, user, instance if any([env, user, instance]): - LOGGER = None - try: - from .log import get_logger + LOGGER.warning( + "Incomplete metadata environment variables; falling back to persisted metadata", + extra={ + "has_env": bool(env), + "has_user": bool(user), + "has_instance": bool(instance), + }, + ) - LOGGER = get_logger("argus.agent.config") - except Exception: # pragma: no cover - defensive - LOGGER = None - if LOGGER is not None: - LOGGER.warning( - "Incomplete metadata environment variables; falling back to hostname parsing", - extra={ - "has_env": bool(env), - "has_user": bool(user), - "has_instance": bool(instance), - }, - ) - env = user = instance = None + state_metadata = _load_metadata_from_state(node_file) + if state_metadata is not None: + return state_metadata from .collector import _parse_hostname # Local import to avoid circular dependency @@ -93,9 +113,9 @@ def load_config() -> AgentConfig: """从环境变量推导配置,移除了外部配置文件依赖。""" hostname = _resolve_hostname() - environment, user, instance = _resolve_metadata_fields(hostname) - node_file = f"/private/argus/agent/{hostname}/node.json" + environment, user, instance = _resolve_metadata_fields(hostname, node_file) + health_dir = f"/private/argus/agent/{hostname}/health/" master_endpoint_env = os.environ.get("MASTER_ENDPOINT") diff --git a/src/agent/dist/argus-agent b/src/agent/dist/argus-agent index d5703fe..1a335c4 100755 Binary files a/src/agent/dist/argus-agent and b/src/agent/dist/argus-agent differ diff --git a/src/agent/tests/scripts/07_restart_agent_and_reregister.sh b/src/agent/tests/scripts/07_restart_agent_and_reregister.sh index 9fa272e..4da99d3 100755 --- a/src/agent/tests/scripts/07_restart_agent_and_reregister.sh +++ b/src/agent/tests/scripts/07_restart_agent_and_reregister.sh @@ -6,9 +6,18 @@ TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" TMP_ROOT="$TEST_ROOT/tmp" API_BASE="http://localhost:32300/api/v1/master" NODE_ID="$(cat "$TMP_ROOT/node_id")" +ENV_NODE_ID_FILE="$TMP_ROOT/node_id_host_abc" +if [[ ! -f "$ENV_NODE_ID_FILE" ]]; then + echo "[ERROR] Env agent node id file missing at $ENV_NODE_ID_FILE" >&2 + exit 1 +fi + +ENV_NODE_ID="$(cat "$ENV_NODE_ID_FILE")" AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0" +ENV_AGENT_HOSTNAME="host_abc" NETWORK_NAME="tests_default" NEW_AGENT_IP="172.28.0.200" +NEW_ENV_AGENT_IP="172.28.0.210" ENTRYPOINT_SCRIPT="$SCRIPT_DIR/agent_entrypoint.sh" VERIFY_SCRIPT="$TEST_ROOT/../scripts/agent_deployment_verify.sh" ENV_FILE="$TEST_ROOT/.env" @@ -80,15 +89,37 @@ if [[ "$prev_ip" != "$initial_ip" ]]; then exit 1 fi +env_before_file="$TMP_ROOT/env_before_restart.json" +curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_before_file" +env_prev_last_updated=$(python3 - "$env_before_file" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) +print(node.get("last_updated", "")) +PY +) +env_prev_ip=$(python3 - "$env_before_file" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) +print(node["meta_data"].get("ip", "")) +PY +) + pushd "$TEST_ROOT" >/dev/null compose rm -sf agent +compose rm -sf agent_env popd >/dev/null docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true +docker container rm -f argus-agent-env-e2e >/dev/null 2>&1 || true AGENT_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME" HEALTH_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/health" +ENV_AGENT_DIR="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME" +ENV_HEALTH_DIR="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME/health" + # 先以 sleep 方式启动容器,确保我们掌握注册时的网络状态 if ! docker run -d \ --name argus-agent-e2e \ @@ -148,3 +179,76 @@ if [[ "$success" != true ]]; then fi echo "[INFO] Agent restart produced successful re-registration with IP change" + +# ---- Restart env-driven agent without metadata environment variables ---- + +if [[ ! -d "$ENV_AGENT_DIR" ]]; then + echo "[ERROR] Env agent data dir missing at $ENV_AGENT_DIR" >&2 + exit 1 +fi + +if [[ ! -d "$ENV_HEALTH_DIR" ]]; then + mkdir -p "$ENV_HEALTH_DIR" +fi + +if ! docker run -d \ + --name argus-agent-env-e2e \ + --hostname "$ENV_AGENT_HOSTNAME" \ + --network "$NETWORK_NAME" \ + --ip "$NEW_ENV_AGENT_IP" \ + -v "$ENV_AGENT_DIR:/private/argus/agent/$ENV_AGENT_HOSTNAME" \ + -v "$ENV_HEALTH_DIR:/private/argus/agent/$ENV_AGENT_HOSTNAME/health" \ + -v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \ + -v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \ + -v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \ + -v "$VERIFY_SCRIPT:/usr/local/bin/agent_deployment_verify.sh:ro" \ + -e MASTER_ENDPOINT=http://master.argus.com:3000 \ + -e REPORT_INTERVAL_SECONDS=2 \ + -e ARGUS_BUILD_UID="$AGENT_UID" \ + -e ARGUS_BUILD_GID="$AGENT_GID" \ + --entrypoint /usr/local/bin/agent-entrypoint.sh \ + ubuntu:22.04 >/dev/null; then + echo "[ERROR] Failed to start env-driven agent container without metadata env" >&2 + exit 1 +fi + +env_success=false +env_detail_file="$TMP_ROOT/env_post_restart.json" +for _ in {1..20}; do + sleep 3 + if ! curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_detail_file"; then + continue + fi + if python3 - "$env_detail_file" "$env_prev_last_updated" "$ENV_NODE_ID" "$env_prev_ip" "$NEW_ENV_AGENT_IP" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) +prev_last_updated = sys.argv[2] +expected_id = sys.argv[3] +old_ip = sys.argv[4] +expected_ip = sys.argv[5] +last_updated = node.get("last_updated") +current_ip = node["meta_data"].get("ip") +meta = node.get("meta_data", {}) +assert node["id"] == expected_id +if current_ip != expected_ip: + raise SystemExit(1) +if current_ip == old_ip: + raise SystemExit(1) +if not last_updated or last_updated == prev_last_updated: + raise SystemExit(1) +if meta.get("env") != "prod" or meta.get("user") != "ml" or meta.get("instance") != "node-3": + raise SystemExit(1) +PY + then + env_success=true + break + fi +done + +if [[ "$env_success" != true ]]; then + echo "[ERROR] Env-driven agent did not reuse persisted metadata after restart" >&2 + exit 1 +fi + +echo "[INFO] Env-driven agent restart succeeded with persisted metadata" diff --git a/src/agent/tests/test_config_metadata.py b/src/agent/tests/test_config_metadata.py index 390fd7d..2ddd45a 100644 --- a/src/agent/tests/test_config_metadata.py +++ b/src/agent/tests/test_config_metadata.py @@ -60,6 +60,24 @@ class LoadConfigMetadataTests(unittest.TestCase): self.assertEqual(config.instance, "abc") mock_mkdir.assert_called() + @patch("app.config._load_metadata_from_state", return_value=("prod", "ops", "node-1")) + @patch("app.config.Path.mkdir") + def test_metadata_from_node_state(self, mock_mkdir, mock_state): + with temp_env( + MASTER_ENDPOINT="http://master.local", + AGENT_HOSTNAME="host_abc", + AGENT_ENV=None, + AGENT_USER=None, + AGENT_INSTANCE=None, + ): + config = load_config() + + self.assertEqual(config.environment, "prod") + self.assertEqual(config.user, "ops") + self.assertEqual(config.instance, "node-1") + mock_state.assert_called_once() + mock_mkdir.assert_called() + @patch("app.config.Path.mkdir") def test_partial_environment_variables_fallback(self, mock_mkdir): with temp_env(