增加sys/debug 部署测试;agent dev/user/instance元信息提取优化;sys/tests 优化 #26
@ -41,8 +41,11 @@ Agent 不再依赖配置文件;所有参数均由环境变量与主机名推
|
|||||||
主机名与元数据的解析优先级:
|
主机名与元数据的解析优先级:
|
||||||
|
|
||||||
1. 若设置 `AGENT_ENV` / `AGENT_USER` / `AGENT_INSTANCE` 且全部存在,则直接使用这些值。
|
1. 若设置 `AGENT_ENV` / `AGENT_USER` / `AGENT_INSTANCE` 且全部存在,则直接使用这些值。
|
||||||
2. 否则按历史约定从主机名解析 `env-user-instance` 前缀。
|
2. 否则检查历史 `node.json`(注册成功后由 Master 返回的信息),若包含 `env` / `user` / `instance` 则沿用。
|
||||||
3. 如果两者都无法得到完整结果,Agent 启动会失败并提示需要提供上述环境变量。
|
3. 若以上均不可用,则按历史约定从主机名解析 `env-user-instance` 前缀。
|
||||||
|
4. 如果仍无法得到完整结果,Agent 启动会失败并提示需要提供上述环境变量。
|
||||||
|
|
||||||
|
> 提示:在首次部署时需确保环境变量或主机名能够提供完整信息。完成注册后,Agent 会把 Master 返回的元数据写入 `node.json`,后续重启无需再次提供环境变量就能保持一致性。
|
||||||
|
|
||||||
派生路径:
|
派生路径:
|
||||||
|
|
||||||
|
@ -6,10 +6,14 @@ from dataclasses import dataclass
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Final
|
from typing import Final
|
||||||
|
|
||||||
|
from .state import load_node_state
|
||||||
from .version import VERSION
|
from .version import VERSION
|
||||||
|
from .log import get_logger
|
||||||
|
|
||||||
DEFAULT_REPORT_INTERVAL_SECONDS: Final[int] = 60
|
DEFAULT_REPORT_INTERVAL_SECONDS: Final[int] = 60
|
||||||
|
|
||||||
|
LOGGER = get_logger("argus.agent.config")
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class AgentConfig:
|
class AgentConfig:
|
||||||
@ -50,7 +54,28 @@ def _resolve_hostname() -> str:
|
|||||||
return os.environ.get("AGENT_HOSTNAME") or socket.gethostname()
|
return os.environ.get("AGENT_HOSTNAME") or socket.gethostname()
|
||||||
|
|
||||||
|
|
||||||
def _resolve_metadata_fields(hostname: str) -> tuple[str, str, str]:
|
def _load_metadata_from_state(node_file: str) -> tuple[str, str, str] | None:
|
||||||
|
state = load_node_state(node_file)
|
||||||
|
if not state:
|
||||||
|
return None
|
||||||
|
|
||||||
|
meta = state.get("meta_data") or {}
|
||||||
|
env = meta.get("env") or state.get("env")
|
||||||
|
user = meta.get("user") or state.get("user")
|
||||||
|
instance = meta.get("instance") or state.get("instance")
|
||||||
|
|
||||||
|
if env and user and instance:
|
||||||
|
LOGGER.debug("Metadata resolved from node state", extra={"node_file": node_file})
|
||||||
|
return env, user, instance
|
||||||
|
|
||||||
|
LOGGER.warning(
|
||||||
|
"node.json missing metadata fields; ignoring",
|
||||||
|
extra={"node_file": node_file, "meta_data": meta},
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_metadata_fields(hostname: str, node_file: str) -> tuple[str, str, str]:
|
||||||
env = os.environ.get("AGENT_ENV")
|
env = os.environ.get("AGENT_ENV")
|
||||||
user = os.environ.get("AGENT_USER")
|
user = os.environ.get("AGENT_USER")
|
||||||
instance = os.environ.get("AGENT_INSTANCE")
|
instance = os.environ.get("AGENT_INSTANCE")
|
||||||
@ -59,23 +84,18 @@ def _resolve_metadata_fields(hostname: str) -> tuple[str, str, str]:
|
|||||||
return env, user, instance
|
return env, user, instance
|
||||||
|
|
||||||
if any([env, user, instance]):
|
if any([env, user, instance]):
|
||||||
LOGGER = None
|
LOGGER.warning(
|
||||||
try:
|
"Incomplete metadata environment variables; falling back to persisted metadata",
|
||||||
from .log import get_logger
|
extra={
|
||||||
|
"has_env": bool(env),
|
||||||
|
"has_user": bool(user),
|
||||||
|
"has_instance": bool(instance),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
LOGGER = get_logger("argus.agent.config")
|
state_metadata = _load_metadata_from_state(node_file)
|
||||||
except Exception: # pragma: no cover - defensive
|
if state_metadata is not None:
|
||||||
LOGGER = None
|
return state_metadata
|
||||||
if LOGGER is not None:
|
|
||||||
LOGGER.warning(
|
|
||||||
"Incomplete metadata environment variables; falling back to hostname parsing",
|
|
||||||
extra={
|
|
||||||
"has_env": bool(env),
|
|
||||||
"has_user": bool(user),
|
|
||||||
"has_instance": bool(instance),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
env = user = instance = None
|
|
||||||
|
|
||||||
from .collector import _parse_hostname # Local import to avoid circular dependency
|
from .collector import _parse_hostname # Local import to avoid circular dependency
|
||||||
|
|
||||||
@ -93,9 +113,9 @@ def load_config() -> AgentConfig:
|
|||||||
"""从环境变量推导配置,移除了外部配置文件依赖。"""
|
"""从环境变量推导配置,移除了外部配置文件依赖。"""
|
||||||
|
|
||||||
hostname = _resolve_hostname()
|
hostname = _resolve_hostname()
|
||||||
environment, user, instance = _resolve_metadata_fields(hostname)
|
|
||||||
|
|
||||||
node_file = f"/private/argus/agent/{hostname}/node.json"
|
node_file = f"/private/argus/agent/{hostname}/node.json"
|
||||||
|
environment, user, instance = _resolve_metadata_fields(hostname, node_file)
|
||||||
|
|
||||||
health_dir = f"/private/argus/agent/{hostname}/health/"
|
health_dir = f"/private/argus/agent/{hostname}/health/"
|
||||||
|
|
||||||
master_endpoint_env = os.environ.get("MASTER_ENDPOINT")
|
master_endpoint_env = os.environ.get("MASTER_ENDPOINT")
|
||||||
|
BIN
src/agent/dist/argus-agent
vendored
BIN
src/agent/dist/argus-agent
vendored
Binary file not shown.
@ -6,9 +6,18 @@ TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|||||||
TMP_ROOT="$TEST_ROOT/tmp"
|
TMP_ROOT="$TEST_ROOT/tmp"
|
||||||
API_BASE="http://localhost:32300/api/v1/master"
|
API_BASE="http://localhost:32300/api/v1/master"
|
||||||
NODE_ID="$(cat "$TMP_ROOT/node_id")"
|
NODE_ID="$(cat "$TMP_ROOT/node_id")"
|
||||||
|
ENV_NODE_ID_FILE="$TMP_ROOT/node_id_host_abc"
|
||||||
|
if [[ ! -f "$ENV_NODE_ID_FILE" ]]; then
|
||||||
|
echo "[ERROR] Env agent node id file missing at $ENV_NODE_ID_FILE" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ENV_NODE_ID="$(cat "$ENV_NODE_ID_FILE")"
|
||||||
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
|
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
|
||||||
|
ENV_AGENT_HOSTNAME="host_abc"
|
||||||
NETWORK_NAME="tests_default"
|
NETWORK_NAME="tests_default"
|
||||||
NEW_AGENT_IP="172.28.0.200"
|
NEW_AGENT_IP="172.28.0.200"
|
||||||
|
NEW_ENV_AGENT_IP="172.28.0.210"
|
||||||
ENTRYPOINT_SCRIPT="$SCRIPT_DIR/agent_entrypoint.sh"
|
ENTRYPOINT_SCRIPT="$SCRIPT_DIR/agent_entrypoint.sh"
|
||||||
VERIFY_SCRIPT="$TEST_ROOT/../scripts/agent_deployment_verify.sh"
|
VERIFY_SCRIPT="$TEST_ROOT/../scripts/agent_deployment_verify.sh"
|
||||||
ENV_FILE="$TEST_ROOT/.env"
|
ENV_FILE="$TEST_ROOT/.env"
|
||||||
@ -80,15 +89,37 @@ if [[ "$prev_ip" != "$initial_ip" ]]; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
env_before_file="$TMP_ROOT/env_before_restart.json"
|
||||||
|
curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_before_file"
|
||||||
|
env_prev_last_updated=$(python3 - "$env_before_file" <<'PY'
|
||||||
|
import json, sys
|
||||||
|
with open(sys.argv[1]) as handle:
|
||||||
|
node = json.load(handle)
|
||||||
|
print(node.get("last_updated", ""))
|
||||||
|
PY
|
||||||
|
)
|
||||||
|
env_prev_ip=$(python3 - "$env_before_file" <<'PY'
|
||||||
|
import json, sys
|
||||||
|
with open(sys.argv[1]) as handle:
|
||||||
|
node = json.load(handle)
|
||||||
|
print(node["meta_data"].get("ip", ""))
|
||||||
|
PY
|
||||||
|
)
|
||||||
|
|
||||||
pushd "$TEST_ROOT" >/dev/null
|
pushd "$TEST_ROOT" >/dev/null
|
||||||
compose rm -sf agent
|
compose rm -sf agent
|
||||||
|
compose rm -sf agent_env
|
||||||
popd >/dev/null
|
popd >/dev/null
|
||||||
|
|
||||||
docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true
|
docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true
|
||||||
|
docker container rm -f argus-agent-env-e2e >/dev/null 2>&1 || true
|
||||||
|
|
||||||
AGENT_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME"
|
AGENT_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME"
|
||||||
HEALTH_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/health"
|
HEALTH_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/health"
|
||||||
|
|
||||||
|
ENV_AGENT_DIR="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME"
|
||||||
|
ENV_HEALTH_DIR="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME/health"
|
||||||
|
|
||||||
# 先以 sleep 方式启动容器,确保我们掌握注册时的网络状态
|
# 先以 sleep 方式启动容器,确保我们掌握注册时的网络状态
|
||||||
if ! docker run -d \
|
if ! docker run -d \
|
||||||
--name argus-agent-e2e \
|
--name argus-agent-e2e \
|
||||||
@ -148,3 +179,76 @@ if [[ "$success" != true ]]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
echo "[INFO] Agent restart produced successful re-registration with IP change"
|
echo "[INFO] Agent restart produced successful re-registration with IP change"
|
||||||
|
|
||||||
|
# ---- Restart env-driven agent without metadata environment variables ----
|
||||||
|
|
||||||
|
if [[ ! -d "$ENV_AGENT_DIR" ]]; then
|
||||||
|
echo "[ERROR] Env agent data dir missing at $ENV_AGENT_DIR" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -d "$ENV_HEALTH_DIR" ]]; then
|
||||||
|
mkdir -p "$ENV_HEALTH_DIR"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! docker run -d \
|
||||||
|
--name argus-agent-env-e2e \
|
||||||
|
--hostname "$ENV_AGENT_HOSTNAME" \
|
||||||
|
--network "$NETWORK_NAME" \
|
||||||
|
--ip "$NEW_ENV_AGENT_IP" \
|
||||||
|
-v "$ENV_AGENT_DIR:/private/argus/agent/$ENV_AGENT_HOSTNAME" \
|
||||||
|
-v "$ENV_HEALTH_DIR:/private/argus/agent/$ENV_AGENT_HOSTNAME/health" \
|
||||||
|
-v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \
|
||||||
|
-v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
|
||||||
|
-v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \
|
||||||
|
-v "$VERIFY_SCRIPT:/usr/local/bin/agent_deployment_verify.sh:ro" \
|
||||||
|
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
|
||||||
|
-e REPORT_INTERVAL_SECONDS=2 \
|
||||||
|
-e ARGUS_BUILD_UID="$AGENT_UID" \
|
||||||
|
-e ARGUS_BUILD_GID="$AGENT_GID" \
|
||||||
|
--entrypoint /usr/local/bin/agent-entrypoint.sh \
|
||||||
|
ubuntu:22.04 >/dev/null; then
|
||||||
|
echo "[ERROR] Failed to start env-driven agent container without metadata env" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
env_success=false
|
||||||
|
env_detail_file="$TMP_ROOT/env_post_restart.json"
|
||||||
|
for _ in {1..20}; do
|
||||||
|
sleep 3
|
||||||
|
if ! curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_detail_file"; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if python3 - "$env_detail_file" "$env_prev_last_updated" "$ENV_NODE_ID" "$env_prev_ip" "$NEW_ENV_AGENT_IP" <<'PY'
|
||||||
|
import json, sys
|
||||||
|
with open(sys.argv[1]) as handle:
|
||||||
|
node = json.load(handle)
|
||||||
|
prev_last_updated = sys.argv[2]
|
||||||
|
expected_id = sys.argv[3]
|
||||||
|
old_ip = sys.argv[4]
|
||||||
|
expected_ip = sys.argv[5]
|
||||||
|
last_updated = node.get("last_updated")
|
||||||
|
current_ip = node["meta_data"].get("ip")
|
||||||
|
meta = node.get("meta_data", {})
|
||||||
|
assert node["id"] == expected_id
|
||||||
|
if current_ip != expected_ip:
|
||||||
|
raise SystemExit(1)
|
||||||
|
if current_ip == old_ip:
|
||||||
|
raise SystemExit(1)
|
||||||
|
if not last_updated or last_updated == prev_last_updated:
|
||||||
|
raise SystemExit(1)
|
||||||
|
if meta.get("env") != "prod" or meta.get("user") != "ml" or meta.get("instance") != "node-3":
|
||||||
|
raise SystemExit(1)
|
||||||
|
PY
|
||||||
|
then
|
||||||
|
env_success=true
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ "$env_success" != true ]]; then
|
||||||
|
echo "[ERROR] Env-driven agent did not reuse persisted metadata after restart" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[INFO] Env-driven agent restart succeeded with persisted metadata"
|
||||||
|
@ -60,6 +60,24 @@ class LoadConfigMetadataTests(unittest.TestCase):
|
|||||||
self.assertEqual(config.instance, "abc")
|
self.assertEqual(config.instance, "abc")
|
||||||
mock_mkdir.assert_called()
|
mock_mkdir.assert_called()
|
||||||
|
|
||||||
|
@patch("app.config._load_metadata_from_state", return_value=("prod", "ops", "node-1"))
|
||||||
|
@patch("app.config.Path.mkdir")
|
||||||
|
def test_metadata_from_node_state(self, mock_mkdir, mock_state):
|
||||||
|
with temp_env(
|
||||||
|
MASTER_ENDPOINT="http://master.local",
|
||||||
|
AGENT_HOSTNAME="host_abc",
|
||||||
|
AGENT_ENV=None,
|
||||||
|
AGENT_USER=None,
|
||||||
|
AGENT_INSTANCE=None,
|
||||||
|
):
|
||||||
|
config = load_config()
|
||||||
|
|
||||||
|
self.assertEqual(config.environment, "prod")
|
||||||
|
self.assertEqual(config.user, "ops")
|
||||||
|
self.assertEqual(config.instance, "node-1")
|
||||||
|
mock_state.assert_called_once()
|
||||||
|
mock_mkdir.assert_called()
|
||||||
|
|
||||||
@patch("app.config.Path.mkdir")
|
@patch("app.config.Path.mkdir")
|
||||||
def test_partial_environment_variables_fallback(self, mock_mkdir):
|
def test_partial_environment_variables_fallback(self, mock_mkdir):
|
||||||
with temp_env(
|
with temp_env(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user