增加sys/debug 部署测试;agent dev/user/instance元信息提取优化;sys/tests 优化 #26

Merged
yuyr merged 9 commits from dev_1.0.0_yuyr_3 into dev_1.0.0 2025-10-16 17:16:07 +08:00
5 changed files with 166 additions and 21 deletions
Showing only changes of commit 68b8461ea0 - Show all commits

View File

@ -41,8 +41,11 @@ Agent 不再依赖配置文件;所有参数均由环境变量与主机名推
主机名与元数据的解析优先级:
1. 若设置 `AGENT_ENV` / `AGENT_USER` / `AGENT_INSTANCE` 且全部存在,则直接使用这些值。
2. 否则按历史约定从主机名解析 `env-user-instance` 前缀。
3. 如果两者都无法得到完整结果Agent 启动会失败并提示需要提供上述环境变量。
2. 否则检查历史 `node.json`(注册成功后由 Master 返回的信息),若包含 `env` / `user` / `instance` 则沿用。
3. 若以上均不可用,则按历史约定从主机名解析 `env-user-instance` 前缀。
4. 如果仍无法得到完整结果Agent 启动会失败并提示需要提供上述环境变量。
> 提示在首次部署时需确保环境变量或主机名能够提供完整信息。完成注册后Agent 会把 Master 返回的元数据写入 `node.json`,后续重启无需再次提供环境变量就能保持一致性。
派生路径:

View File

@ -6,10 +6,14 @@ from dataclasses import dataclass
from pathlib import Path
from typing import Final
from .state import load_node_state
from .version import VERSION
from .log import get_logger
DEFAULT_REPORT_INTERVAL_SECONDS: Final[int] = 60
LOGGER = get_logger("argus.agent.config")
@dataclass(frozen=True)
class AgentConfig:
@ -50,7 +54,28 @@ def _resolve_hostname() -> str:
return os.environ.get("AGENT_HOSTNAME") or socket.gethostname()
def _resolve_metadata_fields(hostname: str) -> tuple[str, str, str]:
def _load_metadata_from_state(node_file: str) -> tuple[str, str, str] | None:
state = load_node_state(node_file)
if not state:
return None
meta = state.get("meta_data") or {}
env = meta.get("env") or state.get("env")
user = meta.get("user") or state.get("user")
instance = meta.get("instance") or state.get("instance")
if env and user and instance:
LOGGER.debug("Metadata resolved from node state", extra={"node_file": node_file})
return env, user, instance
LOGGER.warning(
"node.json missing metadata fields; ignoring",
extra={"node_file": node_file, "meta_data": meta},
)
return None
def _resolve_metadata_fields(hostname: str, node_file: str) -> tuple[str, str, str]:
env = os.environ.get("AGENT_ENV")
user = os.environ.get("AGENT_USER")
instance = os.environ.get("AGENT_INSTANCE")
@ -59,23 +84,18 @@ def _resolve_metadata_fields(hostname: str) -> tuple[str, str, str]:
return env, user, instance
if any([env, user, instance]):
LOGGER = None
try:
from .log import get_logger
LOGGER.warning(
"Incomplete metadata environment variables; falling back to persisted metadata",
extra={
"has_env": bool(env),
"has_user": bool(user),
"has_instance": bool(instance),
},
)
LOGGER = get_logger("argus.agent.config")
except Exception: # pragma: no cover - defensive
LOGGER = None
if LOGGER is not None:
LOGGER.warning(
"Incomplete metadata environment variables; falling back to hostname parsing",
extra={
"has_env": bool(env),
"has_user": bool(user),
"has_instance": bool(instance),
},
)
env = user = instance = None
state_metadata = _load_metadata_from_state(node_file)
if state_metadata is not None:
return state_metadata
from .collector import _parse_hostname # Local import to avoid circular dependency
@ -93,9 +113,9 @@ def load_config() -> AgentConfig:
"""从环境变量推导配置,移除了外部配置文件依赖。"""
hostname = _resolve_hostname()
environment, user, instance = _resolve_metadata_fields(hostname)
node_file = f"/private/argus/agent/{hostname}/node.json"
environment, user, instance = _resolve_metadata_fields(hostname, node_file)
health_dir = f"/private/argus/agent/{hostname}/health/"
master_endpoint_env = os.environ.get("MASTER_ENDPOINT")

Binary file not shown.

View File

@ -6,9 +6,18 @@ TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TMP_ROOT="$TEST_ROOT/tmp"
API_BASE="http://localhost:32300/api/v1/master"
NODE_ID="$(cat "$TMP_ROOT/node_id")"
ENV_NODE_ID_FILE="$TMP_ROOT/node_id_host_abc"
if [[ ! -f "$ENV_NODE_ID_FILE" ]]; then
echo "[ERROR] Env agent node id file missing at $ENV_NODE_ID_FILE" >&2
exit 1
fi
ENV_NODE_ID="$(cat "$ENV_NODE_ID_FILE")"
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
ENV_AGENT_HOSTNAME="host_abc"
NETWORK_NAME="tests_default"
NEW_AGENT_IP="172.28.0.200"
NEW_ENV_AGENT_IP="172.28.0.210"
ENTRYPOINT_SCRIPT="$SCRIPT_DIR/agent_entrypoint.sh"
VERIFY_SCRIPT="$TEST_ROOT/../scripts/agent_deployment_verify.sh"
ENV_FILE="$TEST_ROOT/.env"
@ -80,15 +89,37 @@ if [[ "$prev_ip" != "$initial_ip" ]]; then
exit 1
fi
env_before_file="$TMP_ROOT/env_before_restart.json"
curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_before_file"
env_prev_last_updated=$(python3 - "$env_before_file" <<'PY'
import json, sys
with open(sys.argv[1]) as handle:
node = json.load(handle)
print(node.get("last_updated", ""))
PY
)
env_prev_ip=$(python3 - "$env_before_file" <<'PY'
import json, sys
with open(sys.argv[1]) as handle:
node = json.load(handle)
print(node["meta_data"].get("ip", ""))
PY
)
pushd "$TEST_ROOT" >/dev/null
compose rm -sf agent
compose rm -sf agent_env
popd >/dev/null
docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true
docker container rm -f argus-agent-env-e2e >/dev/null 2>&1 || true
AGENT_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME"
HEALTH_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/health"
ENV_AGENT_DIR="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME"
ENV_HEALTH_DIR="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME/health"
# 先以 sleep 方式启动容器,确保我们掌握注册时的网络状态
if ! docker run -d \
--name argus-agent-e2e \
@ -148,3 +179,76 @@ if [[ "$success" != true ]]; then
fi
echo "[INFO] Agent restart produced successful re-registration with IP change"
# ---- Restart env-driven agent without metadata environment variables ----
if [[ ! -d "$ENV_AGENT_DIR" ]]; then
echo "[ERROR] Env agent data dir missing at $ENV_AGENT_DIR" >&2
exit 1
fi
if [[ ! -d "$ENV_HEALTH_DIR" ]]; then
mkdir -p "$ENV_HEALTH_DIR"
fi
if ! docker run -d \
--name argus-agent-env-e2e \
--hostname "$ENV_AGENT_HOSTNAME" \
--network "$NETWORK_NAME" \
--ip "$NEW_ENV_AGENT_IP" \
-v "$ENV_AGENT_DIR:/private/argus/agent/$ENV_AGENT_HOSTNAME" \
-v "$ENV_HEALTH_DIR:/private/argus/agent/$ENV_AGENT_HOSTNAME/health" \
-v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \
-v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
-v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \
-v "$VERIFY_SCRIPT:/usr/local/bin/agent_deployment_verify.sh:ro" \
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
-e REPORT_INTERVAL_SECONDS=2 \
-e ARGUS_BUILD_UID="$AGENT_UID" \
-e ARGUS_BUILD_GID="$AGENT_GID" \
--entrypoint /usr/local/bin/agent-entrypoint.sh \
ubuntu:22.04 >/dev/null; then
echo "[ERROR] Failed to start env-driven agent container without metadata env" >&2
exit 1
fi
env_success=false
env_detail_file="$TMP_ROOT/env_post_restart.json"
for _ in {1..20}; do
sleep 3
if ! curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_detail_file"; then
continue
fi
if python3 - "$env_detail_file" "$env_prev_last_updated" "$ENV_NODE_ID" "$env_prev_ip" "$NEW_ENV_AGENT_IP" <<'PY'
import json, sys
with open(sys.argv[1]) as handle:
node = json.load(handle)
prev_last_updated = sys.argv[2]
expected_id = sys.argv[3]
old_ip = sys.argv[4]
expected_ip = sys.argv[5]
last_updated = node.get("last_updated")
current_ip = node["meta_data"].get("ip")
meta = node.get("meta_data", {})
assert node["id"] == expected_id
if current_ip != expected_ip:
raise SystemExit(1)
if current_ip == old_ip:
raise SystemExit(1)
if not last_updated or last_updated == prev_last_updated:
raise SystemExit(1)
if meta.get("env") != "prod" or meta.get("user") != "ml" or meta.get("instance") != "node-3":
raise SystemExit(1)
PY
then
env_success=true
break
fi
done
if [[ "$env_success" != true ]]; then
echo "[ERROR] Env-driven agent did not reuse persisted metadata after restart" >&2
exit 1
fi
echo "[INFO] Env-driven agent restart succeeded with persisted metadata"

View File

@ -60,6 +60,24 @@ class LoadConfigMetadataTests(unittest.TestCase):
self.assertEqual(config.instance, "abc")
mock_mkdir.assert_called()
@patch("app.config._load_metadata_from_state", return_value=("prod", "ops", "node-1"))
@patch("app.config.Path.mkdir")
def test_metadata_from_node_state(self, mock_mkdir, mock_state):
with temp_env(
MASTER_ENDPOINT="http://master.local",
AGENT_HOSTNAME="host_abc",
AGENT_ENV=None,
AGENT_USER=None,
AGENT_INSTANCE=None,
):
config = load_config()
self.assertEqual(config.environment, "prod")
self.assertEqual(config.user, "ops")
self.assertEqual(config.instance, "node-1")
mock_state.assert_called_once()
mock_mkdir.assert_called()
@patch("app.config.Path.mkdir")
def test_partial_environment_variables_fallback(self, mock_mkdir):
with temp_env(