[#19 ] alert和web增加系统集成测试

[#6 ] 修改打包镜像
[#5 ] 修改web页面；更新镜像打包
2025-10-13 16:48:05 +08:00 · 2025-10-13 10:13:36 +08:00 · 2025-10-13 10:13:35 +08:00 · 2025-10-13 10:13:35 +08:00 · 2025-10-13 10:13:35 +08:00 · 2025-10-13 10:13:35 +08:00
259 changed files with 23420 additions and 107 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+.idea/
--- a/README.md
+++ b/README.md
@ -5,3 +5,10 @@
 项目文档：【腾讯文档】GPU集群运维系统
 https://docs.qq.com/doc/DQUxDdmhIZ1dpeERk

+## 构建账号配置
+
+镜像构建和运行账号的 UID/GID 可通过 `configs/build_user.conf` 配置，详细说明见 `doc/build-user-config.md`。
+
+## 本地端口占用提示
+
+如需运行 BIND 模块端到端测试且宿主机 53 端口已占用，可通过环境变量 `HOST_DNS_PORT`（默认 1053）指定对外映射端口，例如 `HOST_DNS_PORT=12053 ./scripts/00_e2e_test.sh`。
--- a/build/build_images.sh
+++ b/build/build_images.sh
@ -1,138 +1,205 @@
 #!/usr/bin/env bash
 set -euo pipefail

-# 帮助信息
 show_help() {
-    cat << EOF
+  cat <<'EOF'
 ARGUS Unified Build System - Image Build Tool

 Usage: $0 [OPTIONS]

 Options:
-  --intranet    Use intranet mirror for Ubuntu 22.04 packages
-  -h, --help    Show this help message
+  --intranet        Use intranet mirror for log/bind builds
+  --master-offline  Build master offline image (requires src/master/offline_wheels.tar.gz)
+  -h, --help        Show this help message

 Examples:
-  $0                # Build with default sources
-  $0 --intranet     # Build with intranet mirror
-
+  $0                             # Build with default sources
+  $0 --intranet                  # Build with intranet mirror
+  $0 --master-offline            # Additionally build argus-master:offline
+  $0 --intranet --master-offline
 EOF
 }

-# 解析命令行参数
 use_intranet=false
+build_master=true
+build_master_offline=false

 while [[ $# -gt 0 ]]; do
-    case $1 in
-        --intranet)
-            use_intranet=true
-            shift
-            ;;
-        -h|--help)
-            show_help
-            exit 0
-            ;;
-        *)
-            echo "Unknown option: $1"
-            show_help
-            exit 1
-            ;;
-    esac
+  case $1 in
+    --intranet)
+      use_intranet=true
+      shift
+      ;;
+    --master)
+      build_master=true
+      shift
+      ;;
+    --master-offline)
+      build_master=true
+      build_master_offline=true
+      shift
+      ;;
+    -h|--help)
+      show_help
+      exit 0
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      show_help
+      exit 1
+      ;;
+  esac
 done

-# 获取项目根目录
 root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+. "$root/scripts/common/build_user.sh"
+
+declare -a build_args=()
+
+if [[ "$use_intranet" == true ]]; then
+  build_args+=("--build-arg" "USE_INTRANET=true")
+fi
+
 cd "$root"

+load_build_user
+build_args+=("--build-arg" "ARGUS_BUILD_UID=${ARGUS_BUILD_UID}" "--build-arg" "ARGUS_BUILD_GID=${ARGUS_BUILD_GID}")
+
+master_root="$root/src/master"
+master_offline_tar="$master_root/offline_wheels.tar.gz"
+master_offline_dir="$master_root/offline_wheels"
+
+if [[ "$build_master_offline" == true ]]; then
+  if [[ ! -f "$master_offline_tar" ]]; then
+    echo "❌ offline wheels tar not found: $master_offline_tar" >&2
+    echo "   请提前准备好 offline_wheels.tar.gz 后再执行 --master-offline" >&2
+    exit 1
+  fi
+  echo "📦 Preparing offline wheels for master (extracting $master_offline_tar)"
+  rm -rf "$master_offline_dir"
+  mkdir -p "$master_offline_dir"
+  tar -xzf "$master_offline_tar" -C "$master_root"
+  has_wheel=$(find "$master_offline_dir" -maxdepth 1 -type f -name '*.whl' -print -quit)
+  if [[ -z "$has_wheel" ]]; then
+    echo "❌ offline_wheels extraction failed或无 wheel: $master_offline_dir" >&2
+    exit 1
+  fi
+fi
+
 echo "======================================="
 echo "ARGUS Unified Build System"
 echo "======================================="

 if [[ "$use_intranet" == true ]]; then
-    echo "🌐 Mode: Intranet (Using internal mirror: 10.68.64.1)"
-    build_args="--build-arg USE_INTRANET=true"
+  echo "🌐 Mode: Intranet (Using internal mirror: 10.68.64.1)"
 else
-    echo "🌐 Mode: Public (Using default package sources)"
-    build_args=""
+  echo "🌐 Mode: Public (Using default package sources)"
 fi

+echo "👤 Build user UID:GID -> ${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}"
+
 echo "📁 Build context: $root"
 echo ""

-# 构建镜像的函数
 build_image() {
-    local image_name=$1
-    local dockerfile_path=$2
-    local tag=$3
+  local image_name=$1
+  local dockerfile_path=$2
+  local tag=$3
+  shift 3
+  local extra_args=("$@")

-    echo "🔄 Building $image_name image..."
-    echo "   Dockerfile: $dockerfile_path"
-    echo "   Tag: $tag"
+  echo "🔄 Building $image_name image..."
+  echo "   Dockerfile: $dockerfile_path"
+  echo "   Tag: $tag"

-    if docker build $build_args -f "$dockerfile_path" -t "$tag" .; then
-        echo "✅ $image_name image built successfully"
-        return 0
-    else
-        echo "❌ Failed to build $image_name image"
-        return 1
-    fi
+  if docker build "${build_args[@]}" "${extra_args[@]}" -f "$dockerfile_path" -t "$tag" .; then
+    echo "✅ $image_name image built successfully"
+    return 0
+  else
+    echo "❌ Failed to build $image_name image"
+    return 1
+  fi
 }

-# 构建所有镜像
 images_built=()
 build_failed=false

-# 构建 Elasticsearch 镜像
 if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then
-    images_built+=("argus-elasticsearch:latest")
+  images_built+=("argus-elasticsearch:latest")
 else
-    build_failed=true
+  build_failed=true
 fi

 echo ""

-# 构建 Kibana 镜像
 if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:latest"; then
-    images_built+=("argus-kibana:latest")
+  images_built+=("argus-kibana:latest")
 else
-    build_failed=true
+  build_failed=true
 fi

 echo ""

-# 构建 BIND9 镜像
 if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:latest"; then
-    images_built+=("argus-bind9:latest")
+  images_built+=("argus-bind9:latest")
 else
-    build_failed=true
+  build_failed=true
 fi

 echo ""
+
+if [[ "$build_master" == true ]]; then
+  echo ""
+  echo "🔄 Building Master image..."
+  pushd "$master_root" >/dev/null
+  master_args=("--tag" "argus-master:latest")
+  if [[ "$use_intranet" == true ]]; then
+    master_args+=("--intranet")
+  fi
+  if [[ "$build_master_offline" == true ]]; then
+    master_args+=("--offline")
+  fi
+  if ./scripts/build_images.sh "${master_args[@]}"; then
+    if [[ "$build_master_offline" == true ]]; then
+      images_built+=("argus-master:offline")
+    else
+      images_built+=("argus-master:latest")
+    fi
+  else
+    build_failed=true
+  fi
+  popd >/dev/null
+fi
+
 echo "======================================="
 echo "📦 Build Summary"
 echo "======================================="

 if [[ ${#images_built[@]} -gt 0 ]]; then
-    echo "✅ Successfully built images:"
-    for image in "${images_built[@]}"; do
-        echo "   • $image"
-    done
+  echo "✅ Successfully built images:"
+  for image in "${images_built[@]}"; do
+    echo "   • $image"
+  done
 fi

 if [[ "$build_failed" == true ]]; then
-    echo ""
-    echo "❌ Some images failed to build. Please check the errors above."
-    exit 1
+  echo ""
+  echo "❌ Some images failed to build. Please check the errors above."
+  exit 1
 fi

 if [[ "$use_intranet" == true ]]; then
-    echo ""
-    echo "🌐 Built with intranet mirror configuration"
+  echo ""
+  echo "🌐 Built with intranet mirror configuration"
+fi
+
+if [[ "$build_master_offline" == true ]]; then
+  echo ""
+  echo "🧳 Master offline wheels 已解压到 $master_offline_dir"
 fi

 echo ""
 echo "🚀 Next steps:"
-echo "   cd src/log && ./scripts/save_images.sh     # Export log images"
-echo "   cd src/bind && ./scripts/save_images.sh    # Export bind images"
-echo "   cd src/log/tests && ./scripts/02_up.sh     # Start log services"
+echo "   ./build/save_images.sh --compress          # 导出镜像"
+echo "   cd src/master/tests && MASTER_IMAGE_TAG=argus-master:offline ./scripts/00_e2e_test.sh"
 echo ""
--- a/build/save_images.sh
+++ b/build/save_images.sh
@ -67,6 +67,7 @@ declare -A images=(
    ["argus-elasticsearch:latest"]="argus-elasticsearch-latest.tar"
    ["argus-kibana:latest"]="argus-kibana-latest.tar"
    ["argus-bind9:latest"]="argus-bind9-latest.tar"
+    ["argus-master:offline"]="argus-master-offline.tar"
 )

 # 函数：检查镜像是否存在
--- a/configs/.gitignore
+++ b/configs/.gitignore
@ -0,0 +1,2 @@
+# Local overrides for build user/group settings
+build_user.local.conf
--- a/configs/build_user.conf
+++ b/configs/build_user.conf
@ -0,0 +1,6 @@
+# Default build-time UID/GID for Argus images
+# Override by creating configs/build_user.local.conf with the same format.
+# Syntax: KEY=VALUE, supports UID/GID only. Whitespace and lines starting with # are ignored.
+
+UID=2133
+GID=2015
--- a/doc/build-user-config.md
+++ b/doc/build-user-config.md
@ -0,0 +1,38 @@
+# Argus 镜像构建 UID/GID 配置说明
+
+通过统一配置文件可以为 Kibana、Elasticsearch、Bind、Master 等容器指定运行账号，解决跨机器部署时 UID/GID 不一致导致的权限问题。
+
+## 配置入口
+
+- 默认配置存放在 `configs/build_user.conf`，内容示例：
+
+  ```bash
+  UID=2133
+  GID=2015
+  ```
+
+- 如果需要本地覆盖，可在 `configs/` 下新建 `build_user.local.conf`，字段与默认文件一致。该文件已列入 `.gitignore`，不会被意外提交。
+- 亦可在执行脚本前通过环境变量 `ARGUS_BUILD_UID` / `ARGUS_BUILD_GID` 强制指定值，优先级最高。
+
+## 作用范围
+
+- `build/build_images.sh` 在构建 log/bind/master 镜像时读取配置，并传递 `--build-arg ARGUS_BUILD_UID/GID`；控制台会输出当前使用的 UID/GID。
+- `src/master/scripts/build_images.sh` 同步使用配置，确保单独构建 master 镜像时行为一致。
+- 各镜像 Dockerfile 会根据传入的 UID/GID 调整容器内账号（如 `elasticsearch`、`kibana`、`bind`、`argus`），并以环境变量形式暴露运行时可见值。
+- Master 启动脚本会在执行 DNS 逻辑后，降权到配置的账号运行 `gunicorn`，确保写入 `/private/argus/**` 的文件具备正确属主。
+- Log 模块测试脚本 `01_bootstrap.sh` 会根据配置修正挂载目录属主，方便端到端测试在任意用户下运行。
+
+## 使用建议
+
+1. 初次克隆仓库后无需修改，默认 UID/GID 保持向后兼容。
+2. 如果在目标环境中使用新的账号（例如 `uid=4001,gid=4001`）：
+   - 编辑 `configs/build_user.local.conf` 填入新值；
+   - 使用新账号登录，并确保其加入宿主机的 `docker` 组；
+   - 重新执行 `build/build_images.sh` 或相关模块的构建脚本。
+3. 切换配置后建议重新运行目标模块的端到端脚本（如 `src/log/tests/scripts/01_bootstrap.sh`、`src/master/tests/scripts/00_e2e_test.sh`、`src/agent/tests/scripts/00_e2e_test.sh`），验证 `/private/argus` 下文件属主是否为期望账号。
+
+## 故障排查
+
+- **镜像构建报错 `groupmod: GID already in use`**：说明所选 GID 已存在于基础镜像，建议换用未占用的值，或在自定义基础镜像中先移除冲突。
+- **容器内运行时报写权限不足**：检查宿主机挂载目录是否已经由目标 UID/GID 创建；必要时重新执行模块的 `01_bootstrap.sh` 之类的准备脚本。
+- **仍看到旧 UID/GID**：确认脚本执行时未继承旧缓存，可运行 `ARGUS_BUILD_UID=... ARGUS_BUILD_GID=... ./build/build_images.sh` 强制覆盖。
--- a/scripts/common/build_user.sh
+++ b/scripts/common/build_user.sh
@ -0,0 +1,115 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Shared helper to load Argus build user/group configuration.
+# Usage:
+#   source "${PROJECT_ROOT}/scripts/common/build_user.sh"
+#   load_build_user
+#   echo "$ARGUS_BUILD_UID:$ARGUS_BUILD_GID"
+
+ARGUS_BUILD_UID_DEFAULT=2133
+ARGUS_BUILD_GID_DEFAULT=2015
+
+shopt -s extglob
+
+_ARGUS_BUILD_USER_LOADED="${_ARGUS_BUILD_USER_LOADED:-0}"
+
+_argus_build_user_script_dir() {
+  local dir
+  dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+  echo "$dir"
+}
+
+argus_project_root() {
+  local script_dir
+  script_dir="$(_argus_build_user_script_dir)"
+  (cd "$script_dir/../.." >/dev/null && pwd)
+}
+
+_argus_trim() {
+  local value="$1"
+  value="${value##+([[:space:]])}"
+  value="${value%%+([[:space:]])}"
+  printf '%s' "$value"
+}
+
+_argus_is_number() {
+  [[ "$1" =~ ^[0-9]+$ ]]
+}
+
+load_build_user() {
+  if [[ "$_ARGUS_BUILD_USER_LOADED" == "1" ]]; then
+    return 0
+  fi
+
+  local project_root config_files config uid gid
+  project_root="$(argus_project_root)"
+  config_files=(
+    "$project_root/configs/build_user.local.conf"
+    "$project_root/configs/build_user.conf"
+  )
+
+  uid="$ARGUS_BUILD_UID_DEFAULT"
+  gid="$ARGUS_BUILD_GID_DEFAULT"
+
+  for config in "${config_files[@]}"; do
+    if [[ -f "$config" ]]; then
+      while IFS= read -r raw_line || [[ -n "$raw_line" ]]; do
+        local line key value
+        line="${raw_line%%#*}"
+        line="$(_argus_trim "${line}")"
+        [[ -z "$line" ]] && continue
+        if [[ "$line" != *=* ]]; then
+          echo "[ARGUS build_user] Ignoring malformed line in $config: $raw_line" >&2
+          continue
+        fi
+        key="${line%%=*}"
+        value="${line#*=}"
+        key="$(_argus_trim "$key")"
+        value="$(_argus_trim "$value")"
+        case "$key" in
+          UID)
+            uid="$value"
+            ;;
+          GID)
+            gid="$value"
+            ;;
+          *)
+            echo "[ARGUS build_user] Unknown key '$key' in $config" >&2
+            ;;
+        esac
+      done < "$config"
+      break
+    fi
+  done
+
+  if [[ -n "${ARGUS_BUILD_UID:-}" ]]; then
+    uid="$ARGUS_BUILD_UID"
+  fi
+  if [[ -n "${ARGUS_BUILD_GID:-}" ]]; then
+    gid="$ARGUS_BUILD_GID"
+  fi
+
+  if ! _argus_is_number "$uid"; then
+    echo "[ARGUS build_user] Invalid UID '$uid'" >&2
+    return 1
+  fi
+  if ! _argus_is_number "$gid"; then
+    echo "[ARGUS build_user] Invalid GID '$gid'" >&2
+    return 1
+  fi
+
+  export ARGUS_BUILD_UID="$uid"
+  export ARGUS_BUILD_GID="$gid"
+  _ARGUS_BUILD_USER_LOADED=1
+}
+
+argus_build_user_args() {
+  load_build_user
+  printf '%s' "--build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} --build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID}"
+}
+
+print_build_user() {
+  load_build_user
+  echo "ARGUS build user: UID=${ARGUS_BUILD_UID} GID=${ARGUS_BUILD_GID}"
+}
--- a/src/.gitignore
+++ b/src/.gitignore
@ -0,0 +1,2 @@
+
+__pycache__/
--- a/src/agent/.gitignore
+++ b/src/agent/.gitignore
@ -0,0 +1,5 @@
+build/
+*.egg-info/
+__pycache__/
+
+.env
--- a/src/agent/README.md
+++ b/src/agent/README.md
@ -0,0 +1,66 @@
+# Argus Agent 模块
+
+Argus Agent 是一个轻量级 Python 进程，负责向 Argus Master 注册节点、汇报健康数据，并维护本地持久化信息。模块现以 PyInstaller 打包为独立可执行文件，便于在普通容器或虚机中直接运行。
+
+## 构建可执行文件
+
+```bash
+cd src/agent
+./scripts/build_binary.sh  # 生成 dist/argus-agent
+```
+
+脚本默认会在 Docker 容器 (`python:3.11-slim-bullseye`) 内执行 PyInstaller，确保产物运行时兼容 glibc 2.31+（覆盖 2.35 环境）。构建流程注意事项：
+
+- 每次构建前会清理 `build/`、`dist/` 并在容器内重新创建虚拟环境。
+- 需要使用内网 Python 镜像时，可通过 `PIP_INDEX_URL`、`PIP_EXTRA_INDEX_URL`、`PIP_TRUSTED_HOST` 等环境变量传入，脚本会自动透传给容器。
+- 如果宿主机无法运行 Docker，可设置 `AGENT_BUILD_USE_DOCKER=0` 回退到本地构建；此时代码必须在 glibc ≤ 2.35 的机器上执行。
+
+构建结束后脚本会在 `build/compat_check/` 下解包关键动态库并输出最高 `GLIBC_x.y` 版本，便于快速核对兼容性。如果结果中缺少 `libssl.so.3` / `libcrypto.so.3`，表示系统会在目标宿主机上使用本地 OpenSSL 库，无需额外处理。
+
+例如：
+
+```bash
+strings build/compat_check/libpython*.so.1.0 | grep -Eo 'GLIBC_[0-9]+\.[0-9]+' | sort -Vu | tail -n1
+```
+
+如遇构建失败，常见原因是 Docker 不可用（请改用 `AGENT_BUILD_USE_DOCKER=0`）或无法访问 Python 包镜像（先设置上述镜像环境变量后重试）。
+
+## 运行时配置
+
+Agent 不再依赖配置文件；所有参数均由环境变量与主机名推导：
+
+| 变量 | 必填 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `MASTER_ENDPOINT` | 是 | N/A | Master 基础地址，可写 `http://host:3000` 或 `host:3000`（自动补全 `http://`）。 |
+| `REPORT_INTERVAL_SECONDS` | 否 | `60` | 状态上报间隔（秒）。必须为正整数。 |
+| `AGENT_HOSTNAME` | 否 | `$(hostname)` | 覆盖容器内主机名，便于测试或特殊命名需求。 |
+
+派生路径：
+
+- 节点信息：`/private/argus/agent/<hostname>/node.json`
+- 子模块健康目录：`/private/argus/agent/<hostname>/health/`
+
+健康目录中的文件需遵循 `<模块前缀>-*.json` 命名（例如 `log-fluentbit.json`、`metric-node-exporter.json`），文件内容会原样并入上报的 `health` 字段。
+
+## 日志与持久化
+
+- Agent 会在成功注册、状态上报、异常重试等关键节点输出结构化日志，便于聚合分析。
+- `node.json` 保存 Master 返回的最新节点对象，用于重启后继续使用既有节点 ID。
+
+## 端到端测试
+
+仓库内提供 Docker Compose 测试栈（master + ubuntu 容器）：
+
+```bash
+cd src/agent/tests
+./scripts/00_e2e_test.sh
+```
+
+测试脚本会：
+
+1. 构建 master 镜像与 agent 可执行文件。
+2. 以 `ubuntu:24.04` 启动 agent 容器，并通过环境变量注入 `MASTER_ENDPOINT`、`REPORT_INTERVAL_SECONDS`。
+3. 验证注册、健康上报、nodes.json 生成、统计接口，以及“容器重启 + IP 变化”重注册流程。
+4. 清理 `tests/private/` 与临时容器网络。
+
+如需在真实环境部署，只需将 `dist/argus-agent` 连同健康目录挂载到目标主机，并按上表设置环境变量即可。
--- a/src/agent/app/init.py
+++ b/src/agent/app/init.py
--- a/src/agent/app/client.py
+++ b/src/agent/app/client.py
@ -0,0 +1,60 @@
+from __future__ import annotations
+
+import json
+from typing import Any, Dict, Optional
+
+import requests
+
+from .log import get_logger
+
+LOGGER = get_logger("argus.agent.client")
+
+
+class MasterAPIError(Exception):
+    def __init__(self, message: str, status_code: int, payload: Optional[Dict[str, Any]] = None) -> None:
+        super().__init__(message)
+        self.status_code = status_code
+        self.payload = payload or {}
+
+
+class AgentClient:
+    def __init__(self, base_url: str, *, timeout: int = 10) -> None:
+        self._base_url = base_url.rstrip("/")
+        self._timeout = timeout
+        self._session = requests.Session()
+
+    def register_node(self, body: Dict[str, Any]) -> Dict[str, Any]:
+        """调用 master 注册接口，返回节点对象。"""
+        url = f"{self._base_url}/api/v1/master/nodes"
+        response = self._session.post(url, json=body, timeout=self._timeout)
+        return self._parse_response(response, "Failed to register node")
+
+    def update_status(self, node_id: str, body: Dict[str, Any]) -> Dict[str, Any]:
+        """上报健康信息，由 master 更新 last_report。"""
+        url = f"{self._base_url}/api/v1/master/nodes/{node_id}/status"
+        response = self._session.put(url, json=body, timeout=self._timeout)
+        return self._parse_response(response, "Failed to update node status")
+
+    def _parse_response(self, response: requests.Response, error_prefix: str) -> Dict[str, Any]:
+        content_type = response.headers.get("Content-Type", "")
+        payload: Dict[str, Any] | None = None
+        if "application/json" in content_type:
+            try:
+                payload = response.json()
+            except json.JSONDecodeError:
+                LOGGER.warning("Response contained invalid JSON", extra={"status": response.status_code})
+
+        if response.status_code >= 400:
+            message = payload.get("error") if isinstance(payload, dict) else response.text
+            raise MasterAPIError(
+                f"{error_prefix}: {message}",
+                status_code=response.status_code,
+                payload=payload if isinstance(payload, dict) else None,
+            )
+
+        if payload is None:
+            try:
+                payload = response.json()
+            except json.JSONDecodeError as exc:
+                raise MasterAPIError("Master returned non-JSON payload", response.status_code) from exc
+        return payload
--- a/src/agent/app/collector.py
+++ b/src/agent/app/collector.py
@ -0,0 +1,111 @@
+from __future__ import annotations
+
+import os
+import re
+import socket
+import subprocess
+from pathlib import Path
+from typing import Any, Dict
+
+from .config import AgentConfig
+from .log import get_logger
+
+LOGGER = get_logger("argus.agent.collector")
+
+_HOSTNAME_PATTERN = re.compile(r"^([^-]+)-([^-]+)-([^-]+)-.*$")
+
+
+def collect_metadata(config: AgentConfig) -> Dict[str, Any]:
+    """汇总节点注册需要的静态信息。"""
+    hostname = config.hostname
+    env, user, instance = _parse_hostname(hostname)
+    meta = {
+        "hostname": hostname,
+        "ip": _detect_ip_address(),
+        "env": env,
+        "user": user,
+        "instance": instance,
+        "cpu_number": _detect_cpu_count(),
+        "memory_in_bytes": _detect_memory_bytes(),
+        "gpu_number": _detect_gpu_count(),
+    }
+    return meta
+
+
+def _parse_hostname(hostname: str) -> tuple[str, str, str]:
+    """按照约定的 env-user-instance 前缀拆解主机名。"""
+    match = _HOSTNAME_PATTERN.match(hostname)
+    if not match:
+        LOGGER.warning("Hostname does not match expected pattern", extra={"hostname": hostname})
+        return "", "", ""
+    return match.group(1), match.group(2), match.group(3)
+
+
+def _detect_cpu_count() -> int:
+    count = os.cpu_count()
+    return count if count is not None else 0
+
+
+def _detect_memory_bytes() -> int:
+    """优先读取 cgroup 限额，失败时退回 /proc/meminfo。"""
+    cgroup_path = Path("/sys/fs/cgroup/memory.max")
+    try:
+        raw = cgroup_path.read_text(encoding="utf-8").strip()
+        if raw and raw != "max":
+            return int(raw)
+    except FileNotFoundError:
+        LOGGER.debug("cgroup memory.max not found, falling back to /proc/meminfo")
+    except ValueError:
+        LOGGER.warning("Failed to parse memory.max, falling back", extra={"value": raw})
+
+    try:
+        with open("/proc/meminfo", "r", encoding="utf-8") as handle:
+            for line in handle:
+                if line.startswith("MemTotal:"):
+                    parts = line.split()
+                    if len(parts) >= 2:
+                        return int(parts[1]) * 1024
+    except FileNotFoundError:
+        LOGGER.error("/proc/meminfo not found; defaulting memory to 0")
+    return 0
+
+
+def _detect_gpu_count() -> int:
+    """采集 GPU 数量，如无法探测则默认为 0。"""
+    try:
+        proc = subprocess.run(
+            ["nvidia-smi", "-L"],
+            check=False,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            timeout=5,
+        )
+    except FileNotFoundError:
+        LOGGER.debug("nvidia-smi not available; assuming 0 GPUs")
+        return 0
+    except subprocess.SubprocessError as exc:
+        LOGGER.warning("nvidia-smi invocation failed", extra={"error": str(exc)})
+        return 0
+
+    if proc.returncode != 0:
+        LOGGER.debug("nvidia-smi returned non-zero", extra={"stderr": proc.stderr.strip()})
+        return 0
+
+    count = sum(1 for line in proc.stdout.splitlines() if line.strip())
+    return count
+
+
+def _detect_ip_address() -> str:
+    """尝试通过 UDP socket 获得容器出口 IP，失败则回退解析主机名。"""
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock:
+            sock.connect(("8.8.8.8", 80))
+            return sock.getsockname()[0]
+    except OSError:
+        LOGGER.debug("UDP socket trick failed; falling back to hostname lookup")
+    try:
+        return socket.gethostbyname(socket.gethostname())
+    except OSError:
+        LOGGER.warning("Unable to resolve hostname to IP; defaulting to 127.0.0.1")
+        return "127.0.0.1"
--- a/src/agent/app/config.py
+++ b/src/agent/app/config.py
@ -0,0 +1,74 @@
+from __future__ import annotations
+
+import os
+import socket
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Final
+
+from .version import VERSION
+
+DEFAULT_REPORT_INTERVAL_SECONDS: Final[int] = 60
+
+
+@dataclass(frozen=True)
+class AgentConfig:
+    hostname: str
+    node_file: str
+    version: str
+    master_endpoint: str
+    report_interval_seconds: int
+    health_dir: str
+    request_timeout_seconds: int = 10
+
+
+def _normalise_master_endpoint(value: str) -> str:
+    value = value.strip()
+    if not value:
+        raise ValueError("MASTER_ENDPOINT environment variable is required")
+    if not value.startswith("http://") and not value.startswith("https://"):
+        value = f"http://{value}"
+    return value.rstrip("/")
+
+
+def _read_report_interval(raw_value: str | None) -> int:
+    if raw_value is None or raw_value.strip() == "":
+        return DEFAULT_REPORT_INTERVAL_SECONDS
+    try:
+        interval = int(raw_value)
+    except ValueError as exc:
+        raise ValueError("REPORT_INTERVAL_SECONDS must be an integer") from exc
+    if interval <= 0:
+        raise ValueError("REPORT_INTERVAL_SECONDS must be positive")
+    return interval
+
+
+def _resolve_hostname() -> str:
+    return os.environ.get("AGENT_HOSTNAME") or socket.gethostname()
+
+
+def load_config() -> AgentConfig:
+    """从环境变量推导配置，移除了外部配置文件依赖。"""
+
+    hostname = _resolve_hostname()
+    node_file = f"/private/argus/agent/{hostname}/node.json"
+    health_dir = f"/private/argus/agent/{hostname}/health/"
+
+    master_endpoint_env = os.environ.get("MASTER_ENDPOINT")
+    if master_endpoint_env is None:
+        raise ValueError("MASTER_ENDPOINT environment variable is not set")
+    master_endpoint = _normalise_master_endpoint(master_endpoint_env)
+
+    report_interval_seconds = _read_report_interval(os.environ.get("REPORT_INTERVAL_SECONDS"))
+
+    Path(node_file).parent.mkdir(parents=True, exist_ok=True)
+    Path(health_dir).mkdir(parents=True, exist_ok=True)
+
+    return AgentConfig(
+        hostname=hostname,
+        node_file=node_file,
+        version=VERSION,
+        master_endpoint=master_endpoint,
+        report_interval_seconds=report_interval_seconds,
+        health_dir=health_dir,
+    )
--- a/src/agent/app/health_reader.py
+++ b/src/agent/app/health_reader.py
@ -0,0 +1,32 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, Dict
+
+from .log import get_logger
+
+LOGGER = get_logger("argus.agent.health")
+
+
+def read_health_directory(path: str) -> Dict[str, Any]:
+    """读取目录中所有 <prefix>-*.json 文件并返回 JSON 映射。"""
+    result: Dict[str, Any] = {}
+    directory = Path(path)
+    if not directory.exists():
+        LOGGER.debug("Health directory does not exist", extra={"path": str(directory)})
+        return result
+
+    for health_file in sorted(directory.glob("*.json")):
+        if "-" not in health_file.stem:
+            LOGGER.debug("Skipping non-prefixed health file", extra={"file": health_file.name})
+            continue
+        try:
+            with health_file.open("r", encoding="utf-8") as handle:
+                content = json.load(handle)
+            result[health_file.stem] = content
+        except json.JSONDecodeError as exc:
+            LOGGER.warning("Failed to parse health file", extra={"file": health_file.name, "error": str(exc)})
+        except OSError as exc:
+            LOGGER.warning("Failed to read health file", extra={"file": health_file.name, "error": str(exc)})
+    return result
--- a/src/agent/app/log.py
+++ b/src/agent/app/log.py
@ -0,0 +1,18 @@
+from __future__ import annotations
+
+import logging
+import os
+
+
+_LOG_FORMAT = "%(asctime)s %(levelname)s %(name)s - %(message)s"
+
+
+def setup_logging() -> None:
+    level_name = os.environ.get("AGENT_LOG_LEVEL", "INFO").upper()
+    level = getattr(logging, level_name, logging.INFO)
+    logging.basicConfig(level=level, format=_LOG_FORMAT)
+
+
+def get_logger(name: str) -> logging.Logger:
+    setup_logging()
+    return logging.getLogger(name)
--- a/src/agent/app/main.py
+++ b/src/agent/app/main.py
@ -0,0 +1,163 @@
+from __future__ import annotations
+
+import signal
+import time
+from datetime import datetime, timezone
+from typing import Optional
+
+from .client import AgentClient, MasterAPIError
+from .collector import collect_metadata
+from .config import AgentConfig, load_config
+from .health_reader import read_health_directory
+from .log import get_logger, setup_logging
+from .state import clear_node_state, load_node_state, save_node_state
+
+LOGGER = get_logger("argus.agent")
+
+
+def _current_timestamp() -> str:
+    return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
+
+
+class StopSignal:
+    def __init__(self) -> None:
+        self._stop = False
+
+    def set(self, *_args) -> None:  # type: ignore[override]
+        self._stop = True
+
+    def is_set(self) -> bool:
+        return self._stop
+
+
+def main(argv: Optional[list[str]] = None) -> int:  # noqa: ARG001 - 保留签名以兼容入口调用
+    setup_logging()
+
+    stop_signal = StopSignal()
+    signal.signal(signal.SIGTERM, stop_signal.set)
+    signal.signal(signal.SIGINT, stop_signal.set)
+
+    try:
+        config = load_config()
+    except Exception as exc:
+        LOGGER.error("Failed to load configuration", extra={"error": str(exc)})
+        return 1
+
+    LOGGER.info(
+        "Agent starting",
+        extra={
+            "hostname": config.hostname,
+            "master_endpoint": config.master_endpoint,
+            "node_file": config.node_file,
+        },
+    )
+
+    client = AgentClient(config.master_endpoint, timeout=config.request_timeout_seconds)
+
+    node_state = load_node_state(config.node_file) or {}
+    node_id = node_state.get("id")
+
+    # 与 master 建立注册关系（支持重注册），失败则重试
+    register_response = _register_with_retry(client, config, node_id, stop_signal)
+    if register_response is None:
+        LOGGER.info("Registration aborted due to shutdown signal")
+        return 0
+
+    node_id = register_response.get("id")
+    if not node_id:
+        LOGGER.error("Master did not return node id; aborting")
+        return 1
+    save_node_state(config.node_file, register_response)
+
+    LOGGER.info("Entering status report loop", extra={"node_id": node_id})
+    _status_loop(client, config, node_id, stop_signal)
+    return 0
+
+
+def _register_with_retry(
+    client: AgentClient,
+    config: AgentConfig,
+    node_id: Optional[str],
+    stop_signal: StopSignal,
+):
+    backoff = 5
+    while not stop_signal.is_set():
+        payload = {
+            "name": config.hostname,
+            "type": "agent",
+            "meta_data": collect_metadata(config),
+            "version": config.version,
+        }
+        if node_id:
+            payload["id"] = node_id
+
+        try:
+            response = client.register_node(payload)
+            LOGGER.info("Registration successful", extra={"node_id": response.get("id")})
+            save_node_state(config.node_file, response)
+            return response
+        except MasterAPIError as exc:
+            if exc.status_code == 404 and node_id:
+                LOGGER.warning(
+                    "Master does not recognise node id; clearing local node state",
+                    extra={"node_id": node_id},
+                )
+                clear_node_state(config.node_file)
+                node_id = None
+            elif exc.status_code == 500 and node_id:
+                # id 与 name 不匹配通常意味着配置异常，记录但继续重试
+                LOGGER.error(
+                    "Master rejected node due to id/name mismatch; will retry",
+                    extra={"node_id": node_id},
+                )
+            else:
+                LOGGER.error("Registration failed", extra={"status_code": exc.status_code, "error": str(exc)})
+            time.sleep(min(backoff, 60))
+            backoff = min(backoff * 2, 60)
+        except Exception as exc:  # pragma: no cover - defensive
+            LOGGER.exception("Unexpected error during registration", extra={"error": str(exc)})
+            time.sleep(min(backoff, 60))
+            backoff = min(backoff * 2, 60)
+    return None
+
+
+def _status_loop(
+    client: AgentClient,
+    config: AgentConfig,
+    node_id: str,
+    stop_signal: StopSignal,
+) -> None:
+    interval = config.report_interval_seconds
+    while not stop_signal.is_set():
+        timestamp = _current_timestamp()
+        health_payload = read_health_directory(config.health_dir)
+        body = {
+            "timestamp": timestamp,
+            "health": health_payload,
+        }
+        try:
+            response = client.update_status(node_id, body)
+            LOGGER.info(
+                "Status report succeeded",
+                extra={"node_id": node_id, "health_keys": list(health_payload.keys())},
+            )
+            save_node_state(config.node_file, response)
+        except MasterAPIError as exc:
+            # 保持循环继续执行，等待下一次重试
+            LOGGER.error(
+                "Failed to report status",
+                extra={"status_code": exc.status_code, "error": str(exc)},
+            )
+        except Exception as exc:  # pragma: no cover - defensive
+            LOGGER.exception("Unexpected error during status report", extra={"error": str(exc)})
+
+        for _ in range(interval):
+            if stop_signal.is_set():
+                break
+            time.sleep(1)
+
+    LOGGER.info("Stop signal received; exiting status loop")
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/src/agent/app/state.py
+++ b/src/agent/app/state.py
@ -0,0 +1,44 @@
+from __future__ import annotations
+
+import json
+import os
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+from .log import get_logger
+
+LOGGER = get_logger("argus.agent.state")
+
+
+def load_node_state(path: str) -> Optional[Dict[str, Any]]:
+    """读取本地 node.json，容器重启后沿用之前的 ID。"""
+    try:
+        with open(path, "r", encoding="utf-8") as handle:
+            return json.load(handle)
+    except FileNotFoundError:
+        return None
+    except json.JSONDecodeError as exc:
+        LOGGER.warning("node.json is invalid JSON; ignoring", extra={"error": str(exc)})
+        return None
+
+
+def save_node_state(path: str, data: Dict[str, Any]) -> None:
+    """原子化写入 node.json，避免并发读取坏数据。"""
+    directory = Path(path).parent
+    directory.mkdir(parents=True, exist_ok=True)
+    with tempfile.NamedTemporaryFile("w", dir=directory, delete=False, encoding="utf-8") as tmp:
+        json.dump(data, tmp, separators=(",", ":"))
+        tmp.flush()
+        os.fsync(tmp.fileno())
+        temp_path = tmp.name
+    os.replace(temp_path, path)
+
+
+def clear_node_state(path: str) -> None:
+    try:
+        os.remove(path)
+    except FileNotFoundError:
+        return
+    except OSError as exc:
+        LOGGER.warning("Failed to remove node state file", extra={"error": str(exc), "path": path})
--- a/src/agent/app/version.py
+++ b/src/agent/app/version.py
@ -0,0 +1,69 @@
+from __future__ import annotations
+
+import os
+import sys
+from pathlib import Path
+from typing import Optional
+
+import importlib.metadata
+
+try:
+    import tomllib
+except ModuleNotFoundError:  # pragma: no cover
+    import tomli as tomllib  # type: ignore[no-redef]
+
+
+def _candidate_paths() -> list[Path]:
+    paths = []
+    bundle_dir: Optional[str] = getattr(sys, "_MEIPASS", None)
+    if bundle_dir:
+        paths.append(Path(bundle_dir) / "pyproject.toml")
+    paths.append(Path(__file__).resolve().parent.parent / "pyproject.toml")
+    paths.append(Path(__file__).resolve().parent / "pyproject.toml")
+    paths.append(Path.cwd() / "pyproject.toml")
+    return paths
+
+
+def _read_from_pyproject() -> Optional[str]:
+    for path in _candidate_paths():
+        if not path.exists():
+            continue
+        try:
+            with path.open("rb") as handle:
+                data = tomllib.load(handle)
+        except (OSError, tomllib.TOMLDecodeError):
+            continue
+        project = data.get("project")
+        if isinstance(project, dict):
+            version = project.get("version")
+            if isinstance(version, str):
+                return version
+        tool = data.get("tool")
+        if isinstance(tool, dict):
+            argus_cfg = tool.get("argus")
+            if isinstance(argus_cfg, dict):
+                version = argus_cfg.get("version")
+                if isinstance(version, str):
+                    return version
+    return None
+
+
+def _detect_version() -> str:
+    try:
+        return importlib.metadata.version("argus-agent")
+    except importlib.metadata.PackageNotFoundError:
+        pass
+    override = os.environ.get("AGENT_VERSION_OVERRIDE")
+    if override:
+        return override
+    fallback = _read_from_pyproject()
+    if fallback:
+        return fallback
+    return "0.0.0"
+
+
+VERSION: str = _detect_version()
+
+
+def get_version() -> str:
+    return VERSION
--- a/src/agent/dist/argus-agent
+++ b/src/agent/dist/argus-agent
--- a/src/agent/entry.py
+++ b/src/agent/entry.py
@ -0,0 +1,10 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import sys
+
+from app.main import main as agent_main
+
+
+if __name__ == "__main__":
+    sys.exit(agent_main())
--- a/src/agent/pyproject.toml
+++ b/src/agent/pyproject.toml
@ -0,0 +1,19 @@
+[project]
+name = "argus-agent"
+version = "1.1.0"
+description = "Argus agent binary"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "requests==2.31.0"
+]
+
+[build-system]
+requires = ["setuptools>=69", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[tool.argus]
+entry = "app.main:main"
+
+[tool.setuptools]
+packages = ["app"]
--- a/src/agent/scripts/agent_deployment_verify.sh
+++ b/src/agent/scripts/agent_deployment_verify.sh
@ -0,0 +1,690 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+LOG_PREFIX="[AGENT-VERIFY]"
+MASTER_ENDPOINT_DEFAULT=""
+AGENT_DATA_ROOT_DEFAULT="/private/argus/agent"
+AGENT_ETC_ROOT_DEFAULT="/private/argus/etc"
+REPORT_INTERVAL_DEFAULT="2"
+
+ALLOW_CONFIG_TOUCH="false"
+KEEP_TEST_HEALTH="false"
+
+log_info() {
+  echo "${LOG_PREFIX} INFO  $*"
+}
+
+log_warn() {
+  echo "${LOG_PREFIX} WARN  $*" >&2
+}
+
+log_error() {
+  echo "${LOG_PREFIX} ERROR $*" >&2
+}
+
+usage() {
+  cat <<'USAGE'
+Usage: agent_deployment_verify.sh [options]
+
+Options:
+  --allow-config-touch   Enable optional config PUT dry-run check.
+  --keep-test-health     Keep the temporary verify health file after checks.
+  -h, --help             Show this help message.
+
+Environment variables:
+  MASTER_ENDPOINT        (required) Master API base endpoint, e.g. http://master:3000
+  AGENT_DATA_ROOT        (default: /private/argus/agent)
+  AGENT_ETC_ROOT         (default: /private/argus/etc)
+  VERIFY_HOSTNAME        (default: output of hostname)
+  REPORT_INTERVAL_SECONDS (default: 2) Agent report interval in seconds
+USAGE
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --allow-config-touch)
+      ALLOW_CONFIG_TOUCH="true"
+      shift
+      ;;
+    --keep-test-health)
+      KEEP_TEST_HEALTH="true"
+      shift
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      log_error "Unknown option: $1"
+      usage >&2
+      exit 2
+      ;;
+  esac
+done
+
+MASTER_ENDPOINT="${MASTER_ENDPOINT:-$MASTER_ENDPOINT_DEFAULT}"
+AGENT_DATA_ROOT="${AGENT_DATA_ROOT:-$AGENT_DATA_ROOT_DEFAULT}"
+AGENT_ETC_ROOT="${AGENT_ETC_ROOT:-$AGENT_ETC_ROOT_DEFAULT}"
+VERIFY_HOSTNAME="${VERIFY_HOSTNAME:-$(hostname)}"
+REPORT_INTERVAL_SECONDS="${REPORT_INTERVAL_SECONDS:-$REPORT_INTERVAL_DEFAULT}"
+
+if [[ -z "$MASTER_ENDPOINT" ]]; then
+  log_error "MASTER_ENDPOINT is required"
+  exit 2
+fi
+
+if ! [[ "$REPORT_INTERVAL_SECONDS" =~ ^[0-9]+$ ]] || [[ "$REPORT_INTERVAL_SECONDS" -le 0 ]]; then
+  log_warn "Invalid REPORT_INTERVAL_SECONDS='$REPORT_INTERVAL_SECONDS', fallback to $REPORT_INTERVAL_DEFAULT"
+  REPORT_INTERVAL_SECONDS="$REPORT_INTERVAL_DEFAULT"
+fi
+
+normalize_endpoint() {
+  local endpoint="$1"
+  if [[ "$endpoint" != http://* && "$endpoint" != https://* ]]; then
+    endpoint="http://$endpoint"
+  fi
+  endpoint="${endpoint%/}"
+  echo "$endpoint"
+}
+
+MASTER_BASE="$(normalize_endpoint "$MASTER_ENDPOINT")"
+
+NODE_DIR="$AGENT_DATA_ROOT/$VERIFY_HOSTNAME"
+NODE_JSON="$NODE_DIR/node.json"
+HEALTH_DIR="$NODE_DIR/health"
+DNS_CONF="$AGENT_ETC_ROOT/dns.conf"
+UPDATE_SCRIPT="$AGENT_ETC_ROOT/update-dns.sh"
+
+declare -a RESULTS_PASS=()
+declare -a RESULTS_WARN=()
+declare -a RESULTS_FAIL=()
+
+add_result() {
+  local level="$1" message="$2"
+  case "$level" in
+    PASS)
+      RESULTS_PASS+=("$message")
+      log_info "$message"
+      ;;
+    WARN)
+      RESULTS_WARN+=("$message")
+      log_warn "$message"
+      ;;
+    FAIL)
+      RESULTS_FAIL+=("$message")
+      log_error "$message"
+      ;;
+  esac
+}
+
+HAS_JQ="0"
+if command -v jq >/dev/null 2>&1; then
+  HAS_JQ="1"
+fi
+
+if ! command -v curl >/dev/null 2>&1; then
+  log_error "curl command not found; please install curl (e.g. apt-get install -y curl)"
+  exit 2
+fi
+
+if [[ "$HAS_JQ" == "0" ]] && ! command -v python3 >/dev/null 2>&1; then
+  log_error "Neither jq nor python3 is available for JSON processing"
+  exit 2
+fi
+
+CURL_OPTS=(--fail --show-error --silent --max-time 10)
+
+curl_json() {
+  local url="$1"
+  if ! curl "${CURL_OPTS[@]}" "$url"; then
+    return 1
+  fi
+}
+
+json_query() {
+  local json="$1" jq_expr="$2" py_expr="$3"
+  if [[ "$HAS_JQ" == "1" ]]; then
+    if ! output=$(printf '%s' "$json" | jq -e -r "$jq_expr" 2>/dev/null); then
+      return 1
+    fi
+    printf '%s' "$output"
+    return 0
+  fi
+
+  python3 - "$py_expr" <<'PY'
+import json
+import sys
+
+expr = sys.argv[1]
+try:
+    data = json.load(sys.stdin)
+    value = eval(expr, {}, {"data": data})
+except Exception:
+    sys.exit(1)
+if value is None:
+    sys.exit(1)
+if isinstance(value, (dict, list)):
+    print(json.dumps(value))
+else:
+    print(value)
+PY
+}
+
+json_length() {
+  local json="$1" jq_expr="$2" py_expr="$3"
+  if [[ "$HAS_JQ" == "1" ]]; then
+    if ! output=$(printf '%s' "$json" | jq -e "$jq_expr" 2>/dev/null); then
+      return 1
+    fi
+    printf '%s' "$output"
+    return 0
+  fi
+
+  python3 - "$py_expr" <<'PY'
+import json
+import sys
+
+expr = sys.argv[1]
+try:
+    data = json.load(sys.stdin)
+    value = eval(expr, {}, {"data": data})
+except Exception:
+    sys.exit(1)
+try:
+    print(len(value))
+except Exception:
+    sys.exit(1)
+PY
+}
+
+json_has_key() {
+  local json="$1" jq_expr="$2" py_expr="$3"
+  if [[ "$HAS_JQ" == "1" ]]; then
+    if printf '%s' "$json" | jq -e "$jq_expr" >/dev/null 2>&1; then
+      return 0
+    fi
+    return 1
+  fi
+
+  python3 - "$py_expr" <<'PY'
+import json
+import sys
+
+expr = sys.argv[1]
+try:
+    data = json.load(sys.stdin)
+    value = eval(expr, {}, {"data": data})
+except Exception:
+    sys.exit(1)
+if value:
+    sys.exit(0)
+sys.exit(1)
+PY
+}
+
+iso_to_epoch() {
+  local value="$1"
+  if command -v date >/dev/null 2>&1; then
+    date -d "$value" +%s 2>/dev/null && return 0
+  fi
+  if command -v python3 >/dev/null 2>&1; then
+    python3 - "$value" <<'PY'
+import sys
+from datetime import datetime
+
+value = sys.argv[1]
+if value is None or value == "":
+    sys.exit(1)
+if value.endswith('Z'):
+    value = value[:-1] + '+00:00'
+try:
+    dt = datetime.fromisoformat(value)
+except ValueError:
+    sys.exit(1)
+print(int(dt.timestamp()))
+PY
+    return $?
+  fi
+  return 1
+}
+
+validate_json_file() {
+  local path="$1"
+  if [[ "$HAS_JQ" == "1" ]]; then
+    jq empty "$path" >/dev/null 2>&1 && return 0
+    return 1
+  fi
+  if command -v python3 >/dev/null 2>&1; then
+    python3 - "$path" <<'PY'
+import json
+import sys
+path = sys.argv[1]
+with open(path, 'r', encoding='utf-8') as handle:
+    json.load(handle)
+PY
+    return $?
+  fi
+  return 0
+}
+
+ensure_directory() {
+  local dir="$1"
+  if [[ ! -d "$dir" ]]; then
+    log_warn "Creating missing directory $dir"
+    mkdir -p "$dir"
+  fi
+}
+
+TEST_HEALTH_FILE=""
+TEST_HEALTH_BACKUP=""
+TEST_HEALTH_EXISTED="false"
+
+cleanup() {
+  if [[ -n "$TEST_HEALTH_FILE" ]]; then
+    if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then
+      printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE"
+    elif [[ "$KEEP_TEST_HEALTH" == "true" ]]; then
+      :
+    else
+      rm -f "$TEST_HEALTH_FILE"
+    fi
+  fi
+}
+
+trap cleanup EXIT
+
+log_info "Starting agent deployment verification for hostname '$VERIFY_HOSTNAME'"
+
+# 4.2 Master health checks
+health_resp=""
+if ! health_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/healthz" 2>/tmp/agent_verify_healthz.err); then
+  error_detail=$(cat /tmp/agent_verify_healthz.err || true)
+  add_result FAIL "GET /healthz failed: $error_detail"
+else
+  http_meta=$(tail -n1 <<<"$health_resp")
+  payload=$(head -n -1 <<<"$health_resp" || true)
+  status_code=${http_meta%% *}
+  elapsed=${http_meta##* }
+  add_result PASS "GET /healthz status=$status_code elapsed=${elapsed}s payload=$payload"
+fi
+rm -f /tmp/agent_verify_healthz.err
+
+if ! readyz_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/readyz" 2>/tmp/agent_verify_readyz.err); then
+  error_detail=$(cat /tmp/agent_verify_readyz.err || true)
+  add_result FAIL "GET /readyz failed: $error_detail"
+  readyz_payload=""
+else
+  readyz_meta=$(tail -n1 <<<"$readyz_resp")
+  readyz_payload=$(head -n -1 <<<"$readyz_resp" || true)
+  readyz_status=${readyz_meta%% *}
+  readyz_elapsed=${readyz_meta##* }
+  add_result PASS "GET /readyz status=$readyz_status elapsed=${readyz_elapsed}s"
+fi
+rm -f /tmp/agent_verify_readyz.err
+
+# 4.3 Nodes list and detail
+if ! nodes_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes" 2>/tmp/agent_verify_nodes.err); then
+  error_detail=$(cat /tmp/agent_verify_nodes.err || true)
+  add_result FAIL "GET /api/v1/master/nodes failed: $error_detail"
+  nodes_json=""
+fi
+rm -f /tmp/agent_verify_nodes.err
+
+NODE_ENTRY=""
+NODE_ID=""
+NODE_IP=""
+if [[ -n "$nodes_json" ]]; then
+  if [[ "$HAS_JQ" == "1" ]]; then
+    NODE_ENTRY=$(printf '%s' "$nodes_json" | jq -e --arg name "$VERIFY_HOSTNAME" '.[] | select(.name == $name)') || NODE_ENTRY=""
+  else
+    NODE_ENTRY=$(python3 - "$VERIFY_HOSTNAME" <<'PY'
+import json
+import sys
+
+hostname = sys.argv[1]
+nodes = json.load(sys.stdin)
+for node in nodes:
+    if node.get("name") == hostname:
+        import json as _json
+        print(_json.dumps(node))
+        sys.exit(0)
+sys.exit(1)
+PY
+    ) || NODE_ENTRY=""
+  fi
+
+  if [[ -z "$NODE_ENTRY" ]]; then
+    add_result FAIL "Current node '$VERIFY_HOSTNAME' not found in master nodes list"
+  else
+    if NODE_ID=$(json_query "$NODE_ENTRY" '.id' 'data["id"]'); then
+      add_result PASS "Discovered node id '$NODE_ID' for hostname '$VERIFY_HOSTNAME'"
+    else
+      add_result FAIL "Failed to extract node id from master response"
+    fi
+  fi
+
+  if [[ -n "$NODE_ENTRY" ]] && NODE_DETAIL=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_node_detail.err); then
+    NODE_DETAIL_JSON="$NODE_DETAIL"
+    add_result PASS "Fetched node detail for $NODE_ID"
+    if NODE_IP=$(json_query "$NODE_DETAIL_JSON" '.meta_data.ip // .meta_data.host_ip // empty' 'data.get("meta_data", {}).get("ip") or data.get("meta_data", {}).get("host_ip") or ""'); then
+      if [[ -n "$NODE_IP" ]]; then
+        add_result PASS "Registered node IP=$NODE_IP"
+      else
+        add_result INFO "Node detail does not expose IP fields"
+      fi
+    fi
+  else
+    error_detail=$(cat /tmp/agent_verify_node_detail.err 2>/dev/null || true)
+    add_result FAIL "Failed to fetch node detail for $NODE_ID: $error_detail"
+    NODE_DETAIL_JSON=""
+  fi
+  rm -f /tmp/agent_verify_node_detail.err
+
+  if stats_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes/statistics" 2>/tmp/agent_verify_stats.err); then
+    if total_nodes=$(json_query "$stats_json" '.total // .total_nodes' 'data.get("total") or data.get("total_nodes")'); then
+      if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$total_nodes" -ge 1 ]]; then
+        add_result PASS "Statistics total=$total_nodes"
+      else
+        add_result WARN "Statistics total field not numeric: $total_nodes"
+      fi
+    else
+      add_result WARN "Unable to read total field from statistics"
+    fi
+
+    active_nodes=""
+    if [[ "$HAS_JQ" == "1" ]]; then
+      active_nodes=$(printf '%s' "$stats_json" | jq -e 'if .status_statistics then (.status_statistics[] | select(.status == "online") | .count) else empty end' 2>/dev/null | head -n1 || true)
+    elif command -v python3 >/dev/null 2>&1; then
+      active_nodes=$(printf '%s' "$stats_json" | python3 -c 'import json,sys; data=json.load(sys.stdin); print(next((row.get("count") for row in data.get("status_statistics", []) if row.get("status")=="online"), ""))' 2>/dev/null)
+    fi
+    if [[ -n "$active_nodes" ]]; then
+      add_result PASS "Online nodes reported by master: $active_nodes"
+    fi
+
+    if [[ "$HAS_JQ" == "1" ]]; then
+      node_count=$(printf '%s' "$nodes_json" | jq 'length')
+    else
+      node_count=$(json_length "$nodes_json" 'length' 'len(data)')
+    fi
+    if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$node_count" =~ ^[0-9]+$ ]] && [[ "$total_nodes" -lt "$node_count" ]]; then
+      add_result WARN "Statistics total=$total_nodes less than nodes list count=$node_count"
+    fi
+  else
+    error_detail=$(cat /tmp/agent_verify_stats.err 2>/dev/null || true)
+    add_result FAIL "Failed to fetch node statistics: $error_detail"
+  fi
+  rm -f /tmp/agent_verify_stats.err
+else
+  NODE_DETAIL_JSON=""
+fi
+
+# 4.4 Agent persistence checks
+if [[ -f "$NODE_JSON" ]]; then
+  node_file_content="$(cat "$NODE_JSON")"
+  if node_id_local=$(json_query "$node_file_content" '.id' 'data["id"]'); then
+    if [[ "$NODE_ID" != "" && "$node_id_local" == "$NODE_ID" ]]; then
+      add_result PASS "node.json id matches master ($NODE_ID)"
+    else
+      add_result FAIL "node.json id '$node_id_local' differs from master id '$NODE_ID'"
+    fi
+  else
+    add_result FAIL "Unable to extract id from node.json"
+  fi
+  if node_name_local=$(json_query "$node_file_content" '.name' 'data["name"]'); then
+    if [[ "$node_name_local" == "$VERIFY_HOSTNAME" ]]; then
+      add_result PASS "node.json name matches $VERIFY_HOSTNAME"
+    else
+      add_result FAIL "node.json name '$node_name_local' differs from hostname '$VERIFY_HOSTNAME'"
+    fi
+  else
+    add_result FAIL "Unable to extract name from node.json"
+  fi
+
+  if register_time=$(json_query "$node_file_content" '.register_time' 'data.get("register_time")'); then
+    if iso_to_epoch "$register_time" >/dev/null 2>&1; then
+      add_result PASS "node.json register_time valid ISO timestamp"
+    else
+      add_result WARN "node.json register_time invalid: $register_time"
+    fi
+  else
+    add_result WARN "node.json missing register_time"
+  fi
+
+  if last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then
+    if iso_to_epoch "$last_updated" >/dev/null 2>&1; then
+      add_result PASS "node.json last_updated valid ISO timestamp"
+    else
+      add_result WARN "node.json last_updated invalid: $last_updated"
+    fi
+  else
+    add_result WARN "node.json missing last_updated"
+  fi
+else
+  add_result FAIL "node.json not found at $NODE_JSON"
+  node_file_content=""
+fi
+
+ensure_directory "$HEALTH_DIR"
+
+if [[ -d "$HEALTH_DIR" ]]; then
+  shopt -s nullglob
+  health_files=("$HEALTH_DIR"/*.json)
+  shopt -u nullglob
+  if [[ ${#health_files[@]} -eq 0 ]]; then
+    add_result WARN "Health directory $HEALTH_DIR is empty"
+  else
+    for hf in "${health_files[@]}"; do
+      base=$(basename "$hf")
+      if [[ "$base" != *-* ]]; then
+        add_result WARN "Health file $base does not follow <module>-*.json"
+        continue
+      fi
+      if ! validate_json_file "$hf" >/dev/null 2>&1; then
+        add_result WARN "Health file $base is not valid JSON"
+      fi
+    done
+  fi
+else
+  add_result WARN "Health directory $HEALTH_DIR missing"
+fi
+
+if getent hosts master.argus.com >/dev/null 2>&1; then
+  resolved_ips=$(getent hosts master.argus.com | awk '{print $1}' | xargs)
+  add_result PASS "master.argus.com resolves to $resolved_ips"
+else
+  add_result FAIL "Failed to resolve master.argus.com"
+fi
+
+# 4.5 Master-Node status consistency
+sleep_interval=$((REPORT_INTERVAL_SECONDS + 2))
+
+if [[ -n "$NODE_DETAIL_JSON" ]]; then
+  detail_pre="$NODE_DETAIL_JSON"
+else
+  detail_pre=""
+fi
+
+if [[ -z "$detail_pre" && -n "$NODE_ID" ]]; then
+  if detail_pre=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_pre.err); then
+    add_result PASS "Fetched node detail pre-check"
+  else
+    error_detail=$(cat /tmp/agent_verify_detail_pre.err 2>/dev/null || true)
+    add_result FAIL "Unable to fetch node detail for status check: $error_detail"
+  fi
+  rm -f /tmp/agent_verify_detail_pre.err
+fi
+
+server_ts_pre=""
+agent_ts_pre=""
+server_ts_post=""
+agent_ts_post=""
+
+if [[ -n "$detail_pre" ]]; then
+  server_ts_pre=$(json_query "$detail_pre" '.last_report' 'data.get("last_report")' || echo "")
+  agent_ts_pre=$(json_query "$detail_pre" '.agent_last_report' 'data.get("agent_last_report")' || echo "")
+  log_info "Captured initial last_report timestamps server='$server_ts_pre' agent='$agent_ts_pre'"
+
+  sleep "$sleep_interval"
+
+  if detail_post=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_post.err); then
+    server_ts_post=$(json_query "$detail_post" '.last_report' 'data.get("last_report")' || echo "")
+    agent_ts_post=$(json_query "$detail_post" '.agent_last_report' 'data.get("agent_last_report")' || echo "")
+    if [[ "$server_ts_post" != "$server_ts_pre" ]]; then
+      add_result PASS "last_report.server_timestamp advanced (pre=$server_ts_pre post=$server_ts_post)"
+    else
+      add_result FAIL "last_report.server_timestamp did not change after ${sleep_interval}s"
+    fi
+    if [[ "$agent_ts_post" != "$agent_ts_pre" ]]; then
+      add_result PASS "last_report.agent_timestamp advanced"
+    else
+      add_result FAIL "last_report.agent_timestamp did not change"
+    fi
+
+    if [[ -n "$node_file_content" ]]; then
+      if node_last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then
+        if epoch_post=$(iso_to_epoch "$server_ts_post" 2>/dev/null); then
+          if node_epoch=$(iso_to_epoch "$node_last_updated" 2>/dev/null); then
+            diff=$((epoch_post - node_epoch))
+            [[ $diff -lt 0 ]] && diff=$((-diff))
+            tolerance=$((REPORT_INTERVAL_SECONDS * 2))
+            if [[ $diff -le $tolerance ]]; then
+              add_result PASS "last_report.server_timestamp and node.json last_updated within tolerance ($diff s)"
+            else
+              add_result WARN "Timestamp gap between master ($server_ts_post) and node.json ($node_last_updated) is ${diff}s"
+            fi
+          fi
+        fi
+      fi
+    fi
+
+    NODE_DETAIL_JSON="$detail_post"
+  else
+    error_detail=$(cat /tmp/agent_verify_detail_post.err 2>/dev/null || true)
+    add_result FAIL "Failed to fetch node detail post-check: $error_detail"
+  fi
+  rm -f /tmp/agent_verify_detail_post.err
+fi
+
+# 4.6 Health simulation
+TEST_HEALTH_FILE="$HEALTH_DIR/verify-master.json"
+ensure_directory "$HEALTH_DIR"
+
+if [[ -f "$TEST_HEALTH_FILE" ]]; then
+  TEST_HEALTH_EXISTED="true"
+  TEST_HEALTH_BACKUP="$(cat "$TEST_HEALTH_FILE")"
+else
+  TEST_HEALTH_EXISTED="false"
+fi
+
+create_health_file() {
+  local message="$1"
+  cat > "$TEST_HEALTH_FILE" <<HEALTHJSON
+{"status":"ok","message":"$message"}
+HEALTHJSON
+}
+
+validate_health_in_master() {
+  local expected_message="$1"
+  local detail_json="$2"
+  local message
+  if message=$(json_query "$detail_json" '.health["verify-master"].message' 'data.get("health", {}).get("verify-master", {}).get("message")'); then
+    if [[ "$message" == "$expected_message" ]]; then
+      return 0
+    fi
+  fi
+  return 1
+}
+
+remove_health_from_master() {
+  local detail_json="$1"
+  if json_has_key "$detail_json" '(.health | has("verify-master"))' '"verify-master" in data.get("health", {})'; then
+    return 1
+  fi
+  return 0
+}
+
+health_message_one="verify $(date +%s)"
+create_health_file "$health_message_one"
+add_result PASS "Created test health file $TEST_HEALTH_FILE"
+
+sleep "$sleep_interval"
+if detail_health_one=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health1.err); then
+  if validate_health_in_master "$health_message_one" "$detail_health_one"; then
+    add_result PASS "Master reflects verify-master health message"
+  else
+    add_result FAIL "Master health payload does not match test message"
+  fi
+else
+  error_detail=$(cat /tmp/agent_verify_health1.err 2>/dev/null || true)
+  add_result FAIL "Failed to fetch node detail during health validation: $error_detail"
+  detail_health_one=""
+fi
+rm -f /tmp/agent_verify_health1.err
+
+health_message_two="verify $(date +%s)-update"
+create_health_file "$health_message_two"
+sleep "$sleep_interval"
+if detail_health_two=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health2.err); then
+  if validate_health_in_master "$health_message_two" "$detail_health_two"; then
+    add_result PASS "Master health updated to new message"
+  else
+    add_result FAIL "Master health message did not update"
+  fi
+else
+  error_detail=$(cat /tmp/agent_verify_health2.err 2>/dev/null || true)
+  add_result FAIL "Failed to fetch node detail after health update: $error_detail"
+  detail_health_two=""
+fi
+rm -f /tmp/agent_verify_health2.err
+
+rm -f "$TEST_HEALTH_FILE"
+sleep "$sleep_interval"
+if detail_health_three=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health3.err); then
+  if remove_health_from_master "$detail_health_three"; then
+    add_result PASS "Master health no longer lists verify-master after removal"
+  else
+    add_result FAIL "Master health still contains verify-master after file deletion"
+  fi
+else
+  error_detail=$(cat /tmp/agent_verify_health3.err 2>/dev/null || true)
+  add_result FAIL "Failed to fetch node detail after health removal: $error_detail"
+fi
+rm -f /tmp/agent_verify_health3.err
+
+if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then
+  printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE"
+fi
+
+# Optional config touch
+if [[ "$ALLOW_CONFIG_TOUCH" == "true" ]]; then
+  if [[ -n "$NODE_ID" ]]; then
+    payload='{"label": {"verify": "true"}}'
+    if curl "${CURL_OPTS[@]}" -X PUT -H 'Content-Type: application/json' -d "$payload" "$MASTER_BASE/api/v1/master/nodes/$NODE_ID/config" >/tmp/agent_verify_config.log 2>&1; then
+      add_result PASS "Config PUT dry-run succeeded"
+    else
+      add_result WARN "Config PUT dry-run failed: $(cat /tmp/agent_verify_config.log)"
+    fi
+    rm -f /tmp/agent_verify_config.log
+  fi
+else
+  add_result WARN "Config PUT dry-run skipped (enable with --allow-config-touch)"
+fi
+
+# Result summary
+echo
+echo "==== Verification Summary ===="
+for entry in "${RESULTS_PASS[@]}"; do
+  printf 'PASS: %s\n' "$entry"
+done
+for entry in "${RESULTS_WARN[@]}"; do
+  printf 'WARN: %s\n' "$entry"
+done
+for entry in "${RESULTS_FAIL[@]}"; do
+  printf 'FAIL: %s\n' "$entry"
+done
+
+if [[ ${#RESULTS_FAIL[@]} -gt 0 ]]; then
+  exit 1
+fi
+
+exit 0
--- a/src/agent/scripts/build_binary.sh
+++ b/src/agent/scripts/build_binary.sh
@ -0,0 +1,269 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+MODULE_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+BUILD_ROOT="$MODULE_ROOT/build"
+DIST_DIR="$MODULE_ROOT/dist"
+PYINSTALLER_BUILD="$BUILD_ROOT/pyinstaller"
+PYINSTALLER_SPEC="$PYINSTALLER_BUILD/spec"
+PYINSTALLER_WORK="$PYINSTALLER_BUILD/work"
+VENV_DIR="$BUILD_ROOT/venv"
+
+AGENT_BUILD_IMAGE="${AGENT_BUILD_IMAGE:-python:3.11-slim-bullseye}"
+AGENT_BUILD_USE_DOCKER="${AGENT_BUILD_USE_DOCKER:-1}"
+USED_DOCKER=0
+
+run_host_build() {
+  echo "[INFO] Using host Python environment for build" >&2
+  rm -rf "$BUILD_ROOT" "$DIST_DIR"
+  mkdir -p "$PYINSTALLER_BUILD" "$DIST_DIR"
+  python3 -m venv --copies "$VENV_DIR"
+  # shellcheck disable=SC1091
+  source "$VENV_DIR/bin/activate"
+
+  pip install --upgrade pip
+  pip install .
+  pip install "pyinstaller==6.6.0"
+
+  pyinstaller \
+    --clean \
+    --onefile \
+    --name argus-agent \
+    --distpath "$DIST_DIR" \
+    --workpath "$PYINSTALLER_WORK" \
+    --specpath "$PYINSTALLER_SPEC" \
+    --add-data "$MODULE_ROOT/pyproject.toml:." \
+    "$MODULE_ROOT/entry.py"
+
+  chmod +x "$DIST_DIR/argus-agent"
+  deactivate
+}
+
+run_docker_build() {
+  if ! command -v docker >/dev/null 2>&1; then
+    echo "[ERROR] docker 命令不存在，无法在容器内构建。请安装 Docker 或设置 AGENT_BUILD_USE_DOCKER=0" >&2
+    exit 1
+  fi
+
+  USED_DOCKER=1
+  echo "[INFO] Building agent binary inside $AGENT_BUILD_IMAGE" >&2
+
+  local host_uid host_gid
+  host_uid="$(id -u)"
+  host_gid="$(id -g)"
+  docker_env=("--rm" "-v" "$MODULE_ROOT:/workspace" "-w" "/workspace" "--env" "TARGET_UID=${host_uid}" "--env" "TARGET_GID=${host_gid}")
+
+  pass_env_if_set() {
+    local var="$1"
+    local value="${!var:-}"
+    if [[ -n "$value" ]]; then
+      docker_env+=("--env" "$var=$value")
+    fi
+  }
+
+  pass_env_if_set PIP_INDEX_URL
+  pass_env_if_set PIP_EXTRA_INDEX_URL
+  pass_env_if_set PIP_TRUSTED_HOST
+  pass_env_if_set HTTP_PROXY
+  pass_env_if_set HTTPS_PROXY
+  pass_env_if_set NO_PROXY
+  pass_env_if_set http_proxy
+  pass_env_if_set https_proxy
+  pass_env_if_set no_proxy
+
+build_script=$(cat <<'INNER'
+set -euo pipefail
+cd /workspace
+apt-get update >/dev/null
+apt-get install -y --no-install-recommends binutils >/dev/null
+rm -rf /var/lib/apt/lists/*
+rm -rf build dist
+mkdir -p build/pyinstaller dist
+python3 -m venv --copies build/venv
+source build/venv/bin/activate
+pip install --upgrade pip
+pip install .
+pip install pyinstaller==6.6.0
+pyinstaller \
+  --clean \
+  --onefile \
+  --name argus-agent \
+  --distpath dist \
+  --workpath build/pyinstaller/work \
+  --specpath build/pyinstaller/spec \
+  --add-data /workspace/pyproject.toml:. \
+  entry.py
+chmod +x dist/argus-agent
+
+TARGET_UID="${TARGET_UID:-0}"
+TARGET_GID="${TARGET_GID:-0}"
+chown -R "$TARGET_UID:$TARGET_GID" dist build 2>/dev/null || true
+
+python3 - <<'PY'
+from pathlib import Path
+from PyInstaller.archive.readers import CArchiveReader
+import sys
+
+archive = Path('dist/argus-agent')
+out_dir = Path('build/compat_check')
+out_dir.mkdir(parents=True, exist_ok=True)
+
+major, minor = sys.version_info[:2]
+libpython = f'libpython{major}.{minor}.so.1.0'
+expected_libs = [
+    libpython,
+    'libssl.so.3',
+    'libcrypto.so.3',
+]
+reader = CArchiveReader(str(archive))
+extracted = []
+missing = []
+for name in expected_libs:
+    try:
+        data = reader.extract(name)
+    except KeyError:
+        missing.append(name)
+        continue
+    (out_dir / name).write_bytes(data)
+    extracted.append(name)
+(out_dir / 'manifest').write_text('\n'.join(extracted))
+if extracted:
+    print('[INFO] Extracted libraries: ' + ', '.join(extracted))
+if missing:
+    print('[WARN] Missing expected libraries in bundle: ' + ', '.join(missing))
+PY
+
+compat_check() {
+  local lib_path="$1"
+  if [[ ! -f "$lib_path" ]]; then
+    echo "[WARN] Missing $lib_path for GLIBC check"
+    return
+  fi
+  local max_glibc
+  max_glibc=$(strings -a "$lib_path" | grep -Eo 'GLIBC_[0-9]+\.[0-9]+' | sort -Vu | tail -n 1 || true)
+  if [[ -n "$max_glibc" ]]; then
+    echo "[INFO] $lib_path references up to $max_glibc"
+  else
+    echo "[INFO] $lib_path does not expose GLIBC version strings"
+  fi
+}
+
+compat_libs=()
+if [[ -f build/compat_check/manifest ]]; then
+  mapfile -t compat_libs < build/compat_check/manifest
+fi
+
+if [[ ${#compat_libs[@]} -eq 0 ]]; then
+  echo "[WARN] No libraries captured for GLIBC inspection"
+else
+  for lib in "${compat_libs[@]}"; do
+    compat_check "build/compat_check/$lib"
+  done
+fi
+
+deactivate
+INNER
+  )
+
+  if ! docker run "${docker_env[@]}" "$AGENT_BUILD_IMAGE" bash -lc "$build_script"; then
+    echo "[ERROR] Docker 构建失败，请检查 Docker 权限或设置 AGENT_BUILD_USE_DOCKER=0 在兼容主机上构建" >&2
+    exit 1
+  fi
+}
+
+if [[ "$AGENT_BUILD_USE_DOCKER" == "1" ]]; then
+  run_docker_build
+else
+  run_host_build
+fi
+
+if [[ ! -f "$DIST_DIR/argus-agent" ]]; then
+  echo "[ERROR] Agent binary was not produced" >&2
+  exit 1
+fi
+
+if [[ "$USED_DOCKER" != "1" ]]; then
+  if [[ ! -x "$VENV_DIR/bin/python" ]]; then
+    echo "[WARN] PyInstaller virtualenv missing at $VENV_DIR; skipping compatibility check" >&2
+  else
+    COMPAT_DIR="$BUILD_ROOT/compat_check"
+    rm -rf "$COMPAT_DIR"
+    mkdir -p "$COMPAT_DIR"
+
+    EXTRACT_SCRIPT=$(cat <<'PY'
+from pathlib import Path
+from PyInstaller.archive.readers import CArchiveReader
+import sys
+
+archive = Path('dist/argus-agent')
+out_dir = Path('build/compat_check')
+out_dir.mkdir(parents=True, exist_ok=True)
+
+major, minor = sys.version_info[:2]
+libpython = f'libpython{major}.{minor}.so.1.0'
+expected_libs = [
+    libpython,
+    'libssl.so.3',
+    'libcrypto.so.3',
+]
+reader = CArchiveReader(str(archive))
+extracted = []
+missing = []
+for name in expected_libs:
+    try:
+        data = reader.extract(name)
+    except KeyError:
+        missing.append(name)
+        continue
+    (out_dir / name).write_bytes(data)
+    extracted.append(name)
+(out_dir / 'manifest').write_text('\n'.join(extracted))
+if extracted:
+    print('[INFO] Extracted libraries: ' + ', '.join(extracted))
+if missing:
+    print('[WARN] Missing expected libraries in bundle: ' + ', '.join(missing))
+PY
+)
+
+    "$VENV_DIR/bin/python" - <<PY
+$EXTRACT_SCRIPT
+PY
+
+    compat_libs=()
+    if [[ -f "$COMPAT_DIR/manifest" ]]; then
+      mapfile -t compat_libs < "$COMPAT_DIR/manifest"
+    fi
+
+    check_glibc_version() {
+      local lib_path="$1"
+      if [[ ! -f "$lib_path" ]]; then
+        echo "[WARN] Skipping GLIBC check; file not found: $lib_path" >&2
+        return
+      fi
+      if command -v strings >/dev/null 2>&1; then
+        local max_glibc
+        max_glibc=$(strings -a "$lib_path" | grep -Eo 'GLIBC_[0-9]+\.[0-9]+' | sort -Vu | tail -n 1 || true)
+        if [[ -n "$max_glibc" ]]; then
+          echo "[INFO] $lib_path references up to $max_glibc"
+        else
+          echo "[INFO] $lib_path does not expose GLIBC version strings"
+        fi
+      else
+        echo "[WARN] strings command unavailable; cannot inspect $lib_path" >&2
+      fi
+    }
+
+    if [[ ${#compat_libs[@]} -eq 0 ]]; then
+      echo "[WARN] No libraries captured for GLIBC inspection" >&2
+    else
+      for lib in "${compat_libs[@]}"; do
+        check_glibc_version "$COMPAT_DIR/$lib"
+      done
+    fi
+  fi
+else
+  echo "[INFO] Compatibility check executed inside container"
+fi
+
+echo "[INFO] Agent binary generated at $DIST_DIR/argus-agent"
--- a/src/agent/tests/.gitignore
+++ b/src/agent/tests/.gitignore
@ -0,0 +1,2 @@
+private/
+tmp/
--- a/src/agent/tests/docker-compose.yml
+++ b/src/agent/tests/docker-compose.yml
@ -0,0 +1,69 @@
+services:
+  bind:
+    image: ${BIND_IMAGE_TAG:-argus-bind9:latest}
+    container_name: argus-bind-agent-e2e
+    volumes:
+      - ./private:/private
+    networks:
+      default:
+        ipv4_address: 172.28.0.2
+    environment:
+      - "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
+      - "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
+    restart: always
+
+  master:
+    image: argus-master:latest
+    container_name: argus-master-agent-e2e
+    depends_on:
+      - bind
+    environment:
+      - OFFLINE_THRESHOLD_SECONDS=6
+      - ONLINE_THRESHOLD_SECONDS=2
+      - SCHEDULER_INTERVAL_SECONDS=1
+      - "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
+      - "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
+    ports:
+      - "32300:3000"
+    volumes:
+      - ./private/argus/master:/private/argus/master
+      - ./private/argus/metric/prometheus:/private/argus/metric/prometheus
+      - ./private/argus/etc:/private/argus/etc
+    networks:
+      default:
+        ipv4_address: 172.28.0.10
+    restart: always
+
+  agent:
+    image: ubuntu:22.04
+    container_name: argus-agent-e2e
+    hostname: dev-e2euser-e2einst-pod-0
+    depends_on:
+      - master
+      - bind
+    environment:
+      - MASTER_ENDPOINT=http://master.argus.com:3000
+      - REPORT_INTERVAL_SECONDS=2
+      - "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
+      - "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
+    volumes:
+      - ./private/argus/agent/dev-e2euser-e2einst-pod-0:/private/argus/agent/dev-e2euser-e2einst-pod-0
+      - ./private/argus/agent/dev-e2euser-e2einst-pod-0/health:/private/argus/agent/dev-e2euser-e2einst-pod-0/health
+      - ./private/argus/etc:/private/argus/etc
+      - ../dist/argus-agent:/usr/local/bin/argus-agent:ro
+      - ./scripts/agent_entrypoint.sh:/usr/local/bin/agent-entrypoint.sh:ro
+      - ../scripts/agent_deployment_verify.sh:/usr/local/bin/agent_deployment_verify.sh:ro
+    entrypoint:
+      - /usr/local/bin/agent-entrypoint.sh
+    networks:
+      default:
+        ipv4_address: 172.28.0.20
+    restart: always
+
+networks:
+  default:
+    driver: bridge
+    ipam:
+      driver: default
+      config:
+        - subnet: 172.28.0.0/16
--- a/src/agent/tests/scripts/00_e2e_test.sh
+++ b/src/agent/tests/scripts/00_e2e_test.sh
@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SCRIPTS=(
+  "01_bootstrap.sh"
+  "02_up.sh"
+  "03_wait_and_assert_registration.sh"
+  "04_write_health_files.sh"
+  "08_verify_agent.sh"
+  "05_assert_status_on_master.sh"
+  "06_restart_agent_and_reregister.sh"
+  "07_down.sh"
+)
+
+for script in "${SCRIPTS[@]}"; do
+  echo "[TEST] Running $script"
+  "$SCRIPT_DIR/$script"
+  echo "[TEST] $script completed"
+  echo
+done
+
+echo "[TEST] Agent module E2E tests completed"
--- a/src/agent/tests/scripts/01_bootstrap.sh
+++ b/src/agent/tests/scripts/01_bootstrap.sh
@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+AGENT_ROOT="$(cd "$TEST_ROOT/.." && pwd)"
+MASTER_ROOT="$(cd "$AGENT_ROOT/../master" && pwd)"
+REPO_ROOT="$(cd "$AGENT_ROOT/../.." && pwd)"
+PRIVATE_ROOT="$TEST_ROOT/private"
+TMP_ROOT="$TEST_ROOT/tmp"
+
+AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
+AGENT_CONFIG_DIR="$PRIVATE_ROOT/argus/agent/$AGENT_HOSTNAME"
+AGENT_HEALTH_DIR="$PRIVATE_ROOT/argus/agent/$AGENT_HOSTNAME/health"
+MASTER_PRIVATE_DIR="$PRIVATE_ROOT/argus/master"
+METRIC_PRIVATE_DIR="$PRIVATE_ROOT/argus/metric/prometheus"
+DNS_DIR="$PRIVATE_ROOT/argus/etc"
+BIND_IMAGE_TAG="${BIND_IMAGE_TAG:-argus-bind9:latest}"
+BIND_ROOT="$(cd "$MASTER_ROOT/../bind" && pwd)"
+
+ensure_image() {
+  local image="$1"
+  if ! docker image inspect "$image" >/dev/null 2>&1; then
+    echo "[ERROR] Docker image '$image' 未找到，请先运行统一构建脚本 (例如 ./build/build_images.sh) 生成所需镜像" >&2
+    exit 1
+  fi
+}
+
+mkdir -p "$AGENT_CONFIG_DIR"
+mkdir -p "$AGENT_HEALTH_DIR"
+mkdir -p "$MASTER_PRIVATE_DIR"
+mkdir -p "$METRIC_PRIVATE_DIR"
+mkdir -p "$TMP_ROOT"
+mkdir -p "$DNS_DIR"
+
+touch "$AGENT_HEALTH_DIR/.keep"
+
+# 中文提示：准备 bind 模块提供的 update-dns.sh，模拟生产下发
+if [[ -f "$BIND_ROOT/build/update-dns.sh" ]]; then
+  cp "$BIND_ROOT/build/update-dns.sh" "$DNS_DIR/update-dns.sh"
+  chmod +x "$DNS_DIR/update-dns.sh"
+else
+  echo "[WARN] bind update script missing at $BIND_ROOT/build/update-dns.sh"
+fi
+
+ensure_image "argus-master:latest"
+ensure_image "$BIND_IMAGE_TAG"
+
+AGENT_BINARY="$AGENT_ROOT/dist/argus-agent"
+
+pushd "$AGENT_ROOT" >/dev/null
+./scripts/build_binary.sh
+popd >/dev/null
+
+if [[ ! -x "$AGENT_BINARY" ]]; then
+  echo "[ERROR] Agent binary not found at $AGENT_BINARY" >&2
+  exit 1
+fi
+
+echo "$AGENT_BINARY" > "$TMP_ROOT/agent_binary_path"
+echo "$BIND_IMAGE_TAG" > "$TMP_ROOT/bind_image_tag"
+
+echo "[INFO] Agent E2E bootstrap complete"
--- a/src/agent/tests/scripts/02_up.sh
+++ b/src/agent/tests/scripts/02_up.sh
@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
+
+TMP_ROOT="$TEST_ROOT/tmp"
+ENV_FILE="$TEST_ROOT/.env"
+
+source "$REPO_ROOT/scripts/common/build_user.sh"
+load_build_user
+export ARGUS_BUILD_UID ARGUS_BUILD_GID
+
+cat > "$ENV_FILE" <<EOF
+ARGUS_BUILD_UID=$ARGUS_BUILD_UID
+ARGUS_BUILD_GID=$ARGUS_BUILD_GID
+EOF
+
+if [[ ! -f "$TMP_ROOT/agent_binary_path" ]]; then
+  echo "[ERROR] Agent binary path missing; run 01_bootstrap.sh first" >&2
+  exit 1
+fi
+
+AGENT_BINARY="$(cat "$TMP_ROOT/agent_binary_path")"
+if [[ ! -x "$AGENT_BINARY" ]]; then
+  echo "[ERROR] Agent binary not executable: $AGENT_BINARY" >&2
+  exit 1
+fi
+
+BIND_IMAGE_TAG_VALUE="argus-bind9:latest"
+if [[ -f "$TMP_ROOT/bind_image_tag" ]]; then
+  BIND_IMAGE_TAG_VALUE="$(cat "$TMP_ROOT/bind_image_tag")"
+fi
+
+compose() {
+  if docker compose version >/dev/null 2>&1; then
+    docker compose "$@"
+  else
+    docker-compose "$@"
+  fi
+}
+
+docker container rm -f argus-agent-e2e argus-master-agent-e2e argus-bind-agent-e2e >/dev/null 2>&1 || true
+
+docker network rm tests_default >/dev/null 2>&1 || true
+
+pushd "$TEST_ROOT" >/dev/null
+compose down --remove-orphans || true
+BIND_IMAGE_TAG="$BIND_IMAGE_TAG_VALUE" compose up -d
+popd >/dev/null
+
+echo "[INFO] Master+Agent stack started"
--- a/src/agent/tests/scripts/03_wait_and_assert_registration.sh
+++ b/src/agent/tests/scripts/03_wait_and_assert_registration.sh
@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+TMP_ROOT="$TEST_ROOT/tmp"
+API_BASE="http://localhost:32300/api/v1/master"
+AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
+NODE_FILE="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/node.json"
+
+mkdir -p "$TMP_ROOT"
+
+node_id=""
+for _ in {1..30}; do
+  sleep 2
+  response=$(curl -sS "$API_BASE/nodes" || true)
+  if [[ -z "$response" ]]; then
+    continue
+  fi
+  list_file="$TMP_ROOT/nodes_list.json"
+  echo "$response" > "$list_file"
+  node_id=$(python3 - "$list_file" <<'PY'
+import json, sys
+with open(sys.argv[1]) as handle:
+    nodes = json.load(handle)
+print(nodes[0]["id"] if nodes else "")
+PY
+)
+  if [[ -n "$node_id" ]]; then
+    break
+  fi
+ done
+
+if [[ -z "$node_id" ]]; then
+  echo "[ERROR] Agent did not register within timeout" >&2
+  exit 1
+fi
+
+echo "$node_id" > "$TMP_ROOT/node_id"
+
+if [[ ! -f "$NODE_FILE" ]]; then
+  echo "[ERROR] node.json not created at $NODE_FILE" >&2
+  exit 1
+fi
+
+python3 - "$NODE_FILE" <<'PY'
+import json, sys
+with open(sys.argv[1]) as handle:
+    node = json.load(handle)
+assert "id" in node and node["id"], "node.json missing id"
+PY
+
+detail_file="$TMP_ROOT/initial_detail.json"
+curl -sS "$API_BASE/nodes/$node_id" -o "$detail_file"
+python3 - "$detail_file" "$TMP_ROOT/initial_ip" <<'PY'
+import json, sys, pathlib
+with open(sys.argv[1]) as handle:
+    node = json.load(handle)
+ip = node["meta_data"].get("ip")
+if not ip:
+    raise SystemExit("meta_data.ip missing")
+pathlib.Path(sys.argv[2]).write_text(ip)
+PY
+
+echo "[INFO] Agent registered with node id $node_id"
--- a/src/agent/tests/scripts/04_write_health_files.sh
+++ b/src/agent/tests/scripts/04_write_health_files.sh
@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+HEALTH_DIR="$TEST_ROOT/private/argus/agent/dev-e2euser-e2einst-pod-0/health"
+
+cat > "$HEALTH_DIR/log-fluentbit.json" <<JSON
+{
+  "status": "healthy",
+  "timestamp": "2023-10-05T12:05:00Z"
+}
+JSON
+
+cat > "$HEALTH_DIR/metric-node-exporter.json" <<JSON
+{
+  "status": "healthy",
+  "timestamp": "2023-10-05T12:05:00Z"
+}
+JSON
+
+echo "[INFO] Health files written"
--- a/src/agent/tests/scripts/05_assert_status_on_master.sh
+++ b/src/agent/tests/scripts/05_assert_status_on_master.sh
@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+TMP_ROOT="$TEST_ROOT/tmp"
+API_BASE="http://localhost:32300/api/v1/master"
+NODE_ID="$(cat "$TMP_ROOT/node_id")"
+NODES_JSON="$TEST_ROOT/private/argus/metric/prometheus/nodes.json"
+
+success=false
+detail_file="$TMP_ROOT/agent_status_detail.json"
+for _ in {1..20}; do
+  sleep 2
+  if ! curl -sS "$API_BASE/nodes/$NODE_ID" -o "$detail_file"; then
+    continue
+  fi
+  if python3 - "$detail_file" <<'PY'
+import json, sys
+with open(sys.argv[1]) as handle:
+    node = json.load(handle)
+if node["status"] != "online":
+    raise SystemExit(1)
+health = node.get("health", {})
+if "log-fluentbit" not in health or "metric-node-exporter" not in health:
+    raise SystemExit(1)
+PY
+  then
+    success=true
+    break
+  fi
+done
+
+if [[ "$success" != true ]]; then
+  echo "[ERROR] Node did not report health data in time" >&2
+  exit 1
+fi
+
+if [[ ! -f "$NODES_JSON" ]]; then
+  echo "[ERROR] nodes.json missing at $NODES_JSON" >&2
+  exit 1
+fi
+
+python3 - "$NODES_JSON" <<'PY'
+import json, sys
+with open(sys.argv[1]) as handle:
+    nodes = json.load(handle)
+assert len(nodes) == 1, nodes
+entry = nodes[0]
+assert entry["node_id"], entry
+PY
+
+echo "[INFO] Master reflects agent health and nodes.json entries"
--- a/src/agent/tests/scripts/06_restart_agent_and_reregister.sh
+++ b/src/agent/tests/scripts/06_restart_agent_and_reregister.sh
@ -0,0 +1,143 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+TMP_ROOT="$TEST_ROOT/tmp"
+API_BASE="http://localhost:32300/api/v1/master"
+NODE_ID="$(cat "$TMP_ROOT/node_id")"
+AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
+NETWORK_NAME="tests_default"
+NEW_AGENT_IP="172.28.0.200"
+ENTRYPOINT_SCRIPT="$SCRIPT_DIR/agent_entrypoint.sh"
+ENV_FILE="$TEST_ROOT/.env"
+
+# 中文提示：重启场景也需要同样的入口脚本，确保 DNS 注册逻辑一致
+if [[ ! -f "$ENTRYPOINT_SCRIPT" ]]; then
+  echo "[ERROR] agent entrypoint script missing at $ENTRYPOINT_SCRIPT" >&2
+  exit 1
+fi
+
+if [[ ! -f "$TMP_ROOT/agent_binary_path" ]]; then
+  echo "[ERROR] Agent binary path missing; rerun bootstrap" >&2
+  exit 1
+fi
+
+AGENT_BINARY="$(cat "$TMP_ROOT/agent_binary_path")"
+if [[ ! -x "$AGENT_BINARY" ]]; then
+  echo "[ERROR] Agent binary not executable: $AGENT_BINARY" >&2
+  exit 1
+fi
+
+if [[ -f "$ENV_FILE" ]]; then
+  set -a
+  # shellcheck disable=SC1090
+  source "$ENV_FILE"
+  set +a
+else
+  REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
+  # shellcheck disable=SC1090
+  source "$REPO_ROOT/scripts/common/build_user.sh"
+  load_build_user
+fi
+
+AGENT_UID="${ARGUS_BUILD_UID:-2133}"
+AGENT_GID="${ARGUS_BUILD_GID:-2015}"
+
+compose() {
+  if docker compose version >/dev/null 2>&1; then
+    docker compose "$@"
+  else
+    docker-compose "$@"
+  fi
+}
+
+before_file="$TMP_ROOT/before_restart.json"
+curl -sS "$API_BASE/nodes/$NODE_ID" -o "$before_file"
+prev_last_updated=$(python3 - "$before_file" <<'PY'
+import json, sys
+with open(sys.argv[1]) as handle:
+    node = json.load(handle)
+print(node.get("last_updated", ""))
+PY
+)
+prev_ip=$(python3 - "$before_file" <<'PY'
+import json, sys
+with open(sys.argv[1]) as handle:
+    node = json.load(handle)
+print(node["meta_data"].get("ip", ""))
+PY
+)
+initial_ip=$(cat "$TMP_ROOT/initial_ip")
+if [[ "$prev_ip" != "$initial_ip" ]]; then
+  echo "[ERROR] Expected initial IP $initial_ip, got $prev_ip" >&2
+  exit 1
+fi
+
+pushd "$TEST_ROOT" >/dev/null
+compose rm -sf agent
+popd >/dev/null
+
+docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true
+
+AGENT_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME"
+HEALTH_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/health"
+
+# 先以 sleep 方式启动容器，确保我们掌握注册时的网络状态
+if ! docker run -d \
+  --name argus-agent-e2e \
+  --hostname "$AGENT_HOSTNAME" \
+  --network "$NETWORK_NAME" \
+  --ip "$NEW_AGENT_IP" \
+  -v "$AGENT_DIR:/private/argus/agent/$AGENT_HOSTNAME" \
+  -v "$HEALTH_DIR:/private/argus/agent/$AGENT_HOSTNAME/health" \
+  -v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \
+  -v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
+  -v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \
+  -e MASTER_ENDPOINT=http://master.argus.com:3000 \
+  -e REPORT_INTERVAL_SECONDS=2 \
+  -e ARGUS_BUILD_UID="$AGENT_UID" \
+  -e ARGUS_BUILD_GID="$AGENT_GID" \
+  --entrypoint /usr/local/bin/agent-entrypoint.sh \
+  ubuntu:22.04 >/dev/null; then
+  echo "[ERROR] Failed to start agent container with custom IP" >&2
+  exit 1
+fi
+
+success=false
+detail_file="$TMP_ROOT/post_restart.json"
+for _ in {1..20}; do
+  sleep 3
+  if ! curl -sS "$API_BASE/nodes/$NODE_ID" -o "$detail_file"; then
+    continue
+  fi
+  if python3 - "$detail_file" "$prev_last_updated" "$NODE_ID" "$prev_ip" "$NEW_AGENT_IP" <<'PY'
+import json, sys
+with open(sys.argv[1]) as handle:
+    node = json.load(handle)
+prev_last_updated = sys.argv[2]
+expected_id = sys.argv[3]
+old_ip = sys.argv[4]
+expected_ip = sys.argv[5]
+last_updated = node.get("last_updated")
+current_ip = node["meta_data"].get("ip")
+assert node["id"] == expected_id
+if current_ip != expected_ip:
+    raise SystemExit(1)
+if current_ip == old_ip:
+    raise SystemExit(1)
+if not last_updated or last_updated == prev_last_updated:
+    raise SystemExit(1)
+PY
+  then
+    success=true
+    break
+  fi
+done
+
+if [[ "$success" != true ]]; then
+  echo "[ERROR] Agent did not report expected new IP $NEW_AGENT_IP after restart" >&2
+  exit 1
+fi
+
+echo "[INFO] Agent restart produced successful re-registration with IP change"
--- a/src/agent/tests/scripts/07_down.sh
+++ b/src/agent/tests/scripts/07_down.sh
@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+ENV_FILE="$TEST_ROOT/.env"
+
+compose() {
+  if docker compose version >/dev/null 2>&1; then
+    docker compose "$@"
+  else
+    docker-compose "$@"
+  fi
+}
+
+docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true
+
+pushd "$TEST_ROOT" >/dev/null
+compose down --remove-orphans
+popd >/dev/null
+
+if [[ -d "$TEST_ROOT/private" ]]; then
+  docker run --rm \
+    -v "$TEST_ROOT/private:/target" \
+    ubuntu:24.04 \
+    chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true
+  rm -rf "$TEST_ROOT/private"
+fi
+
+rm -rf "$TEST_ROOT/tmp"
+
+if [[ -f "$ENV_FILE" ]]; then
+  rm -f "$ENV_FILE"
+fi
+
+echo "[INFO] Agent E2E environment cleaned up"
--- a/src/agent/tests/scripts/08_verify_agent.sh
+++ b/src/agent/tests/scripts/08_verify_agent.sh
@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+VERIFY_SCRIPT="$(cd "$TEST_ROOT/.." && pwd)/scripts/agent_deployment_verify.sh"
+
+if ! docker ps --format '{{.Names}}' | grep -q '^argus-agent-e2e$'; then
+  echo "[WARN] agent container not running; skip verification"
+  exit 0
+fi
+
+if docker exec -i argus-agent-e2e bash -lc 'command -v curl >/dev/null && command -v jq >/dev/null'; then
+  echo "[INFO] curl/jq already installed in agent container"
+else
+  echo "[INFO] Installing curl/jq in agent container"
+  docker exec -i argus-agent-e2e bash -lc 'apt-get update >/dev/null 2>&1 && apt-get install -y curl jq >/dev/null 2>&1' || true
+fi
+
+if docker exec -i argus-agent-e2e bash -lc 'command -v /usr/local/bin/agent_deployment_verify.sh >/dev/null'; then
+  docker exec -i argus-agent-e2e /usr/local/bin/agent_deployment_verify.sh
+elif [[ -x "$VERIFY_SCRIPT" ]]; then
+  docker exec -i argus-agent-e2e "$VERIFY_SCRIPT"
+else
+  echo "[WARN] agent_deployment_verify.sh not found"
+fi
--- a/src/agent/tests/scripts/agent_entrypoint.sh
+++ b/src/agent/tests/scripts/agent_entrypoint.sh
@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+LOG_PREFIX="[AGENT-ENTRYPOINT]"
+DNS_SCRIPT="/private/argus/etc/update-dns.sh"
+DNS_CONF="/private/argus/etc/dns.conf"
+TARGET_DOMAIN="master.argus.com"
+AGENT_UID="${ARGUS_BUILD_UID:-2133}"
+AGENT_GID="${ARGUS_BUILD_GID:-2015}"
+AGENT_HOSTNAME="${HOSTNAME:-unknown}"
+AGENT_DATA_DIR="/private/argus/agent/${AGENT_HOSTNAME}"
+AGENT_HEALTH_DIR="${AGENT_DATA_DIR}/health"
+RUNTIME_GROUP="argusagent"
+RUNTIME_USER="argusagent"
+
+log() {
+  echo "${LOG_PREFIX} $*"
+}
+
+mkdir -p "$AGENT_DATA_DIR" "$AGENT_HEALTH_DIR"
+chown -R "$AGENT_UID:$AGENT_GID" "$AGENT_DATA_DIR" "$AGENT_HEALTH_DIR" 2>/dev/null || true
+chown -R "$AGENT_UID:$AGENT_GID" "/private/argus/etc" 2>/dev/null || true
+
+if ! getent group "$AGENT_GID" >/dev/null 2>&1; then
+  groupadd -g "$AGENT_GID" "$RUNTIME_GROUP"
+else
+  RUNTIME_GROUP="$(getent group "$AGENT_GID" | cut -d: -f1)"
+fi
+
+if ! getent passwd "$AGENT_UID" >/dev/null 2>&1; then
+  useradd -u "$AGENT_UID" -g "$AGENT_GID" -M -s /bin/bash "$RUNTIME_USER"
+else
+  RUNTIME_USER="$(getent passwd "$AGENT_UID" | cut -d: -f1)"
+fi
+
+log "运行用户: $RUNTIME_USER ($AGENT_UID:$AGENT_GID)"
+
+# 中文提示：等待 bind 下发的 update-dns.sh 脚本
+for _ in {1..30}; do
+  if [[ -x "$DNS_SCRIPT" ]]; then
+    break
+  fi
+  log "等待 update-dns.sh 准备就绪..."
+  sleep 1
+done
+
+if [[ -x "$DNS_SCRIPT" ]]; then
+  log "执行 update-dns.sh 更新容器 DNS"
+  while true; do
+    if "$DNS_SCRIPT"; then
+      log "update-dns.sh 执行成功"
+      break
+    fi
+    log "update-dns.sh 执行失败，3 秒后重试"
+    sleep 3
+  done
+else
+  log "未获取到 update-dns.sh，使用镜像默认 DNS"
+fi
+
+# 中文提示：记录当前 dns.conf 内容，便于排查
+if [[ -f "$DNS_CONF" ]]; then
+  log "dns.conf 内容: $(tr '\n' ' ' < "$DNS_CONF")"
+else
+  log "dns.conf 暂未生成"
+fi
+
+# 中文提示：尝试解析 master 域名，失败不阻塞但会打日志
+for _ in {1..30}; do
+  if getent hosts "$TARGET_DOMAIN" >/dev/null 2>&1; then
+    MASTER_IP=$(getent hosts "$TARGET_DOMAIN" | awk '{print $1}' | head -n 1)
+    log "master.argus.com 解析成功: $MASTER_IP"
+    break
+  fi
+  sleep 1
+done
+
+log "启动 argus-agent"
+exec su -s /bin/bash -c /usr/local/bin/argus-agent "$RUNTIME_USER"
--- a/src/alert/README.md
+++ b/src/alert/README.md
@ -0,0 +1,13 @@
+# Alertmanager
+
+## 启动示例
+
+```bash
+docker run -d --name alertmanager \
+  -p 9093:9093 \
+  -v /opt/alertmanager/data:/alertmanager \
+  argus-alert:latest
+```
+
+## 动态配置
+修改alertmanager.yml后，调用`/-/reload`接口可以重新加载配置
--- a/src/alert/alertmanager/build/Dockerfile
+++ b/src/alert/alertmanager/build/Dockerfile
@ -0,0 +1,86 @@
+# 基于 Ubuntu 24.04
+FROM ubuntu:24.04
+
+# 切换到 root 用户
+USER root
+
+# 安装必要依赖
+RUN apt-get update && \
+    apt-get install -y wget supervisor net-tools inetutils-ping vim ca-certificates passwd && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# 设置 Alertmanager 版本
+ARG ALERTMANAGER_VERSION=0.28.1
+
+# 下载并解压 Alertmanager 二进制
+RUN wget https://github.com/prometheus/alertmanager/releases/download/v${ALERTMANAGER_VERSION}/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz && \
+    tar xvf alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz && \
+    mv alertmanager-${ALERTMANAGER_VERSION}.linux-amd64 /usr/local/alertmanager && \
+    rm alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz
+
+ENV ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
+ENV ARGUS_UID=2133
+ENV ARGUS_GID=2015
+
+RUN mkdir -p /usr/share/alertmanager && \
+    mkdir -p ${ALERTMANAGER_BASE_PATH} && \
+    mkdir -p /private/argus/etc && \
+    rm -rf /alertmanager && \
+    ln -s ${ALERTMANAGER_BASE_PATH} /alertmanager
+
+# 创建 alertmanager 用户（可自定义 UID/GID）
+# 创建 alertmanager 用户组
+RUN groupadd -g ${ARGUS_GID} alertmanager
+
+# 创建 alertmanager 用户并指定组
+RUN useradd -M -s /usr/sbin/nologin -u ${ARGUS_UID} -g ${ARGUS_GID} alertmanager
+
+RUN chown -R alertmanager:alertmanager /usr/share/alertmanager && \
+    chown -R alertmanager:alertmanager /alertmanager && \
+    chown -R alertmanager:alertmanager ${ALERTMANAGER_BASE_PATH} && \
+    chown -R alertmanager:alertmanager /private/argus/etc && \
+    chown -R alertmanager:alertmanager /usr/local/bin
+
+# 配置内网 apt 源 (如果指定了内网选项)
+RUN if [ "$USE_INTRANET" = "true" ]; then \
+        echo "Configuring intranet apt sources..." && \
+        cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
+        echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
+        echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
+        echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
+    fi
+
+
+# 配置部署时使用的 apt 源
+RUN if [ "$USE_INTRANET" = "true" ]; then \
+    echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
+    fi
+
+# 创建 supervisor 日志目录
+RUN mkdir -p /var/log/supervisor
+
+# 复制 supervisor 配置文件
+COPY src/alert/alertmanager/build/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
+
+# 复制启动脚本
+COPY src/alert/alertmanager/build/start-am-supervised.sh /usr/local/bin/start-am-supervised.sh
+RUN chmod +x /usr/local/bin/start-am-supervised.sh
+
+# 复制 Alertmanager 配置文件
+COPY src/alert/alertmanager/build/alertmanager.yml /etc/alertmanager/alertmanager.yml
+RUN chmod +x /etc/alertmanager/alertmanager.yml
+# COPY src/alert/alertmanager/build/alertmanager.yml ${ALERTMANAGER_BASE_PATH}/alertmanager.yml
+
+# 复制 DNS 监控脚本
+COPY src/alert/alertmanager/build/dns-monitor.sh /usr/local/bin/dns-monitor.sh
+RUN chmod +x /usr/local/bin/dns-monitor.sh
+
+# 保持 root 用户，由 supervisor 控制 user 切换
+USER root
+
+# 暴露端口（Alertmanager 默认端口 9093）
+EXPOSE 9093
+
+# 使用 supervisor 作为入口点
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
+
--- a/src/alert/alertmanager/build/alertmanager.yml
+++ b/src/alert/alertmanager/build/alertmanager.yml
@ -0,0 +1,19 @@
+global:
+  resolve_timeout: 5m
+
+route:
+  group_by: ['alertname', 'instance']   # 分组：相同 alertname + instance 的告警合并
+  group_wait: 30s        # 第一个告警后，等 30s 看是否有同组告警一起发
+  group_interval: 5m     # 同组告警变化后，至少 5 分钟再发一次
+  repeat_interval: 3h    # 相同告警，3 小时重复提醒一次
+  receiver: 'null'
+
+receivers:
+  - name: 'null'
+
+inhibit_rules:
+  - source_match:
+      severity: 'critical'     # critical 告警存在时
+    target_match:
+      severity: 'warning'      # 抑制相同 instance 的 warning 告警
+    equal: ['instance']
--- a/src/alert/alertmanager/build/dns-monitor.sh
+++ b/src/alert/alertmanager/build/dns-monitor.sh
@ -0,0 +1,68 @@
+#!/bin/bash
+
+# DNS监控脚本 - 每10秒检查dns.conf是否有变化
+# 如果有变化则执行update-dns.sh脚本
+
+DNS_CONF="/private/argus/etc/dns.conf"
+DNS_BACKUP="/tmp/dns.conf.backup"
+UPDATE_SCRIPT="/private/argus/etc/update-dns.sh"
+LOG_FILE="/var/log/supervisor/dns-monitor.log"
+
+# 确保日志文件存在
+touch "$LOG_FILE"
+
+log_message() {
+    echo "$(date '+%Y-%m-%d %H:%M:%S') [DNS-Monitor] $1" >> "$LOG_FILE"
+}
+
+log_message "DNS监控脚本启动"
+
+while true; do
+    if [ -f "$DNS_CONF" ]; then
+        if [ -f "$DNS_BACKUP" ]; then
+            # 比较文件内容
+            if ! cmp -s "$DNS_CONF" "$DNS_BACKUP"; then
+                log_message "检测到DNS配置变化"
+
+                # 更新备份文件
+                cp "$DNS_CONF" "$DNS_BACKUP"
+
+                # 执行更新脚本
+                if [ -x "$UPDATE_SCRIPT" ]; then
+                    log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
+                    "$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
+                    if [ $? -eq 0 ]; then
+                        log_message "DNS更新脚本执行成功"
+                    else
+                        log_message "DNS更新脚本执行失败"
+                    fi
+                else
+                    log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
+                fi
+            fi
+        else
+
+            # 第一次检测到配置文件，执行更新脚本
+            if [ -x "$UPDATE_SCRIPT" ]; then
+                log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
+                "$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
+                if [ $? -eq 0 ]; then
+                    log_message "DNS更新脚本执行成功"
+
+		    # 第一次运行，创建备份并执行更新
+		    cp "$DNS_CONF" "$DNS_BACKUP"
+		    log_message "创建DNS配置备份文件"
+
+                else
+                    log_message "DNS更新脚本执行失败"
+                fi
+            else
+                log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
+            fi
+        fi
+    else
+        log_message "警告: DNS配置文件不存在: $DNS_CONF"
+    fi
+
+    sleep 10
+done
--- a/src/alert/alertmanager/build/start-am-supervised.sh
+++ b/src/alert/alertmanager/build/start-am-supervised.sh
@ -0,0 +1,26 @@
+#!/bin/bash
+set -euo pipefail
+
+echo "[INFO] Starting Alertmanager under supervisor..."
+
+ALERTMANAGER_BASE_PATH=${ALERTMANAGER_BASE_PATH:-/private/argus/alert/alertmanager}
+
+echo "[INFO] Alertmanager base path: ${ALERTMANAGER_BASE_PATH}"
+
+# 生成配置文件
+echo "[INFO] Generating Alertmanager configuration file..."
+sed "s|\${ALERTMANAGER_BASE_PATH}|${ALERTMANAGER_BASE_PATH}|g" \
+    /etc/alertmanager/alertmanager.yml > ${ALERTMANAGER_BASE_PATH}/alertmanager.yml
+
+
+# 记录容器 IP 地址
+DOMAIN=alertmanager.alert.argus.com
+IP=$(ifconfig | grep -A 1 eth0 | grep inet | awk '{print $2}')
+echo "current IP: ${IP}"
+echo "${IP}" > /private/argus/etc/${DOMAIN}
+
+
+echo "[INFO] Starting Alertmanager process..."
+
+# 启动 Alertmanager 主进程
+exec /usr/local/alertmanager/alertmanager --config.file=/etc/alertmanager/alertmanager.yml --storage.path=/alertmanager --cluster.listen-address=""
--- a/src/alert/alertmanager/build/supervisord.conf
+++ b/src/alert/alertmanager/build/supervisord.conf
@ -0,0 +1,39 @@
+[supervisord]
+nodaemon=true
+logfile=/var/log/supervisor/supervisord.log
+pidfile=/var/run/supervisord.pid
+user=root
+
+[program:alertmanager]
+command=/usr/local/bin/start-am-supervised.sh
+user=alertmanager
+stdout_logfile=/var/log/supervisor/alertmanager.log
+stderr_logfile=/var/log/supervisor/alertmanager_error.log
+autorestart=true
+startretries=3
+startsecs=10
+stopwaitsecs=20
+killasgroup=true
+stopasgroup=true
+
+[program:dns-monitor]
+command=/usr/local/bin/dns-monitor.sh
+user=root
+stdout_logfile=/var/log/supervisor/dns-monitor.log
+stderr_logfile=/var/log/supervisor/dns-monitor_error.log
+autorestart=true
+startretries=3
+startsecs=5
+stopwaitsecs=10
+killasgroup=true
+stopasgroup=true
+
+[unix_http_server]
+file=/var/run/supervisor.sock
+chmod=0700
+
+[supervisorctl]
+serverurl=unix:///var/run/supervisor.sock
+
+[rpcinterface:supervisor]
+supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
--- a/src/alert/alertmanager/config/rule_files/README.md
+++ b/src/alert/alertmanager/config/rule_files/README.md
@ -0,0 +1,60 @@
+# 告警配置
+
+> 参考：[自定义Prometheus告警规则](https://yunlzheng.gitbook.io/prometheus-book/parti-prometheus-ji-chu/alert/prometheus-alert-rule)
+
+在Prometheus中配置告警的有两个步骤：
+
+1. 写告警规则文件（rules文件）
+2. 在promethues.yml里加载规则，并配置Alertmanager
+
+## 1. 编写告警规则文件
+告警规则如下：
+```yml
+groups:
+  - name: example-rules
+    interval: 30s  # 每30秒评估一次
+    rules:
+      - alert: InstanceDown
+        expr: up == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "实例 {{ $labels.instance }} 已宕机"
+          description: "{{ $labels.instance }} 在 {{ $labels.job }} 中无响应超过 1 分钟。"
+
+      - alert: HighCpuUsage
+        expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "CPU 使用率过高"
+          description: "实例 {{ $labels.instance }} CPU 使用率超过 80% 持续 5 分钟。"
+```
+
+其中：
+
+- `alert`：告警规则的名称。
+- `expr`：基于PromQL表达式告警触发条件，用于计算是否有时间序列满足该条件。
+- `for`：评估等待时间，可选参数。用于表示只有当触发条件持续一段时间后才发送告警。在等待期间新产生告警的状态为pending。
+- `labels`：自定义标签，允许用户指定要附加到告警上的一组附加标签，可以在Alertmanager中做路由和分组。
+- `annotations`：用于指定一组附加信息，比如用于描述告警详细信息的文字等，annotations的内容在告警产生时会一同作为参数发送到Alertmanager。可以提供告警摘要和详细信息。
+
+## 2. promothues.yml里引用
+在prometheus.yml中加上`rule_files`和`alerting`:
+
+```yml
+global:
+  [ evaluation_interval: <duration> | default = 1m ]
+
+rule_files:
+  [ - <filepath_glob> ... ]
+
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+            - "localhost:9093"   # Alertmanager 地址
+
+```
--- a/src/alert/alertmanager/config/rule_files/example_rules.yml
+++ b/src/alert/alertmanager/config/rule_files/example_rules.yml
@ -0,0 +1,37 @@
+groups:
+  - name: example-rules
+    interval: 30s  # 每30秒评估一次
+    rules:
+      - alert: InstanceDown
+        expr: up == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "实例 {{ $labels.instance }} 已宕机"
+          description: "{{ $labels.instance }} 在 {{ $labels.job }} 中无响应超过 1 分钟。"
+
+      - alert: HighCpuUsage
+        expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "CPU 使用率过高"
+          description: "实例 {{ $labels.instance }} CPU 使用率超过 80% 持续 5 分钟。"
+      - alert: HighMemoryUsage
+        expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "内存使用率过高"
+          description: "实例 {{ $labels.instance }} 内存使用率超过 80% 持续 5 分钟。"
+      - alert: DiskSpaceLow
+        expr: (node_filesystem_size_bytes{fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{fstype!~"tmpfs|overlay"}) / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"} * 100 > 90
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "磁盘空间不足"
+          description: "实例 {{ $labels.instance }} 磁盘空间不足超过 90% 持续 10 分钟。"
--- a/src/alert/tests/data/alertmanager/alertmanager.yml
+++ b/src/alert/tests/data/alertmanager/alertmanager.yml
@ -0,0 +1,19 @@
+global:
+  resolve_timeout: 5m
+
+route:
+  group_by: ['alertname', 'instance']   # 分组：相同 alertname + instance 的告警合并
+  group_wait: 30s        # 第一个告警后，等 30s 看是否有同组告警一起发
+  group_interval: 5m     # 同组告警变化后，至少 5 分钟再发一次
+  repeat_interval: 3h    # 相同告警，3 小时重复提醒一次
+  receiver: 'null'
+
+receivers:
+  - name: 'null'
+
+inhibit_rules:
+  - source_match:
+      severity: 'critical'     # critical 告警存在时
+    target_match:
+      severity: 'warning'      # 抑制相同 instance 的 warning 告警
+    equal: ['instance']
--- a/src/alert/tests/data/alertmanager/nflog
+++ b/src/alert/tests/data/alertmanager/nflog
--- a/src/alert/tests/data/alertmanager/silences
+++ b/src/alert/tests/data/alertmanager/silences
--- a/src/alert/tests/data/etc/alertmanager.alert.argus.com
+++ b/src/alert/tests/data/etc/alertmanager.alert.argus.com
@ -0,0 +1 @@
+172.18.0.2
--- a/src/alert/tests/docker-compose.yml
+++ b/src/alert/tests/docker-compose.yml
@ -0,0 +1,37 @@
+version: '3.8'
+services:
+  alertmanager:
+    build:
+      context: ../../../
+      dockerfile: src/alert/alertmanager/build/Dockerfile
+      args:
+        ARGUS_UID: ${ARGUS_UID:-2133}
+        ARGUS_GID: ${ARGUS_GID:-2015}
+        USE_INTRANET: ${USE_INTRANET:-false}
+    image: argus-alertmanager:latest
+    container_name: argus-alertmanager
+    environment:
+      - ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
+      - ARGUS_UID=${ARGUS_UID:-2133}
+      - ARGUS_GID=${ARGUS_GID:-2015}
+    ports:
+      - "${ARGUS_PORT:-9093}:9093"
+    volumes:
+      - ${DATA_ROOT:-./data}/alertmanager:/private/argus/alert/alertmanager
+      - ${DATA_ROOT:-./data}/etc:/private/argus/etc
+    networks:
+      - argus-network
+    restart: unless-stopped
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "10m"
+        max-file: "3"
+networks:
+  argus-network:
+    driver: bridge
+    name: argus-network
+
+volumes:
+  alertmanager_data:
+    driver: local
--- a/src/alert/tests/scripts/01_bootstrap.sh
+++ b/src/alert/tests/scripts/01_bootstrap.sh
@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+set -euo pipefail
+root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)"
+project_root="$(cd "$root/../../.." && pwd)"
+
+source "$project_root/scripts/common/build_user.sh"
+load_build_user
+
+# 创建新的private目录结构 (基于argus目录结构)
+echo "[INFO] Creating private directory structure for supervisor-based containers..."
+mkdir -p "$root/private/argus/alert/alertmanager"
+mkdir -p "$root/private/argus/etc/"
+
+# 设置数据目录权限
+echo "[INFO] Setting permissions for data directories..."
+chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/alert/alertmanager" 2>/dev/null || true
+chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/etc" 2>/dev/null || true
+
+echo "[INFO] Supervisor-based containers will manage their own scripts and configurations"
--- a/src/alert/tests/scripts/02_up.sh
+++ b/src/alert/tests/scripts/02_up.sh
@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+set -euo pipefail
+cd "$(dirname "$0")/.."
+compose_cmd="docker compose"
+if ! $compose_cmd version >/dev/null 2>&1; then
+  if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else
+    echo "需要 Docker Compose，请安装后重试" >&2; exit 1; fi
+fi
+$compose_cmd -p alert-mvp up -d --remove-orphans
+echo "[OK] 服务已启动：Alertmanager http://localhost:9093"
--- a/src/alert/tests/scripts/03_alertmanager_add_alert.sh
+++ b/src/alert/tests/scripts/03_alertmanager_add_alert.sh
@ -0,0 +1,106 @@
+#!/bin/bash
+set -euo pipefail
+
+# ==========================================================
+# Alertmanager 测试脚本
+# ==========================================================
+
+ALERTMANAGER_URL="http://localhost:9093"
+TEST_ALERT_NAME_CRITICAL="NodeDown"
+TEST_ALERT_NAME_WARNING="HighCPU"
+TMP_LOG="/tmp/test-alertmanager.log"
+
+# 等待参数
+am_wait_attempts=30
+am_wait_interval=2
+
+GREEN="\033[1;32m"
+RED="\033[1;31m"
+YELLOW="\033[1;33m"
+RESET="\033[0m"
+
+# ==========================================================
+# 函数定义
+# ==========================================================
+
+wait_for_alertmanager() {
+  local attempt=1
+  echo "[INFO] 等待 Alertmanager 启动中..."
+  while (( attempt <= am_wait_attempts )); do
+    if curl -fsS "${ALERTMANAGER_URL}/api/v2/status" >/dev/null 2>&1; then
+      echo -e "${GREEN}[OK] Alertmanager 已就绪 (attempt=${attempt}/${am_wait_attempts})${RESET}"
+      return 0
+    fi
+    echo "[..] Alertmanager 尚未就绪 (${attempt}/${am_wait_attempts})"
+    sleep "${am_wait_interval}"
+    (( attempt++ ))
+  done
+  echo -e "${RED}[ERROR] Alertmanager 在 ${am_wait_attempts} 次尝试后仍未就绪${RESET}"
+  return 1
+}
+
+log_step() {
+  echo -e "${YELLOW}==== $1 ====${RESET}"
+}
+
+# ==========================================================
+# 主流程
+# ==========================================================
+
+log_step "测试 Alertmanager 开始"
+echo "[INFO] Alertmanager 地址: $ALERTMANAGER_URL"
+
+# Step 1: 等待 Alertmanager 启动
+wait_for_alertmanager
+
+# Step 2: 触发一个critical测试告警
+echo "[INFO] 发送critical测试告警..."
+curl -fsS -X POST "${ALERTMANAGER_URL}/api/v2/alerts" \
+  -H "Content-Type: application/json" \
+  -d '[
+    {
+      "labels": {
+        "alertname": "'"${TEST_ALERT_NAME_CRITICAL}"'",
+        "instance": "node-1",
+        "severity": "critical"
+      },
+      "annotations": {
+        "summary": "节点 node-1 宕机"
+      }
+    }
+  ]' \
+  -o "$TMP_LOG"
+
+if [ $? -eq 0 ]; then
+  echo -e "${GREEN}[OK] 已成功发送critical测试告警${RESET}"
+else
+  echo -e "${RED}[ERROR] critical告警发送失败！${RESET}"
+  cat "$TMP_LOG"
+  exit 1
+fi
+
+# Step 3: 触发一个warning测试告警
+echo "[INFO] 发送warning测试告警..."
+curl -fsS -X POST "${ALERTMANAGER_URL}/api/v2/alerts" \
+  -H "Content-Type: application/json" \
+  -d '[
+    {
+      "labels": {
+        "alertname": "'"${TEST_ALERT_NAME_WARNING}"'",
+        "instance": "node-1",
+        "severity": "warning"
+      },
+      "annotations": {
+        "summary": "节点 node-1 CPU 使用率过高"
+      }
+    }
+  ]' \
+  -o "$TMP_LOG"
+
+if [ $? -eq 0 ]; then
+  echo -e "${GREEN}[OK] 已成功发送warning测试告警${RESET}"
+else
+  echo -e "${RED}[ERROR] warning告警发送失败！${RESET}"
+  cat "$TMP_LOG"
+  exit 1
+fi
--- a/src/alert/tests/scripts/04_query_alerts.sh
+++ b/src/alert/tests/scripts/04_query_alerts.sh
@ -0,0 +1,71 @@
+#!/bin/bash
+set -euo pipefail
+
+# ==========================================================
+# Alertmanager 测试脚本（含启动等待）
+# ==========================================================
+
+ALERTMANAGER_URL="http://localhost:9093"
+TEST_ALERT_NAME_CRITICAL="NodeDown"
+TEST_ALERT_NAME_WARNING="HighCPU"
+TMP_LOG="/tmp/test-alertmanager.log"
+
+# 等待参数
+am_wait_attempts=30
+am_wait_interval=2
+
+GREEN="\033[1;32m"
+RED="\033[1;31m"
+YELLOW="\033[1;33m"
+RESET="\033[0m"
+
+# ==========================================================
+# 函数定义
+# ==========================================================
+
+wait_for_alertmanager() {
+  local attempt=1
+  echo "[INFO] 等待 Alertmanager 启动中..."
+  while (( attempt <= am_wait_attempts )); do
+    if curl -fsS "${ALERTMANAGER_URL}/api/v2/status" >/dev/null 2>&1; then
+      echo -e "${GREEN}[OK] Alertmanager 已就绪 (attempt=${attempt}/${am_wait_attempts})${RESET}"
+      return 0
+    fi
+    echo "[..] Alertmanager 尚未就绪 (${attempt}/${am_wait_attempts})"
+    sleep "${am_wait_interval}"
+    (( attempt++ ))
+  done
+  echo -e "${RED}[ERROR] Alertmanager 在 ${am_wait_attempts} 次尝试后仍未就绪${RESET}"
+  return 1
+}
+
+log_step() {
+  echo -e "${YELLOW}==== $1 ====${RESET}"
+}
+
+# ==========================================================
+# 主流程
+# ==========================================================
+
+log_step "查询 Alertmanager 当前告警列表开始"
+echo "[INFO] Alertmanager 地址: $ALERTMANAGER_URL"
+
+# Step 1: 等待 Alertmanager 启动
+wait_for_alertmanager
+
+# Step 2: 查询当前告警列表
+echo "[INFO] 查询当前告警..."
+sleep 1
+curl -fsS "${ALERTMANAGER_URL}/api/v2/alerts" | jq '.' || {
+  echo -e "${RED}[WARN] 无法解析返回 JSON，请检查 jq 是否安装${RESET}"
+  curl -s "${ALERTMANAGER_URL}/api/v2/alerts"
+}
+
+# Step 3: 检查告警是否包含 NodeDown
+if curl -fsS "${ALERTMANAGER_URL}/api/v2/alerts" | grep -q "${TEST_ALERT_NAME_CRITICAL}"; then
+  echo -e "${GREEN}✅ 测试通过：Alertmanager 已成功接收告警 ${TEST_ALERT_NAME_CRITICAL}${RESET}"
+else
+  echo -e "${RED}❌ 测试失败：未检测到告警 ${TEST_ALERT_NAME_CRITICAL}${RESET}"
+fi
+
+log_step "测试结束"
--- a/src/alert/tests/scripts/05_down.sh
+++ b/src/alert/tests/scripts/05_down.sh
@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+set -euo pipefail
+cd "$(dirname "$0")/.."
+compose_cmd="docker compose"
+if ! $compose_cmd version >/dev/null 2>&1; then
+  if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else
+    echo "需要 Docker Compose，请安装后重试" >&2; exit 1; fi
+fi
+$compose_cmd -p alert-mvp down
+echo "[OK] 已停止所有容器"
+
+# 清理private目录内容
+echo "[INFO] 清理private目录内容..."
+cd "$(dirname "$0")/.."
+if [ -d "private" ]; then
+    # 删除private目录及其所有内容
+    rm -rf private
+    echo "[OK] 已清理private目录"
+else
+    echo "[INFO] private目录不存在，无需清理"
+fi
--- a/src/alert/tests/scripts/e2e_test.sh
+++ b/src/alert/tests/scripts/e2e_test.sh
@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+echo "======================================="
+echo "ARGUS Alert System End-to-End Test"
+echo "======================================="
+echo ""
+
+# 记录测试开始时间
+test_start_time=$(date +%s)
+
+# 函数：等待服务就绪
+wait_for_services() {
+    echo "[INFO] Waiting for all services to be ready..."
+    local max_attempts=${SERVICE_WAIT_ATTEMPTS:-120}
+    local attempt=1
+
+    while [ $attempt -le $max_attempts ]; do
+        if curl -fs http://localhost:9093/api/v2/status >/dev/null 2>&1; then
+            echo "[OK] All services are ready!"
+            return 0
+        fi
+        echo "    Waiting for services... ($attempt/$max_attempts)"
+        sleep 5
+        ((attempt++))
+    done
+
+    echo "[ERROR] Services not ready after $max_attempts attempts"
+    return 1
+}
+
+# 函数：显示测试步骤
+show_step() {
+    echo ""
+    echo "🔄 Step $1: $2"
+    echo "----------------------------------------"
+}
+
+# 函数：验证步骤结果
+verify_step() {
+    if [ $? -eq 0 ]; then
+        echo "✅ $1 - SUCCESS"
+    else
+        echo "❌ $1 - FAILED"
+        exit 1
+    fi
+}
+
+# 开始端到端测试
+show_step "1" "Bootstrap - Initialize environment"
+./scripts/01_bootstrap.sh
+verify_step "Bootstrap"
+
+show_step "2" "Startup - Start all services"
+./scripts/02_up.sh
+verify_step "Service startup"
+
+# 等待服务完全就绪
+wait_for_services || exit 1
+
+# 发送告警数据
+show_step "3" "Add alerts - Send test alerts to Alertmanager"
+./scripts/03_alertmanager_add_alert.sh
+verify_step "Send test alerts"
+
+# 查询告警数据
+show_step "4" "Verify data - Query Alertmanager"
+./scripts/04_query_alerts.sh
+verify_step "Data verification"
+
+
+# 检查服务健康状态
+show_step "Health" "Check service health"
+echo "[INFO] Checking service health..."
+
+# 检查 Alertmanager 状态
+if curl -fs "http://localhost:9093/api/v2/status" >/dev/null 2>&1; then
+    am_status="available"
+    echo "✅ Alertmanager status: $am_status"
+else
+    am_status="unavailable"
+    echo "⚠️  Alertmanager status: $am_status"
+fi
+verify_step "Service health check"
+
+# 清理环境
+show_step "5" "Cleanup - Stop all services"
+./scripts/05_down.sh
+verify_step "Service cleanup"
+
+# 计算总测试时间
+test_end_time=$(date +%s)
+total_time=$((test_end_time - test_start_time))
+
+echo ""
+echo "======================================="
+echo "🎉 END-TO-END TEST COMPLETED SUCCESSFULLY!"
+echo "======================================="
+echo "📊 Test Summary:"
+echo "   • Total time: ${total_time}s"
+echo "   • Alertmanager status: $am_status"
+echo "   • All services started and stopped successfully"
+echo ""
+echo "✅ The ARGUS Alert system is working correctly!"
+echo ""
--- a/src/bind/build/Dockerfile
+++ b/src/bind/build/Dockerfile
@ -6,6 +6,11 @@ ENV TZ=Asia/Shanghai

 # 设置构建参数
 ARG USE_INTRANET=false
+ARG ARGUS_BUILD_UID=2133
+ARG ARGUS_BUILD_GID=2015
+
+ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
+    ARGUS_BUILD_GID=${ARGUS_BUILD_GID}

 # 配置内网 apt 源 (如果指定了内网选项)
 RUN if [ "$USE_INTRANET" = "true" ]; then \
@ -29,6 +34,24 @@ RUN apt-get update && \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*

+# 调整 bind 用户与用户组 ID 以匹配宿主机配置
+RUN set -eux; \
+    current_gid="$(getent group bind | awk -F: '{print $3}')"; \
+    if [ -z "$current_gid" ]; then \
+        groupadd -g "${ARGUS_BUILD_GID}" bind; \
+    elif [ "$current_gid" != "${ARGUS_BUILD_GID}" ]; then \
+        groupmod -g "${ARGUS_BUILD_GID}" bind; \
+    fi; \
+    if id bind >/dev/null 2>&1; then \
+        current_uid="$(id -u bind)"; \
+        if [ "$current_uid" != "${ARGUS_BUILD_UID}" ]; then \
+            usermod -u "${ARGUS_BUILD_UID}" bind; \
+        fi; \
+    else \
+        useradd -m -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" bind; \
+    fi; \
+    chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /var/cache/bind /var/lib/bind
+
 # 配置部署时使用的apt源
 RUN if [ "$USE_INTRANET" = "true" ]; then \
 	echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
--- a/src/bind/build/argus_dns_sync.sh
+++ b/src/bind/build/argus_dns_sync.sh
@ -9,6 +9,9 @@ SLEEP_SECONDS=10
 RELOAD_SCRIPT="/usr/local/bin/reload-bind9.sh"   # 这里放你已有脚本的路径

 mkdir -p "$(dirname "$LOCKFILE")" "$BACKUP_DIR"
+BACKUP_UID="${ARGUS_BUILD_UID:-2133}"
+BACKUP_GID="${ARGUS_BUILD_GID:-2015}"
+chown -R "$BACKUP_UID:$BACKUP_GID" "$BACKUP_DIR" 2>/dev/null || true

 is_ipv4() {
  local ip="$1"
@ -33,6 +36,7 @@ upsert_record() {
  local changed=0

  cp -a "$ZONE_DB" "$BACKUP_DIR/db.argus.com.$ts.bak"
+  chown "$BACKUP_UID:$BACKUP_GID" "$BACKUP_DIR/db.argus.com.$ts.bak" 2>/dev/null || true

  local cur_ip
  cur_ip="$(get_current_ip "$name" || true)"
@ -61,7 +65,10 @@ upsert_record() {
    echo "[SKIP] ${name} unchanged (${new_ip})"
  fi

-  return $changed
+  if [[ $changed -eq 1 ]]; then
+    return 0
+  fi
+  return 1
 }

 while true; do
@ -70,7 +77,7 @@ while true; do
    shopt -s nullglob
    NEED_RELOAD=0

-    for f in "$WATCH_DIR"/*.argus.com; do
+  for f in "$WATCH_DIR"/*.argus.com; do
      base="$(basename "$f")"
      name="${base%.argus.com}"
      ip="$(grep -Eo '([0-9]{1,3}\.){3}[0-9]{1,3}' "$f" | tail -n1 || true)"
@ -97,4 +104,3 @@ while true; do

  sleep "$SLEEP_SECONDS"
 done
-
--- a/src/bind/build/startup.sh
+++ b/src/bind/build/startup.sh
@ -6,6 +6,8 @@ chmod 777 /private 2>/dev/null || true
 # Create persistent directories for BIND9 configs and DNS sync
 mkdir -p /private/argus/bind
 mkdir -p /private/argus/etc
+chown bind:bind /private/argus 2>/dev/null || true
+chown -R bind:bind /private/argus/bind /private/argus/etc

 # Copy configuration files to persistent storage if they don't exist
 if [ ! -f /private/argus/bind/named.conf.local ]; then
--- a/src/bind/tests/docker-compose.yml
+++ b/src/bind/tests/docker-compose.yml
@ -3,8 +3,8 @@ services:
    image: argus-bind9:latest
    container_name: argus-bind9-test
    ports:
-      - "53:53/tcp"
-      - "53:53/udp"
+      - "${HOST_DNS_PORT:-1053}:53/tcp"
+      - "${HOST_DNS_PORT:-1053}:53/udp"
    volumes:
      - ./private:/private
    restart: unless-stopped
--- a/src/bind/tests/scripts/00_e2e_test.sh
+++ b/src/bind/tests/scripts/00_e2e_test.sh
@ -7,6 +7,9 @@
 set -e

 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
+
+export HOST_DNS_PORT

 echo "=========================================="
 echo "BIND9 DNS Server End-to-End Test Suite"
--- a/src/bind/tests/scripts/01_start_container.sh
+++ b/src/bind/tests/scripts/01_start_container.sh
@ -7,13 +7,17 @@ set -e

 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 TEST_DIR="$(dirname "$SCRIPT_DIR")"
+HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
+
+export HOST_DNS_PORT

 cd "$TEST_DIR"

 echo "Starting BIND9 test container..."

 # Ensure private directory exists with proper permissions
-mkdir -p private
+mkdir -p private/argus/bind
+mkdir -p private/argus/etc
 chmod 777 private

 # Start the container
@ -35,4 +39,4 @@ fi

 echo ""
 echo "BIND9 test environment is ready!"
-echo "DNS server listening on localhost:53"
+echo "DNS server listening on localhost:${HOST_DNS_PORT}"
--- a/src/bind/tests/scripts/02_dig_test.sh
+++ b/src/bind/tests/scripts/02_dig_test.sh
@ -5,7 +5,10 @@

 set -e

+HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
+
 echo "Testing DNS resolution with dig..."
+echo "Using DNS server localhost:${HOST_DNS_PORT}"

 # Function to test DNS query
 test_dns_query() {
@ -19,7 +22,7 @@ test_dns_query() {
    echo "Expected IP: $expected_ip"

    # Perform dig query
-    result=$(dig @localhost $hostname.argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
+    result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")

    if [ "$result" = "QUERY_FAILED" ]; then
        echo "✗ DNS query failed"
--- a/src/bind/tests/scripts/03.5_dns_sync_test.sh
+++ b/src/bind/tests/scripts/03.5_dns_sync_test.sh
@ -6,10 +6,13 @@

 set -e

+HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
+
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 TEST_DIR="$(dirname "$SCRIPT_DIR")"

 echo "=== DNS Auto-Sync Functionality Test ==="
+echo "Using DNS server localhost:${HOST_DNS_PORT}"

 # Check if container is running
 if ! docker compose ps | grep -q "Up"; then
@ -36,7 +39,7 @@ test_dns_query() {
    # Wait a moment for DNS cache
    sleep 2

-    result=$(dig @localhost $hostname.argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
+    result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")

    if [ "$result" = "$expected_ip" ]; then
        echo "✓ $result"
@ -90,7 +93,7 @@ echo ""
 echo "Step 2: Testing initial DNS configuration..."

 # Get current IP for web.argus.com (may have been changed by previous tests)
-current_web_ip=$(dig @localhost web.argus.com A +short 2>/dev/null || echo "UNKNOWN")
+current_web_ip=$(dig @localhost -p "$HOST_DNS_PORT" web.argus.com A +short 2>/dev/null || echo "UNKNOWN")
 echo "Current web.argus.com IP: $current_web_ip"

 # Test that DNS is working (regardless of specific IP)
@ -185,7 +188,7 @@ docker compose exec bind9 bash -c 'echo "this is not an IP address" > /private/a
 wait_for_sync

 # Verify invalid record was not added (should fail to resolve)
-result=$(dig @localhost invalid.argus.com A +short 2>/dev/null || echo "NO_RESULT")
+result=$(dig @localhost -p "$HOST_DNS_PORT" invalid.argus.com A +short 2>/dev/null || echo "NO_RESULT")
 if [ "$result" = "NO_RESULT" ] || [ -z "$result" ]; then
    echo "✓ Invalid IP correctly ignored"
 else
--- a/src/bind/tests/scripts/03_reload_test.sh
+++ b/src/bind/tests/scripts/03_reload_test.sh
@ -5,10 +5,13 @@

 set -e

+HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
+
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 TEST_DIR="$(dirname "$SCRIPT_DIR")"

 echo "=== DNS Configuration Reload Test ==="
+echo "Using DNS server localhost:${HOST_DNS_PORT}"

 # Check if container is running
 if ! docker compose ps | grep -q "Up"; then
@ -32,7 +35,7 @@ test_dns_query() {
    echo "Testing: $description"
    echo "Query: $hostname.argus.com -> Expected: $expected_ip"

-    result=$(dig @localhost $hostname.argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
+    result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")

    if [ "$result" = "$expected_ip" ]; then
        echo "✓ $result"
--- a/src/bind/tests/scripts/04_persistence_test.sh
+++ b/src/bind/tests/scripts/04_persistence_test.sh
@ -5,10 +5,13 @@

 set -e

+HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
+
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 TEST_DIR="$(dirname "$SCRIPT_DIR")"

 echo "=== Configuration Persistence Test ==="
+echo "Using DNS server localhost:${HOST_DNS_PORT}"

 # Check if dig is available
 if ! command -v dig &> /dev/null; then
@ -25,7 +28,7 @@ test_dns_query() {
    echo "Testing: $description"
    echo "Query: $hostname.argus.com -> Expected: $expected_ip"

-    result=$(dig @localhost $hostname.argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
+    result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")

    if [ "$result" = "$expected_ip" ]; then
        echo "✓ $result"
--- a/src/bind/tests/scripts/05_cleanup.sh
+++ b/src/bind/tests/scripts/05_cleanup.sh
@ -7,6 +7,9 @@ set -e

 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 TEST_DIR="$(dirname "$SCRIPT_DIR")"
+HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
+
+export HOST_DNS_PORT

 # Parse command line arguments
 FULL_CLEANUP=true
--- a/src/log/elasticsearch/build/Dockerfile
+++ b/src/log/elasticsearch/build/Dockerfile
@ -3,10 +3,29 @@ FROM docker.elastic.co/elasticsearch/elasticsearch:8.13.4
 # 切换到 root 用户进行系统级安装
 USER root

-# 修改elasticsearch用户的UID和GID
-RUN usermod -u 2133 elasticsearch && \
-    groupmod -g 2015 elasticsearch && \
-    chown -R elasticsearch:elasticsearch /usr/share/elasticsearch
+ARG ARGUS_BUILD_UID=2133
+ARG ARGUS_BUILD_GID=2015
+
+ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
+    ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
+
+# 调整 elasticsearch 用户与用户组 ID 以匹配宿主机配置
+RUN set -eux; \
+    current_gid="$(getent group elasticsearch | awk -F: '{print $3}')"; \
+    if [ -z "$current_gid" ]; then \
+        groupadd -g "${ARGUS_BUILD_GID}" elasticsearch; \
+    elif [ "$current_gid" != "${ARGUS_BUILD_GID}" ]; then \
+        groupmod -g "${ARGUS_BUILD_GID}" elasticsearch; \
+    fi; \
+    if id elasticsearch >/dev/null 2>&1; then \
+        current_uid="$(id -u elasticsearch)"; \
+        if [ "$current_uid" != "${ARGUS_BUILD_UID}" ]; then \
+            usermod -u "${ARGUS_BUILD_UID}" elasticsearch; \
+        fi; \
+    else \
+        useradd -m -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" elasticsearch; \
+    fi; \
+    chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/elasticsearch

 # 设置构建参数
 ARG USE_INTRANET=false
--- a/src/log/fluent-bit/build/etc/parsers.conf
+++ b/src/log/fluent-bit/build/etc/parsers.conf
@ -25,3 +25,5 @@
    Regex  ^(?<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(?<level>\w+)\s+(?<message>.*)$
    Time_Key    timestamp
    Time_Format %Y-%m-%d %H:%M:%S
+    Time_Offset +0800
+    Time_Keep On
--- a/src/log/kibana/build/Dockerfile
+++ b/src/log/kibana/build/Dockerfile
@ -3,10 +3,29 @@ FROM docker.elastic.co/kibana/kibana:8.13.4
 # 切换到 root 用户进行系统级安装
 USER root

-# 修改kibana用户的UID和GID
-RUN usermod -u 2133 kibana && \
-    groupmod -g 2015 kibana && \
-    chown -R kibana:kibana /usr/share/kibana
+ARG ARGUS_BUILD_UID=2133
+ARG ARGUS_BUILD_GID=2015
+
+ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
+    ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
+
+# 调整 kibana 用户与用户组 ID 以匹配宿主机配置
+RUN set -eux; \
+    current_gid="$(getent group kibana | awk -F: '{print $3}')"; \
+    if [ -z "$current_gid" ]; then \
+        groupadd -g "${ARGUS_BUILD_GID}" kibana; \
+    elif [ "$current_gid" != "${ARGUS_BUILD_GID}" ]; then \
+        groupmod -g "${ARGUS_BUILD_GID}" kibana; \
+    fi; \
+    if id kibana >/dev/null 2>&1; then \
+        current_uid="$(id -u kibana)"; \
+        if [ "$current_uid" != "${ARGUS_BUILD_UID}" ]; then \
+            usermod -u "${ARGUS_BUILD_UID}" kibana; \
+        fi; \
+    else \
+        useradd -m -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" kibana; \
+    fi; \
+    chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/kibana

 # 设置构建参数
 ARG USE_INTRANET=false
--- a/src/log/tests/docker-compose.yml
+++ b/src/log/tests/docker-compose.yml
@ -17,6 +17,7 @@ services:
      interval: 10s
      timeout: 5s
      retries: 30
+    restart: always

  kibana:
    build:
@ -73,13 +74,11 @@ services:
      interval: 15s
      timeout: 10s
      retries: 30
+    restart: always

  bind9:
      image: argus-bind9:latest
-      ports:
-        - "53:53/tcp"
-        - "53:53/udp"
      volumes:
        - ./private/argus:/private/argus/
-      restart: unless-stopped
+      restart: always

--- a/src/log/tests/scripts/01_bootstrap.sh
+++ b/src/log/tests/scripts/01_bootstrap.sh
@ -1,6 +1,10 @@
 #!/usr/bin/env bash
 set -euo pipefail
 root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)"
+project_root="$(cd "$root/../../.." && pwd)"
+
+source "$project_root/scripts/common/build_user.sh"
+load_build_user

 # 创建新的private目录结构 (基于argus目录结构)
 echo "[INFO] Creating private directory structure for supervisor-based containers..."
@ -11,9 +15,9 @@ mkdir -p "$root/private/argus/etc/"

 # 设置数据目录权限（ES 和 Kibana 容器都使用 UID 1000）
 echo "[INFO] Setting permissions for data directories..."
-sudo chown -R 2133:2015 "$root/private/argus/log/elasticsearch" 2>/dev/null || true
-sudo chown -R 2133:2015 "$root/private/argus/log/kibana" 2>/dev/null || true
-sudo chown -R 2133:2015 "$root/private/argus/etc" 2>/dev/null || true
+chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/log/elasticsearch" 2>/dev/null || true
+chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/log/kibana" 2>/dev/null || true
+chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/etc" 2>/dev/null || true

 echo "[INFO] Supervisor-based containers will manage their own scripts and configurations"

--- a/src/log/tests/scripts/03_send_test_host01.sh
+++ b/src/log/tests/scripts/03_send_test_host01.sh
@ -4,8 +4,22 @@ set -euo pipefail
 # 获取fluent-bit-host01容器名称
 container_name="logging-mvp-fluent-bit-host01-1"

-# 检查容器是否存在并运行
-if ! docker ps | grep -q "$container_name"; then
+wait_for_container() {
+    local name="$1"
+    local attempts=30
+    local delay=5
+    local i
+    for ((i = 1; i <= attempts; i++)); do
+        if docker ps --format '{{.Names}}' | grep -Fx "$name" >/dev/null; then
+            return 0
+        fi
+        echo "[INFO] 等待容器 $name 启动中... ($i/$attempts)"
+        sleep "$delay"
+    done
+    return 1
+}
+
+if ! wait_for_container "$container_name"; then
    echo "[ERROR] Fluent Bit容器 $container_name 未运行"
    exit 1
 fi
--- a/src/log/tests/scripts/03_send_test_host02.sh
+++ b/src/log/tests/scripts/03_send_test_host02.sh
@ -4,8 +4,22 @@ set -euo pipefail
 # 获取fluent-bit-host02容器名称
 container_name="logging-mvp-fluent-bit-host02-1"

-# 检查容器是否存在并运行
-if ! docker ps | grep -q "$container_name"; then
+wait_for_container() {
+    local name="$1"
+    local attempts=30
+    local delay=5
+    local i
+    for ((i = 1; i <= attempts; i++)); do
+        if docker ps --format '{{.Names}}' | grep -Fx "$name" >/dev/null; then
+            return 0
+        fi
+        echo "[INFO] 等待容器 $name 启动中... ($i/$attempts)"
+        sleep "$delay"
+    done
+    return 1
+}
+
+if ! wait_for_container "$container_name"; then
    echo "[ERROR] Fluent Bit容器 $container_name 未运行"
    exit 1
 fi
--- a/src/log/tests/scripts/04_query_es.sh
+++ b/src/log/tests/scripts/04_query_es.sh
@ -1,7 +1,42 @@
 #!/usr/bin/env bash
 set -euo pipefail
+
+# ES endpoint and wait strategy
 ES="${ES:-http://localhost:9200}"
+es_wait_attempts="${ES_WAIT_ATTEMPTS:-60}"      # total attempts to wait for ES
+es_wait_interval="${ES_WAIT_INTERVAL:-2}"        # seconds between attempts
+
 echo "[i] 查询 ES 端点：$ES"
+
+wait_for_es() {
+  local attempt=1
+  while (( attempt <= es_wait_attempts )); do
+    # 等待集群达到至少 yellow 状态；请求失败则重试
+    if curl -fsS "$ES/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then
+      echo "[ok] Elasticsearch 已就绪 (attempt=${attempt}/${es_wait_attempts})"
+      return 0
+    fi
+    echo "[..] 等待 Elasticsearch 可用中 (${attempt}/${es_wait_attempts})"
+    sleep "${es_wait_interval}"
+    (( attempt++ ))
+  done
+  echo "[err] Elasticsearch 在 ${es_wait_attempts} 次尝试后仍不可用"
+  return 1
+}
+
+safe_count() {
+  # 对缺失索引返回 0，避免 404 触发失败
+  local pattern="$1"
+  local json
+  json=$(curl -fsS "$ES/${pattern}/_count?ignore_unavailable=true&allow_no_indices=true" 2>/dev/null || echo '{}')
+  echo "$json" | sed -E 's/.*"count":([0-9]+).*/\1/' | awk 'NF{print $0;exit} END{if(NR==0)print 0}'
+}
+
+wait_for_es
+
+# 列出相关索引（可能为空，允许）
 curl -fsS "$ES/_cat/indices?v" | egrep 'train-|infer-|logstash' || true
-printf "train-* 计数："; curl -fsS "$ES/train-*/_count" | sed -E 's/.*"count":([0-9]+).*/\1/'; echo
-printf "infer-* 计数："; curl -fsS "$ES/infer-*/_count" | sed -E 's/.*"count":([0-9]+).*/\1/'; echo
+
+# 打印计数，缺失索引按 0 处理
+printf "train-* 计数："; safe_count "train-*"; echo
+printf "infer-* 计数："; safe_count "infer-*"; echo
--- a/src/log/tests/scripts/e2e_test.sh
+++ b/src/log/tests/scripts/e2e_test.sh
@ -19,7 +19,7 @@ get_log_count() {
 # 函数：等待服务就绪
 wait_for_services() {
    echo "[INFO] Waiting for all services to be ready..."
-    local max_attempts=60
+    local max_attempts=${SERVICE_WAIT_ATTEMPTS:-120}
    local attempt=1

    while [ $attempt -le $max_attempts ]; do
--- a/src/master/Dockerfile
+++ b/src/master/Dockerfile
@ -0,0 +1,81 @@
+FROM python:3.11-slim
+
+SHELL ["/bin/bash", "-c"]
+
+ARG PIP_INDEX_URL=
+ARG USE_OFFLINE=0
+ARG USE_INTRANET=false
+ARG ARGUS_BUILD_UID=2133
+ARG ARGUS_BUILD_GID=2015
+
+ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
+    ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
+
+ENV PIP_NO_CACHE_DIR=1 \
+    PYTHONUNBUFFERED=1 \
+    PYTHONPATH=/app
+
+USER root
+
+WORKDIR /app
+
+COPY ./src/master/requirements.txt ./requirements.txt
+COPY ./src/master/offline_wheels/ /opt/offline_wheels/
+
+RUN set -euxo pipefail \
+    && if [[ "$USE_OFFLINE" == "1" ]]; then \
+         python -m pip install --no-index --find-links /opt/offline_wheels pip && \
+         python -m pip install --no-index --find-links /opt/offline_wheels -r requirements.txt; \
+       else \
+         python -m pip install --upgrade pip && \
+         if [[ -n "$PIP_INDEX_URL" ]]; then \
+              PIP_INDEX_URL="$PIP_INDEX_URL" python -m pip install -r requirements.txt; \
+         else \
+              python -m pip install -r requirements.txt; \
+         fi; \
+       fi
+
+# 配置内网 apt 源并安装常用工具
+RUN if [[ "$USE_INTRANET" == "true" ]]; then \
+        echo "Configuring intranet apt sources" && \
+        if [[ -f /etc/apt/sources.list ]]; then cp /etc/apt/sources.list /etc/apt/sources.list.bak; fi && \
+        mkdir -p /etc/apt && \
+        echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
+        rm -rf /etc/apt/sources.list.d && \
+        echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
+        echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
+    fi && \
+    apt-get update && \
+    apt-get install -y supervisor net-tools inetutils-ping && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# 运行期切换到运行所需的 apt 源
+RUN if [[ "$USE_INTRANET" == "true" ]]; then \
+        echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
+    fi
+
+RUN mkdir -p /var/log/supervisor
+
+RUN set -eux; \
+    if getent group argus >/dev/null; then \
+        groupmod -g "${ARGUS_BUILD_GID}" argus; \
+    else \
+        groupadd -g "${ARGUS_BUILD_GID}" argus; \
+    fi; \
+    if id argus >/dev/null 2>&1; then \
+        usermod -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" argus; \
+    else \
+        useradd -m -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" -s /bin/bash argus; \
+    fi
+
+COPY ./src/master/build/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
+COPY ./src/master/build/start-master.sh /usr/local/bin/start-master.sh
+COPY ./src/master/build/dns-monitor.sh /usr/local/bin/dns-monitor.sh
+RUN chmod +x /usr/local/bin/start-master.sh /usr/local/bin/dns-monitor.sh
+
+COPY ./src/master/app ./app
+
+EXPOSE 3000
+
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
--- a/src/master/README.md
+++ b/src/master/README.md
@ -0,0 +1,186 @@
+# Argus Master 模块
+
+Argus Master 是基于 Flask + SQLite 的节点管理服务，负责：
+
+- 接收 agent 的注册与重注册请求，分配/校验节点 ID。
+- 存储节点元数据、配置、健康状态，并根据上报时间计算在线状态。
+- 输出仅包含在线节点的 `nodes.json`，供其他模块（如 metric）消费。
+- 提供查询、配置更新、统计等 REST API。
+
+## 构建与运行
+
+```bash
+cd src/master
+./scripts/build_images.sh            # 生成 argus-master:latest 镜像
+```
+
+如需离线构建，先在有网环境运行准备脚本：
+
+```bash
+cd src/master
+./scripts/prepare_offline_wheels.sh --pip-version 25.2  # 可选 --clean
+```
+
+脚本会把 `requirements.txt` 及 pip 指定版本全部下载到 `offline_wheels/`。随后将源码目录（含该子目录）与基础镜像一并拷贝到内网，执行：
+
+```bash
+cd src/master
+./scripts/build_images.sh --offline --tag argus-master:latest
+```
+
+若内网缺少 `python:3.11-slim`，请提前在外网 `docker save` 后通过离线介质 `docker load`。
+
+本仓库提供的端到端测试会使用 `src/master/tests/docker-compose.yml` 启动示例环境：
+
+```bash
+cd src/master/tests
+./scripts/01_up_master.sh            # 构建镜像并启动容器，监听 http://localhost:31300
+```
+
+服务日志与数据默认写入 `tests/private/argus/master/`（或自定义的挂载目录）。
+
+## 运行时环境变量
+
+| 变量 | 默认值 | 说明 |
+| --- | --- | --- |
+| `DB_PATH` | `/private/argus/master/db.sqlite3` | SQLite 数据库存放路径。目录会在启动时自动创建。 |
+| `METRIC_NODES_JSON_PATH` | `/private/argus/metric/prometheus/nodes.json` | `nodes.json` 输出位置，仅包含在线节点。采用原子写入避免部分文件。 |
+| `OFFLINE_THRESHOLD_SECONDS` | `180` | 若距离最近一次上报时间超过该值，调度器会将节点标记为 `offline`。 |
+| `ONLINE_THRESHOLD_SECONDS` | `120` | 若最新上报时间距当前不超过该值，则标记为 `online`。范围处于两个阈值之间时保持原状态。 |
+| `SCHEDULER_INTERVAL_SECONDS` | `30` | 调度器检查节点状态与刷新 `nodes.json` 的周期。 |
+| `NODE_ID_PREFIX` | `A` | 新节点 ID 的前缀，实际 ID 形如 `A1`、`A2`。 |
+| `AUTH_MODE` | `disabled` | 预留的认证开关，当前固定为禁用。 |
+
+## 进程与监控
+
+镜像内通过 `supervisord` 管理进程：
+
+- `master`：执行 `/usr/local/bin/start-master.sh`，默认以 4 个 Gunicorn worker 监听 `0.0.0.0:3000`；可通过环境变量 `GUNICORN_WORKERS`、`GUNICORN_BIND`、`GUNICORN_EXTRA_ARGS` 调整。
+- `dns-monitor`：轮询 `/private/argus/etc/dns.conf`，若发现变更则调用 `/private/argus/etc/update-dns.sh`，日志输出在 `/var/log/supervisor/dns-monitor.log`。
+
+镜像构建阶段会安装 `supervisor`/`net-tools`/`inetutils-ping`/`vim` 等基础工具，并在运行前把 apt 源切换到内网镜像，方便容器内进一步运维。
+
+## 域名注册与 DNS 联动
+
+- Master 容器启动时会主动执行 `/private/argus/etc/update-dns.sh`（若存在），把自身 `/etc/resolv.conf` 指向 bind 服务提供的 DNS；随后解析 `eth0` 的 IPv4 地址并写入 `/private/argus/etc/master.argus.com`。该文件会被 bind 模块的 `argus_dns_sync.sh` 监控，用于生成 `master.argus.com` → 当前容器 IP 的 A 记录。
+- 测试与生产都需要将 bind 下发的 `update-dns.sh`、`dns.conf` 等文件挂载到 `/private/argus/etc/`。在 E2E 场景中，`tests/private/argus/etc` 会由脚本自动准备。
+- 其他模块（如 agent）在启动脚本中只需执行同一份 `update-dns.sh`，即可使用域名访问 master；若域名注册异常，agent 将无法成功上报，可据此快速定位问题。
+
+## REST API 详解
+
+基础路径：`/api/v1/master`，全部返回 JSON。
+
+### 1. `GET /nodes`
+- **用途**：获取所有节点的简要信息。
+- **响应示例**：
+  ```json
+  [
+    {"id": "A1", "name": "dev-user-inst-pod-0", "status": "online", "type": "agent", "version": "1.1.0"}
+  ]
+  ```
+
+### 2. `GET /nodes/{id}`
+- **用途**：获取节点详情（包含配置、健康、持久化时间戳等）。
+- **错误**：`404` 表示节点不存在。
+
+### 3. `POST /nodes`
+- **用途**：注册或重注册节点。
+- **请求体**：
+  ```json
+  {
+    "id": "A1",            // 可选，重注册时携带
+    "name": "dev-user-inst-pod-0",
+    "type": "agent",
+    "version": "1.1.0",
+    "meta_data": {
+      "hostname": "dev-user-inst-pod-0",
+      "ip": "10.0.0.10",
+      "env": "dev",
+      "user": "testuser",
+      "instance": "testinst",
+      "cpu_number": 4,
+      "memory_in_bytes": 2147483648,
+      "gpu_number": 0
+    }
+  }
+  ```
+- **成功返回**：
+  - 新节点：`201 Created`，返回完整节点对象。
+  - 重注册：`200 OK`，返回更新后的节点对象。
+- **错误情况**：
+  - `404 Not Found`：携带的 ID 在 Master 中不存在。
+  - `500 Internal Server Error`：携带的 ID 与已有名称不匹配。
+  - `400 Bad Request`：请求体缺字段或类型不正确。
+
+### 4. `PUT /nodes/{id}/status`
+- **用途**：Agent 上报状态。Master 记录 `last_report`（服务器时间）与 `agent_last_report`（上报内时间），并更新 `health` 字段。
+- **请求体示例**：
+  ```json
+  {
+    "timestamp": "2025-09-24T03:24:59Z",
+    "health": {
+      "log-fluentbit": {"status": "healthy"},
+      "metric-node-exporter": {"status": "healthy"}
+    }
+  }
+  ```
+- **响应**：`200 OK`，返回最新节点对象。`404` 表示节点不存在。
+
+### 5. `PUT /nodes/{id}/config`
+- **用途**：局部更新节点配置与标签。
+- **请求体示例**：
+  ```json
+  {
+    "config": {"log_level": "debug"},
+    "label": ["gpu", "exp001"]
+  }
+  ```
+- **说明**：字段可任选其一；未提供的配置保持原值。更新标签会触发 `nodes.json` 重新生成。
+- **错误**：`404` 表示节点不存在；`400` 表示请求体不合法。
+
+### 6. `GET /nodes/statistics`
+- **用途**：统计节点总数及按状态分布。
+- **响应示例**：
+  ```json
+  {
+    "total": 2,
+    "status_statistics": [
+      {"status": "online", "count": 1},
+      {"status": "offline", "count": 1}
+    ]
+  }
+  ```
+
+### 7. 健康探针
+- `GET /healthz`：进程存活检查。
+- `GET /readyz`：数据库可用性检查（会尝试访问 `DB_PATH`）。
+
+
+如需验证离线镜像，可使用自动化脚本：
+```bash
+cd src/master/tests
+./scripts/00_e2e_test_offline.sh    # 构建离线镜像并执行完整 E2E
+```
+
+## 端到端测试场景
+
+执行 `src/master/tests/scripts/00_e2e_test.sh` 会串联以下用例（脚本 01–10）：
+
+1. **01_up_master**：构建镜像、启动容器、初始化目录与卷。 
+2. **02_verify_ready_and_nodes_json**：轮询 `/readyz`，校验初始 `nodes.json` 为 `[]`。
+3. **03_register_via_curl**：模拟 agent 注册，保存返回的节点 ID，并确认节点出现在列表接口中。
+4. **04_reregister_and_error_cases**：覆盖重注册成功、携带未知 ID 的 `404`、ID/名称不匹配触发 `500` 等场景。
+5. **05_status_report_via_curl**：上报健康信息并验证状态自动从 `initialized`→`online`→`offline`→`online` 的转换。
+6. **06_config_update_and_nodes_json**：更新配置/标签，检查 `nodes.json` 中的标签同步，并确保离线节点不会出现在文件里。
+7. **07_stats_single_node**：等待节点掉线，验证统计接口与 `nodes.json` 为空列表。
+8. **08_multi_node_stats**：注册第二节点，使一在线一离线，校验统计聚合和 `nodes.json` 仅包含在线节点。
+9. **09_restart_persistence**：重启 master 容器，确认节点数据、统计结果与 `nodes.json` 在持久化目录中保持不变。
+10. **10_down**：停止并清理容器、网络与临时目录。
+
+## 相关持久化文件
+
+- SQLite：默认位于 `DB_PATH`，包含 `nodes` 与 `kv` 两张表。
+- `nodes.json`：由调度器周期生成，仅保留状态为 `online` 的节点信息。
+- 测试用例中的 `tests/private/`、`tests/tmp/` 会随脚本自动清理，避免污染后续运行。
+
+如需在生产环境运行，可将镜像推送到私有仓库，或参考测试 Compose 配置自行部署；只需确保上述环境变量在容器内正确设置即可。
--- a/src/master/app/init.py
+++ b/src/master/app/init.py
@ -0,0 +1,41 @@
+from __future__ import annotations
+
+import atexit
+import logging
+
+from flask import Flask
+
+from .config import AppConfig, load_config
+from .routes import register_routes
+from .scheduler import StatusScheduler
+from .storage import Storage
+
+
+def create_app(config: AppConfig | None = None) -> Flask:
+    app_config = config or load_config()
+    storage = Storage(app_config.db_path, app_config.node_id_prefix)
+    scheduler = StatusScheduler(storage, app_config)
+
+    app = Flask(__name__)
+    app.config["APP_CONFIG"] = app_config
+    app.config["STORAGE"] = storage
+    app.config["SCHEDULER"] = scheduler
+
+    register_routes(app, storage, scheduler, app_config)
+
+    scheduler.start()
+
+    def _cleanup() -> None:
+        logging.getLogger("argus.master").info("Shutting down master app")
+        try:
+            scheduler.stop()
+        except Exception:  # pragma: no cover - defensive
+            logging.getLogger("argus.master").exception("Failed to stop scheduler")
+        try:
+            storage.close()
+        except Exception:  # pragma: no cover - defensive
+            logging.getLogger("argus.master").exception("Failed to close storage")
+
+    atexit.register(_cleanup)
+
+    return app
--- a/src/master/app/config.py
+++ b/src/master/app/config.py
@ -0,0 +1,40 @@
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class AppConfig:
+    db_path: str
+    metric_nodes_json_path: str
+    offline_threshold_seconds: int
+    online_threshold_seconds: int
+    scheduler_interval_seconds: int
+    node_id_prefix: str
+    auth_mode: str
+
+
+def _get_int_env(name: str, default: int) -> int:
+    raw = os.environ.get(name)
+    if raw is None or raw.strip() == "":
+        return default
+    try:
+        return int(raw)
+    except ValueError as exc:
+        raise ValueError(f"Environment variable {name} must be an integer, got {raw!r}") from exc
+
+
+def load_config() -> AppConfig:
+    """读取环境变量生成配置对象，方便统一管理运行参数。"""
+    return AppConfig(
+        db_path=os.environ.get("DB_PATH", "/private/argus/master/db.sqlite3"),
+        metric_nodes_json_path=os.environ.get(
+            "METRIC_NODES_JSON_PATH", "/private/argus/metric/prometheus/nodes.json"
+        ),
+        offline_threshold_seconds=_get_int_env("OFFLINE_THRESHOLD_SECONDS", 180),
+        online_threshold_seconds=_get_int_env("ONLINE_THRESHOLD_SECONDS", 120),
+        scheduler_interval_seconds=_get_int_env("SCHEDULER_INTERVAL_SECONDS", 30),
+        node_id_prefix=os.environ.get("NODE_ID_PREFIX", "A"),
+        auth_mode=os.environ.get("AUTH_MODE", "disabled"),
+    )
--- a/src/master/app/models.py
+++ b/src/master/app/models.py
@ -0,0 +1,171 @@
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from typing import Any, Dict, Iterable, Mapping
+
+from .util import parse_iso
+
+
+class ValidationError(Exception):
+    """Raised when user payload fails validation."""
+
+
+@dataclass
+class Node:
+    id: str
+    name: str
+    type: str
+    version: str | None
+    status: str
+    config: Dict[str, Any]
+    labels: Iterable[str]
+    meta_data: Dict[str, Any]
+    health: Dict[str, Any]
+    register_time: str | None
+    last_report: str | None
+    agent_last_report: str | None
+    last_updated: str | None
+
+
+def serialize_node_row(row: Mapping[str, Any]) -> Dict[str, Any]:
+    def _json_or_default(value: str | None, default: Any) -> Any:
+        if value is None or value == "":
+            return default
+        try:
+            return json.loads(value)
+        except json.JSONDecodeError:
+            return default
+
+    config = _json_or_default(row["config_json"], {})
+    labels = _json_or_default(row["labels_json"], [])
+    meta = _json_or_default(row["meta_json"], {})
+    health = _json_or_default(row["health_json"], {})
+    return {
+        "id": row["id"],
+        "name": row["name"],
+        "type": row["type"],
+        "version": row["version"],
+        "status": row["status"],
+        "config": config if isinstance(config, dict) else {},
+        "label": list(labels) if isinstance(labels, list) else [],
+        "meta_data": meta if isinstance(meta, dict) else {},
+        "health": health if isinstance(health, dict) else {},
+        "register_time": row["register_time"],
+        "last_report": row["last_report"],
+        "agent_last_report": row["agent_last_report"],
+        "last_updated": row["last_updated"],
+    }
+
+
+def serialize_node_summary(row: Mapping[str, Any]) -> Dict[str, Any]:
+    return {
+        "id": row["id"],
+        "name": row["name"],
+        "status": row["status"],
+        "type": row["type"],
+        "version": row["version"],
+    }
+
+
+def validate_registration_payload(payload: Mapping[str, Any]) -> Dict[str, Any]:
+    if not isinstance(payload, Mapping):
+        raise ValidationError("Request body must be a JSON object")
+
+    name = payload.get("name")
+    if not isinstance(name, str) or not name.strip():
+        raise ValidationError("Field 'name' is required and must be a non-empty string")
+
+    node_type = payload.get("type", "agent")
+    if not isinstance(node_type, str) or not node_type:
+        raise ValidationError("Field 'type' must be a string")
+
+    version = payload.get("version")
+    if version is not None and not isinstance(version, str):
+        raise ValidationError("Field 'version' must be a string if provided")
+
+    meta = payload.get("meta_data")
+    if not isinstance(meta, Mapping):
+        raise ValidationError("Field 'meta_data' must be an object")
+
+    required_meta = ["hostname", "ip", "env", "user", "instance", "cpu_number", "memory_in_bytes", "gpu_number"]
+    for key in required_meta:
+        if key not in meta:
+            raise ValidationError(f"meta_data.{key} is required")
+
+    cpu_number = meta["cpu_number"]
+    memory_in_bytes = meta["memory_in_bytes"]
+    gpu_number = meta["gpu_number"]
+    if not isinstance(cpu_number, int) or cpu_number < 0:
+        raise ValidationError("meta_data.cpu_number must be a non-negative integer")
+    if not isinstance(memory_in_bytes, int) or memory_in_bytes < 0:
+        raise ValidationError("meta_data.memory_in_bytes must be a non-negative integer")
+    if not isinstance(gpu_number, int) or gpu_number < 0:
+        raise ValidationError("meta_data.gpu_number must be a non-negative integer")
+
+    node_id = payload.get("id")
+    if node_id is not None and (not isinstance(node_id, str) or not node_id.strip()):
+        raise ValidationError("Field 'id' must be a non-empty string when provided")
+
+    return {
+        "id": node_id,
+        "name": name,
+        "type": node_type,
+        "version": version,
+        "meta_data": dict(meta),
+    }
+
+
+def validate_status_payload(payload: Mapping[str, Any]) -> Dict[str, Any]:
+    if not isinstance(payload, Mapping):
+        raise ValidationError("Request body must be a JSON object")
+
+    timestamp = payload.get("timestamp")
+    if not isinstance(timestamp, str) or not timestamp:
+        raise ValidationError("Field 'timestamp' is required and must be a string")
+
+    parsed = parse_iso(timestamp)
+    if parsed is None:
+        raise ValidationError("Field 'timestamp' must be an ISO8601 datetime string")
+
+    health = payload.get("health", {})
+    if not isinstance(health, Mapping):
+        raise ValidationError("Field 'health' must be an object if provided")
+
+    sanitized_health: Dict[str, Any] = {}
+    for key, value in health.items():
+        if not isinstance(key, str):
+            raise ValidationError("Keys in 'health' must be strings")
+        if not isinstance(value, (Mapping, list, str, int, float, bool)) and value is not None:
+            raise ValidationError("Values in 'health' must be JSON-compatible")
+        sanitized_health[key] = value
+
+    return {
+        "timestamp": timestamp,
+        "parsed_timestamp": parsed,
+        "health": sanitized_health,
+    }
+
+
+def validate_config_payload(payload: Mapping[str, Any]) -> Dict[str, Any]:
+    if not isinstance(payload, Mapping):
+        raise ValidationError("Request body must be a JSON object")
+
+    result: Dict[str, Any] = {}
+    if "config" in payload:
+        config = payload["config"]
+        if not isinstance(config, Mapping):
+            raise ValidationError("Field 'config' must be an object")
+        result["config"] = dict(config)
+
+    if "label" in payload:
+        labels = payload["label"]
+        if not isinstance(labels, list) or not all(isinstance(item, str) for item in labels):
+            raise ValidationError("Field 'label' must be an array of strings")
+        result["label"] = list(labels)
+
+    if not result:
+        raise ValidationError("At least one of 'config' or 'label' must be provided")
+
+    return result
+
--- a/src/master/app/nodes_api.py
+++ b/src/master/app/nodes_api.py
@ -0,0 +1,155 @@
+from __future__ import annotations
+
+import logging
+from http import HTTPStatus
+from typing import Any, Mapping
+
+from flask import Blueprint, jsonify, request
+
+from .models import (
+    ValidationError,
+    validate_config_payload,
+    validate_registration_payload,
+    validate_status_payload,
+)
+from .scheduler import StatusScheduler
+from .storage import Storage
+from .util import to_iso, utcnow
+
+
+def create_nodes_blueprint(storage: Storage, scheduler: StatusScheduler) -> Blueprint:
+    bp = Blueprint("nodes", __name__)
+    logger = logging.getLogger("argus.master.api")
+
+    def _json_error(message: str, status: HTTPStatus, code: str) -> Any:
+        response = jsonify({"error": message, "code": code})
+        response.status_code = status
+        return response
+
+    @bp.errorhandler(ValidationError)
+    def _handle_validation_error(err: ValidationError):
+        return _json_error(str(err), HTTPStatus.BAD_REQUEST, "invalid_request")
+
+    @bp.get("/nodes")
+    def list_nodes():
+        nodes = storage.list_nodes()
+        return jsonify(nodes)
+
+    @bp.get("/nodes/<node_id>")
+    def get_node(node_id: str):
+        node = storage.get_node(node_id)
+        if node is None:
+            return _json_error("Node not found", HTTPStatus.NOT_FOUND, "not_found")
+        return jsonify(node)
+
+    @bp.post("/nodes")
+    def register_node():
+        payload = _get_json()
+        data = validate_registration_payload(payload)
+        now = utcnow()
+        now_iso = to_iso(now)
+        node_id = data["id"]
+        name = data["name"]
+        node_type = data["type"]
+        version = data["version"]
+        meta = data["meta_data"]
+
+        if node_id:
+            # 携带 id 说明是重注册，需要校验名称一致性
+            existing_row = storage.get_node_raw(node_id)
+            if existing_row is None:
+                return _json_error("Node not found", HTTPStatus.NOT_FOUND, "not_found")
+            if existing_row["name"] != name:
+                return _json_error(
+                    "Node id and name mismatch during re-registration",
+                    HTTPStatus.INTERNAL_SERVER_ERROR,
+                    "id_name_mismatch",
+                )
+            updated = storage.update_node_meta(
+                node_id,
+                node_type=node_type,
+                version=version,
+                meta_data=meta,
+                last_updated_iso=now_iso,
+            )
+            scheduler.trigger_nodes_json_refresh()
+            return jsonify(updated), HTTPStatus.OK
+
+        # No id provided → search by name
+        existing_by_name = storage.get_node_by_name(name)
+        if existing_by_name:
+            # 同名节点已存在，视为无 id 重注册
+            updated = storage.update_node_meta(
+                existing_by_name["id"],
+                node_type=node_type,
+                version=version,
+                meta_data=meta,
+                last_updated_iso=now_iso,
+            )
+            scheduler.trigger_nodes_json_refresh()
+            return jsonify(updated), HTTPStatus.OK
+
+        new_id = storage.allocate_node_id()
+        created = storage.create_node(
+            new_id,
+            name,
+            node_type,
+            version,
+            meta,
+            status="initialized",
+            register_time_iso=now_iso,
+            last_updated_iso=now_iso,
+        )
+        scheduler.trigger_nodes_json_refresh()
+        return jsonify(created), HTTPStatus.CREATED
+
+    @bp.put("/nodes/<node_id>/config")
+    def update_node_config(node_id: str):
+        payload = _get_json()
+        updates = validate_config_payload(payload)
+        try:
+            updated = storage.update_config_and_labels(
+                node_id,
+                config=updates.get("config"),
+                labels=updates.get("label"),
+            )
+        except KeyError:
+            return _json_error("Node not found", HTTPStatus.NOT_FOUND, "not_found")
+
+        if "label" in updates:
+            scheduler.trigger_nodes_json_refresh()
+        return jsonify(updated)
+
+    @bp.get("/nodes/statistics")
+    def node_statistics():
+        stats = storage.get_statistics()
+        return jsonify(stats)
+
+    @bp.put("/nodes/<node_id>/status")
+    def update_status(node_id: str):
+        payload = _get_json()
+        data = validate_status_payload(payload)
+        try:
+            # master 负责写入 last_report，状态由调度器计算
+            updated = storage.update_last_report(
+                node_id,
+                server_timestamp_iso=to_iso(utcnow()),
+                agent_timestamp_iso=data["timestamp"],
+                health=data["health"],
+            )
+        except KeyError:
+            return _json_error("Node not found", HTTPStatus.NOT_FOUND, "not_found")
+
+        scheduler.trigger_nodes_json_refresh()
+        return jsonify(updated)
+
+    return bp
+
+
+def _get_json() -> Mapping[str, Any]:
+    data = request.get_json(silent=True)
+    if data is None:
+        raise ValidationError("Request body must be valid JSON")
+    if not isinstance(data, Mapping):
+        raise ValidationError("Request body must be a JSON object")
+    return data
--- a/src/master/app/routes.py
+++ b/src/master/app/routes.py
@ -0,0 +1,24 @@
+from __future__ import annotations
+
+from flask import Flask, jsonify
+
+from .config import AppConfig
+from .nodes_api import create_nodes_blueprint
+from .scheduler import StatusScheduler
+from .storage import Storage
+
+
+def register_routes(app: Flask, storage: Storage, scheduler: StatusScheduler, config: AppConfig) -> None:
+    app.register_blueprint(create_nodes_blueprint(storage, scheduler), url_prefix="/api/v1/master")
+
+    @app.get("/healthz")
+    def healthz():
+        return jsonify({"status": "ok"})
+
+    @app.get("/readyz")
+    def readyz():
+        try:
+            storage.list_nodes()  # simple readiness probe
+        except Exception as exc:  # pragma: no cover - defensive
+            return jsonify({"status": "error", "error": str(exc)}), 500
+        return jsonify({"status": "ok"})
--- a/src/master/app/scheduler.py
+++ b/src/master/app/scheduler.py
@ -0,0 +1,90 @@
+from __future__ import annotations
+
+import logging
+import threading
+from typing import Optional
+
+from .config import AppConfig
+from .storage import Storage
+from .util import atomic_write_json, parse_iso, to_iso, utcnow
+
+
+class StatusScheduler:
+    def __init__(self, storage: Storage, config: AppConfig, logger: Optional[logging.Logger] = None) -> None:
+        self._storage = storage
+        self._config = config
+        self._logger = logger or logging.getLogger("argus.master.scheduler")
+        self._stop_event = threading.Event()
+        self._thread = threading.Thread(target=self._run, name="status-scheduler", daemon=True)
+        self._nodes_json_lock = threading.Lock()
+        self._pending_nodes_json = threading.Event()
+
+    def start(self) -> None:
+        """启动后台线程，定期刷新节点状态与 nodes.json。"""
+        if not self._thread.is_alive():
+            self._logger.info("Starting scheduler thread")
+            self._thread.start()
+
+    def stop(self) -> None:
+        self._stop_event.set()
+        self._pending_nodes_json.set()
+        self._thread.join(timeout=5)
+
+    def trigger_nodes_json_refresh(self) -> None:
+        self._pending_nodes_json.set()
+
+    def generate_nodes_json(self) -> None:
+        with self._nodes_json_lock:
+            online_nodes = self._storage.get_online_nodes()
+            atomic_write_json(self._config.metric_nodes_json_path, online_nodes)
+            self._logger.info("nodes.json updated", extra={"count": len(online_nodes)})
+
+    # ------------------------------------------------------------------
+    # internal loop
+    # ------------------------------------------------------------------
+
+    def _run(self) -> None:
+        # 确保启动时 nodes.json 会立即生成
+        self._pending_nodes_json.set()
+        while not self._stop_event.is_set():
+            changed = self._reconcile_statuses()
+            if changed or self._pending_nodes_json.is_set():
+                try:
+                    self.generate_nodes_json()
+                finally:
+                    self._pending_nodes_json.clear()
+            self._stop_event.wait(self._config.scheduler_interval_seconds)
+
+    def _reconcile_statuses(self) -> bool:
+        """根据 last_report 与当前时间对比，决定是否切换状态。"""
+        any_status_changed = False
+        now = utcnow()
+        rows = self._storage.fetch_nodes_for_scheduler()
+        for row in rows:
+            node_id = row["id"]
+            last_report_iso = row["last_report"]
+            current_status = row["status"]
+            last_report_dt = parse_iso(last_report_iso)
+            if last_report_dt is None:
+                # No report yet; treat as initialized until report arrives
+                continue
+            delta_seconds = (now - last_report_dt).total_seconds()
+            new_status = current_status
+            if delta_seconds > self._config.offline_threshold_seconds:
+                new_status = "offline"
+            elif delta_seconds <= self._config.online_threshold_seconds:
+                new_status = "online"
+            # Between thresholds: keep current status (sticky)
+            if new_status != current_status:
+                any_status_changed = True
+                self._logger.info(
+                    "Updating node status",
+                    extra={
+                        "node_id": node_id,
+                        "previous": current_status,
+                        "new": new_status,
+                        "delta_seconds": delta_seconds,
+                    },
+                )
+                self._storage.update_status(node_id, new_status, last_updated_iso=to_iso(now))
+        return any_status_changed
--- a/src/master/app/storage.py
+++ b/src/master/app/storage.py
@ -0,0 +1,332 @@
+from __future__ import annotations
+
+import json
+import sqlite3
+import threading
+from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple
+
+from .models import serialize_node_row, serialize_node_summary
+from .util import ensure_parent, to_iso, utcnow
+
+
+class Storage:
+    def __init__(self, db_path: str, node_id_prefix: str) -> None:
+        self._db_path = db_path
+        self._node_id_prefix = node_id_prefix
+        ensure_parent(db_path)
+        self._lock = threading.Lock()
+        self._conn = sqlite3.connect(db_path, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
+        self._conn.row_factory = sqlite3.Row
+        with self._lock:
+            self._conn.execute("PRAGMA foreign_keys = ON;")
+        self._ensure_schema()
+
+    # ------------------------------------------------------------------
+    # schema & helpers
+    # ------------------------------------------------------------------
+
+    def _ensure_schema(self) -> None:
+        """初始化表结构，确保服务启动时数据库结构就绪。"""
+        with self._lock:
+            self._conn.executescript(
+                """
+                CREATE TABLE IF NOT EXISTS nodes (
+                    id TEXT PRIMARY KEY,
+                    name TEXT NOT NULL UNIQUE,
+                    type TEXT NOT NULL,
+                    version TEXT,
+                    status TEXT NOT NULL,
+                    config_json TEXT,
+                    labels_json TEXT,
+                    meta_json TEXT,
+                    health_json TEXT,
+                    register_time TEXT,
+                    last_report TEXT,
+                    agent_last_report TEXT,
+                    last_updated TEXT
+                );
+
+                CREATE TABLE IF NOT EXISTS kv (
+                    key TEXT PRIMARY KEY,
+                    value TEXT NOT NULL
+                );
+
+                CREATE INDEX IF NOT EXISTS idx_nodes_status ON nodes(status);
+                CREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name);
+                """
+            )
+            self._conn.commit()
+
+    def close(self) -> None:
+        with self._lock:
+            self._conn.close()
+
+    # ------------------------------------------------------------------
+    # Node ID allocation
+    # ------------------------------------------------------------------
+
+    def allocate_node_id(self) -> str:
+        """在 kv 表里维护自增序列，为新节点生成形如 A1 的 ID。"""
+        with self._lock:
+            cur = self._conn.execute("SELECT value FROM kv WHERE key = ?", ("node_id_seq",))
+            row = cur.fetchone()
+            if row is None:
+                next_id = 1
+                self._conn.execute("INSERT INTO kv(key, value) VALUES(?, ?)", ("node_id_seq", str(next_id)))
+            else:
+                next_id = int(row["value"]) + 1
+                self._conn.execute("UPDATE kv SET value = ? WHERE key = ?", (str(next_id), "node_id_seq"))
+            self._conn.commit()
+        return f"{self._node_id_prefix}{next_id}"
+
+    # ------------------------------------------------------------------
+    # Query helpers
+    # ------------------------------------------------------------------
+
+    def list_nodes(self) -> List[Dict[str, Any]]:
+        with self._lock:
+            cur = self._conn.execute(
+                "SELECT id, name, status, type, version FROM nodes ORDER BY id ASC"
+            )
+            rows = cur.fetchall()
+        return [serialize_node_summary(row) for row in rows]
+
+    def get_node(self, node_id: str) -> Optional[Dict[str, Any]]:
+        with self._lock:
+            cur = self._conn.execute("SELECT * FROM nodes WHERE id = ?", (node_id,))
+            row = cur.fetchone()
+        if row is None:
+            return None
+        return serialize_node_row(row)
+
+    def get_node_raw(self, node_id: str) -> Optional[sqlite3.Row]:
+        with self._lock:
+            cur = self._conn.execute("SELECT * FROM nodes WHERE id = ?", (node_id,))
+            row = cur.fetchone()
+        return row
+
+    def get_node_by_name(self, name: str) -> Optional[Dict[str, Any]]:
+        with self._lock:
+            cur = self._conn.execute("SELECT * FROM nodes WHERE name = ?", (name,))
+            row = cur.fetchone()
+        if row is None:
+            return None
+        return serialize_node_row(row)
+
+    # ------------------------------------------------------------------
+    # Mutation helpers
+    # ------------------------------------------------------------------
+
+    def create_node(
+        self,
+        node_id: str,
+        name: str,
+        node_type: str,
+        version: str | None,
+        meta_data: Mapping[str, Any],
+        status: str,
+        register_time_iso: str,
+        last_updated_iso: str,
+    ) -> Dict[str, Any]:
+        """插入节点初始记录，默认 config/label/health 为空。"""
+        now_iso = last_updated_iso
+        with self._lock:
+            self._conn.execute(
+                """
+                INSERT INTO nodes (
+                    id, name, type, version, status, config_json, labels_json, meta_json,
+                    health_json, register_time, last_report, agent_last_report, last_updated
+                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                """,
+                (
+                    node_id,
+                    name,
+                    node_type,
+                    version,
+                    status,
+                    json.dumps({}),
+                    json.dumps([]),
+                    json.dumps(dict(meta_data)),
+                    json.dumps({}),
+                    register_time_iso,
+                    None,
+                    None,
+                    now_iso,
+                ),
+            )
+            self._conn.commit()
+
+        created = self.get_node(node_id)
+        if created is None:
+            raise RuntimeError("Failed to read back created node")
+        return created
+
+    def update_node_meta(
+        self,
+        node_id: str,
+        *,
+        name: Optional[str] = None,
+        node_type: Optional[str] = None,
+        version: Optional[str | None] = None,
+        meta_data: Optional[Mapping[str, Any]] = None,
+        last_updated_iso: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """重注册时更新节点静态信息，缺省字段保持不变。"""
+        updates: List[str] = []
+        params: List[Any] = []
+        if name is not None:
+            updates.append("name = ?")
+            params.append(name)
+        if node_type is not None:
+            updates.append("type = ?")
+            params.append(node_type)
+        if version is not None:
+            updates.append("version = ?")
+            params.append(version)
+        if meta_data is not None:
+            updates.append("meta_json = ?")
+            params.append(json.dumps(dict(meta_data)))
+        if last_updated_iso is not None:
+            updates.append("last_updated = ?")
+            params.append(last_updated_iso)
+
+        if not updates:
+            result = self.get_node(node_id)
+            if result is None:
+                raise KeyError(node_id)
+            return result
+
+        params.append(node_id)
+        with self._lock:
+            self._conn.execute(
+                f"UPDATE nodes SET {', '.join(updates)} WHERE id = ?",
+                tuple(params),
+            )
+            self._conn.commit()
+        updated = self.get_node(node_id)
+        if updated is None:
+            raise KeyError(node_id)
+        return updated
+
+    def update_config_and_labels(
+        self, node_id: str, *, config: Optional[Mapping[str, Any]] = None, labels: Optional[Iterable[str]] = None
+    ) -> Dict[str, Any]:
+        """部分更新 config/label，并刷新 last_updated 时间戳。"""
+        updates: List[str] = []
+        params: List[Any] = []
+        if config is not None:
+            updates.append("config_json = ?")
+            params.append(json.dumps(dict(config)))
+        if labels is not None:
+            updates.append("labels_json = ?")
+            params.append(json.dumps(list(labels)))
+        updates.append("last_updated = ?")
+        params.append(to_iso(utcnow()))
+        params.append(node_id)
+        with self._lock:
+            self._conn.execute(
+                f"UPDATE nodes SET {', '.join(updates)} WHERE id = ?",
+                tuple(params),
+            )
+            if self._conn.total_changes == 0:
+                self._conn.rollback()
+                raise KeyError(node_id)
+            self._conn.commit()
+        updated = self.get_node(node_id)
+        if updated is None:
+            raise KeyError(node_id)
+        return updated
+
+    def update_last_report(
+        self,
+        node_id: str,
+        *,
+        server_timestamp_iso: str,
+        agent_timestamp_iso: str,
+        health: Mapping[str, Any],
+    ) -> Dict[str, Any]:
+        """记录最新上报时间和健康信息，用于后续状态计算。"""
+        with self._lock:
+            self._conn.execute(
+                """
+                UPDATE nodes
+                SET last_report = ?,
+                    agent_last_report = ?,
+                    health_json = ?,
+                    last_updated = ?
+                WHERE id = ?
+                """,
+                (
+                    server_timestamp_iso,
+                    agent_timestamp_iso,
+                    json.dumps(health),
+                    server_timestamp_iso,
+                    node_id,
+                ),
+            )
+            if self._conn.total_changes == 0:
+                self._conn.rollback()
+                raise KeyError(node_id)
+            self._conn.commit()
+        updated = self.get_node(node_id)
+        if updated is None:
+            raise KeyError(node_id)
+        return updated
+
+    def update_status(self, node_id: str, status: str, *, last_updated_iso: str) -> None:
+        with self._lock:
+            self._conn.execute(
+                "UPDATE nodes SET status = ?, last_updated = ? WHERE id = ?",
+                (status, last_updated_iso, node_id),
+            )
+            self._conn.commit()
+
+    # ------------------------------------------------------------------
+    # Reporting helpers
+    # ------------------------------------------------------------------
+
+    def get_statistics(self) -> Dict[str, Any]:
+        """统计节点总数及按状态聚合的数量。"""
+        with self._lock:
+            cur = self._conn.execute("SELECT COUNT(*) AS total FROM nodes")
+            total_row = cur.fetchone()
+            cur = self._conn.execute("SELECT status, COUNT(*) AS count FROM nodes GROUP BY status")
+            status_rows = cur.fetchall()
+        return {
+            "total": total_row["total"] if total_row else 0,
+            "status_statistics": [
+                {"status": row["status"], "count": row["count"]}
+                for row in status_rows
+            ],
+        }
+
+    def fetch_nodes_for_scheduler(self) -> List[sqlite3.Row]:
+        with self._lock:
+            cur = self._conn.execute(
+                "SELECT id, last_report, status FROM nodes"
+            )
+            return cur.fetchall()
+
+    def get_online_nodes(self) -> List[Dict[str, Any]]:
+        """返回在线节点列表，用于生成 nodes.json。"""
+        with self._lock:
+            cur = self._conn.execute(
+                "SELECT id, meta_json, labels_json, name FROM nodes WHERE status = ? ORDER BY id ASC",
+                ("online",),
+            )
+            rows = cur.fetchall()
+
+        result: List[Dict[str, Any]] = []
+        for row in rows:
+            meta = json.loads(row["meta_json"]) if row["meta_json"] else {}
+            labels = json.loads(row["labels_json"]) if row["labels_json"] else []
+            result.append(
+                {
+                    "node_id": row["id"],
+                    "user_id": meta.get("user"),
+                    "ip": meta.get("ip"),
+                    "hostname": meta.get("hostname", row["name"]),
+                    "labels": labels if isinstance(labels, list) else [],
+                }
+            )
+        return result
--- a/src/master/app/util.py
+++ b/src/master/app/util.py
@ -0,0 +1,51 @@
+from __future__ import annotations
+
+import json
+import os
+import tempfile
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Iterable
+
+
+ISO_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
+
+
+def utcnow() -> datetime:
+    """获取当前 UTC 时间，统一时间基准。"""
+    return datetime.now(timezone.utc)
+
+
+def to_iso(dt: datetime | None) -> str | None:
+    if dt is None:
+        return None
+    return dt.astimezone(timezone.utc).replace(microsecond=0).strftime(ISO_FORMAT)
+
+
+def parse_iso(value: str | None) -> datetime | None:
+    if not value:
+        return None
+    try:
+        if value.endswith("Z"):
+            return datetime.strptime(value, ISO_FORMAT).replace(tzinfo=timezone.utc)
+        # Fallback for ISO strings with offset
+        return datetime.fromisoformat(value).astimezone(timezone.utc)
+    except ValueError:
+        return None
+
+
+def ensure_parent(path: str) -> None:
+    """确保目标文件所在目录存在。"""
+    Path(path).parent.mkdir(parents=True, exist_ok=True)
+
+
+def atomic_write_json(path: str, data: Iterable[Any] | Any) -> None:
+    """原子化写 JSON，避免被其它进程读到半成品。"""
+    ensure_parent(path)
+    directory = Path(path).parent
+    with tempfile.NamedTemporaryFile("w", dir=directory, delete=False) as tmp:
+        json.dump(data, tmp, separators=(",", ":"))
+        tmp.flush()
+        os.fsync(tmp.fileno())
+        temp_path = tmp.name
+    os.replace(temp_path, path)
--- a/src/master/build/dns-monitor.sh
+++ b/src/master/build/dns-monitor.sh
@ -0,0 +1 @@
+../../bind/build/dns-monitor.sh
--- a/src/master/build/start-master.sh
+++ b/src/master/build/start-master.sh
@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# 中文提示：确保共享目录与 DNS 相关脚本存在
+DNS_DIR="/private/argus/etc"
+DNS_SCRIPT="${DNS_DIR}/update-dns.sh"
+MASTER_DOMAIN_FILE="${DNS_DIR}/master.argus.com"
+RUNTIME_USER="${ARGUS_RUNTIME_USER:-argus}"
+RUNTIME_UID="${ARGUS_BUILD_UID:-2133}"
+RUNTIME_GID="${ARGUS_BUILD_GID:-2015}"
+MASTER_DATA_DIR="/private/argus/master"
+METRIC_DIR="/private/argus/metric/prometheus"
+
+mkdir -p "$DNS_DIR"
+chown -R "$RUNTIME_UID:$RUNTIME_GID" "$DNS_DIR" 2>/dev/null || true
+mkdir -p "$MASTER_DATA_DIR"
+mkdir -p "$METRIC_DIR"
+chown -R "$RUNTIME_UID:$RUNTIME_GID" "$MASTER_DATA_DIR" "$METRIC_DIR" 2>/dev/null || true
+
+if [[ -x "$DNS_SCRIPT" ]]; then
+  echo "[INFO] Running update-dns.sh before master starts"
+  # 中文提示：若脚本存在则执行，保证容器使用 bind 作为 DNS
+  "$DNS_SCRIPT" || echo "[WARN] update-dns.sh execution failed"
+else
+  echo "[WARN] DNS update script not found or not executable: $DNS_SCRIPT"
+fi
+
+# 中文提示：记录 master 当前 IP，供 bind 服务同步
+MASTER_IP=$(ifconfig | grep -A 1 eth0 | grep inet | awk '{print $2}' || true)
+if [[ -n "${MASTER_IP}" ]]; then
+  echo "current IP: ${MASTER_IP}"
+  echo "${MASTER_IP}" > "$MASTER_DOMAIN_FILE"
+  chown "$RUNTIME_UID:$RUNTIME_GID" "$MASTER_DOMAIN_FILE" 2>/dev/null || true
+else
+  echo "[WARN] Failed to detect master IP via ifconfig"
+fi
+
+WORKERS=${GUNICORN_WORKERS:-4}
+BIND_ADDR=${GUNICORN_BIND:-0.0.0.0:3000}
+EXTRA_OPTS=${GUNICORN_EXTRA_ARGS:-}
+
+if [[ -n "$EXTRA_OPTS" ]]; then
+  read -r -a EXTRA_ARRAY <<< "$EXTRA_OPTS"
+else
+  EXTRA_ARRAY=()
+fi
+
+command=(gunicorn --bind "$BIND_ADDR" --workers "$WORKERS")
+if [[ ${#EXTRA_ARRAY[@]} -gt 0 ]]; then
+  command+=("${EXTRA_ARRAY[@]}")
+fi
+command+=("app:create_app()")
+
+if command -v runuser >/dev/null 2>&1; then
+  exec runuser -u "$RUNTIME_USER" -- "${command[@]}"
+else
+  printf -v _cmd_str '%q ' "${command[@]}"
+  exec su -s /bin/bash -m "$RUNTIME_USER" -c "exec ${_cmd_str}"
+fi
--- a/src/master/build/supervisord.conf
+++ b/src/master/build/supervisord.conf
@ -0,0 +1,39 @@
+[supervisord]
+nodaemon=true
+logfile=/var/log/supervisor/supervisord.log
+pidfile=/var/run/supervisord.pid
+user=root
+
+[program:master]
+command=/usr/local/bin/start-master.sh
+user=root
+stdout_logfile=/var/log/supervisor/master.log
+stderr_logfile=/var/log/supervisor/master_error.log
+autostart=true
+autorestart=true
+startsecs=5
+stopwaitsecs=30
+killasgroup=true
+stopasgroup=true
+
+[program:dns-monitor]
+command=/usr/local/bin/dns-monitor.sh
+user=root
+stdout_logfile=/var/log/supervisor/dns-monitor.log
+stderr_logfile=/var/log/supervisor/dns-monitor_error.log
+autostart=true
+autorestart=true
+startsecs=5
+stopwaitsecs=10
+killasgroup=true
+stopasgroup=true
+
+[unix_http_server]
+file=/var/run/supervisor.sock
+chmod=0700
+
+[supervisorctl]
+serverurl=unix:///var/run/supervisor.sock
+
+[rpcinterface:supervisor]
+supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
--- a/src/master/images/.gitkeep
+++ b/src/master/images/.gitkeep
--- a/src/master/offline_wheels.tar.gz
+++ b/src/master/offline_wheels.tar.gz
--- a/src/master/offline_wheels/.gitkeep
+++ b/src/master/offline_wheels/.gitkeep
--- a/src/master/requirements.txt
+++ b/src/master/requirements.txt
@ -0,0 +1,2 @@
+Flask==2.3.3
+gunicorn==21.2.0
--- a/src/master/scripts/build_images.sh
+++ b/src/master/scripts/build_images.sh
@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+usage() {
+  cat >&2 <<'USAGE'
+Usage: $0 [--intranet] [--offline] [--tag <image_tag>]
+
+Options:
+  --intranet           使用指定的 PyPI 镜像源（默认清华镜像）。
+  --offline            完全离线构建，依赖 offline_wheels/ 目录中的离线依赖包。
+  --tag <image_tag>    自定义镜像标签，默认 argus-master:latest。
+USAGE
+}
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+MODULE_ROOT="$PROJECT_ROOT/src/master"
+IMAGE_TAG="${IMAGE_TAG:-argus-master:latest}"
+DOCKERFILE="src/master/Dockerfile"
+BUILD_ARGS=()
+OFFLINE_MODE=0
+
+source "$PROJECT_ROOT/scripts/common/build_user.sh"
+load_build_user
+BUILD_ARGS+=("--build-arg" "ARGUS_BUILD_UID=${ARGUS_BUILD_UID}" "--build-arg" "ARGUS_BUILD_GID=${ARGUS_BUILD_GID}")
+
+cd "$PROJECT_ROOT"
+
+while [[ "$#" -gt 0 ]]; do
+  case "$1" in
+    --intranet)
+      INTRANET_INDEX="${INTRANET_INDEX:-https://pypi.tuna.tsinghua.edu.cn/simple}"
+      BUILD_ARGS+=("--build-arg" "PIP_INDEX_URL=${INTRANET_INDEX}")
+      BUILD_ARGS+=("--build-arg" "USE_INTRANET=true")
+      shift
+      ;;
+    --offline)
+      OFFLINE_MODE=1
+      BUILD_ARGS+=("--build-arg" "USE_OFFLINE=1")
+      BUILD_ARGS+=("--build-arg" "USE_INTRANET=true")
+      shift
+      ;;
+    --tag)
+      [[ $# -ge 2 ]] || { usage; exit 1; }
+      IMAGE_TAG="$2"
+      shift 2
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage
+      exit 1
+      ;;
+  esac
+ done
+
+if [[ "$OFFLINE_MODE" -eq 1 ]]; then
+  WHEELS_DIR="$MODULE_ROOT/offline_wheels"
+  if [[ ! -d "$WHEELS_DIR" ]]; then
+    echo "[ERROR] offline_wheels 目录不存在: $WHEELS_DIR" >&2
+    exit 1
+  fi
+  if ! find "$WHEELS_DIR" -maxdepth 1 -type f -name '*.whl' -print -quit >/dev/null; then
+    echo "[ERROR] offline_wheels 目录为空，请先在有网环境执行 scripts/prepare_offline_wheels.sh" >&2
+    exit 1
+  fi
+fi
+
+
+
+echo "[INFO] Building image $IMAGE_TAG"
+docker build -f "$DOCKERFILE" "${BUILD_ARGS[@]}" -t "$IMAGE_TAG" "$PROJECT_ROOT"
+echo "[OK] Image $IMAGE_TAG built"
--- a/src/master/scripts/load_images.sh
+++ b/src/master/scripts/load_images.sh
@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+usage() {
+  echo "Usage: $0 [--file <tar_path>]" >&2
+}
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+DEFAULT_INPUT="$PROJECT_ROOT/images/argus-master-dev.tar"
+IMAGE_TAR="$DEFAULT_INPUT"
+
+while [[ "$#" -gt 0 ]]; do
+  case "$1" in
+    --file)
+      [[ $# -ge 2 ]] || { usage; exit 1; }
+      IMAGE_TAR="$2"
+      shift 2
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage
+      exit 1
+      ;;
+  esac
+ done
+
+if [[ ! -f "$IMAGE_TAR" ]]; then
+  echo "[ERROR] Image tarball not found: $IMAGE_TAR" >&2
+  exit 1
+fi
+
+echo "[INFO] Loading image from $IMAGE_TAR"
+docker image load -i "$IMAGE_TAR"
+echo "[OK] Image loaded"
--- a/src/master/scripts/prepare_offline_wheels.sh
+++ b/src/master/scripts/prepare_offline_wheels.sh
@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+usage() {
+  cat >&2 <<'USAGE'
+Usage: $0 [--pip-version <version>] [--clean] [--local]
+
+Options:
+  --pip-version <version>   额外下载指定版本的 pip wheel（例如 25.2）。
+  --clean                   清理 offline_wheels/*.whl 后重新下载。
+  --local                   使用本地 python 执行下载（默认通过 docker python:3.11-slim）。
+USAGE
+}
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+REQUIREMENTS_FILE="$PROJECT_ROOT/requirements.txt"
+WHEEL_DIR="$PROJECT_ROOT/offline_wheels"
+PIP_VERSION=""
+CLEAN=0
+USE_LOCAL=0
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --pip-version)
+      [[ $# -ge 2 ]] || { usage; exit 1; }
+      PIP_VERSION="$2"
+      shift 2
+      ;;
+    --clean)
+      CLEAN=1
+      shift
+      ;;
+    --local)
+      USE_LOCAL=1
+      shift
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage
+      exit 1
+      ;;
+  esac
+ done
+
+if [[ ! -f "$REQUIREMENTS_FILE" ]]; then
+  echo "[ERROR] requirements.txt not found at $REQUIREMENTS_FILE" >&2
+  exit 1
+fi
+
+mkdir -p "$WHEEL_DIR"
+
+if [[ "$CLEAN" -eq 1 ]]; then
+  echo "[INFO] Cleaning existing wheels in $WHEEL_DIR"
+  find "$WHEEL_DIR" -maxdepth 1 -type f -name '*.whl' -delete
+fi
+
+run_with_python() {
+  local cmd=("python" "-m" "pip" "$@")
+  eval "${cmd[@]}"
+}
+
+if [[ "$USE_LOCAL" -eq 1 ]]; then
+  PYTHON_BIN=${PYTHON_BIN:-python3}
+  if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then
+    echo "[ERROR] $PYTHON_BIN not found" >&2
+    exit 1
+  fi
+  echo "[INFO] Using local python ($PYTHON_BIN) to download wheels"
+  "$PYTHON_BIN" -m pip download -r "$REQUIREMENTS_FILE" -d "$WHEEL_DIR"
+  if [[ -n "$PIP_VERSION" ]]; then
+    "$PYTHON_BIN" -m pip download "pip==${PIP_VERSION}" -d "$WHEEL_DIR"
+  fi
+else
+  if ! command -v docker >/dev/null 2>&1; then
+    echo "[ERROR] docker not found; rerun with --local or安装 docker" >&2
+    exit 1
+  fi
+  echo "[INFO] Using docker image python:3.11-slim 下载 wheel"
+  docker run --rm \
+    -v "$WHEEL_DIR":/wheels \
+    -v "$REQUIREMENTS_FILE":/tmp/requirements.txt \
+    python:3.11-slim \
+    bash -c "set -euo pipefail && python -m pip install --upgrade pip && python -m pip download -r /tmp/requirements.txt -d /wheels"
+  if [[ -n "$PIP_VERSION" ]]; then
+    docker run --rm \
+      -v "$WHEEL_DIR":/wheels \
+      python:3.11-slim \
+      bash -c "set -euo pipefail && python -m pip download pip==${PIP_VERSION} -d /wheels"
+  fi
+fi
+
+echo "[INFO] Offline wheels prepared at $WHEEL_DIR"
--- a/src/master/scripts/save_images.sh
+++ b/src/master/scripts/save_images.sh
@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+usage() {
+  echo "Usage: $0 [--tag <image_tag>] [--output <tar_path>]" >&2
+}
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+DEFAULT_OUTPUT="$PROJECT_ROOT/images/argus-master-dev.tar"
+IMAGE_TAG="${IMAGE_TAG:-argus-master:latest}"
+OUTPUT_PATH="$DEFAULT_OUTPUT"
+
+while [[ "$#" -gt 0 ]]; do
+  case "$1" in
+    --tag)
+      [[ $# -ge 2 ]] || { usage; exit 1; }
+      IMAGE_TAG="$2"
+      shift 2
+      ;;
+    --output)
+      [[ $# -ge 2 ]] || { usage; exit 1; }
+      OUTPUT_PATH="$2"
+      shift 2
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage
+      exit 1
+      ;;
+  esac
+ done
+
+mkdir -p "$(dirname "$OUTPUT_PATH")"
+echo "[INFO] Saving image $IMAGE_TAG to $OUTPUT_PATH"
+docker image save "$IMAGE_TAG" -o "$OUTPUT_PATH"
+echo "[OK] Image saved"
--- a/src/master/tests/.gitignore
+++ b/src/master/tests/.gitignore
@ -0,0 +1,2 @@
+private/
+tmp/
--- a/src/master/tests/docker-compose.yml
+++ b/src/master/tests/docker-compose.yml
@ -0,0 +1,19 @@
+services:
+  master:
+    image: ${MASTER_IMAGE_TAG:-argus-master:latest}
+    container_name: argus-master-e2e
+    environment:
+      - OFFLINE_THRESHOLD_SECONDS=6
+      - ONLINE_THRESHOLD_SECONDS=2
+      - SCHEDULER_INTERVAL_SECONDS=1
+    ports:
+      - "31300:3000"
+    volumes:
+      - ./private/argus/master:/private/argus/master
+      - ./private/argus/metric/prometheus:/private/argus/metric/prometheus
+      - ./private/argus/etc:/private/argus/etc
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
--- a/src/master/tests/scripts/00_e2e_test.sh
+++ b/src/master/tests/scripts/00_e2e_test.sh
@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SCRIPTS=(
+  "01_up_master.sh"
+  "02_verify_ready_and_nodes_json.sh"
+  "03_register_via_curl.sh"
+  "04_reregister_and_error_cases.sh"
+  "05_status_report_via_curl.sh"
+  "06_config_update_and_nodes_json.sh"
+  "07_stats_single_node.sh"
+  "08_multi_node_stats.sh"
+  "09_restart_persistence.sh"
+  "10_down.sh"
+)
+
+for script in "${SCRIPTS[@]}"; do
+  echo "[TEST] Running $script"
+  MASTER_IMAGE_TAG="${MASTER_IMAGE_TAG:-argus-master:latest}" "$SCRIPT_DIR/$script"
+  echo "[TEST] $script completed"
+  echo
+done
+
+echo "[TEST] Master module E2E tests completed"
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
xiuting.xu	abc739b1be	[#19 ] alert和web增加系统集成测试	2025-10-13 16:48:05 +08:00
xiuting.xu	cb213df6f8	[#6 ] 修改打包镜像	2025-10-13 10:13:36 +08:00
xiuting.xu	ade9dd7d62	[#5 ] 修改web页面；更新镜像打包	2025-10-13 10:13:35 +08:00
xiuting.xu	54f99b854c	[#6 ] 提供在Prometheus上部署Alertmanager配置，提供配置文件片段；	2025-10-13 10:13:35 +08:00
xiuting.xu	ac15595c8e	[#6 ] 修改Alertmanager的镜像打包，适配算力平台环境	2025-10-13 10:13:35 +08:00
xiuting.xu	f17bc6d312	[#6 ] alertmanager的容器化部署	2025-10-13 10:13:35 +08:00
xiuting.xu	ac0eb558e9	[#6 ] 修改web页面	2025-10-13 10:13:34 +08:00
xiuting.xu	ef89e5d7e6	[#5 ] web网页代码初始化	2025-10-13 10:13:34 +08:00
sundapeng	c098f1d3ce	dev_1.0.0_sundp 完成Metric模块及模块e2e测试 (#18 ) Co-authored-by: sundapeng.sdp <sundapeng@hashdata.cn> Reviewed-on: #18 Reviewed-by: xuxt <xuxt@zgclab.edu.cn> Reviewed-by: yuyr <yuyr@zgclab.edu.cn> Reviewed-by: huhy <husteryezi@163.com>	2025-10-11 17:15:06 +08:00
yuyr	1e5e91b193	dev_1.0.0_yuyr_2：重新提交 PR，增加 master/agent 以及系统集成测试 (#17 ) Reviewed-on: #17 Reviewed-by: sundapeng <sundp@mail.zgclab.edu.cn> Reviewed-by: xuxt <xuxt@zgclab.edu.cn>	2025-10-11 15:04:46 +08:00