diff --git a/README.md b/README.md index 253aded..b4796ee 100644 --- a/README.md +++ b/README.md @@ -5,3 +5,10 @@ 项目文档:【腾讯文档】GPU集群运维系统 https://docs.qq.com/doc/DQUxDdmhIZ1dpeERk +## 构建账号配置 + +镜像构建和运行账号的 UID/GID 可通过 `configs/build_user.conf` 配置,详细说明见 `doc/build-user-config.md`。 + +## 本地端口占用提示 + +如需运行 BIND 模块端到端测试且宿主机 53 端口已占用,可通过环境变量 `HOST_DNS_PORT`(默认 1053)指定对外映射端口,例如 `HOST_DNS_PORT=12053 ./scripts/00_e2e_test.sh`。 diff --git a/build/build_images.sh b/build/build_images.sh index 6a095b0..bf712a4 100755 --- a/build/build_images.sh +++ b/build/build_images.sh @@ -1,138 +1,205 @@ #!/usr/bin/env bash set -euo pipefail -# 帮助信息 show_help() { - cat << EOF + cat <<'EOF' ARGUS Unified Build System - Image Build Tool Usage: $0 [OPTIONS] Options: - --intranet Use intranet mirror for Ubuntu 22.04 packages - -h, --help Show this help message + --intranet Use intranet mirror for log/bind builds + --master-offline Build master offline image (requires src/master/offline_wheels.tar.gz) + -h, --help Show this help message Examples: - $0 # Build with default sources - $0 --intranet # Build with intranet mirror - + $0 # Build with default sources + $0 --intranet # Build with intranet mirror + $0 --master-offline # Additionally build argus-master:offline + $0 --intranet --master-offline EOF } -# 解析命令行参数 use_intranet=false +build_master=true +build_master_offline=false while [[ $# -gt 0 ]]; do - case $1 in - --intranet) - use_intranet=true - shift - ;; - -h|--help) - show_help - exit 0 - ;; - *) - echo "Unknown option: $1" - show_help - exit 1 - ;; - esac + case $1 in + --intranet) + use_intranet=true + shift + ;; + --master) + build_master=true + shift + ;; + --master-offline) + build_master=true + build_master_offline=true + shift + ;; + -h|--help) + show_help + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + show_help + exit 1 + ;; + esac done -# 获取项目根目录 root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +. "$root/scripts/common/build_user.sh" + +declare -a build_args=() + +if [[ "$use_intranet" == true ]]; then + build_args+=("--build-arg" "USE_INTRANET=true") +fi + cd "$root" +load_build_user +build_args+=("--build-arg" "ARGUS_BUILD_UID=${ARGUS_BUILD_UID}" "--build-arg" "ARGUS_BUILD_GID=${ARGUS_BUILD_GID}") + +master_root="$root/src/master" +master_offline_tar="$master_root/offline_wheels.tar.gz" +master_offline_dir="$master_root/offline_wheels" + +if [[ "$build_master_offline" == true ]]; then + if [[ ! -f "$master_offline_tar" ]]; then + echo "❌ offline wheels tar not found: $master_offline_tar" >&2 + echo " 请提前准备好 offline_wheels.tar.gz 后再执行 --master-offline" >&2 + exit 1 + fi + echo "📦 Preparing offline wheels for master (extracting $master_offline_tar)" + rm -rf "$master_offline_dir" + mkdir -p "$master_offline_dir" + tar -xzf "$master_offline_tar" -C "$master_root" + has_wheel=$(find "$master_offline_dir" -maxdepth 1 -type f -name '*.whl' -print -quit) + if [[ -z "$has_wheel" ]]; then + echo "❌ offline_wheels extraction failed或无 wheel: $master_offline_dir" >&2 + exit 1 + fi +fi + echo "=======================================" echo "ARGUS Unified Build System" echo "=======================================" if [[ "$use_intranet" == true ]]; then - echo "🌐 Mode: Intranet (Using internal mirror: 10.68.64.1)" - build_args="--build-arg USE_INTRANET=true" + echo "🌐 Mode: Intranet (Using internal mirror: 10.68.64.1)" else - echo "🌐 Mode: Public (Using default package sources)" - build_args="" + echo "🌐 Mode: Public (Using default package sources)" fi +echo "👤 Build user UID:GID -> ${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" + echo "📁 Build context: $root" echo "" -# 构建镜像的函数 build_image() { - local image_name=$1 - local dockerfile_path=$2 - local tag=$3 + local image_name=$1 + local dockerfile_path=$2 + local tag=$3 + shift 3 + local extra_args=("$@") - echo "🔄 Building $image_name image..." - echo " Dockerfile: $dockerfile_path" - echo " Tag: $tag" + echo "🔄 Building $image_name image..." + echo " Dockerfile: $dockerfile_path" + echo " Tag: $tag" - if docker build $build_args -f "$dockerfile_path" -t "$tag" .; then - echo "✅ $image_name image built successfully" - return 0 - else - echo "❌ Failed to build $image_name image" - return 1 - fi + if docker build "${build_args[@]}" "${extra_args[@]}" -f "$dockerfile_path" -t "$tag" .; then + echo "✅ $image_name image built successfully" + return 0 + else + echo "❌ Failed to build $image_name image" + return 1 + fi } -# 构建所有镜像 images_built=() build_failed=false -# 构建 Elasticsearch 镜像 if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then - images_built+=("argus-elasticsearch:latest") + images_built+=("argus-elasticsearch:latest") else - build_failed=true + build_failed=true fi echo "" -# 构建 Kibana 镜像 if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:latest"; then - images_built+=("argus-kibana:latest") + images_built+=("argus-kibana:latest") else - build_failed=true + build_failed=true fi echo "" -# 构建 BIND9 镜像 if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:latest"; then - images_built+=("argus-bind9:latest") + images_built+=("argus-bind9:latest") else - build_failed=true + build_failed=true fi echo "" + +if [[ "$build_master" == true ]]; then + echo "" + echo "🔄 Building Master image..." + pushd "$master_root" >/dev/null + master_args=("--tag" "argus-master:latest") + if [[ "$use_intranet" == true ]]; then + master_args+=("--intranet") + fi + if [[ "$build_master_offline" == true ]]; then + master_args+=("--offline") + fi + if ./scripts/build_images.sh "${master_args[@]}"; then + if [[ "$build_master_offline" == true ]]; then + images_built+=("argus-master:offline") + else + images_built+=("argus-master:latest") + fi + else + build_failed=true + fi + popd >/dev/null +fi + echo "=======================================" echo "📦 Build Summary" echo "=======================================" if [[ ${#images_built[@]} -gt 0 ]]; then - echo "✅ Successfully built images:" - for image in "${images_built[@]}"; do - echo " • $image" - done + echo "✅ Successfully built images:" + for image in "${images_built[@]}"; do + echo " • $image" + done fi if [[ "$build_failed" == true ]]; then - echo "" - echo "❌ Some images failed to build. Please check the errors above." - exit 1 + echo "" + echo "❌ Some images failed to build. Please check the errors above." + exit 1 fi if [[ "$use_intranet" == true ]]; then - echo "" - echo "🌐 Built with intranet mirror configuration" + echo "" + echo "🌐 Built with intranet mirror configuration" +fi + +if [[ "$build_master_offline" == true ]]; then + echo "" + echo "🧳 Master offline wheels 已解压到 $master_offline_dir" fi echo "" echo "🚀 Next steps:" -echo " cd src/log && ./scripts/save_images.sh # Export log images" -echo " cd src/bind && ./scripts/save_images.sh # Export bind images" -echo " cd src/log/tests && ./scripts/02_up.sh # Start log services" -echo "" \ No newline at end of file +echo " ./build/save_images.sh --compress # 导出镜像" +echo " cd src/master/tests && MASTER_IMAGE_TAG=argus-master:offline ./scripts/00_e2e_test.sh" +echo "" diff --git a/build/save_images.sh b/build/save_images.sh index ffe7151..20d9c1b 100755 --- a/build/save_images.sh +++ b/build/save_images.sh @@ -67,6 +67,7 @@ declare -A images=( ["argus-elasticsearch:latest"]="argus-elasticsearch-latest.tar" ["argus-kibana:latest"]="argus-kibana-latest.tar" ["argus-bind9:latest"]="argus-bind9-latest.tar" + ["argus-master:offline"]="argus-master-offline.tar" ) # 函数:检查镜像是否存在 diff --git a/configs/.gitignore b/configs/.gitignore new file mode 100644 index 0000000..2f80b1e --- /dev/null +++ b/configs/.gitignore @@ -0,0 +1,2 @@ +# Local overrides for build user/group settings +build_user.local.conf diff --git a/configs/build_user.conf b/configs/build_user.conf new file mode 100644 index 0000000..e4df5be --- /dev/null +++ b/configs/build_user.conf @@ -0,0 +1,6 @@ +# Default build-time UID/GID for Argus images +# Override by creating configs/build_user.local.conf with the same format. +# Syntax: KEY=VALUE, supports UID/GID only. Whitespace and lines starting with # are ignored. + +UID=2133 +GID=2015 diff --git a/doc/build-user-config.md b/doc/build-user-config.md new file mode 100644 index 0000000..8b809a4 --- /dev/null +++ b/doc/build-user-config.md @@ -0,0 +1,38 @@ +# Argus 镜像构建 UID/GID 配置说明 + +通过统一配置文件可以为 Kibana、Elasticsearch、Bind、Master 等容器指定运行账号,解决跨机器部署时 UID/GID 不一致导致的权限问题。 + +## 配置入口 + +- 默认配置存放在 `configs/build_user.conf`,内容示例: + + ```bash + UID=2133 + GID=2015 + ``` + +- 如果需要本地覆盖,可在 `configs/` 下新建 `build_user.local.conf`,字段与默认文件一致。该文件已列入 `.gitignore`,不会被意外提交。 +- 亦可在执行脚本前通过环境变量 `ARGUS_BUILD_UID` / `ARGUS_BUILD_GID` 强制指定值,优先级最高。 + +## 作用范围 + +- `build/build_images.sh` 在构建 log/bind/master 镜像时读取配置,并传递 `--build-arg ARGUS_BUILD_UID/GID`;控制台会输出当前使用的 UID/GID。 +- `src/master/scripts/build_images.sh` 同步使用配置,确保单独构建 master 镜像时行为一致。 +- 各镜像 Dockerfile 会根据传入的 UID/GID 调整容器内账号(如 `elasticsearch`、`kibana`、`bind`、`argus`),并以环境变量形式暴露运行时可见值。 +- Master 启动脚本会在执行 DNS 逻辑后,降权到配置的账号运行 `gunicorn`,确保写入 `/private/argus/**` 的文件具备正确属主。 +- Log 模块测试脚本 `01_bootstrap.sh` 会根据配置修正挂载目录属主,方便端到端测试在任意用户下运行。 + +## 使用建议 + +1. 初次克隆仓库后无需修改,默认 UID/GID 保持向后兼容。 +2. 如果在目标环境中使用新的账号(例如 `uid=4001,gid=4001`): + - 编辑 `configs/build_user.local.conf` 填入新值; + - 使用新账号登录,并确保其加入宿主机的 `docker` 组; + - 重新执行 `build/build_images.sh` 或相关模块的构建脚本。 +3. 切换配置后建议重新运行目标模块的端到端脚本(如 `src/log/tests/scripts/01_bootstrap.sh`、`src/master/tests/scripts/00_e2e_test.sh`、`src/agent/tests/scripts/00_e2e_test.sh`),验证 `/private/argus` 下文件属主是否为期望账号。 + +## 故障排查 + +- **镜像构建报错 `groupmod: GID already in use`**:说明所选 GID 已存在于基础镜像,建议换用未占用的值,或在自定义基础镜像中先移除冲突。 +- **容器内运行时报写权限不足**:检查宿主机挂载目录是否已经由目标 UID/GID 创建;必要时重新执行模块的 `01_bootstrap.sh` 之类的准备脚本。 +- **仍看到旧 UID/GID**:确认脚本执行时未继承旧缓存,可运行 `ARGUS_BUILD_UID=... ARGUS_BUILD_GID=... ./build/build_images.sh` 强制覆盖。 diff --git a/scripts/common/build_user.sh b/scripts/common/build_user.sh new file mode 100644 index 0000000..c8f5c08 --- /dev/null +++ b/scripts/common/build_user.sh @@ -0,0 +1,115 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Shared helper to load Argus build user/group configuration. +# Usage: +# source "${PROJECT_ROOT}/scripts/common/build_user.sh" +# load_build_user +# echo "$ARGUS_BUILD_UID:$ARGUS_BUILD_GID" + +ARGUS_BUILD_UID_DEFAULT=2133 +ARGUS_BUILD_GID_DEFAULT=2015 + +shopt -s extglob + +_ARGUS_BUILD_USER_LOADED="${_ARGUS_BUILD_USER_LOADED:-0}" + +_argus_build_user_script_dir() { + local dir + dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + echo "$dir" +} + +argus_project_root() { + local script_dir + script_dir="$(_argus_build_user_script_dir)" + (cd "$script_dir/../.." >/dev/null && pwd) +} + +_argus_trim() { + local value="$1" + value="${value##+([[:space:]])}" + value="${value%%+([[:space:]])}" + printf '%s' "$value" +} + +_argus_is_number() { + [[ "$1" =~ ^[0-9]+$ ]] +} + +load_build_user() { + if [[ "$_ARGUS_BUILD_USER_LOADED" == "1" ]]; then + return 0 + fi + + local project_root config_files config uid gid + project_root="$(argus_project_root)" + config_files=( + "$project_root/configs/build_user.local.conf" + "$project_root/configs/build_user.conf" + ) + + uid="$ARGUS_BUILD_UID_DEFAULT" + gid="$ARGUS_BUILD_GID_DEFAULT" + + for config in "${config_files[@]}"; do + if [[ -f "$config" ]]; then + while IFS= read -r raw_line || [[ -n "$raw_line" ]]; do + local line key value + line="${raw_line%%#*}" + line="$(_argus_trim "${line}")" + [[ -z "$line" ]] && continue + if [[ "$line" != *=* ]]; then + echo "[ARGUS build_user] Ignoring malformed line in $config: $raw_line" >&2 + continue + fi + key="${line%%=*}" + value="${line#*=}" + key="$(_argus_trim "$key")" + value="$(_argus_trim "$value")" + case "$key" in + UID) + uid="$value" + ;; + GID) + gid="$value" + ;; + *) + echo "[ARGUS build_user] Unknown key '$key' in $config" >&2 + ;; + esac + done < "$config" + break + fi + done + + if [[ -n "${ARGUS_BUILD_UID:-}" ]]; then + uid="$ARGUS_BUILD_UID" + fi + if [[ -n "${ARGUS_BUILD_GID:-}" ]]; then + gid="$ARGUS_BUILD_GID" + fi + + if ! _argus_is_number "$uid"; then + echo "[ARGUS build_user] Invalid UID '$uid'" >&2 + return 1 + fi + if ! _argus_is_number "$gid"; then + echo "[ARGUS build_user] Invalid GID '$gid'" >&2 + return 1 + fi + + export ARGUS_BUILD_UID="$uid" + export ARGUS_BUILD_GID="$gid" + _ARGUS_BUILD_USER_LOADED=1 +} + +argus_build_user_args() { + load_build_user + printf '%s' "--build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} --build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID}" +} + +print_build_user() { + load_build_user + echo "ARGUS build user: UID=${ARGUS_BUILD_UID} GID=${ARGUS_BUILD_GID}" +} diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..1b05740 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,2 @@ + +__pycache__/ diff --git a/src/agent/.gitignore b/src/agent/.gitignore new file mode 100644 index 0000000..60fe090 --- /dev/null +++ b/src/agent/.gitignore @@ -0,0 +1,5 @@ +build/ +*.egg-info/ +__pycache__/ + +.env diff --git a/src/agent/README.md b/src/agent/README.md index e69de29..f89334d 100644 --- a/src/agent/README.md +++ b/src/agent/README.md @@ -0,0 +1,66 @@ +# Argus Agent 模块 + +Argus Agent 是一个轻量级 Python 进程,负责向 Argus Master 注册节点、汇报健康数据,并维护本地持久化信息。模块现以 PyInstaller 打包为独立可执行文件,便于在普通容器或虚机中直接运行。 + +## 构建可执行文件 + +```bash +cd src/agent +./scripts/build_binary.sh # 生成 dist/argus-agent +``` + +脚本默认会在 Docker 容器 (`python:3.11-slim-bullseye`) 内执行 PyInstaller,确保产物运行时兼容 glibc 2.31+(覆盖 2.35 环境)。构建流程注意事项: + +- 每次构建前会清理 `build/`、`dist/` 并在容器内重新创建虚拟环境。 +- 需要使用内网 Python 镜像时,可通过 `PIP_INDEX_URL`、`PIP_EXTRA_INDEX_URL`、`PIP_TRUSTED_HOST` 等环境变量传入,脚本会自动透传给容器。 +- 如果宿主机无法运行 Docker,可设置 `AGENT_BUILD_USE_DOCKER=0` 回退到本地构建;此时代码必须在 glibc ≤ 2.35 的机器上执行。 + +构建结束后脚本会在 `build/compat_check/` 下解包关键动态库并输出最高 `GLIBC_x.y` 版本,便于快速核对兼容性。如果结果中缺少 `libssl.so.3` / `libcrypto.so.3`,表示系统会在目标宿主机上使用本地 OpenSSL 库,无需额外处理。 + +例如: + +```bash +strings build/compat_check/libpython*.so.1.0 | grep -Eo 'GLIBC_[0-9]+\.[0-9]+' | sort -Vu | tail -n1 +``` + +如遇构建失败,常见原因是 Docker 不可用(请改用 `AGENT_BUILD_USE_DOCKER=0`)或无法访问 Python 包镜像(先设置上述镜像环境变量后重试)。 + +## 运行时配置 + +Agent 不再依赖配置文件;所有参数均由环境变量与主机名推导: + +| 变量 | 必填 | 默认值 | 说明 | +| --- | --- | --- | --- | +| `MASTER_ENDPOINT` | 是 | N/A | Master 基础地址,可写 `http://host:3000` 或 `host:3000`(自动补全 `http://`)。 | +| `REPORT_INTERVAL_SECONDS` | 否 | `60` | 状态上报间隔(秒)。必须为正整数。 | +| `AGENT_HOSTNAME` | 否 | `$(hostname)` | 覆盖容器内主机名,便于测试或特殊命名需求。 | + +派生路径: + +- 节点信息:`/private/argus/agent//node.json` +- 子模块健康目录:`/private/argus/agent//health/` + +健康目录中的文件需遵循 `<模块前缀>-*.json` 命名(例如 `log-fluentbit.json`、`metric-node-exporter.json`),文件内容会原样并入上报的 `health` 字段。 + +## 日志与持久化 + +- Agent 会在成功注册、状态上报、异常重试等关键节点输出结构化日志,便于聚合分析。 +- `node.json` 保存 Master 返回的最新节点对象,用于重启后继续使用既有节点 ID。 + +## 端到端测试 + +仓库内提供 Docker Compose 测试栈(master + ubuntu 容器): + +```bash +cd src/agent/tests +./scripts/00_e2e_test.sh +``` + +测试脚本会: + +1. 构建 master 镜像与 agent 可执行文件。 +2. 以 `ubuntu:24.04` 启动 agent 容器,并通过环境变量注入 `MASTER_ENDPOINT`、`REPORT_INTERVAL_SECONDS`。 +3. 验证注册、健康上报、nodes.json 生成、统计接口,以及“容器重启 + IP 变化”重注册流程。 +4. 清理 `tests/private/` 与临时容器网络。 + +如需在真实环境部署,只需将 `dist/argus-agent` 连同健康目录挂载到目标主机,并按上表设置环境变量即可。 diff --git a/src/agent/app/__init__.py b/src/agent/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/agent/app/client.py b/src/agent/app/client.py new file mode 100644 index 0000000..f4f8bd6 --- /dev/null +++ b/src/agent/app/client.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +import json +from typing import Any, Dict, Optional + +import requests + +from .log import get_logger + +LOGGER = get_logger("argus.agent.client") + + +class MasterAPIError(Exception): + def __init__(self, message: str, status_code: int, payload: Optional[Dict[str, Any]] = None) -> None: + super().__init__(message) + self.status_code = status_code + self.payload = payload or {} + + +class AgentClient: + def __init__(self, base_url: str, *, timeout: int = 10) -> None: + self._base_url = base_url.rstrip("/") + self._timeout = timeout + self._session = requests.Session() + + def register_node(self, body: Dict[str, Any]) -> Dict[str, Any]: + """调用 master 注册接口,返回节点对象。""" + url = f"{self._base_url}/api/v1/master/nodes" + response = self._session.post(url, json=body, timeout=self._timeout) + return self._parse_response(response, "Failed to register node") + + def update_status(self, node_id: str, body: Dict[str, Any]) -> Dict[str, Any]: + """上报健康信息,由 master 更新 last_report。""" + url = f"{self._base_url}/api/v1/master/nodes/{node_id}/status" + response = self._session.put(url, json=body, timeout=self._timeout) + return self._parse_response(response, "Failed to update node status") + + def _parse_response(self, response: requests.Response, error_prefix: str) -> Dict[str, Any]: + content_type = response.headers.get("Content-Type", "") + payload: Dict[str, Any] | None = None + if "application/json" in content_type: + try: + payload = response.json() + except json.JSONDecodeError: + LOGGER.warning("Response contained invalid JSON", extra={"status": response.status_code}) + + if response.status_code >= 400: + message = payload.get("error") if isinstance(payload, dict) else response.text + raise MasterAPIError( + f"{error_prefix}: {message}", + status_code=response.status_code, + payload=payload if isinstance(payload, dict) else None, + ) + + if payload is None: + try: + payload = response.json() + except json.JSONDecodeError as exc: + raise MasterAPIError("Master returned non-JSON payload", response.status_code) from exc + return payload diff --git a/src/agent/app/collector.py b/src/agent/app/collector.py new file mode 100644 index 0000000..1b61caa --- /dev/null +++ b/src/agent/app/collector.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +import os +import re +import socket +import subprocess +from pathlib import Path +from typing import Any, Dict + +from .config import AgentConfig +from .log import get_logger + +LOGGER = get_logger("argus.agent.collector") + +_HOSTNAME_PATTERN = re.compile(r"^([^-]+)-([^-]+)-([^-]+)-.*$") + + +def collect_metadata(config: AgentConfig) -> Dict[str, Any]: + """汇总节点注册需要的静态信息。""" + hostname = config.hostname + env, user, instance = _parse_hostname(hostname) + meta = { + "hostname": hostname, + "ip": _detect_ip_address(), + "env": env, + "user": user, + "instance": instance, + "cpu_number": _detect_cpu_count(), + "memory_in_bytes": _detect_memory_bytes(), + "gpu_number": _detect_gpu_count(), + } + return meta + + +def _parse_hostname(hostname: str) -> tuple[str, str, str]: + """按照约定的 env-user-instance 前缀拆解主机名。""" + match = _HOSTNAME_PATTERN.match(hostname) + if not match: + LOGGER.warning("Hostname does not match expected pattern", extra={"hostname": hostname}) + return "", "", "" + return match.group(1), match.group(2), match.group(3) + + +def _detect_cpu_count() -> int: + count = os.cpu_count() + return count if count is not None else 0 + + +def _detect_memory_bytes() -> int: + """优先读取 cgroup 限额,失败时退回 /proc/meminfo。""" + cgroup_path = Path("/sys/fs/cgroup/memory.max") + try: + raw = cgroup_path.read_text(encoding="utf-8").strip() + if raw and raw != "max": + return int(raw) + except FileNotFoundError: + LOGGER.debug("cgroup memory.max not found, falling back to /proc/meminfo") + except ValueError: + LOGGER.warning("Failed to parse memory.max, falling back", extra={"value": raw}) + + try: + with open("/proc/meminfo", "r", encoding="utf-8") as handle: + for line in handle: + if line.startswith("MemTotal:"): + parts = line.split() + if len(parts) >= 2: + return int(parts[1]) * 1024 + except FileNotFoundError: + LOGGER.error("/proc/meminfo not found; defaulting memory to 0") + return 0 + + +def _detect_gpu_count() -> int: + """采集 GPU 数量,如无法探测则默认为 0。""" + try: + proc = subprocess.run( + ["nvidia-smi", "-L"], + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=5, + ) + except FileNotFoundError: + LOGGER.debug("nvidia-smi not available; assuming 0 GPUs") + return 0 + except subprocess.SubprocessError as exc: + LOGGER.warning("nvidia-smi invocation failed", extra={"error": str(exc)}) + return 0 + + if proc.returncode != 0: + LOGGER.debug("nvidia-smi returned non-zero", extra={"stderr": proc.stderr.strip()}) + return 0 + + count = sum(1 for line in proc.stdout.splitlines() if line.strip()) + return count + + +def _detect_ip_address() -> str: + """尝试通过 UDP socket 获得容器出口 IP,失败则回退解析主机名。""" + try: + with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock: + sock.connect(("8.8.8.8", 80)) + return sock.getsockname()[0] + except OSError: + LOGGER.debug("UDP socket trick failed; falling back to hostname lookup") + try: + return socket.gethostbyname(socket.gethostname()) + except OSError: + LOGGER.warning("Unable to resolve hostname to IP; defaulting to 127.0.0.1") + return "127.0.0.1" diff --git a/src/agent/app/config.py b/src/agent/app/config.py new file mode 100644 index 0000000..dae5d47 --- /dev/null +++ b/src/agent/app/config.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +import os +import socket +from dataclasses import dataclass +from pathlib import Path +from typing import Final + +from .version import VERSION + +DEFAULT_REPORT_INTERVAL_SECONDS: Final[int] = 60 + + +@dataclass(frozen=True) +class AgentConfig: + hostname: str + node_file: str + version: str + master_endpoint: str + report_interval_seconds: int + health_dir: str + request_timeout_seconds: int = 10 + + +def _normalise_master_endpoint(value: str) -> str: + value = value.strip() + if not value: + raise ValueError("MASTER_ENDPOINT environment variable is required") + if not value.startswith("http://") and not value.startswith("https://"): + value = f"http://{value}" + return value.rstrip("/") + + +def _read_report_interval(raw_value: str | None) -> int: + if raw_value is None or raw_value.strip() == "": + return DEFAULT_REPORT_INTERVAL_SECONDS + try: + interval = int(raw_value) + except ValueError as exc: + raise ValueError("REPORT_INTERVAL_SECONDS must be an integer") from exc + if interval <= 0: + raise ValueError("REPORT_INTERVAL_SECONDS must be positive") + return interval + + +def _resolve_hostname() -> str: + return os.environ.get("AGENT_HOSTNAME") or socket.gethostname() + + +def load_config() -> AgentConfig: + """从环境变量推导配置,移除了外部配置文件依赖。""" + + hostname = _resolve_hostname() + node_file = f"/private/argus/agent/{hostname}/node.json" + health_dir = f"/private/argus/agent/{hostname}/health/" + + master_endpoint_env = os.environ.get("MASTER_ENDPOINT") + if master_endpoint_env is None: + raise ValueError("MASTER_ENDPOINT environment variable is not set") + master_endpoint = _normalise_master_endpoint(master_endpoint_env) + + report_interval_seconds = _read_report_interval(os.environ.get("REPORT_INTERVAL_SECONDS")) + + Path(node_file).parent.mkdir(parents=True, exist_ok=True) + Path(health_dir).mkdir(parents=True, exist_ok=True) + + return AgentConfig( + hostname=hostname, + node_file=node_file, + version=VERSION, + master_endpoint=master_endpoint, + report_interval_seconds=report_interval_seconds, + health_dir=health_dir, + ) diff --git a/src/agent/app/health_reader.py b/src/agent/app/health_reader.py new file mode 100644 index 0000000..754ca24 --- /dev/null +++ b/src/agent/app/health_reader.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Dict + +from .log import get_logger + +LOGGER = get_logger("argus.agent.health") + + +def read_health_directory(path: str) -> Dict[str, Any]: + """读取目录中所有 -*.json 文件并返回 JSON 映射。""" + result: Dict[str, Any] = {} + directory = Path(path) + if not directory.exists(): + LOGGER.debug("Health directory does not exist", extra={"path": str(directory)}) + return result + + for health_file in sorted(directory.glob("*.json")): + if "-" not in health_file.stem: + LOGGER.debug("Skipping non-prefixed health file", extra={"file": health_file.name}) + continue + try: + with health_file.open("r", encoding="utf-8") as handle: + content = json.load(handle) + result[health_file.stem] = content + except json.JSONDecodeError as exc: + LOGGER.warning("Failed to parse health file", extra={"file": health_file.name, "error": str(exc)}) + except OSError as exc: + LOGGER.warning("Failed to read health file", extra={"file": health_file.name, "error": str(exc)}) + return result diff --git a/src/agent/app/log.py b/src/agent/app/log.py new file mode 100644 index 0000000..fffecbe --- /dev/null +++ b/src/agent/app/log.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +import logging +import os + + +_LOG_FORMAT = "%(asctime)s %(levelname)s %(name)s - %(message)s" + + +def setup_logging() -> None: + level_name = os.environ.get("AGENT_LOG_LEVEL", "INFO").upper() + level = getattr(logging, level_name, logging.INFO) + logging.basicConfig(level=level, format=_LOG_FORMAT) + + +def get_logger(name: str) -> logging.Logger: + setup_logging() + return logging.getLogger(name) diff --git a/src/agent/app/main.py b/src/agent/app/main.py new file mode 100644 index 0000000..c5e2ba0 --- /dev/null +++ b/src/agent/app/main.py @@ -0,0 +1,163 @@ +from __future__ import annotations + +import signal +import time +from datetime import datetime, timezone +from typing import Optional + +from .client import AgentClient, MasterAPIError +from .collector import collect_metadata +from .config import AgentConfig, load_config +from .health_reader import read_health_directory +from .log import get_logger, setup_logging +from .state import clear_node_state, load_node_state, save_node_state + +LOGGER = get_logger("argus.agent") + + +def _current_timestamp() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +class StopSignal: + def __init__(self) -> None: + self._stop = False + + def set(self, *_args) -> None: # type: ignore[override] + self._stop = True + + def is_set(self) -> bool: + return self._stop + + +def main(argv: Optional[list[str]] = None) -> int: # noqa: ARG001 - 保留签名以兼容入口调用 + setup_logging() + + stop_signal = StopSignal() + signal.signal(signal.SIGTERM, stop_signal.set) + signal.signal(signal.SIGINT, stop_signal.set) + + try: + config = load_config() + except Exception as exc: + LOGGER.error("Failed to load configuration", extra={"error": str(exc)}) + return 1 + + LOGGER.info( + "Agent starting", + extra={ + "hostname": config.hostname, + "master_endpoint": config.master_endpoint, + "node_file": config.node_file, + }, + ) + + client = AgentClient(config.master_endpoint, timeout=config.request_timeout_seconds) + + node_state = load_node_state(config.node_file) or {} + node_id = node_state.get("id") + + # 与 master 建立注册关系(支持重注册),失败则重试 + register_response = _register_with_retry(client, config, node_id, stop_signal) + if register_response is None: + LOGGER.info("Registration aborted due to shutdown signal") + return 0 + + node_id = register_response.get("id") + if not node_id: + LOGGER.error("Master did not return node id; aborting") + return 1 + save_node_state(config.node_file, register_response) + + LOGGER.info("Entering status report loop", extra={"node_id": node_id}) + _status_loop(client, config, node_id, stop_signal) + return 0 + + +def _register_with_retry( + client: AgentClient, + config: AgentConfig, + node_id: Optional[str], + stop_signal: StopSignal, +): + backoff = 5 + while not stop_signal.is_set(): + payload = { + "name": config.hostname, + "type": "agent", + "meta_data": collect_metadata(config), + "version": config.version, + } + if node_id: + payload["id"] = node_id + + try: + response = client.register_node(payload) + LOGGER.info("Registration successful", extra={"node_id": response.get("id")}) + save_node_state(config.node_file, response) + return response + except MasterAPIError as exc: + if exc.status_code == 404 and node_id: + LOGGER.warning( + "Master does not recognise node id; clearing local node state", + extra={"node_id": node_id}, + ) + clear_node_state(config.node_file) + node_id = None + elif exc.status_code == 500 and node_id: + # id 与 name 不匹配通常意味着配置异常,记录但继续重试 + LOGGER.error( + "Master rejected node due to id/name mismatch; will retry", + extra={"node_id": node_id}, + ) + else: + LOGGER.error("Registration failed", extra={"status_code": exc.status_code, "error": str(exc)}) + time.sleep(min(backoff, 60)) + backoff = min(backoff * 2, 60) + except Exception as exc: # pragma: no cover - defensive + LOGGER.exception("Unexpected error during registration", extra={"error": str(exc)}) + time.sleep(min(backoff, 60)) + backoff = min(backoff * 2, 60) + return None + + +def _status_loop( + client: AgentClient, + config: AgentConfig, + node_id: str, + stop_signal: StopSignal, +) -> None: + interval = config.report_interval_seconds + while not stop_signal.is_set(): + timestamp = _current_timestamp() + health_payload = read_health_directory(config.health_dir) + body = { + "timestamp": timestamp, + "health": health_payload, + } + try: + response = client.update_status(node_id, body) + LOGGER.info( + "Status report succeeded", + extra={"node_id": node_id, "health_keys": list(health_payload.keys())}, + ) + save_node_state(config.node_file, response) + except MasterAPIError as exc: + # 保持循环继续执行,等待下一次重试 + LOGGER.error( + "Failed to report status", + extra={"status_code": exc.status_code, "error": str(exc)}, + ) + except Exception as exc: # pragma: no cover - defensive + LOGGER.exception("Unexpected error during status report", extra={"error": str(exc)}) + + for _ in range(interval): + if stop_signal.is_set(): + break + time.sleep(1) + + LOGGER.info("Stop signal received; exiting status loop") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/agent/app/state.py b/src/agent/app/state.py new file mode 100644 index 0000000..5cf6211 --- /dev/null +++ b/src/agent/app/state.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +import json +import os +import tempfile +from pathlib import Path +from typing import Any, Dict, Optional + +from .log import get_logger + +LOGGER = get_logger("argus.agent.state") + + +def load_node_state(path: str) -> Optional[Dict[str, Any]]: + """读取本地 node.json,容器重启后沿用之前的 ID。""" + try: + with open(path, "r", encoding="utf-8") as handle: + return json.load(handle) + except FileNotFoundError: + return None + except json.JSONDecodeError as exc: + LOGGER.warning("node.json is invalid JSON; ignoring", extra={"error": str(exc)}) + return None + + +def save_node_state(path: str, data: Dict[str, Any]) -> None: + """原子化写入 node.json,避免并发读取坏数据。""" + directory = Path(path).parent + directory.mkdir(parents=True, exist_ok=True) + with tempfile.NamedTemporaryFile("w", dir=directory, delete=False, encoding="utf-8") as tmp: + json.dump(data, tmp, separators=(",", ":")) + tmp.flush() + os.fsync(tmp.fileno()) + temp_path = tmp.name + os.replace(temp_path, path) + + +def clear_node_state(path: str) -> None: + try: + os.remove(path) + except FileNotFoundError: + return + except OSError as exc: + LOGGER.warning("Failed to remove node state file", extra={"error": str(exc), "path": path}) diff --git a/src/agent/app/version.py b/src/agent/app/version.py new file mode 100644 index 0000000..97a14f8 --- /dev/null +++ b/src/agent/app/version.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +import os +import sys +from pathlib import Path +from typing import Optional + +import importlib.metadata + +try: + import tomllib +except ModuleNotFoundError: # pragma: no cover + import tomli as tomllib # type: ignore[no-redef] + + +def _candidate_paths() -> list[Path]: + paths = [] + bundle_dir: Optional[str] = getattr(sys, "_MEIPASS", None) + if bundle_dir: + paths.append(Path(bundle_dir) / "pyproject.toml") + paths.append(Path(__file__).resolve().parent.parent / "pyproject.toml") + paths.append(Path(__file__).resolve().parent / "pyproject.toml") + paths.append(Path.cwd() / "pyproject.toml") + return paths + + +def _read_from_pyproject() -> Optional[str]: + for path in _candidate_paths(): + if not path.exists(): + continue + try: + with path.open("rb") as handle: + data = tomllib.load(handle) + except (OSError, tomllib.TOMLDecodeError): + continue + project = data.get("project") + if isinstance(project, dict): + version = project.get("version") + if isinstance(version, str): + return version + tool = data.get("tool") + if isinstance(tool, dict): + argus_cfg = tool.get("argus") + if isinstance(argus_cfg, dict): + version = argus_cfg.get("version") + if isinstance(version, str): + return version + return None + + +def _detect_version() -> str: + try: + return importlib.metadata.version("argus-agent") + except importlib.metadata.PackageNotFoundError: + pass + override = os.environ.get("AGENT_VERSION_OVERRIDE") + if override: + return override + fallback = _read_from_pyproject() + if fallback: + return fallback + return "0.0.0" + + +VERSION: str = _detect_version() + + +def get_version() -> str: + return VERSION diff --git a/src/agent/dist/argus-agent b/src/agent/dist/argus-agent new file mode 100755 index 0000000..4fef67c Binary files /dev/null and b/src/agent/dist/argus-agent differ diff --git a/src/agent/entry.py b/src/agent/entry.py new file mode 100644 index 0000000..39197b1 --- /dev/null +++ b/src/agent/entry.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import sys + +from app.main import main as agent_main + + +if __name__ == "__main__": + sys.exit(agent_main()) diff --git a/src/agent/pyproject.toml b/src/agent/pyproject.toml new file mode 100644 index 0000000..627766e --- /dev/null +++ b/src/agent/pyproject.toml @@ -0,0 +1,19 @@ +[project] +name = "argus-agent" +version = "1.1.0" +description = "Argus agent binary" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "requests==2.31.0" +] + +[build-system] +requires = ["setuptools>=69", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.argus] +entry = "app.main:main" + +[tool.setuptools] +packages = ["app"] diff --git a/src/agent/scripts/agent_deployment_verify.sh b/src/agent/scripts/agent_deployment_verify.sh new file mode 100755 index 0000000..bdea058 --- /dev/null +++ b/src/agent/scripts/agent_deployment_verify.sh @@ -0,0 +1,690 @@ +#!/usr/bin/env bash +set -euo pipefail + +LOG_PREFIX="[AGENT-VERIFY]" +MASTER_ENDPOINT_DEFAULT="" +AGENT_DATA_ROOT_DEFAULT="/private/argus/agent" +AGENT_ETC_ROOT_DEFAULT="/private/argus/etc" +REPORT_INTERVAL_DEFAULT="2" + +ALLOW_CONFIG_TOUCH="false" +KEEP_TEST_HEALTH="false" + +log_info() { + echo "${LOG_PREFIX} INFO $*" +} + +log_warn() { + echo "${LOG_PREFIX} WARN $*" >&2 +} + +log_error() { + echo "${LOG_PREFIX} ERROR $*" >&2 +} + +usage() { + cat <<'USAGE' +Usage: agent_deployment_verify.sh [options] + +Options: + --allow-config-touch Enable optional config PUT dry-run check. + --keep-test-health Keep the temporary verify health file after checks. + -h, --help Show this help message. + +Environment variables: + MASTER_ENDPOINT (required) Master API base endpoint, e.g. http://master:3000 + AGENT_DATA_ROOT (default: /private/argus/agent) + AGENT_ETC_ROOT (default: /private/argus/etc) + VERIFY_HOSTNAME (default: output of hostname) + REPORT_INTERVAL_SECONDS (default: 2) Agent report interval in seconds +USAGE +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --allow-config-touch) + ALLOW_CONFIG_TOUCH="true" + shift + ;; + --keep-test-health) + KEEP_TEST_HEALTH="true" + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + log_error "Unknown option: $1" + usage >&2 + exit 2 + ;; + esac +done + +MASTER_ENDPOINT="${MASTER_ENDPOINT:-$MASTER_ENDPOINT_DEFAULT}" +AGENT_DATA_ROOT="${AGENT_DATA_ROOT:-$AGENT_DATA_ROOT_DEFAULT}" +AGENT_ETC_ROOT="${AGENT_ETC_ROOT:-$AGENT_ETC_ROOT_DEFAULT}" +VERIFY_HOSTNAME="${VERIFY_HOSTNAME:-$(hostname)}" +REPORT_INTERVAL_SECONDS="${REPORT_INTERVAL_SECONDS:-$REPORT_INTERVAL_DEFAULT}" + +if [[ -z "$MASTER_ENDPOINT" ]]; then + log_error "MASTER_ENDPOINT is required" + exit 2 +fi + +if ! [[ "$REPORT_INTERVAL_SECONDS" =~ ^[0-9]+$ ]] || [[ "$REPORT_INTERVAL_SECONDS" -le 0 ]]; then + log_warn "Invalid REPORT_INTERVAL_SECONDS='$REPORT_INTERVAL_SECONDS', fallback to $REPORT_INTERVAL_DEFAULT" + REPORT_INTERVAL_SECONDS="$REPORT_INTERVAL_DEFAULT" +fi + +normalize_endpoint() { + local endpoint="$1" + if [[ "$endpoint" != http://* && "$endpoint" != https://* ]]; then + endpoint="http://$endpoint" + fi + endpoint="${endpoint%/}" + echo "$endpoint" +} + +MASTER_BASE="$(normalize_endpoint "$MASTER_ENDPOINT")" + +NODE_DIR="$AGENT_DATA_ROOT/$VERIFY_HOSTNAME" +NODE_JSON="$NODE_DIR/node.json" +HEALTH_DIR="$NODE_DIR/health" +DNS_CONF="$AGENT_ETC_ROOT/dns.conf" +UPDATE_SCRIPT="$AGENT_ETC_ROOT/update-dns.sh" + +declare -a RESULTS_PASS=() +declare -a RESULTS_WARN=() +declare -a RESULTS_FAIL=() + +add_result() { + local level="$1" message="$2" + case "$level" in + PASS) + RESULTS_PASS+=("$message") + log_info "$message" + ;; + WARN) + RESULTS_WARN+=("$message") + log_warn "$message" + ;; + FAIL) + RESULTS_FAIL+=("$message") + log_error "$message" + ;; + esac +} + +HAS_JQ="0" +if command -v jq >/dev/null 2>&1; then + HAS_JQ="1" +fi + +if ! command -v curl >/dev/null 2>&1; then + log_error "curl command not found; please install curl (e.g. apt-get install -y curl)" + exit 2 +fi + +if [[ "$HAS_JQ" == "0" ]] && ! command -v python3 >/dev/null 2>&1; then + log_error "Neither jq nor python3 is available for JSON processing" + exit 2 +fi + +CURL_OPTS=(--fail --show-error --silent --max-time 10) + +curl_json() { + local url="$1" + if ! curl "${CURL_OPTS[@]}" "$url"; then + return 1 + fi +} + +json_query() { + local json="$1" jq_expr="$2" py_expr="$3" + if [[ "$HAS_JQ" == "1" ]]; then + if ! output=$(printf '%s' "$json" | jq -e -r "$jq_expr" 2>/dev/null); then + return 1 + fi + printf '%s' "$output" + return 0 + fi + + python3 - "$py_expr" <<'PY' +import json +import sys + +expr = sys.argv[1] +try: + data = json.load(sys.stdin) + value = eval(expr, {}, {"data": data}) +except Exception: + sys.exit(1) +if value is None: + sys.exit(1) +if isinstance(value, (dict, list)): + print(json.dumps(value)) +else: + print(value) +PY +} + +json_length() { + local json="$1" jq_expr="$2" py_expr="$3" + if [[ "$HAS_JQ" == "1" ]]; then + if ! output=$(printf '%s' "$json" | jq -e "$jq_expr" 2>/dev/null); then + return 1 + fi + printf '%s' "$output" + return 0 + fi + + python3 - "$py_expr" <<'PY' +import json +import sys + +expr = sys.argv[1] +try: + data = json.load(sys.stdin) + value = eval(expr, {}, {"data": data}) +except Exception: + sys.exit(1) +try: + print(len(value)) +except Exception: + sys.exit(1) +PY +} + +json_has_key() { + local json="$1" jq_expr="$2" py_expr="$3" + if [[ "$HAS_JQ" == "1" ]]; then + if printf '%s' "$json" | jq -e "$jq_expr" >/dev/null 2>&1; then + return 0 + fi + return 1 + fi + + python3 - "$py_expr" <<'PY' +import json +import sys + +expr = sys.argv[1] +try: + data = json.load(sys.stdin) + value = eval(expr, {}, {"data": data}) +except Exception: + sys.exit(1) +if value: + sys.exit(0) +sys.exit(1) +PY +} + +iso_to_epoch() { + local value="$1" + if command -v date >/dev/null 2>&1; then + date -d "$value" +%s 2>/dev/null && return 0 + fi + if command -v python3 >/dev/null 2>&1; then + python3 - "$value" <<'PY' +import sys +from datetime import datetime + +value = sys.argv[1] +if value is None or value == "": + sys.exit(1) +if value.endswith('Z'): + value = value[:-1] + '+00:00' +try: + dt = datetime.fromisoformat(value) +except ValueError: + sys.exit(1) +print(int(dt.timestamp())) +PY + return $? + fi + return 1 +} + +validate_json_file() { + local path="$1" + if [[ "$HAS_JQ" == "1" ]]; then + jq empty "$path" >/dev/null 2>&1 && return 0 + return 1 + fi + if command -v python3 >/dev/null 2>&1; then + python3 - "$path" <<'PY' +import json +import sys +path = sys.argv[1] +with open(path, 'r', encoding='utf-8') as handle: + json.load(handle) +PY + return $? + fi + return 0 +} + +ensure_directory() { + local dir="$1" + if [[ ! -d "$dir" ]]; then + log_warn "Creating missing directory $dir" + mkdir -p "$dir" + fi +} + +TEST_HEALTH_FILE="" +TEST_HEALTH_BACKUP="" +TEST_HEALTH_EXISTED="false" + +cleanup() { + if [[ -n "$TEST_HEALTH_FILE" ]]; then + if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then + printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE" + elif [[ "$KEEP_TEST_HEALTH" == "true" ]]; then + : + else + rm -f "$TEST_HEALTH_FILE" + fi + fi +} + +trap cleanup EXIT + +log_info "Starting agent deployment verification for hostname '$VERIFY_HOSTNAME'" + +# 4.2 Master health checks +health_resp="" +if ! health_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/healthz" 2>/tmp/agent_verify_healthz.err); then + error_detail=$(cat /tmp/agent_verify_healthz.err || true) + add_result FAIL "GET /healthz failed: $error_detail" +else + http_meta=$(tail -n1 <<<"$health_resp") + payload=$(head -n -1 <<<"$health_resp" || true) + status_code=${http_meta%% *} + elapsed=${http_meta##* } + add_result PASS "GET /healthz status=$status_code elapsed=${elapsed}s payload=$payload" +fi +rm -f /tmp/agent_verify_healthz.err + +if ! readyz_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/readyz" 2>/tmp/agent_verify_readyz.err); then + error_detail=$(cat /tmp/agent_verify_readyz.err || true) + add_result FAIL "GET /readyz failed: $error_detail" + readyz_payload="" +else + readyz_meta=$(tail -n1 <<<"$readyz_resp") + readyz_payload=$(head -n -1 <<<"$readyz_resp" || true) + readyz_status=${readyz_meta%% *} + readyz_elapsed=${readyz_meta##* } + add_result PASS "GET /readyz status=$readyz_status elapsed=${readyz_elapsed}s" +fi +rm -f /tmp/agent_verify_readyz.err + +# 4.3 Nodes list and detail +if ! nodes_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes" 2>/tmp/agent_verify_nodes.err); then + error_detail=$(cat /tmp/agent_verify_nodes.err || true) + add_result FAIL "GET /api/v1/master/nodes failed: $error_detail" + nodes_json="" +fi +rm -f /tmp/agent_verify_nodes.err + +NODE_ENTRY="" +NODE_ID="" +NODE_IP="" +if [[ -n "$nodes_json" ]]; then + if [[ "$HAS_JQ" == "1" ]]; then + NODE_ENTRY=$(printf '%s' "$nodes_json" | jq -e --arg name "$VERIFY_HOSTNAME" '.[] | select(.name == $name)') || NODE_ENTRY="" + else + NODE_ENTRY=$(python3 - "$VERIFY_HOSTNAME" <<'PY' +import json +import sys + +hostname = sys.argv[1] +nodes = json.load(sys.stdin) +for node in nodes: + if node.get("name") == hostname: + import json as _json + print(_json.dumps(node)) + sys.exit(0) +sys.exit(1) +PY + ) || NODE_ENTRY="" + fi + + if [[ -z "$NODE_ENTRY" ]]; then + add_result FAIL "Current node '$VERIFY_HOSTNAME' not found in master nodes list" + else + if NODE_ID=$(json_query "$NODE_ENTRY" '.id' 'data["id"]'); then + add_result PASS "Discovered node id '$NODE_ID' for hostname '$VERIFY_HOSTNAME'" + else + add_result FAIL "Failed to extract node id from master response" + fi + fi + + if [[ -n "$NODE_ENTRY" ]] && NODE_DETAIL=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_node_detail.err); then + NODE_DETAIL_JSON="$NODE_DETAIL" + add_result PASS "Fetched node detail for $NODE_ID" + if NODE_IP=$(json_query "$NODE_DETAIL_JSON" '.meta_data.ip // .meta_data.host_ip // empty' 'data.get("meta_data", {}).get("ip") or data.get("meta_data", {}).get("host_ip") or ""'); then + if [[ -n "$NODE_IP" ]]; then + add_result PASS "Registered node IP=$NODE_IP" + else + add_result INFO "Node detail does not expose IP fields" + fi + fi + else + error_detail=$(cat /tmp/agent_verify_node_detail.err 2>/dev/null || true) + add_result FAIL "Failed to fetch node detail for $NODE_ID: $error_detail" + NODE_DETAIL_JSON="" + fi + rm -f /tmp/agent_verify_node_detail.err + + if stats_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes/statistics" 2>/tmp/agent_verify_stats.err); then + if total_nodes=$(json_query "$stats_json" '.total // .total_nodes' 'data.get("total") or data.get("total_nodes")'); then + if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$total_nodes" -ge 1 ]]; then + add_result PASS "Statistics total=$total_nodes" + else + add_result WARN "Statistics total field not numeric: $total_nodes" + fi + else + add_result WARN "Unable to read total field from statistics" + fi + + active_nodes="" + if [[ "$HAS_JQ" == "1" ]]; then + active_nodes=$(printf '%s' "$stats_json" | jq -e 'if .status_statistics then (.status_statistics[] | select(.status == "online") | .count) else empty end' 2>/dev/null | head -n1 || true) + elif command -v python3 >/dev/null 2>&1; then + active_nodes=$(printf '%s' "$stats_json" | python3 -c 'import json,sys; data=json.load(sys.stdin); print(next((row.get("count") for row in data.get("status_statistics", []) if row.get("status")=="online"), ""))' 2>/dev/null) + fi + if [[ -n "$active_nodes" ]]; then + add_result PASS "Online nodes reported by master: $active_nodes" + fi + + if [[ "$HAS_JQ" == "1" ]]; then + node_count=$(printf '%s' "$nodes_json" | jq 'length') + else + node_count=$(json_length "$nodes_json" 'length' 'len(data)') + fi + if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$node_count" =~ ^[0-9]+$ ]] && [[ "$total_nodes" -lt "$node_count" ]]; then + add_result WARN "Statistics total=$total_nodes less than nodes list count=$node_count" + fi + else + error_detail=$(cat /tmp/agent_verify_stats.err 2>/dev/null || true) + add_result FAIL "Failed to fetch node statistics: $error_detail" + fi + rm -f /tmp/agent_verify_stats.err +else + NODE_DETAIL_JSON="" +fi + +# 4.4 Agent persistence checks +if [[ -f "$NODE_JSON" ]]; then + node_file_content="$(cat "$NODE_JSON")" + if node_id_local=$(json_query "$node_file_content" '.id' 'data["id"]'); then + if [[ "$NODE_ID" != "" && "$node_id_local" == "$NODE_ID" ]]; then + add_result PASS "node.json id matches master ($NODE_ID)" + else + add_result FAIL "node.json id '$node_id_local' differs from master id '$NODE_ID'" + fi + else + add_result FAIL "Unable to extract id from node.json" + fi + if node_name_local=$(json_query "$node_file_content" '.name' 'data["name"]'); then + if [[ "$node_name_local" == "$VERIFY_HOSTNAME" ]]; then + add_result PASS "node.json name matches $VERIFY_HOSTNAME" + else + add_result FAIL "node.json name '$node_name_local' differs from hostname '$VERIFY_HOSTNAME'" + fi + else + add_result FAIL "Unable to extract name from node.json" + fi + + if register_time=$(json_query "$node_file_content" '.register_time' 'data.get("register_time")'); then + if iso_to_epoch "$register_time" >/dev/null 2>&1; then + add_result PASS "node.json register_time valid ISO timestamp" + else + add_result WARN "node.json register_time invalid: $register_time" + fi + else + add_result WARN "node.json missing register_time" + fi + + if last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then + if iso_to_epoch "$last_updated" >/dev/null 2>&1; then + add_result PASS "node.json last_updated valid ISO timestamp" + else + add_result WARN "node.json last_updated invalid: $last_updated" + fi + else + add_result WARN "node.json missing last_updated" + fi +else + add_result FAIL "node.json not found at $NODE_JSON" + node_file_content="" +fi + +ensure_directory "$HEALTH_DIR" + +if [[ -d "$HEALTH_DIR" ]]; then + shopt -s nullglob + health_files=("$HEALTH_DIR"/*.json) + shopt -u nullglob + if [[ ${#health_files[@]} -eq 0 ]]; then + add_result WARN "Health directory $HEALTH_DIR is empty" + else + for hf in "${health_files[@]}"; do + base=$(basename "$hf") + if [[ "$base" != *-* ]]; then + add_result WARN "Health file $base does not follow -*.json" + continue + fi + if ! validate_json_file "$hf" >/dev/null 2>&1; then + add_result WARN "Health file $base is not valid JSON" + fi + done + fi +else + add_result WARN "Health directory $HEALTH_DIR missing" +fi + +if getent hosts master.argus.com >/dev/null 2>&1; then + resolved_ips=$(getent hosts master.argus.com | awk '{print $1}' | xargs) + add_result PASS "master.argus.com resolves to $resolved_ips" +else + add_result FAIL "Failed to resolve master.argus.com" +fi + +# 4.5 Master-Node status consistency +sleep_interval=$((REPORT_INTERVAL_SECONDS + 2)) + +if [[ -n "$NODE_DETAIL_JSON" ]]; then + detail_pre="$NODE_DETAIL_JSON" +else + detail_pre="" +fi + +if [[ -z "$detail_pre" && -n "$NODE_ID" ]]; then + if detail_pre=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_pre.err); then + add_result PASS "Fetched node detail pre-check" + else + error_detail=$(cat /tmp/agent_verify_detail_pre.err 2>/dev/null || true) + add_result FAIL "Unable to fetch node detail for status check: $error_detail" + fi + rm -f /tmp/agent_verify_detail_pre.err +fi + +server_ts_pre="" +agent_ts_pre="" +server_ts_post="" +agent_ts_post="" + +if [[ -n "$detail_pre" ]]; then + server_ts_pre=$(json_query "$detail_pre" '.last_report' 'data.get("last_report")' || echo "") + agent_ts_pre=$(json_query "$detail_pre" '.agent_last_report' 'data.get("agent_last_report")' || echo "") + log_info "Captured initial last_report timestamps server='$server_ts_pre' agent='$agent_ts_pre'" + + sleep "$sleep_interval" + + if detail_post=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_post.err); then + server_ts_post=$(json_query "$detail_post" '.last_report' 'data.get("last_report")' || echo "") + agent_ts_post=$(json_query "$detail_post" '.agent_last_report' 'data.get("agent_last_report")' || echo "") + if [[ "$server_ts_post" != "$server_ts_pre" ]]; then + add_result PASS "last_report.server_timestamp advanced (pre=$server_ts_pre post=$server_ts_post)" + else + add_result FAIL "last_report.server_timestamp did not change after ${sleep_interval}s" + fi + if [[ "$agent_ts_post" != "$agent_ts_pre" ]]; then + add_result PASS "last_report.agent_timestamp advanced" + else + add_result FAIL "last_report.agent_timestamp did not change" + fi + + if [[ -n "$node_file_content" ]]; then + if node_last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then + if epoch_post=$(iso_to_epoch "$server_ts_post" 2>/dev/null); then + if node_epoch=$(iso_to_epoch "$node_last_updated" 2>/dev/null); then + diff=$((epoch_post - node_epoch)) + [[ $diff -lt 0 ]] && diff=$((-diff)) + tolerance=$((REPORT_INTERVAL_SECONDS * 2)) + if [[ $diff -le $tolerance ]]; then + add_result PASS "last_report.server_timestamp and node.json last_updated within tolerance ($diff s)" + else + add_result WARN "Timestamp gap between master ($server_ts_post) and node.json ($node_last_updated) is ${diff}s" + fi + fi + fi + fi + fi + + NODE_DETAIL_JSON="$detail_post" + else + error_detail=$(cat /tmp/agent_verify_detail_post.err 2>/dev/null || true) + add_result FAIL "Failed to fetch node detail post-check: $error_detail" + fi + rm -f /tmp/agent_verify_detail_post.err +fi + +# 4.6 Health simulation +TEST_HEALTH_FILE="$HEALTH_DIR/verify-master.json" +ensure_directory "$HEALTH_DIR" + +if [[ -f "$TEST_HEALTH_FILE" ]]; then + TEST_HEALTH_EXISTED="true" + TEST_HEALTH_BACKUP="$(cat "$TEST_HEALTH_FILE")" +else + TEST_HEALTH_EXISTED="false" +fi + +create_health_file() { + local message="$1" + cat > "$TEST_HEALTH_FILE" </tmp/agent_verify_health1.err); then + if validate_health_in_master "$health_message_one" "$detail_health_one"; then + add_result PASS "Master reflects verify-master health message" + else + add_result FAIL "Master health payload does not match test message" + fi +else + error_detail=$(cat /tmp/agent_verify_health1.err 2>/dev/null || true) + add_result FAIL "Failed to fetch node detail during health validation: $error_detail" + detail_health_one="" +fi +rm -f /tmp/agent_verify_health1.err + +health_message_two="verify $(date +%s)-update" +create_health_file "$health_message_two" +sleep "$sleep_interval" +if detail_health_two=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health2.err); then + if validate_health_in_master "$health_message_two" "$detail_health_two"; then + add_result PASS "Master health updated to new message" + else + add_result FAIL "Master health message did not update" + fi +else + error_detail=$(cat /tmp/agent_verify_health2.err 2>/dev/null || true) + add_result FAIL "Failed to fetch node detail after health update: $error_detail" + detail_health_two="" +fi +rm -f /tmp/agent_verify_health2.err + +rm -f "$TEST_HEALTH_FILE" +sleep "$sleep_interval" +if detail_health_three=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health3.err); then + if remove_health_from_master "$detail_health_three"; then + add_result PASS "Master health no longer lists verify-master after removal" + else + add_result FAIL "Master health still contains verify-master after file deletion" + fi +else + error_detail=$(cat /tmp/agent_verify_health3.err 2>/dev/null || true) + add_result FAIL "Failed to fetch node detail after health removal: $error_detail" +fi +rm -f /tmp/agent_verify_health3.err + +if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then + printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE" +fi + +# Optional config touch +if [[ "$ALLOW_CONFIG_TOUCH" == "true" ]]; then + if [[ -n "$NODE_ID" ]]; then + payload='{"label": {"verify": "true"}}' + if curl "${CURL_OPTS[@]}" -X PUT -H 'Content-Type: application/json' -d "$payload" "$MASTER_BASE/api/v1/master/nodes/$NODE_ID/config" >/tmp/agent_verify_config.log 2>&1; then + add_result PASS "Config PUT dry-run succeeded" + else + add_result WARN "Config PUT dry-run failed: $(cat /tmp/agent_verify_config.log)" + fi + rm -f /tmp/agent_verify_config.log + fi +else + add_result WARN "Config PUT dry-run skipped (enable with --allow-config-touch)" +fi + +# Result summary +echo +echo "==== Verification Summary ====" +for entry in "${RESULTS_PASS[@]}"; do + printf 'PASS: %s\n' "$entry" +done +for entry in "${RESULTS_WARN[@]}"; do + printf 'WARN: %s\n' "$entry" +done +for entry in "${RESULTS_FAIL[@]}"; do + printf 'FAIL: %s\n' "$entry" +done + +if [[ ${#RESULTS_FAIL[@]} -gt 0 ]]; then + exit 1 +fi + +exit 0 diff --git a/src/agent/scripts/build_binary.sh b/src/agent/scripts/build_binary.sh new file mode 100755 index 0000000..7e5a720 --- /dev/null +++ b/src/agent/scripts/build_binary.sh @@ -0,0 +1,269 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +MODULE_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +BUILD_ROOT="$MODULE_ROOT/build" +DIST_DIR="$MODULE_ROOT/dist" +PYINSTALLER_BUILD="$BUILD_ROOT/pyinstaller" +PYINSTALLER_SPEC="$PYINSTALLER_BUILD/spec" +PYINSTALLER_WORK="$PYINSTALLER_BUILD/work" +VENV_DIR="$BUILD_ROOT/venv" + +AGENT_BUILD_IMAGE="${AGENT_BUILD_IMAGE:-python:3.11-slim-bullseye}" +AGENT_BUILD_USE_DOCKER="${AGENT_BUILD_USE_DOCKER:-1}" +USED_DOCKER=0 + +run_host_build() { + echo "[INFO] Using host Python environment for build" >&2 + rm -rf "$BUILD_ROOT" "$DIST_DIR" + mkdir -p "$PYINSTALLER_BUILD" "$DIST_DIR" + python3 -m venv --copies "$VENV_DIR" + # shellcheck disable=SC1091 + source "$VENV_DIR/bin/activate" + + pip install --upgrade pip + pip install . + pip install "pyinstaller==6.6.0" + + pyinstaller \ + --clean \ + --onefile \ + --name argus-agent \ + --distpath "$DIST_DIR" \ + --workpath "$PYINSTALLER_WORK" \ + --specpath "$PYINSTALLER_SPEC" \ + --add-data "$MODULE_ROOT/pyproject.toml:." \ + "$MODULE_ROOT/entry.py" + + chmod +x "$DIST_DIR/argus-agent" + deactivate +} + +run_docker_build() { + if ! command -v docker >/dev/null 2>&1; then + echo "[ERROR] docker 命令不存在,无法在容器内构建。请安装 Docker 或设置 AGENT_BUILD_USE_DOCKER=0" >&2 + exit 1 + fi + + USED_DOCKER=1 + echo "[INFO] Building agent binary inside $AGENT_BUILD_IMAGE" >&2 + + local host_uid host_gid + host_uid="$(id -u)" + host_gid="$(id -g)" + docker_env=("--rm" "-v" "$MODULE_ROOT:/workspace" "-w" "/workspace" "--env" "TARGET_UID=${host_uid}" "--env" "TARGET_GID=${host_gid}") + + pass_env_if_set() { + local var="$1" + local value="${!var:-}" + if [[ -n "$value" ]]; then + docker_env+=("--env" "$var=$value") + fi + } + + pass_env_if_set PIP_INDEX_URL + pass_env_if_set PIP_EXTRA_INDEX_URL + pass_env_if_set PIP_TRUSTED_HOST + pass_env_if_set HTTP_PROXY + pass_env_if_set HTTPS_PROXY + pass_env_if_set NO_PROXY + pass_env_if_set http_proxy + pass_env_if_set https_proxy + pass_env_if_set no_proxy + +build_script=$(cat <<'INNER' +set -euo pipefail +cd /workspace +apt-get update >/dev/null +apt-get install -y --no-install-recommends binutils >/dev/null +rm -rf /var/lib/apt/lists/* +rm -rf build dist +mkdir -p build/pyinstaller dist +python3 -m venv --copies build/venv +source build/venv/bin/activate +pip install --upgrade pip +pip install . +pip install pyinstaller==6.6.0 +pyinstaller \ + --clean \ + --onefile \ + --name argus-agent \ + --distpath dist \ + --workpath build/pyinstaller/work \ + --specpath build/pyinstaller/spec \ + --add-data /workspace/pyproject.toml:. \ + entry.py +chmod +x dist/argus-agent + +TARGET_UID="${TARGET_UID:-0}" +TARGET_GID="${TARGET_GID:-0}" +chown -R "$TARGET_UID:$TARGET_GID" dist build 2>/dev/null || true + +python3 - <<'PY' +from pathlib import Path +from PyInstaller.archive.readers import CArchiveReader +import sys + +archive = Path('dist/argus-agent') +out_dir = Path('build/compat_check') +out_dir.mkdir(parents=True, exist_ok=True) + +major, minor = sys.version_info[:2] +libpython = f'libpython{major}.{minor}.so.1.0' +expected_libs = [ + libpython, + 'libssl.so.3', + 'libcrypto.so.3', +] +reader = CArchiveReader(str(archive)) +extracted = [] +missing = [] +for name in expected_libs: + try: + data = reader.extract(name) + except KeyError: + missing.append(name) + continue + (out_dir / name).write_bytes(data) + extracted.append(name) +(out_dir / 'manifest').write_text('\n'.join(extracted)) +if extracted: + print('[INFO] Extracted libraries: ' + ', '.join(extracted)) +if missing: + print('[WARN] Missing expected libraries in bundle: ' + ', '.join(missing)) +PY + +compat_check() { + local lib_path="$1" + if [[ ! -f "$lib_path" ]]; then + echo "[WARN] Missing $lib_path for GLIBC check" + return + fi + local max_glibc + max_glibc=$(strings -a "$lib_path" | grep -Eo 'GLIBC_[0-9]+\.[0-9]+' | sort -Vu | tail -n 1 || true) + if [[ -n "$max_glibc" ]]; then + echo "[INFO] $lib_path references up to $max_glibc" + else + echo "[INFO] $lib_path does not expose GLIBC version strings" + fi +} + +compat_libs=() +if [[ -f build/compat_check/manifest ]]; then + mapfile -t compat_libs < build/compat_check/manifest +fi + +if [[ ${#compat_libs[@]} -eq 0 ]]; then + echo "[WARN] No libraries captured for GLIBC inspection" +else + for lib in "${compat_libs[@]}"; do + compat_check "build/compat_check/$lib" + done +fi + +deactivate +INNER + ) + + if ! docker run "${docker_env[@]}" "$AGENT_BUILD_IMAGE" bash -lc "$build_script"; then + echo "[ERROR] Docker 构建失败,请检查 Docker 权限或设置 AGENT_BUILD_USE_DOCKER=0 在兼容主机上构建" >&2 + exit 1 + fi +} + +if [[ "$AGENT_BUILD_USE_DOCKER" == "1" ]]; then + run_docker_build +else + run_host_build +fi + +if [[ ! -f "$DIST_DIR/argus-agent" ]]; then + echo "[ERROR] Agent binary was not produced" >&2 + exit 1 +fi + +if [[ "$USED_DOCKER" != "1" ]]; then + if [[ ! -x "$VENV_DIR/bin/python" ]]; then + echo "[WARN] PyInstaller virtualenv missing at $VENV_DIR; skipping compatibility check" >&2 + else + COMPAT_DIR="$BUILD_ROOT/compat_check" + rm -rf "$COMPAT_DIR" + mkdir -p "$COMPAT_DIR" + + EXTRACT_SCRIPT=$(cat <<'PY' +from pathlib import Path +from PyInstaller.archive.readers import CArchiveReader +import sys + +archive = Path('dist/argus-agent') +out_dir = Path('build/compat_check') +out_dir.mkdir(parents=True, exist_ok=True) + +major, minor = sys.version_info[:2] +libpython = f'libpython{major}.{minor}.so.1.0' +expected_libs = [ + libpython, + 'libssl.so.3', + 'libcrypto.so.3', +] +reader = CArchiveReader(str(archive)) +extracted = [] +missing = [] +for name in expected_libs: + try: + data = reader.extract(name) + except KeyError: + missing.append(name) + continue + (out_dir / name).write_bytes(data) + extracted.append(name) +(out_dir / 'manifest').write_text('\n'.join(extracted)) +if extracted: + print('[INFO] Extracted libraries: ' + ', '.join(extracted)) +if missing: + print('[WARN] Missing expected libraries in bundle: ' + ', '.join(missing)) +PY +) + + "$VENV_DIR/bin/python" - <&2 + return + fi + if command -v strings >/dev/null 2>&1; then + local max_glibc + max_glibc=$(strings -a "$lib_path" | grep -Eo 'GLIBC_[0-9]+\.[0-9]+' | sort -Vu | tail -n 1 || true) + if [[ -n "$max_glibc" ]]; then + echo "[INFO] $lib_path references up to $max_glibc" + else + echo "[INFO] $lib_path does not expose GLIBC version strings" + fi + else + echo "[WARN] strings command unavailable; cannot inspect $lib_path" >&2 + fi + } + + if [[ ${#compat_libs[@]} -eq 0 ]]; then + echo "[WARN] No libraries captured for GLIBC inspection" >&2 + else + for lib in "${compat_libs[@]}"; do + check_glibc_version "$COMPAT_DIR/$lib" + done + fi + fi +else + echo "[INFO] Compatibility check executed inside container" +fi + +echo "[INFO] Agent binary generated at $DIST_DIR/argus-agent" diff --git a/src/agent/tests/.gitignore b/src/agent/tests/.gitignore new file mode 100644 index 0000000..285ed60 --- /dev/null +++ b/src/agent/tests/.gitignore @@ -0,0 +1,2 @@ +private/ +tmp/ diff --git a/src/agent/tests/docker-compose.yml b/src/agent/tests/docker-compose.yml new file mode 100644 index 0000000..e24e252 --- /dev/null +++ b/src/agent/tests/docker-compose.yml @@ -0,0 +1,69 @@ +services: + bind: + image: ${BIND_IMAGE_TAG:-argus-bind9:latest} + container_name: argus-bind-agent-e2e + volumes: + - ./private:/private + networks: + default: + ipv4_address: 172.28.0.2 + environment: + - "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}" + - "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}" + restart: always + + master: + image: argus-master:latest + container_name: argus-master-agent-e2e + depends_on: + - bind + environment: + - OFFLINE_THRESHOLD_SECONDS=6 + - ONLINE_THRESHOLD_SECONDS=2 + - SCHEDULER_INTERVAL_SECONDS=1 + - "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}" + - "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}" + ports: + - "32300:3000" + volumes: + - ./private/argus/master:/private/argus/master + - ./private/argus/metric/prometheus:/private/argus/metric/prometheus + - ./private/argus/etc:/private/argus/etc + networks: + default: + ipv4_address: 172.28.0.10 + restart: always + + agent: + image: ubuntu:22.04 + container_name: argus-agent-e2e + hostname: dev-e2euser-e2einst-pod-0 + depends_on: + - master + - bind + environment: + - MASTER_ENDPOINT=http://master.argus.com:3000 + - REPORT_INTERVAL_SECONDS=2 + - "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}" + - "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}" + volumes: + - ./private/argus/agent/dev-e2euser-e2einst-pod-0:/private/argus/agent/dev-e2euser-e2einst-pod-0 + - ./private/argus/agent/dev-e2euser-e2einst-pod-0/health:/private/argus/agent/dev-e2euser-e2einst-pod-0/health + - ./private/argus/etc:/private/argus/etc + - ../dist/argus-agent:/usr/local/bin/argus-agent:ro + - ./scripts/agent_entrypoint.sh:/usr/local/bin/agent-entrypoint.sh:ro + - ../scripts/agent_deployment_verify.sh:/usr/local/bin/agent_deployment_verify.sh:ro + entrypoint: + - /usr/local/bin/agent-entrypoint.sh + networks: + default: + ipv4_address: 172.28.0.20 + restart: always + +networks: + default: + driver: bridge + ipam: + driver: default + config: + - subnet: 172.28.0.0/16 diff --git a/src/agent/tests/scripts/00_e2e_test.sh b/src/agent/tests/scripts/00_e2e_test.sh new file mode 100755 index 0000000..9515d34 --- /dev/null +++ b/src/agent/tests/scripts/00_e2e_test.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPTS=( + "01_bootstrap.sh" + "02_up.sh" + "03_wait_and_assert_registration.sh" + "04_write_health_files.sh" + "08_verify_agent.sh" + "05_assert_status_on_master.sh" + "06_restart_agent_and_reregister.sh" + "07_down.sh" +) + +for script in "${SCRIPTS[@]}"; do + echo "[TEST] Running $script" + "$SCRIPT_DIR/$script" + echo "[TEST] $script completed" + echo +done + +echo "[TEST] Agent module E2E tests completed" diff --git a/src/agent/tests/scripts/01_bootstrap.sh b/src/agent/tests/scripts/01_bootstrap.sh new file mode 100755 index 0000000..b6b9e4f --- /dev/null +++ b/src/agent/tests/scripts/01_bootstrap.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +AGENT_ROOT="$(cd "$TEST_ROOT/.." && pwd)" +MASTER_ROOT="$(cd "$AGENT_ROOT/../master" && pwd)" +REPO_ROOT="$(cd "$AGENT_ROOT/../.." && pwd)" +PRIVATE_ROOT="$TEST_ROOT/private" +TMP_ROOT="$TEST_ROOT/tmp" + +AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0" +AGENT_CONFIG_DIR="$PRIVATE_ROOT/argus/agent/$AGENT_HOSTNAME" +AGENT_HEALTH_DIR="$PRIVATE_ROOT/argus/agent/$AGENT_HOSTNAME/health" +MASTER_PRIVATE_DIR="$PRIVATE_ROOT/argus/master" +METRIC_PRIVATE_DIR="$PRIVATE_ROOT/argus/metric/prometheus" +DNS_DIR="$PRIVATE_ROOT/argus/etc" +BIND_IMAGE_TAG="${BIND_IMAGE_TAG:-argus-bind9:latest}" +BIND_ROOT="$(cd "$MASTER_ROOT/../bind" && pwd)" + +ensure_image() { + local image="$1" + if ! docker image inspect "$image" >/dev/null 2>&1; then + echo "[ERROR] Docker image '$image' 未找到,请先运行统一构建脚本 (例如 ./build/build_images.sh) 生成所需镜像" >&2 + exit 1 + fi +} + +mkdir -p "$AGENT_CONFIG_DIR" +mkdir -p "$AGENT_HEALTH_DIR" +mkdir -p "$MASTER_PRIVATE_DIR" +mkdir -p "$METRIC_PRIVATE_DIR" +mkdir -p "$TMP_ROOT" +mkdir -p "$DNS_DIR" + +touch "$AGENT_HEALTH_DIR/.keep" + +# 中文提示:准备 bind 模块提供的 update-dns.sh,模拟生产下发 +if [[ -f "$BIND_ROOT/build/update-dns.sh" ]]; then + cp "$BIND_ROOT/build/update-dns.sh" "$DNS_DIR/update-dns.sh" + chmod +x "$DNS_DIR/update-dns.sh" +else + echo "[WARN] bind update script missing at $BIND_ROOT/build/update-dns.sh" +fi + +ensure_image "argus-master:latest" +ensure_image "$BIND_IMAGE_TAG" + +AGENT_BINARY="$AGENT_ROOT/dist/argus-agent" + +pushd "$AGENT_ROOT" >/dev/null +./scripts/build_binary.sh +popd >/dev/null + +if [[ ! -x "$AGENT_BINARY" ]]; then + echo "[ERROR] Agent binary not found at $AGENT_BINARY" >&2 + exit 1 +fi + +echo "$AGENT_BINARY" > "$TMP_ROOT/agent_binary_path" +echo "$BIND_IMAGE_TAG" > "$TMP_ROOT/bind_image_tag" + +echo "[INFO] Agent E2E bootstrap complete" diff --git a/src/agent/tests/scripts/02_up.sh b/src/agent/tests/scripts/02_up.sh new file mode 100755 index 0000000..56c4cda --- /dev/null +++ b/src/agent/tests/scripts/02_up.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)" + +TMP_ROOT="$TEST_ROOT/tmp" +ENV_FILE="$TEST_ROOT/.env" + +source "$REPO_ROOT/scripts/common/build_user.sh" +load_build_user +export ARGUS_BUILD_UID ARGUS_BUILD_GID + +cat > "$ENV_FILE" <&2 + exit 1 +fi + +AGENT_BINARY="$(cat "$TMP_ROOT/agent_binary_path")" +if [[ ! -x "$AGENT_BINARY" ]]; then + echo "[ERROR] Agent binary not executable: $AGENT_BINARY" >&2 + exit 1 +fi + +BIND_IMAGE_TAG_VALUE="argus-bind9:latest" +if [[ -f "$TMP_ROOT/bind_image_tag" ]]; then + BIND_IMAGE_TAG_VALUE="$(cat "$TMP_ROOT/bind_image_tag")" +fi + +compose() { + if docker compose version >/dev/null 2>&1; then + docker compose "$@" + else + docker-compose "$@" + fi +} + +docker container rm -f argus-agent-e2e argus-master-agent-e2e argus-bind-agent-e2e >/dev/null 2>&1 || true + +docker network rm tests_default >/dev/null 2>&1 || true + +pushd "$TEST_ROOT" >/dev/null +compose down --remove-orphans || true +BIND_IMAGE_TAG="$BIND_IMAGE_TAG_VALUE" compose up -d +popd >/dev/null + +echo "[INFO] Master+Agent stack started" diff --git a/src/agent/tests/scripts/03_wait_and_assert_registration.sh b/src/agent/tests/scripts/03_wait_and_assert_registration.sh new file mode 100755 index 0000000..7e9c127 --- /dev/null +++ b/src/agent/tests/scripts/03_wait_and_assert_registration.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_ROOT="$TEST_ROOT/tmp" +API_BASE="http://localhost:32300/api/v1/master" +AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0" +NODE_FILE="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/node.json" + +mkdir -p "$TMP_ROOT" + +node_id="" +for _ in {1..30}; do + sleep 2 + response=$(curl -sS "$API_BASE/nodes" || true) + if [[ -z "$response" ]]; then + continue + fi + list_file="$TMP_ROOT/nodes_list.json" + echo "$response" > "$list_file" + node_id=$(python3 - "$list_file" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + nodes = json.load(handle) +print(nodes[0]["id"] if nodes else "") +PY +) + if [[ -n "$node_id" ]]; then + break + fi + done + +if [[ -z "$node_id" ]]; then + echo "[ERROR] Agent did not register within timeout" >&2 + exit 1 +fi + +echo "$node_id" > "$TMP_ROOT/node_id" + +if [[ ! -f "$NODE_FILE" ]]; then + echo "[ERROR] node.json not created at $NODE_FILE" >&2 + exit 1 +fi + +python3 - "$NODE_FILE" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) +assert "id" in node and node["id"], "node.json missing id" +PY + +detail_file="$TMP_ROOT/initial_detail.json" +curl -sS "$API_BASE/nodes/$node_id" -o "$detail_file" +python3 - "$detail_file" "$TMP_ROOT/initial_ip" <<'PY' +import json, sys, pathlib +with open(sys.argv[1]) as handle: + node = json.load(handle) +ip = node["meta_data"].get("ip") +if not ip: + raise SystemExit("meta_data.ip missing") +pathlib.Path(sys.argv[2]).write_text(ip) +PY + +echo "[INFO] Agent registered with node id $node_id" diff --git a/src/agent/tests/scripts/04_write_health_files.sh b/src/agent/tests/scripts/04_write_health_files.sh new file mode 100755 index 0000000..ba7128e --- /dev/null +++ b/src/agent/tests/scripts/04_write_health_files.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +HEALTH_DIR="$TEST_ROOT/private/argus/agent/dev-e2euser-e2einst-pod-0/health" + +cat > "$HEALTH_DIR/log-fluentbit.json" < "$HEALTH_DIR/metric-node-exporter.json" <&2 + exit 1 +fi + +if [[ ! -f "$NODES_JSON" ]]; then + echo "[ERROR] nodes.json missing at $NODES_JSON" >&2 + exit 1 +fi + +python3 - "$NODES_JSON" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + nodes = json.load(handle) +assert len(nodes) == 1, nodes +entry = nodes[0] +assert entry["node_id"], entry +PY + +echo "[INFO] Master reflects agent health and nodes.json entries" diff --git a/src/agent/tests/scripts/06_restart_agent_and_reregister.sh b/src/agent/tests/scripts/06_restart_agent_and_reregister.sh new file mode 100755 index 0000000..78c6322 --- /dev/null +++ b/src/agent/tests/scripts/06_restart_agent_and_reregister.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_ROOT="$TEST_ROOT/tmp" +API_BASE="http://localhost:32300/api/v1/master" +NODE_ID="$(cat "$TMP_ROOT/node_id")" +AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0" +NETWORK_NAME="tests_default" +NEW_AGENT_IP="172.28.0.200" +ENTRYPOINT_SCRIPT="$SCRIPT_DIR/agent_entrypoint.sh" +ENV_FILE="$TEST_ROOT/.env" + +# 中文提示:重启场景也需要同样的入口脚本,确保 DNS 注册逻辑一致 +if [[ ! -f "$ENTRYPOINT_SCRIPT" ]]; then + echo "[ERROR] agent entrypoint script missing at $ENTRYPOINT_SCRIPT" >&2 + exit 1 +fi + +if [[ ! -f "$TMP_ROOT/agent_binary_path" ]]; then + echo "[ERROR] Agent binary path missing; rerun bootstrap" >&2 + exit 1 +fi + +AGENT_BINARY="$(cat "$TMP_ROOT/agent_binary_path")" +if [[ ! -x "$AGENT_BINARY" ]]; then + echo "[ERROR] Agent binary not executable: $AGENT_BINARY" >&2 + exit 1 +fi + +if [[ -f "$ENV_FILE" ]]; then + set -a + # shellcheck disable=SC1090 + source "$ENV_FILE" + set +a +else + REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)" + # shellcheck disable=SC1090 + source "$REPO_ROOT/scripts/common/build_user.sh" + load_build_user +fi + +AGENT_UID="${ARGUS_BUILD_UID:-2133}" +AGENT_GID="${ARGUS_BUILD_GID:-2015}" + +compose() { + if docker compose version >/dev/null 2>&1; then + docker compose "$@" + else + docker-compose "$@" + fi +} + +before_file="$TMP_ROOT/before_restart.json" +curl -sS "$API_BASE/nodes/$NODE_ID" -o "$before_file" +prev_last_updated=$(python3 - "$before_file" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) +print(node.get("last_updated", "")) +PY +) +prev_ip=$(python3 - "$before_file" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) +print(node["meta_data"].get("ip", "")) +PY +) +initial_ip=$(cat "$TMP_ROOT/initial_ip") +if [[ "$prev_ip" != "$initial_ip" ]]; then + echo "[ERROR] Expected initial IP $initial_ip, got $prev_ip" >&2 + exit 1 +fi + +pushd "$TEST_ROOT" >/dev/null +compose rm -sf agent +popd >/dev/null + +docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true + +AGENT_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME" +HEALTH_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/health" + +# 先以 sleep 方式启动容器,确保我们掌握注册时的网络状态 +if ! docker run -d \ + --name argus-agent-e2e \ + --hostname "$AGENT_HOSTNAME" \ + --network "$NETWORK_NAME" \ + --ip "$NEW_AGENT_IP" \ + -v "$AGENT_DIR:/private/argus/agent/$AGENT_HOSTNAME" \ + -v "$HEALTH_DIR:/private/argus/agent/$AGENT_HOSTNAME/health" \ + -v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \ + -v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \ + -v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \ + -e MASTER_ENDPOINT=http://master.argus.com:3000 \ + -e REPORT_INTERVAL_SECONDS=2 \ + -e ARGUS_BUILD_UID="$AGENT_UID" \ + -e ARGUS_BUILD_GID="$AGENT_GID" \ + --entrypoint /usr/local/bin/agent-entrypoint.sh \ + ubuntu:22.04 >/dev/null; then + echo "[ERROR] Failed to start agent container with custom IP" >&2 + exit 1 +fi + +success=false +detail_file="$TMP_ROOT/post_restart.json" +for _ in {1..20}; do + sleep 3 + if ! curl -sS "$API_BASE/nodes/$NODE_ID" -o "$detail_file"; then + continue + fi + if python3 - "$detail_file" "$prev_last_updated" "$NODE_ID" "$prev_ip" "$NEW_AGENT_IP" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) +prev_last_updated = sys.argv[2] +expected_id = sys.argv[3] +old_ip = sys.argv[4] +expected_ip = sys.argv[5] +last_updated = node.get("last_updated") +current_ip = node["meta_data"].get("ip") +assert node["id"] == expected_id +if current_ip != expected_ip: + raise SystemExit(1) +if current_ip == old_ip: + raise SystemExit(1) +if not last_updated or last_updated == prev_last_updated: + raise SystemExit(1) +PY + then + success=true + break + fi +done + +if [[ "$success" != true ]]; then + echo "[ERROR] Agent did not report expected new IP $NEW_AGENT_IP after restart" >&2 + exit 1 +fi + +echo "[INFO] Agent restart produced successful re-registration with IP change" diff --git a/src/agent/tests/scripts/07_down.sh b/src/agent/tests/scripts/07_down.sh new file mode 100755 index 0000000..b9674ee --- /dev/null +++ b/src/agent/tests/scripts/07_down.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +ENV_FILE="$TEST_ROOT/.env" + +compose() { + if docker compose version >/dev/null 2>&1; then + docker compose "$@" + else + docker-compose "$@" + fi +} + +docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true + +pushd "$TEST_ROOT" >/dev/null +compose down --remove-orphans +popd >/dev/null + +if [[ -d "$TEST_ROOT/private" ]]; then + docker run --rm \ + -v "$TEST_ROOT/private:/target" \ + ubuntu:24.04 \ + chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true + rm -rf "$TEST_ROOT/private" +fi + +rm -rf "$TEST_ROOT/tmp" + +if [[ -f "$ENV_FILE" ]]; then + rm -f "$ENV_FILE" +fi + +echo "[INFO] Agent E2E environment cleaned up" diff --git a/src/agent/tests/scripts/08_verify_agent.sh b/src/agent/tests/scripts/08_verify_agent.sh new file mode 100755 index 0000000..8b347b0 --- /dev/null +++ b/src/agent/tests/scripts/08_verify_agent.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +VERIFY_SCRIPT="$(cd "$TEST_ROOT/.." && pwd)/scripts/agent_deployment_verify.sh" + +if ! docker ps --format '{{.Names}}' | grep -q '^argus-agent-e2e$'; then + echo "[WARN] agent container not running; skip verification" + exit 0 +fi + +if docker exec -i argus-agent-e2e bash -lc 'command -v curl >/dev/null && command -v jq >/dev/null'; then + echo "[INFO] curl/jq already installed in agent container" +else + echo "[INFO] Installing curl/jq in agent container" + docker exec -i argus-agent-e2e bash -lc 'apt-get update >/dev/null 2>&1 && apt-get install -y curl jq >/dev/null 2>&1' || true +fi + +if docker exec -i argus-agent-e2e bash -lc 'command -v /usr/local/bin/agent_deployment_verify.sh >/dev/null'; then + docker exec -i argus-agent-e2e /usr/local/bin/agent_deployment_verify.sh +elif [[ -x "$VERIFY_SCRIPT" ]]; then + docker exec -i argus-agent-e2e "$VERIFY_SCRIPT" +else + echo "[WARN] agent_deployment_verify.sh not found" +fi diff --git a/src/agent/tests/scripts/agent_entrypoint.sh b/src/agent/tests/scripts/agent_entrypoint.sh new file mode 100755 index 0000000..1823605 --- /dev/null +++ b/src/agent/tests/scripts/agent_entrypoint.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash +set -euo pipefail + +LOG_PREFIX="[AGENT-ENTRYPOINT]" +DNS_SCRIPT="/private/argus/etc/update-dns.sh" +DNS_CONF="/private/argus/etc/dns.conf" +TARGET_DOMAIN="master.argus.com" +AGENT_UID="${ARGUS_BUILD_UID:-2133}" +AGENT_GID="${ARGUS_BUILD_GID:-2015}" +AGENT_HOSTNAME="${HOSTNAME:-unknown}" +AGENT_DATA_DIR="/private/argus/agent/${AGENT_HOSTNAME}" +AGENT_HEALTH_DIR="${AGENT_DATA_DIR}/health" +RUNTIME_GROUP="argusagent" +RUNTIME_USER="argusagent" + +log() { + echo "${LOG_PREFIX} $*" +} + +mkdir -p "$AGENT_DATA_DIR" "$AGENT_HEALTH_DIR" +chown -R "$AGENT_UID:$AGENT_GID" "$AGENT_DATA_DIR" "$AGENT_HEALTH_DIR" 2>/dev/null || true +chown -R "$AGENT_UID:$AGENT_GID" "/private/argus/etc" 2>/dev/null || true + +if ! getent group "$AGENT_GID" >/dev/null 2>&1; then + groupadd -g "$AGENT_GID" "$RUNTIME_GROUP" +else + RUNTIME_GROUP="$(getent group "$AGENT_GID" | cut -d: -f1)" +fi + +if ! getent passwd "$AGENT_UID" >/dev/null 2>&1; then + useradd -u "$AGENT_UID" -g "$AGENT_GID" -M -s /bin/bash "$RUNTIME_USER" +else + RUNTIME_USER="$(getent passwd "$AGENT_UID" | cut -d: -f1)" +fi + +log "运行用户: $RUNTIME_USER ($AGENT_UID:$AGENT_GID)" + +# 中文提示:等待 bind 下发的 update-dns.sh 脚本 +for _ in {1..30}; do + if [[ -x "$DNS_SCRIPT" ]]; then + break + fi + log "等待 update-dns.sh 准备就绪..." + sleep 1 +done + +if [[ -x "$DNS_SCRIPT" ]]; then + log "执行 update-dns.sh 更新容器 DNS" + while true; do + if "$DNS_SCRIPT"; then + log "update-dns.sh 执行成功" + break + fi + log "update-dns.sh 执行失败,3 秒后重试" + sleep 3 + done +else + log "未获取到 update-dns.sh,使用镜像默认 DNS" +fi + +# 中文提示:记录当前 dns.conf 内容,便于排查 +if [[ -f "$DNS_CONF" ]]; then + log "dns.conf 内容: $(tr '\n' ' ' < "$DNS_CONF")" +else + log "dns.conf 暂未生成" +fi + +# 中文提示:尝试解析 master 域名,失败不阻塞但会打日志 +for _ in {1..30}; do + if getent hosts "$TARGET_DOMAIN" >/dev/null 2>&1; then + MASTER_IP=$(getent hosts "$TARGET_DOMAIN" | awk '{print $1}' | head -n 1) + log "master.argus.com 解析成功: $MASTER_IP" + break + fi + sleep 1 +done + +log "启动 argus-agent" +exec su -s /bin/bash -c /usr/local/bin/argus-agent "$RUNTIME_USER" diff --git a/src/bind/build/Dockerfile b/src/bind/build/Dockerfile index f743d86..c6293d3 100644 --- a/src/bind/build/Dockerfile +++ b/src/bind/build/Dockerfile @@ -6,6 +6,11 @@ ENV TZ=Asia/Shanghai # 设置构建参数 ARG USE_INTRANET=false +ARG ARGUS_BUILD_UID=2133 +ARG ARGUS_BUILD_GID=2015 + +ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \ + ARGUS_BUILD_GID=${ARGUS_BUILD_GID} # 配置内网 apt 源 (如果指定了内网选项) RUN if [ "$USE_INTRANET" = "true" ]; then \ @@ -29,6 +34,24 @@ RUN apt-get update && \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* +# 调整 bind 用户与用户组 ID 以匹配宿主机配置 +RUN set -eux; \ + current_gid="$(getent group bind | awk -F: '{print $3}')"; \ + if [ -z "$current_gid" ]; then \ + groupadd -g "${ARGUS_BUILD_GID}" bind; \ + elif [ "$current_gid" != "${ARGUS_BUILD_GID}" ]; then \ + groupmod -g "${ARGUS_BUILD_GID}" bind; \ + fi; \ + if id bind >/dev/null 2>&1; then \ + current_uid="$(id -u bind)"; \ + if [ "$current_uid" != "${ARGUS_BUILD_UID}" ]; then \ + usermod -u "${ARGUS_BUILD_UID}" bind; \ + fi; \ + else \ + useradd -m -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" bind; \ + fi; \ + chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /var/cache/bind /var/lib/bind + # 配置部署时使用的apt源 RUN if [ "$USE_INTRANET" = "true" ]; then \ echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \ diff --git a/src/bind/build/argus_dns_sync.sh b/src/bind/build/argus_dns_sync.sh index 76c8f88..cfa4adc 100644 --- a/src/bind/build/argus_dns_sync.sh +++ b/src/bind/build/argus_dns_sync.sh @@ -9,6 +9,9 @@ SLEEP_SECONDS=10 RELOAD_SCRIPT="/usr/local/bin/reload-bind9.sh" # 这里放你已有脚本的路径 mkdir -p "$(dirname "$LOCKFILE")" "$BACKUP_DIR" +BACKUP_UID="${ARGUS_BUILD_UID:-2133}" +BACKUP_GID="${ARGUS_BUILD_GID:-2015}" +chown -R "$BACKUP_UID:$BACKUP_GID" "$BACKUP_DIR" 2>/dev/null || true is_ipv4() { local ip="$1" @@ -33,6 +36,7 @@ upsert_record() { local changed=0 cp -a "$ZONE_DB" "$BACKUP_DIR/db.argus.com.$ts.bak" + chown "$BACKUP_UID:$BACKUP_GID" "$BACKUP_DIR/db.argus.com.$ts.bak" 2>/dev/null || true local cur_ip cur_ip="$(get_current_ip "$name" || true)" @@ -61,7 +65,10 @@ upsert_record() { echo "[SKIP] ${name} unchanged (${new_ip})" fi - return $changed + if [[ $changed -eq 1 ]]; then + return 0 + fi + return 1 } while true; do @@ -70,7 +77,7 @@ while true; do shopt -s nullglob NEED_RELOAD=0 - for f in "$WATCH_DIR"/*.argus.com; do + for f in "$WATCH_DIR"/*.argus.com; do base="$(basename "$f")" name="${base%.argus.com}" ip="$(grep -Eo '([0-9]{1,3}\.){3}[0-9]{1,3}' "$f" | tail -n1 || true)" @@ -97,4 +104,3 @@ while true; do sleep "$SLEEP_SECONDS" done - diff --git a/src/bind/build/startup.sh b/src/bind/build/startup.sh index 964867f..66a2e5d 100644 --- a/src/bind/build/startup.sh +++ b/src/bind/build/startup.sh @@ -6,6 +6,8 @@ chmod 777 /private 2>/dev/null || true # Create persistent directories for BIND9 configs and DNS sync mkdir -p /private/argus/bind mkdir -p /private/argus/etc +chown bind:bind /private/argus 2>/dev/null || true +chown -R bind:bind /private/argus/bind /private/argus/etc # Copy configuration files to persistent storage if they don't exist if [ ! -f /private/argus/bind/named.conf.local ]; then diff --git a/src/bind/tests/docker-compose.yml b/src/bind/tests/docker-compose.yml index e2d4fc9..b01d33d 100644 --- a/src/bind/tests/docker-compose.yml +++ b/src/bind/tests/docker-compose.yml @@ -3,8 +3,8 @@ services: image: argus-bind9:latest container_name: argus-bind9-test ports: - - "53:53/tcp" - - "53:53/udp" + - "${HOST_DNS_PORT:-1053}:53/tcp" + - "${HOST_DNS_PORT:-1053}:53/udp" volumes: - ./private:/private restart: unless-stopped @@ -13,4 +13,4 @@ services: networks: bind-test-network: - driver: bridge \ No newline at end of file + driver: bridge diff --git a/src/bind/tests/scripts/00_e2e_test.sh b/src/bind/tests/scripts/00_e2e_test.sh index 3a8a78a..6aa92b1 100755 --- a/src/bind/tests/scripts/00_e2e_test.sh +++ b/src/bind/tests/scripts/00_e2e_test.sh @@ -7,6 +7,9 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +HOST_DNS_PORT="${HOST_DNS_PORT:-1053}" + +export HOST_DNS_PORT echo "==========================================" echo "BIND9 DNS Server End-to-End Test Suite" @@ -112,4 +115,4 @@ else echo " - Review BIND9 configuration files" echo " - Check system resources and port availability" exit 1 -fi \ No newline at end of file +fi diff --git a/src/bind/tests/scripts/01_start_container.sh b/src/bind/tests/scripts/01_start_container.sh index 2a501b9..407a88c 100755 --- a/src/bind/tests/scripts/01_start_container.sh +++ b/src/bind/tests/scripts/01_start_container.sh @@ -7,13 +7,17 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TEST_DIR="$(dirname "$SCRIPT_DIR")" +HOST_DNS_PORT="${HOST_DNS_PORT:-1053}" + +export HOST_DNS_PORT cd "$TEST_DIR" echo "Starting BIND9 test container..." # Ensure private directory exists with proper permissions -mkdir -p private +mkdir -p private/argus/bind +mkdir -p private/argus/etc chmod 777 private # Start the container @@ -35,4 +39,4 @@ fi echo "" echo "BIND9 test environment is ready!" -echo "DNS server listening on localhost:53" \ No newline at end of file +echo "DNS server listening on localhost:${HOST_DNS_PORT}" diff --git a/src/bind/tests/scripts/02_dig_test.sh b/src/bind/tests/scripts/02_dig_test.sh index 5ea31ad..65c91df 100755 --- a/src/bind/tests/scripts/02_dig_test.sh +++ b/src/bind/tests/scripts/02_dig_test.sh @@ -5,7 +5,10 @@ set -e +HOST_DNS_PORT="${HOST_DNS_PORT:-1053}" + echo "Testing DNS resolution with dig..." +echo "Using DNS server localhost:${HOST_DNS_PORT}" # Function to test DNS query test_dns_query() { @@ -19,7 +22,7 @@ test_dns_query() { echo "Expected IP: $expected_ip" # Perform dig query - result=$(dig @localhost $hostname.argus.com A +short 2>/dev/null || echo "QUERY_FAILED") + result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED") if [ "$result" = "QUERY_FAILED" ]; then echo "✗ DNS query failed" @@ -69,4 +72,4 @@ if [ $failed_tests -eq 0 ]; then else echo "✗ $failed_tests test(s) failed" exit 1 -fi \ No newline at end of file +fi diff --git a/src/bind/tests/scripts/03.5_dns_sync_test.sh b/src/bind/tests/scripts/03.5_dns_sync_test.sh index 6e872bc..9a164c9 100755 --- a/src/bind/tests/scripts/03.5_dns_sync_test.sh +++ b/src/bind/tests/scripts/03.5_dns_sync_test.sh @@ -6,10 +6,13 @@ set -e +HOST_DNS_PORT="${HOST_DNS_PORT:-1053}" + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TEST_DIR="$(dirname "$SCRIPT_DIR")" echo "=== DNS Auto-Sync Functionality Test ===" +echo "Using DNS server localhost:${HOST_DNS_PORT}" # Check if container is running if ! docker compose ps | grep -q "Up"; then @@ -36,7 +39,7 @@ test_dns_query() { # Wait a moment for DNS cache sleep 2 - result=$(dig @localhost $hostname.argus.com A +short 2>/dev/null || echo "QUERY_FAILED") + result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED") if [ "$result" = "$expected_ip" ]; then echo "✓ $result" @@ -90,7 +93,7 @@ echo "" echo "Step 2: Testing initial DNS configuration..." # Get current IP for web.argus.com (may have been changed by previous tests) -current_web_ip=$(dig @localhost web.argus.com A +short 2>/dev/null || echo "UNKNOWN") +current_web_ip=$(dig @localhost -p "$HOST_DNS_PORT" web.argus.com A +short 2>/dev/null || echo "UNKNOWN") echo "Current web.argus.com IP: $current_web_ip" # Test that DNS is working (regardless of specific IP) @@ -185,7 +188,7 @@ docker compose exec bind9 bash -c 'echo "this is not an IP address" > /private/a wait_for_sync # Verify invalid record was not added (should fail to resolve) -result=$(dig @localhost invalid.argus.com A +short 2>/dev/null || echo "NO_RESULT") +result=$(dig @localhost -p "$HOST_DNS_PORT" invalid.argus.com A +short 2>/dev/null || echo "NO_RESULT") if [ "$result" = "NO_RESULT" ] || [ -z "$result" ]; then echo "✓ Invalid IP correctly ignored" else diff --git a/src/bind/tests/scripts/03_reload_test.sh b/src/bind/tests/scripts/03_reload_test.sh index 3dac886..e023a4b 100755 --- a/src/bind/tests/scripts/03_reload_test.sh +++ b/src/bind/tests/scripts/03_reload_test.sh @@ -5,10 +5,13 @@ set -e +HOST_DNS_PORT="${HOST_DNS_PORT:-1053}" + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TEST_DIR="$(dirname "$SCRIPT_DIR")" echo "=== DNS Configuration Reload Test ===" +echo "Using DNS server localhost:${HOST_DNS_PORT}" # Check if container is running if ! docker compose ps | grep -q "Up"; then @@ -32,7 +35,7 @@ test_dns_query() { echo "Testing: $description" echo "Query: $hostname.argus.com -> Expected: $expected_ip" - result=$(dig @localhost $hostname.argus.com A +short 2>/dev/null || echo "QUERY_FAILED") + result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED") if [ "$result" = "$expected_ip" ]; then echo "✓ $result" @@ -109,4 +112,4 @@ fi echo "" echo "✓ DNS configuration reload test completed successfully!" echo "✓ IP address changed from 12.4.5.6 to 192.168.1.100" -echo "✓ Configuration persisted and reloaded correctly" \ No newline at end of file +echo "✓ Configuration persisted and reloaded correctly" diff --git a/src/bind/tests/scripts/04_persistence_test.sh b/src/bind/tests/scripts/04_persistence_test.sh index 46db1eb..e3ccb21 100755 --- a/src/bind/tests/scripts/04_persistence_test.sh +++ b/src/bind/tests/scripts/04_persistence_test.sh @@ -5,10 +5,13 @@ set -e +HOST_DNS_PORT="${HOST_DNS_PORT:-1053}" + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TEST_DIR="$(dirname "$SCRIPT_DIR")" echo "=== Configuration Persistence Test ===" +echo "Using DNS server localhost:${HOST_DNS_PORT}" # Check if dig is available if ! command -v dig &> /dev/null; then @@ -25,7 +28,7 @@ test_dns_query() { echo "Testing: $description" echo "Query: $hostname.argus.com -> Expected: $expected_ip" - result=$(dig @localhost $hostname.argus.com A +short 2>/dev/null || echo "QUERY_FAILED") + result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED") if [ "$result" = "$expected_ip" ]; then echo "✓ $result" @@ -112,4 +115,4 @@ echo "" echo "✓ Configuration persistence test completed successfully!" echo "✓ Modified IP (192.168.1.100) persisted after container restart" echo "✓ Configuration files properly linked to persistent storage" -echo "✓ DNS resolution working correctly with persisted configuration" \ No newline at end of file +echo "✓ DNS resolution working correctly with persisted configuration" diff --git a/src/bind/tests/scripts/05_cleanup.sh b/src/bind/tests/scripts/05_cleanup.sh index 2ee0884..45e8cdb 100755 --- a/src/bind/tests/scripts/05_cleanup.sh +++ b/src/bind/tests/scripts/05_cleanup.sh @@ -7,6 +7,9 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TEST_DIR="$(dirname "$SCRIPT_DIR")" +HOST_DNS_PORT="${HOST_DNS_PORT:-1053}" + +export HOST_DNS_PORT # Parse command line arguments FULL_CLEANUP=true diff --git a/src/log/elasticsearch/build/Dockerfile b/src/log/elasticsearch/build/Dockerfile index 9b80f84..7b05ac1 100644 --- a/src/log/elasticsearch/build/Dockerfile +++ b/src/log/elasticsearch/build/Dockerfile @@ -3,10 +3,29 @@ FROM docker.elastic.co/elasticsearch/elasticsearch:8.13.4 # 切换到 root 用户进行系统级安装 USER root -# 修改elasticsearch用户的UID和GID -RUN usermod -u 2133 elasticsearch && \ - groupmod -g 2015 elasticsearch && \ - chown -R elasticsearch:elasticsearch /usr/share/elasticsearch +ARG ARGUS_BUILD_UID=2133 +ARG ARGUS_BUILD_GID=2015 + +ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \ + ARGUS_BUILD_GID=${ARGUS_BUILD_GID} + +# 调整 elasticsearch 用户与用户组 ID 以匹配宿主机配置 +RUN set -eux; \ + current_gid="$(getent group elasticsearch | awk -F: '{print $3}')"; \ + if [ -z "$current_gid" ]; then \ + groupadd -g "${ARGUS_BUILD_GID}" elasticsearch; \ + elif [ "$current_gid" != "${ARGUS_BUILD_GID}" ]; then \ + groupmod -g "${ARGUS_BUILD_GID}" elasticsearch; \ + fi; \ + if id elasticsearch >/dev/null 2>&1; then \ + current_uid="$(id -u elasticsearch)"; \ + if [ "$current_uid" != "${ARGUS_BUILD_UID}" ]; then \ + usermod -u "${ARGUS_BUILD_UID}" elasticsearch; \ + fi; \ + else \ + useradd -m -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" elasticsearch; \ + fi; \ + chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/elasticsearch # 设置构建参数 ARG USE_INTRANET=false diff --git a/src/log/fluent-bit/build/etc/parsers.conf b/src/log/fluent-bit/build/etc/parsers.conf index d86fa06..32f5571 100644 --- a/src/log/fluent-bit/build/etc/parsers.conf +++ b/src/log/fluent-bit/build/etc/parsers.conf @@ -25,3 +25,5 @@ Regex ^(?\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(?\w+)\s+(?.*)$ Time_Key timestamp Time_Format %Y-%m-%d %H:%M:%S + Time_Offset +0800 + Time_Keep On diff --git a/src/log/kibana/build/Dockerfile b/src/log/kibana/build/Dockerfile index 211440d..a8b16d7 100644 --- a/src/log/kibana/build/Dockerfile +++ b/src/log/kibana/build/Dockerfile @@ -3,10 +3,29 @@ FROM docker.elastic.co/kibana/kibana:8.13.4 # 切换到 root 用户进行系统级安装 USER root -# 修改kibana用户的UID和GID -RUN usermod -u 2133 kibana && \ - groupmod -g 2015 kibana && \ - chown -R kibana:kibana /usr/share/kibana +ARG ARGUS_BUILD_UID=2133 +ARG ARGUS_BUILD_GID=2015 + +ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \ + ARGUS_BUILD_GID=${ARGUS_BUILD_GID} + +# 调整 kibana 用户与用户组 ID 以匹配宿主机配置 +RUN set -eux; \ + current_gid="$(getent group kibana | awk -F: '{print $3}')"; \ + if [ -z "$current_gid" ]; then \ + groupadd -g "${ARGUS_BUILD_GID}" kibana; \ + elif [ "$current_gid" != "${ARGUS_BUILD_GID}" ]; then \ + groupmod -g "${ARGUS_BUILD_GID}" kibana; \ + fi; \ + if id kibana >/dev/null 2>&1; then \ + current_uid="$(id -u kibana)"; \ + if [ "$current_uid" != "${ARGUS_BUILD_UID}" ]; then \ + usermod -u "${ARGUS_BUILD_UID}" kibana; \ + fi; \ + else \ + useradd -m -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" kibana; \ + fi; \ + chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/kibana # 设置构建参数 ARG USE_INTRANET=false diff --git a/src/log/tests/docker-compose.yml b/src/log/tests/docker-compose.yml index 4f2c7fe..59d02f6 100644 --- a/src/log/tests/docker-compose.yml +++ b/src/log/tests/docker-compose.yml @@ -17,6 +17,7 @@ services: interval: 10s timeout: 5s retries: 30 + restart: always kibana: build: @@ -73,13 +74,11 @@ services: interval: 15s timeout: 10s retries: 30 + restart: always bind9: image: argus-bind9:latest - ports: - - "53:53/tcp" - - "53:53/udp" volumes: - ./private/argus:/private/argus/ - restart: unless-stopped + restart: always diff --git a/src/log/tests/scripts/01_bootstrap.sh b/src/log/tests/scripts/01_bootstrap.sh index ba3842b..93898e0 100755 --- a/src/log/tests/scripts/01_bootstrap.sh +++ b/src/log/tests/scripts/01_bootstrap.sh @@ -1,6 +1,10 @@ #!/usr/bin/env bash set -euo pipefail root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)" +project_root="$(cd "$root/../../.." && pwd)" + +source "$project_root/scripts/common/build_user.sh" +load_build_user # 创建新的private目录结构 (基于argus目录结构) echo "[INFO] Creating private directory structure for supervisor-based containers..." @@ -11,9 +15,9 @@ mkdir -p "$root/private/argus/etc/" # 设置数据目录权限(ES 和 Kibana 容器都使用 UID 1000) echo "[INFO] Setting permissions for data directories..." -sudo chown -R 2133:2015 "$root/private/argus/log/elasticsearch" 2>/dev/null || true -sudo chown -R 2133:2015 "$root/private/argus/log/kibana" 2>/dev/null || true -sudo chown -R 2133:2015 "$root/private/argus/etc" 2>/dev/null || true +chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/log/elasticsearch" 2>/dev/null || true +chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/log/kibana" 2>/dev/null || true +chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/etc" 2>/dev/null || true echo "[INFO] Supervisor-based containers will manage their own scripts and configurations" diff --git a/src/log/tests/scripts/03_send_test_host01.sh b/src/log/tests/scripts/03_send_test_host01.sh index 8889b06..2fe11b8 100755 --- a/src/log/tests/scripts/03_send_test_host01.sh +++ b/src/log/tests/scripts/03_send_test_host01.sh @@ -4,8 +4,22 @@ set -euo pipefail # 获取fluent-bit-host01容器名称 container_name="logging-mvp-fluent-bit-host01-1" -# 检查容器是否存在并运行 -if ! docker ps | grep -q "$container_name"; then +wait_for_container() { + local name="$1" + local attempts=30 + local delay=5 + local i + for ((i = 1; i <= attempts; i++)); do + if docker ps --format '{{.Names}}' | grep -Fx "$name" >/dev/null; then + return 0 + fi + echo "[INFO] 等待容器 $name 启动中... ($i/$attempts)" + sleep "$delay" + done + return 1 +} + +if ! wait_for_container "$container_name"; then echo "[ERROR] Fluent Bit容器 $container_name 未运行" exit 1 fi @@ -28,4 +42,4 @@ STACK" echo "[OK] 已通过docker exec写入测试日志到 host01 容器内:" echo " - /logs/train/train-demo.log" -echo " - /logs/infer/infer-demo.log" \ No newline at end of file +echo " - /logs/infer/infer-demo.log" diff --git a/src/log/tests/scripts/03_send_test_host02.sh b/src/log/tests/scripts/03_send_test_host02.sh index 039c0cc..d36ecf4 100755 --- a/src/log/tests/scripts/03_send_test_host02.sh +++ b/src/log/tests/scripts/03_send_test_host02.sh @@ -4,8 +4,22 @@ set -euo pipefail # 获取fluent-bit-host02容器名称 container_name="logging-mvp-fluent-bit-host02-1" -# 检查容器是否存在并运行 -if ! docker ps | grep -q "$container_name"; then +wait_for_container() { + local name="$1" + local attempts=30 + local delay=5 + local i + for ((i = 1; i <= attempts; i++)); do + if docker ps --format '{{.Names}}' | grep -Fx "$name" >/dev/null; then + return 0 + fi + echo "[INFO] 等待容器 $name 启动中... ($i/$attempts)" + sleep "$delay" + done + return 1 +} + +if ! wait_for_container "$container_name"; then echo "[ERROR] Fluent Bit容器 $container_name 未运行" exit 1 fi @@ -24,4 +38,4 @@ docker exec "$container_name" sh -c "printf '%s INFO [host02] inference complete echo "[OK] 已通过docker exec写入测试日志到 host02 容器内:" echo " - /logs/train/train-demo.log" -echo " - /logs/infer/infer-demo.log" \ No newline at end of file +echo " - /logs/infer/infer-demo.log" diff --git a/src/log/tests/scripts/04_query_es.sh b/src/log/tests/scripts/04_query_es.sh index 2cf427e..73c8bb7 100755 --- a/src/log/tests/scripts/04_query_es.sh +++ b/src/log/tests/scripts/04_query_es.sh @@ -1,7 +1,42 @@ #!/usr/bin/env bash set -euo pipefail + +# ES endpoint and wait strategy ES="${ES:-http://localhost:9200}" +es_wait_attempts="${ES_WAIT_ATTEMPTS:-60}" # total attempts to wait for ES +es_wait_interval="${ES_WAIT_INTERVAL:-2}" # seconds between attempts + echo "[i] 查询 ES 端点:$ES" + +wait_for_es() { + local attempt=1 + while (( attempt <= es_wait_attempts )); do + # 等待集群达到至少 yellow 状态;请求失败则重试 + if curl -fsS "$ES/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then + echo "[ok] Elasticsearch 已就绪 (attempt=${attempt}/${es_wait_attempts})" + return 0 + fi + echo "[..] 等待 Elasticsearch 可用中 (${attempt}/${es_wait_attempts})" + sleep "${es_wait_interval}" + (( attempt++ )) + done + echo "[err] Elasticsearch 在 ${es_wait_attempts} 次尝试后仍不可用" + return 1 +} + +safe_count() { + # 对缺失索引返回 0,避免 404 触发失败 + local pattern="$1" + local json + json=$(curl -fsS "$ES/${pattern}/_count?ignore_unavailable=true&allow_no_indices=true" 2>/dev/null || echo '{}') + echo "$json" | sed -E 's/.*"count":([0-9]+).*/\1/' | awk 'NF{print $0;exit} END{if(NR==0)print 0}' +} + +wait_for_es + +# 列出相关索引(可能为空,允许) curl -fsS "$ES/_cat/indices?v" | egrep 'train-|infer-|logstash' || true -printf "train-* 计数:"; curl -fsS "$ES/train-*/_count" | sed -E 's/.*"count":([0-9]+).*/\1/'; echo -printf "infer-* 计数:"; curl -fsS "$ES/infer-*/_count" | sed -E 's/.*"count":([0-9]+).*/\1/'; echo + +# 打印计数,缺失索引按 0 处理 +printf "train-* 计数:"; safe_count "train-*"; echo +printf "infer-* 计数:"; safe_count "infer-*"; echo diff --git a/src/log/tests/scripts/e2e_test.sh b/src/log/tests/scripts/e2e_test.sh index c7748fe..fbe5197 100755 --- a/src/log/tests/scripts/e2e_test.sh +++ b/src/log/tests/scripts/e2e_test.sh @@ -19,7 +19,7 @@ get_log_count() { # 函数:等待服务就绪 wait_for_services() { echo "[INFO] Waiting for all services to be ready..." - local max_attempts=60 + local max_attempts=${SERVICE_WAIT_ATTEMPTS:-120} local attempt=1 while [ $attempt -le $max_attempts ]; do diff --git a/src/master/Dockerfile b/src/master/Dockerfile new file mode 100644 index 0000000..bcc932d --- /dev/null +++ b/src/master/Dockerfile @@ -0,0 +1,81 @@ +FROM python:3.11-slim + +SHELL ["/bin/bash", "-c"] + +ARG PIP_INDEX_URL= +ARG USE_OFFLINE=0 +ARG USE_INTRANET=false +ARG ARGUS_BUILD_UID=2133 +ARG ARGUS_BUILD_GID=2015 + +ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \ + ARGUS_BUILD_GID=${ARGUS_BUILD_GID} + +ENV PIP_NO_CACHE_DIR=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONPATH=/app + +USER root + +WORKDIR /app + +COPY ./src/master/requirements.txt ./requirements.txt +COPY ./src/master/offline_wheels/ /opt/offline_wheels/ + +RUN set -euxo pipefail \ + && if [[ "$USE_OFFLINE" == "1" ]]; then \ + python -m pip install --no-index --find-links /opt/offline_wheels pip && \ + python -m pip install --no-index --find-links /opt/offline_wheels -r requirements.txt; \ + else \ + python -m pip install --upgrade pip && \ + if [[ -n "$PIP_INDEX_URL" ]]; then \ + PIP_INDEX_URL="$PIP_INDEX_URL" python -m pip install -r requirements.txt; \ + else \ + python -m pip install -r requirements.txt; \ + fi; \ + fi + +# 配置内网 apt 源并安装常用工具 +RUN if [[ "$USE_INTRANET" == "true" ]]; then \ + echo "Configuring intranet apt sources" && \ + if [[ -f /etc/apt/sources.list ]]; then cp /etc/apt/sources.list /etc/apt/sources.list.bak; fi && \ + mkdir -p /etc/apt && \ + echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \ + rm -rf /etc/apt/sources.list.d && \ + echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \ + echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \ + fi && \ + apt-get update && \ + apt-get install -y supervisor net-tools inetutils-ping && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# 运行期切换到运行所需的 apt 源 +RUN if [[ "$USE_INTRANET" == "true" ]]; then \ + echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \ + fi + +RUN mkdir -p /var/log/supervisor + +RUN set -eux; \ + if getent group argus >/dev/null; then \ + groupmod -g "${ARGUS_BUILD_GID}" argus; \ + else \ + groupadd -g "${ARGUS_BUILD_GID}" argus; \ + fi; \ + if id argus >/dev/null 2>&1; then \ + usermod -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" argus; \ + else \ + useradd -m -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" -s /bin/bash argus; \ + fi + +COPY ./src/master/build/supervisord.conf /etc/supervisor/conf.d/supervisord.conf +COPY ./src/master/build/start-master.sh /usr/local/bin/start-master.sh +COPY ./src/master/build/dns-monitor.sh /usr/local/bin/dns-monitor.sh +RUN chmod +x /usr/local/bin/start-master.sh /usr/local/bin/dns-monitor.sh + +COPY ./src/master/app ./app + +EXPOSE 3000 + +CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] diff --git a/src/master/README.md b/src/master/README.md index e69de29..9d5a231 100644 --- a/src/master/README.md +++ b/src/master/README.md @@ -0,0 +1,186 @@ +# Argus Master 模块 + +Argus Master 是基于 Flask + SQLite 的节点管理服务,负责: + +- 接收 agent 的注册与重注册请求,分配/校验节点 ID。 +- 存储节点元数据、配置、健康状态,并根据上报时间计算在线状态。 +- 输出仅包含在线节点的 `nodes.json`,供其他模块(如 metric)消费。 +- 提供查询、配置更新、统计等 REST API。 + +## 构建与运行 + +```bash +cd src/master +./scripts/build_images.sh # 生成 argus-master:latest 镜像 +``` + +如需离线构建,先在有网环境运行准备脚本: + +```bash +cd src/master +./scripts/prepare_offline_wheels.sh --pip-version 25.2 # 可选 --clean +``` + +脚本会把 `requirements.txt` 及 pip 指定版本全部下载到 `offline_wheels/`。随后将源码目录(含该子目录)与基础镜像一并拷贝到内网,执行: + +```bash +cd src/master +./scripts/build_images.sh --offline --tag argus-master:latest +``` + +若内网缺少 `python:3.11-slim`,请提前在外网 `docker save` 后通过离线介质 `docker load`。 + +本仓库提供的端到端测试会使用 `src/master/tests/docker-compose.yml` 启动示例环境: + +```bash +cd src/master/tests +./scripts/01_up_master.sh # 构建镜像并启动容器,监听 http://localhost:31300 +``` + +服务日志与数据默认写入 `tests/private/argus/master/`(或自定义的挂载目录)。 + +## 运行时环境变量 + +| 变量 | 默认值 | 说明 | +| --- | --- | --- | +| `DB_PATH` | `/private/argus/master/db.sqlite3` | SQLite 数据库存放路径。目录会在启动时自动创建。 | +| `METRIC_NODES_JSON_PATH` | `/private/argus/metric/prometheus/nodes.json` | `nodes.json` 输出位置,仅包含在线节点。采用原子写入避免部分文件。 | +| `OFFLINE_THRESHOLD_SECONDS` | `180` | 若距离最近一次上报时间超过该值,调度器会将节点标记为 `offline`。 | +| `ONLINE_THRESHOLD_SECONDS` | `120` | 若最新上报时间距当前不超过该值,则标记为 `online`。范围处于两个阈值之间时保持原状态。 | +| `SCHEDULER_INTERVAL_SECONDS` | `30` | 调度器检查节点状态与刷新 `nodes.json` 的周期。 | +| `NODE_ID_PREFIX` | `A` | 新节点 ID 的前缀,实际 ID 形如 `A1`、`A2`。 | +| `AUTH_MODE` | `disabled` | 预留的认证开关,当前固定为禁用。 | + +## 进程与监控 + +镜像内通过 `supervisord` 管理进程: + +- `master`:执行 `/usr/local/bin/start-master.sh`,默认以 4 个 Gunicorn worker 监听 `0.0.0.0:3000`;可通过环境变量 `GUNICORN_WORKERS`、`GUNICORN_BIND`、`GUNICORN_EXTRA_ARGS` 调整。 +- `dns-monitor`:轮询 `/private/argus/etc/dns.conf`,若发现变更则调用 `/private/argus/etc/update-dns.sh`,日志输出在 `/var/log/supervisor/dns-monitor.log`。 + +镜像构建阶段会安装 `supervisor`/`net-tools`/`inetutils-ping`/`vim` 等基础工具,并在运行前把 apt 源切换到内网镜像,方便容器内进一步运维。 + +## 域名注册与 DNS 联动 + +- Master 容器启动时会主动执行 `/private/argus/etc/update-dns.sh`(若存在),把自身 `/etc/resolv.conf` 指向 bind 服务提供的 DNS;随后解析 `eth0` 的 IPv4 地址并写入 `/private/argus/etc/master.argus.com`。该文件会被 bind 模块的 `argus_dns_sync.sh` 监控,用于生成 `master.argus.com` → 当前容器 IP 的 A 记录。 +- 测试与生产都需要将 bind 下发的 `update-dns.sh`、`dns.conf` 等文件挂载到 `/private/argus/etc/`。在 E2E 场景中,`tests/private/argus/etc` 会由脚本自动准备。 +- 其他模块(如 agent)在启动脚本中只需执行同一份 `update-dns.sh`,即可使用域名访问 master;若域名注册异常,agent 将无法成功上报,可据此快速定位问题。 + +## REST API 详解 + +基础路径:`/api/v1/master`,全部返回 JSON。 + +### 1. `GET /nodes` +- **用途**:获取所有节点的简要信息。 +- **响应示例**: + ```json + [ + {"id": "A1", "name": "dev-user-inst-pod-0", "status": "online", "type": "agent", "version": "1.1.0"} + ] + ``` + +### 2. `GET /nodes/{id}` +- **用途**:获取节点详情(包含配置、健康、持久化时间戳等)。 +- **错误**:`404` 表示节点不存在。 + +### 3. `POST /nodes` +- **用途**:注册或重注册节点。 +- **请求体**: + ```json + { + "id": "A1", // 可选,重注册时携带 + "name": "dev-user-inst-pod-0", + "type": "agent", + "version": "1.1.0", + "meta_data": { + "hostname": "dev-user-inst-pod-0", + "ip": "10.0.0.10", + "env": "dev", + "user": "testuser", + "instance": "testinst", + "cpu_number": 4, + "memory_in_bytes": 2147483648, + "gpu_number": 0 + } + } + ``` +- **成功返回**: + - 新节点:`201 Created`,返回完整节点对象。 + - 重注册:`200 OK`,返回更新后的节点对象。 +- **错误情况**: + - `404 Not Found`:携带的 ID 在 Master 中不存在。 + - `500 Internal Server Error`:携带的 ID 与已有名称不匹配。 + - `400 Bad Request`:请求体缺字段或类型不正确。 + +### 4. `PUT /nodes/{id}/status` +- **用途**:Agent 上报状态。Master 记录 `last_report`(服务器时间)与 `agent_last_report`(上报内时间),并更新 `health` 字段。 +- **请求体示例**: + ```json + { + "timestamp": "2025-09-24T03:24:59Z", + "health": { + "log-fluentbit": {"status": "healthy"}, + "metric-node-exporter": {"status": "healthy"} + } + } + ``` +- **响应**:`200 OK`,返回最新节点对象。`404` 表示节点不存在。 + +### 5. `PUT /nodes/{id}/config` +- **用途**:局部更新节点配置与标签。 +- **请求体示例**: + ```json + { + "config": {"log_level": "debug"}, + "label": ["gpu", "exp001"] + } + ``` +- **说明**:字段可任选其一;未提供的配置保持原值。更新标签会触发 `nodes.json` 重新生成。 +- **错误**:`404` 表示节点不存在;`400` 表示请求体不合法。 + +### 6. `GET /nodes/statistics` +- **用途**:统计节点总数及按状态分布。 +- **响应示例**: + ```json + { + "total": 2, + "status_statistics": [ + {"status": "online", "count": 1}, + {"status": "offline", "count": 1} + ] + } + ``` + +### 7. 健康探针 +- `GET /healthz`:进程存活检查。 +- `GET /readyz`:数据库可用性检查(会尝试访问 `DB_PATH`)。 + + +如需验证离线镜像,可使用自动化脚本: +```bash +cd src/master/tests +./scripts/00_e2e_test_offline.sh # 构建离线镜像并执行完整 E2E +``` + +## 端到端测试场景 + +执行 `src/master/tests/scripts/00_e2e_test.sh` 会串联以下用例(脚本 01–10): + +1. **01_up_master**:构建镜像、启动容器、初始化目录与卷。 +2. **02_verify_ready_and_nodes_json**:轮询 `/readyz`,校验初始 `nodes.json` 为 `[]`。 +3. **03_register_via_curl**:模拟 agent 注册,保存返回的节点 ID,并确认节点出现在列表接口中。 +4. **04_reregister_and_error_cases**:覆盖重注册成功、携带未知 ID 的 `404`、ID/名称不匹配触发 `500` 等场景。 +5. **05_status_report_via_curl**:上报健康信息并验证状态自动从 `initialized`→`online`→`offline`→`online` 的转换。 +6. **06_config_update_and_nodes_json**:更新配置/标签,检查 `nodes.json` 中的标签同步,并确保离线节点不会出现在文件里。 +7. **07_stats_single_node**:等待节点掉线,验证统计接口与 `nodes.json` 为空列表。 +8. **08_multi_node_stats**:注册第二节点,使一在线一离线,校验统计聚合和 `nodes.json` 仅包含在线节点。 +9. **09_restart_persistence**:重启 master 容器,确认节点数据、统计结果与 `nodes.json` 在持久化目录中保持不变。 +10. **10_down**:停止并清理容器、网络与临时目录。 + +## 相关持久化文件 + +- SQLite:默认位于 `DB_PATH`,包含 `nodes` 与 `kv` 两张表。 +- `nodes.json`:由调度器周期生成,仅保留状态为 `online` 的节点信息。 +- 测试用例中的 `tests/private/`、`tests/tmp/` 会随脚本自动清理,避免污染后续运行。 + +如需在生产环境运行,可将镜像推送到私有仓库,或参考测试 Compose 配置自行部署;只需确保上述环境变量在容器内正确设置即可。 diff --git a/src/master/app/__init__.py b/src/master/app/__init__.py new file mode 100644 index 0000000..9e66eaa --- /dev/null +++ b/src/master/app/__init__.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +import atexit +import logging + +from flask import Flask + +from .config import AppConfig, load_config +from .routes import register_routes +from .scheduler import StatusScheduler +from .storage import Storage + + +def create_app(config: AppConfig | None = None) -> Flask: + app_config = config or load_config() + storage = Storage(app_config.db_path, app_config.node_id_prefix) + scheduler = StatusScheduler(storage, app_config) + + app = Flask(__name__) + app.config["APP_CONFIG"] = app_config + app.config["STORAGE"] = storage + app.config["SCHEDULER"] = scheduler + + register_routes(app, storage, scheduler, app_config) + + scheduler.start() + + def _cleanup() -> None: + logging.getLogger("argus.master").info("Shutting down master app") + try: + scheduler.stop() + except Exception: # pragma: no cover - defensive + logging.getLogger("argus.master").exception("Failed to stop scheduler") + try: + storage.close() + except Exception: # pragma: no cover - defensive + logging.getLogger("argus.master").exception("Failed to close storage") + + atexit.register(_cleanup) + + return app diff --git a/src/master/app/config.py b/src/master/app/config.py new file mode 100644 index 0000000..246d3bf --- /dev/null +++ b/src/master/app/config.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +import os +from dataclasses import dataclass + + +@dataclass(frozen=True) +class AppConfig: + db_path: str + metric_nodes_json_path: str + offline_threshold_seconds: int + online_threshold_seconds: int + scheduler_interval_seconds: int + node_id_prefix: str + auth_mode: str + + +def _get_int_env(name: str, default: int) -> int: + raw = os.environ.get(name) + if raw is None or raw.strip() == "": + return default + try: + return int(raw) + except ValueError as exc: + raise ValueError(f"Environment variable {name} must be an integer, got {raw!r}") from exc + + +def load_config() -> AppConfig: + """读取环境变量生成配置对象,方便统一管理运行参数。""" + return AppConfig( + db_path=os.environ.get("DB_PATH", "/private/argus/master/db.sqlite3"), + metric_nodes_json_path=os.environ.get( + "METRIC_NODES_JSON_PATH", "/private/argus/metric/prometheus/nodes.json" + ), + offline_threshold_seconds=_get_int_env("OFFLINE_THRESHOLD_SECONDS", 180), + online_threshold_seconds=_get_int_env("ONLINE_THRESHOLD_SECONDS", 120), + scheduler_interval_seconds=_get_int_env("SCHEDULER_INTERVAL_SECONDS", 30), + node_id_prefix=os.environ.get("NODE_ID_PREFIX", "A"), + auth_mode=os.environ.get("AUTH_MODE", "disabled"), + ) diff --git a/src/master/app/models.py b/src/master/app/models.py new file mode 100644 index 0000000..f4e37a9 --- /dev/null +++ b/src/master/app/models.py @@ -0,0 +1,171 @@ +from __future__ import annotations + +import json +from dataclasses import dataclass +from typing import Any, Dict, Iterable, Mapping + +from .util import parse_iso + + +class ValidationError(Exception): + """Raised when user payload fails validation.""" + + +@dataclass +class Node: + id: str + name: str + type: str + version: str | None + status: str + config: Dict[str, Any] + labels: Iterable[str] + meta_data: Dict[str, Any] + health: Dict[str, Any] + register_time: str | None + last_report: str | None + agent_last_report: str | None + last_updated: str | None + + +def serialize_node_row(row: Mapping[str, Any]) -> Dict[str, Any]: + def _json_or_default(value: str | None, default: Any) -> Any: + if value is None or value == "": + return default + try: + return json.loads(value) + except json.JSONDecodeError: + return default + + config = _json_or_default(row["config_json"], {}) + labels = _json_or_default(row["labels_json"], []) + meta = _json_or_default(row["meta_json"], {}) + health = _json_or_default(row["health_json"], {}) + return { + "id": row["id"], + "name": row["name"], + "type": row["type"], + "version": row["version"], + "status": row["status"], + "config": config if isinstance(config, dict) else {}, + "label": list(labels) if isinstance(labels, list) else [], + "meta_data": meta if isinstance(meta, dict) else {}, + "health": health if isinstance(health, dict) else {}, + "register_time": row["register_time"], + "last_report": row["last_report"], + "agent_last_report": row["agent_last_report"], + "last_updated": row["last_updated"], + } + + +def serialize_node_summary(row: Mapping[str, Any]) -> Dict[str, Any]: + return { + "id": row["id"], + "name": row["name"], + "status": row["status"], + "type": row["type"], + "version": row["version"], + } + + +def validate_registration_payload(payload: Mapping[str, Any]) -> Dict[str, Any]: + if not isinstance(payload, Mapping): + raise ValidationError("Request body must be a JSON object") + + name = payload.get("name") + if not isinstance(name, str) or not name.strip(): + raise ValidationError("Field 'name' is required and must be a non-empty string") + + node_type = payload.get("type", "agent") + if not isinstance(node_type, str) or not node_type: + raise ValidationError("Field 'type' must be a string") + + version = payload.get("version") + if version is not None and not isinstance(version, str): + raise ValidationError("Field 'version' must be a string if provided") + + meta = payload.get("meta_data") + if not isinstance(meta, Mapping): + raise ValidationError("Field 'meta_data' must be an object") + + required_meta = ["hostname", "ip", "env", "user", "instance", "cpu_number", "memory_in_bytes", "gpu_number"] + for key in required_meta: + if key not in meta: + raise ValidationError(f"meta_data.{key} is required") + + cpu_number = meta["cpu_number"] + memory_in_bytes = meta["memory_in_bytes"] + gpu_number = meta["gpu_number"] + if not isinstance(cpu_number, int) or cpu_number < 0: + raise ValidationError("meta_data.cpu_number must be a non-negative integer") + if not isinstance(memory_in_bytes, int) or memory_in_bytes < 0: + raise ValidationError("meta_data.memory_in_bytes must be a non-negative integer") + if not isinstance(gpu_number, int) or gpu_number < 0: + raise ValidationError("meta_data.gpu_number must be a non-negative integer") + + node_id = payload.get("id") + if node_id is not None and (not isinstance(node_id, str) or not node_id.strip()): + raise ValidationError("Field 'id' must be a non-empty string when provided") + + return { + "id": node_id, + "name": name, + "type": node_type, + "version": version, + "meta_data": dict(meta), + } + + +def validate_status_payload(payload: Mapping[str, Any]) -> Dict[str, Any]: + if not isinstance(payload, Mapping): + raise ValidationError("Request body must be a JSON object") + + timestamp = payload.get("timestamp") + if not isinstance(timestamp, str) or not timestamp: + raise ValidationError("Field 'timestamp' is required and must be a string") + + parsed = parse_iso(timestamp) + if parsed is None: + raise ValidationError("Field 'timestamp' must be an ISO8601 datetime string") + + health = payload.get("health", {}) + if not isinstance(health, Mapping): + raise ValidationError("Field 'health' must be an object if provided") + + sanitized_health: Dict[str, Any] = {} + for key, value in health.items(): + if not isinstance(key, str): + raise ValidationError("Keys in 'health' must be strings") + if not isinstance(value, (Mapping, list, str, int, float, bool)) and value is not None: + raise ValidationError("Values in 'health' must be JSON-compatible") + sanitized_health[key] = value + + return { + "timestamp": timestamp, + "parsed_timestamp": parsed, + "health": sanitized_health, + } + + +def validate_config_payload(payload: Mapping[str, Any]) -> Dict[str, Any]: + if not isinstance(payload, Mapping): + raise ValidationError("Request body must be a JSON object") + + result: Dict[str, Any] = {} + if "config" in payload: + config = payload["config"] + if not isinstance(config, Mapping): + raise ValidationError("Field 'config' must be an object") + result["config"] = dict(config) + + if "label" in payload: + labels = payload["label"] + if not isinstance(labels, list) or not all(isinstance(item, str) for item in labels): + raise ValidationError("Field 'label' must be an array of strings") + result["label"] = list(labels) + + if not result: + raise ValidationError("At least one of 'config' or 'label' must be provided") + + return result + diff --git a/src/master/app/nodes_api.py b/src/master/app/nodes_api.py new file mode 100644 index 0000000..0a2f57f --- /dev/null +++ b/src/master/app/nodes_api.py @@ -0,0 +1,155 @@ +from __future__ import annotations + +import logging +from http import HTTPStatus +from typing import Any, Mapping + +from flask import Blueprint, jsonify, request + +from .models import ( + ValidationError, + validate_config_payload, + validate_registration_payload, + validate_status_payload, +) +from .scheduler import StatusScheduler +from .storage import Storage +from .util import to_iso, utcnow + + +def create_nodes_blueprint(storage: Storage, scheduler: StatusScheduler) -> Blueprint: + bp = Blueprint("nodes", __name__) + logger = logging.getLogger("argus.master.api") + + def _json_error(message: str, status: HTTPStatus, code: str) -> Any: + response = jsonify({"error": message, "code": code}) + response.status_code = status + return response + + @bp.errorhandler(ValidationError) + def _handle_validation_error(err: ValidationError): + return _json_error(str(err), HTTPStatus.BAD_REQUEST, "invalid_request") + + @bp.get("/nodes") + def list_nodes(): + nodes = storage.list_nodes() + return jsonify(nodes) + + @bp.get("/nodes/") + def get_node(node_id: str): + node = storage.get_node(node_id) + if node is None: + return _json_error("Node not found", HTTPStatus.NOT_FOUND, "not_found") + return jsonify(node) + + @bp.post("/nodes") + def register_node(): + payload = _get_json() + data = validate_registration_payload(payload) + now = utcnow() + now_iso = to_iso(now) + node_id = data["id"] + name = data["name"] + node_type = data["type"] + version = data["version"] + meta = data["meta_data"] + + if node_id: + # 携带 id 说明是重注册,需要校验名称一致性 + existing_row = storage.get_node_raw(node_id) + if existing_row is None: + return _json_error("Node not found", HTTPStatus.NOT_FOUND, "not_found") + if existing_row["name"] != name: + return _json_error( + "Node id and name mismatch during re-registration", + HTTPStatus.INTERNAL_SERVER_ERROR, + "id_name_mismatch", + ) + updated = storage.update_node_meta( + node_id, + node_type=node_type, + version=version, + meta_data=meta, + last_updated_iso=now_iso, + ) + scheduler.trigger_nodes_json_refresh() + return jsonify(updated), HTTPStatus.OK + + # No id provided → search by name + existing_by_name = storage.get_node_by_name(name) + if existing_by_name: + # 同名节点已存在,视为无 id 重注册 + updated = storage.update_node_meta( + existing_by_name["id"], + node_type=node_type, + version=version, + meta_data=meta, + last_updated_iso=now_iso, + ) + scheduler.trigger_nodes_json_refresh() + return jsonify(updated), HTTPStatus.OK + + new_id = storage.allocate_node_id() + created = storage.create_node( + new_id, + name, + node_type, + version, + meta, + status="initialized", + register_time_iso=now_iso, + last_updated_iso=now_iso, + ) + scheduler.trigger_nodes_json_refresh() + return jsonify(created), HTTPStatus.CREATED + + @bp.put("/nodes//config") + def update_node_config(node_id: str): + payload = _get_json() + updates = validate_config_payload(payload) + try: + updated = storage.update_config_and_labels( + node_id, + config=updates.get("config"), + labels=updates.get("label"), + ) + except KeyError: + return _json_error("Node not found", HTTPStatus.NOT_FOUND, "not_found") + + if "label" in updates: + scheduler.trigger_nodes_json_refresh() + return jsonify(updated) + + @bp.get("/nodes/statistics") + def node_statistics(): + stats = storage.get_statistics() + return jsonify(stats) + + @bp.put("/nodes//status") + def update_status(node_id: str): + payload = _get_json() + data = validate_status_payload(payload) + try: + # master 负责写入 last_report,状态由调度器计算 + updated = storage.update_last_report( + node_id, + server_timestamp_iso=to_iso(utcnow()), + agent_timestamp_iso=data["timestamp"], + health=data["health"], + ) + except KeyError: + return _json_error("Node not found", HTTPStatus.NOT_FOUND, "not_found") + + scheduler.trigger_nodes_json_refresh() + return jsonify(updated) + + return bp + + +def _get_json() -> Mapping[str, Any]: + data = request.get_json(silent=True) + if data is None: + raise ValidationError("Request body must be valid JSON") + if not isinstance(data, Mapping): + raise ValidationError("Request body must be a JSON object") + return data diff --git a/src/master/app/routes.py b/src/master/app/routes.py new file mode 100644 index 0000000..10bbba6 --- /dev/null +++ b/src/master/app/routes.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from flask import Flask, jsonify + +from .config import AppConfig +from .nodes_api import create_nodes_blueprint +from .scheduler import StatusScheduler +from .storage import Storage + + +def register_routes(app: Flask, storage: Storage, scheduler: StatusScheduler, config: AppConfig) -> None: + app.register_blueprint(create_nodes_blueprint(storage, scheduler), url_prefix="/api/v1/master") + + @app.get("/healthz") + def healthz(): + return jsonify({"status": "ok"}) + + @app.get("/readyz") + def readyz(): + try: + storage.list_nodes() # simple readiness probe + except Exception as exc: # pragma: no cover - defensive + return jsonify({"status": "error", "error": str(exc)}), 500 + return jsonify({"status": "ok"}) diff --git a/src/master/app/scheduler.py b/src/master/app/scheduler.py new file mode 100644 index 0000000..8797b25 --- /dev/null +++ b/src/master/app/scheduler.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +import logging +import threading +from typing import Optional + +from .config import AppConfig +from .storage import Storage +from .util import atomic_write_json, parse_iso, to_iso, utcnow + + +class StatusScheduler: + def __init__(self, storage: Storage, config: AppConfig, logger: Optional[logging.Logger] = None) -> None: + self._storage = storage + self._config = config + self._logger = logger or logging.getLogger("argus.master.scheduler") + self._stop_event = threading.Event() + self._thread = threading.Thread(target=self._run, name="status-scheduler", daemon=True) + self._nodes_json_lock = threading.Lock() + self._pending_nodes_json = threading.Event() + + def start(self) -> None: + """启动后台线程,定期刷新节点状态与 nodes.json。""" + if not self._thread.is_alive(): + self._logger.info("Starting scheduler thread") + self._thread.start() + + def stop(self) -> None: + self._stop_event.set() + self._pending_nodes_json.set() + self._thread.join(timeout=5) + + def trigger_nodes_json_refresh(self) -> None: + self._pending_nodes_json.set() + + def generate_nodes_json(self) -> None: + with self._nodes_json_lock: + online_nodes = self._storage.get_online_nodes() + atomic_write_json(self._config.metric_nodes_json_path, online_nodes) + self._logger.info("nodes.json updated", extra={"count": len(online_nodes)}) + + # ------------------------------------------------------------------ + # internal loop + # ------------------------------------------------------------------ + + def _run(self) -> None: + # 确保启动时 nodes.json 会立即生成 + self._pending_nodes_json.set() + while not self._stop_event.is_set(): + changed = self._reconcile_statuses() + if changed or self._pending_nodes_json.is_set(): + try: + self.generate_nodes_json() + finally: + self._pending_nodes_json.clear() + self._stop_event.wait(self._config.scheduler_interval_seconds) + + def _reconcile_statuses(self) -> bool: + """根据 last_report 与当前时间对比,决定是否切换状态。""" + any_status_changed = False + now = utcnow() + rows = self._storage.fetch_nodes_for_scheduler() + for row in rows: + node_id = row["id"] + last_report_iso = row["last_report"] + current_status = row["status"] + last_report_dt = parse_iso(last_report_iso) + if last_report_dt is None: + # No report yet; treat as initialized until report arrives + continue + delta_seconds = (now - last_report_dt).total_seconds() + new_status = current_status + if delta_seconds > self._config.offline_threshold_seconds: + new_status = "offline" + elif delta_seconds <= self._config.online_threshold_seconds: + new_status = "online" + # Between thresholds: keep current status (sticky) + if new_status != current_status: + any_status_changed = True + self._logger.info( + "Updating node status", + extra={ + "node_id": node_id, + "previous": current_status, + "new": new_status, + "delta_seconds": delta_seconds, + }, + ) + self._storage.update_status(node_id, new_status, last_updated_iso=to_iso(now)) + return any_status_changed diff --git a/src/master/app/storage.py b/src/master/app/storage.py new file mode 100644 index 0000000..3547066 --- /dev/null +++ b/src/master/app/storage.py @@ -0,0 +1,332 @@ +from __future__ import annotations + +import json +import sqlite3 +import threading +from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple + +from .models import serialize_node_row, serialize_node_summary +from .util import ensure_parent, to_iso, utcnow + + +class Storage: + def __init__(self, db_path: str, node_id_prefix: str) -> None: + self._db_path = db_path + self._node_id_prefix = node_id_prefix + ensure_parent(db_path) + self._lock = threading.Lock() + self._conn = sqlite3.connect(db_path, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False) + self._conn.row_factory = sqlite3.Row + with self._lock: + self._conn.execute("PRAGMA foreign_keys = ON;") + self._ensure_schema() + + # ------------------------------------------------------------------ + # schema & helpers + # ------------------------------------------------------------------ + + def _ensure_schema(self) -> None: + """初始化表结构,确保服务启动时数据库结构就绪。""" + with self._lock: + self._conn.executescript( + """ + CREATE TABLE IF NOT EXISTS nodes ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL UNIQUE, + type TEXT NOT NULL, + version TEXT, + status TEXT NOT NULL, + config_json TEXT, + labels_json TEXT, + meta_json TEXT, + health_json TEXT, + register_time TEXT, + last_report TEXT, + agent_last_report TEXT, + last_updated TEXT + ); + + CREATE TABLE IF NOT EXISTS kv ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL + ); + + CREATE INDEX IF NOT EXISTS idx_nodes_status ON nodes(status); + CREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name); + """ + ) + self._conn.commit() + + def close(self) -> None: + with self._lock: + self._conn.close() + + # ------------------------------------------------------------------ + # Node ID allocation + # ------------------------------------------------------------------ + + def allocate_node_id(self) -> str: + """在 kv 表里维护自增序列,为新节点生成形如 A1 的 ID。""" + with self._lock: + cur = self._conn.execute("SELECT value FROM kv WHERE key = ?", ("node_id_seq",)) + row = cur.fetchone() + if row is None: + next_id = 1 + self._conn.execute("INSERT INTO kv(key, value) VALUES(?, ?)", ("node_id_seq", str(next_id))) + else: + next_id = int(row["value"]) + 1 + self._conn.execute("UPDATE kv SET value = ? WHERE key = ?", (str(next_id), "node_id_seq")) + self._conn.commit() + return f"{self._node_id_prefix}{next_id}" + + # ------------------------------------------------------------------ + # Query helpers + # ------------------------------------------------------------------ + + def list_nodes(self) -> List[Dict[str, Any]]: + with self._lock: + cur = self._conn.execute( + "SELECT id, name, status, type, version FROM nodes ORDER BY id ASC" + ) + rows = cur.fetchall() + return [serialize_node_summary(row) for row in rows] + + def get_node(self, node_id: str) -> Optional[Dict[str, Any]]: + with self._lock: + cur = self._conn.execute("SELECT * FROM nodes WHERE id = ?", (node_id,)) + row = cur.fetchone() + if row is None: + return None + return serialize_node_row(row) + + def get_node_raw(self, node_id: str) -> Optional[sqlite3.Row]: + with self._lock: + cur = self._conn.execute("SELECT * FROM nodes WHERE id = ?", (node_id,)) + row = cur.fetchone() + return row + + def get_node_by_name(self, name: str) -> Optional[Dict[str, Any]]: + with self._lock: + cur = self._conn.execute("SELECT * FROM nodes WHERE name = ?", (name,)) + row = cur.fetchone() + if row is None: + return None + return serialize_node_row(row) + + # ------------------------------------------------------------------ + # Mutation helpers + # ------------------------------------------------------------------ + + def create_node( + self, + node_id: str, + name: str, + node_type: str, + version: str | None, + meta_data: Mapping[str, Any], + status: str, + register_time_iso: str, + last_updated_iso: str, + ) -> Dict[str, Any]: + """插入节点初始记录,默认 config/label/health 为空。""" + now_iso = last_updated_iso + with self._lock: + self._conn.execute( + """ + INSERT INTO nodes ( + id, name, type, version, status, config_json, labels_json, meta_json, + health_json, register_time, last_report, agent_last_report, last_updated + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + node_id, + name, + node_type, + version, + status, + json.dumps({}), + json.dumps([]), + json.dumps(dict(meta_data)), + json.dumps({}), + register_time_iso, + None, + None, + now_iso, + ), + ) + self._conn.commit() + + created = self.get_node(node_id) + if created is None: + raise RuntimeError("Failed to read back created node") + return created + + def update_node_meta( + self, + node_id: str, + *, + name: Optional[str] = None, + node_type: Optional[str] = None, + version: Optional[str | None] = None, + meta_data: Optional[Mapping[str, Any]] = None, + last_updated_iso: Optional[str] = None, + ) -> Dict[str, Any]: + """重注册时更新节点静态信息,缺省字段保持不变。""" + updates: List[str] = [] + params: List[Any] = [] + if name is not None: + updates.append("name = ?") + params.append(name) + if node_type is not None: + updates.append("type = ?") + params.append(node_type) + if version is not None: + updates.append("version = ?") + params.append(version) + if meta_data is not None: + updates.append("meta_json = ?") + params.append(json.dumps(dict(meta_data))) + if last_updated_iso is not None: + updates.append("last_updated = ?") + params.append(last_updated_iso) + + if not updates: + result = self.get_node(node_id) + if result is None: + raise KeyError(node_id) + return result + + params.append(node_id) + with self._lock: + self._conn.execute( + f"UPDATE nodes SET {', '.join(updates)} WHERE id = ?", + tuple(params), + ) + self._conn.commit() + updated = self.get_node(node_id) + if updated is None: + raise KeyError(node_id) + return updated + + def update_config_and_labels( + self, node_id: str, *, config: Optional[Mapping[str, Any]] = None, labels: Optional[Iterable[str]] = None + ) -> Dict[str, Any]: + """部分更新 config/label,并刷新 last_updated 时间戳。""" + updates: List[str] = [] + params: List[Any] = [] + if config is not None: + updates.append("config_json = ?") + params.append(json.dumps(dict(config))) + if labels is not None: + updates.append("labels_json = ?") + params.append(json.dumps(list(labels))) + updates.append("last_updated = ?") + params.append(to_iso(utcnow())) + params.append(node_id) + with self._lock: + self._conn.execute( + f"UPDATE nodes SET {', '.join(updates)} WHERE id = ?", + tuple(params), + ) + if self._conn.total_changes == 0: + self._conn.rollback() + raise KeyError(node_id) + self._conn.commit() + updated = self.get_node(node_id) + if updated is None: + raise KeyError(node_id) + return updated + + def update_last_report( + self, + node_id: str, + *, + server_timestamp_iso: str, + agent_timestamp_iso: str, + health: Mapping[str, Any], + ) -> Dict[str, Any]: + """记录最新上报时间和健康信息,用于后续状态计算。""" + with self._lock: + self._conn.execute( + """ + UPDATE nodes + SET last_report = ?, + agent_last_report = ?, + health_json = ?, + last_updated = ? + WHERE id = ? + """, + ( + server_timestamp_iso, + agent_timestamp_iso, + json.dumps(health), + server_timestamp_iso, + node_id, + ), + ) + if self._conn.total_changes == 0: + self._conn.rollback() + raise KeyError(node_id) + self._conn.commit() + updated = self.get_node(node_id) + if updated is None: + raise KeyError(node_id) + return updated + + def update_status(self, node_id: str, status: str, *, last_updated_iso: str) -> None: + with self._lock: + self._conn.execute( + "UPDATE nodes SET status = ?, last_updated = ? WHERE id = ?", + (status, last_updated_iso, node_id), + ) + self._conn.commit() + + # ------------------------------------------------------------------ + # Reporting helpers + # ------------------------------------------------------------------ + + def get_statistics(self) -> Dict[str, Any]: + """统计节点总数及按状态聚合的数量。""" + with self._lock: + cur = self._conn.execute("SELECT COUNT(*) AS total FROM nodes") + total_row = cur.fetchone() + cur = self._conn.execute("SELECT status, COUNT(*) AS count FROM nodes GROUP BY status") + status_rows = cur.fetchall() + return { + "total": total_row["total"] if total_row else 0, + "status_statistics": [ + {"status": row["status"], "count": row["count"]} + for row in status_rows + ], + } + + def fetch_nodes_for_scheduler(self) -> List[sqlite3.Row]: + with self._lock: + cur = self._conn.execute( + "SELECT id, last_report, status FROM nodes" + ) + return cur.fetchall() + + def get_online_nodes(self) -> List[Dict[str, Any]]: + """返回在线节点列表,用于生成 nodes.json。""" + with self._lock: + cur = self._conn.execute( + "SELECT id, meta_json, labels_json, name FROM nodes WHERE status = ? ORDER BY id ASC", + ("online",), + ) + rows = cur.fetchall() + + result: List[Dict[str, Any]] = [] + for row in rows: + meta = json.loads(row["meta_json"]) if row["meta_json"] else {} + labels = json.loads(row["labels_json"]) if row["labels_json"] else [] + result.append( + { + "node_id": row["id"], + "user_id": meta.get("user"), + "ip": meta.get("ip"), + "hostname": meta.get("hostname", row["name"]), + "labels": labels if isinstance(labels, list) else [], + } + ) + return result diff --git a/src/master/app/util.py b/src/master/app/util.py new file mode 100644 index 0000000..903846c --- /dev/null +++ b/src/master/app/util.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import json +import os +import tempfile +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Iterable + + +ISO_FORMAT = "%Y-%m-%dT%H:%M:%SZ" + + +def utcnow() -> datetime: + """获取当前 UTC 时间,统一时间基准。""" + return datetime.now(timezone.utc) + + +def to_iso(dt: datetime | None) -> str | None: + if dt is None: + return None + return dt.astimezone(timezone.utc).replace(microsecond=0).strftime(ISO_FORMAT) + + +def parse_iso(value: str | None) -> datetime | None: + if not value: + return None + try: + if value.endswith("Z"): + return datetime.strptime(value, ISO_FORMAT).replace(tzinfo=timezone.utc) + # Fallback for ISO strings with offset + return datetime.fromisoformat(value).astimezone(timezone.utc) + except ValueError: + return None + + +def ensure_parent(path: str) -> None: + """确保目标文件所在目录存在。""" + Path(path).parent.mkdir(parents=True, exist_ok=True) + + +def atomic_write_json(path: str, data: Iterable[Any] | Any) -> None: + """原子化写 JSON,避免被其它进程读到半成品。""" + ensure_parent(path) + directory = Path(path).parent + with tempfile.NamedTemporaryFile("w", dir=directory, delete=False) as tmp: + json.dump(data, tmp, separators=(",", ":")) + tmp.flush() + os.fsync(tmp.fileno()) + temp_path = tmp.name + os.replace(temp_path, path) diff --git a/src/master/build/dns-monitor.sh b/src/master/build/dns-monitor.sh new file mode 120000 index 0000000..dc3391b --- /dev/null +++ b/src/master/build/dns-monitor.sh @@ -0,0 +1 @@ +../../bind/build/dns-monitor.sh \ No newline at end of file diff --git a/src/master/build/start-master.sh b/src/master/build/start-master.sh new file mode 100755 index 0000000..deeb211 --- /dev/null +++ b/src/master/build/start-master.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +set -euo pipefail + +# 中文提示:确保共享目录与 DNS 相关脚本存在 +DNS_DIR="/private/argus/etc" +DNS_SCRIPT="${DNS_DIR}/update-dns.sh" +MASTER_DOMAIN_FILE="${DNS_DIR}/master.argus.com" +RUNTIME_USER="${ARGUS_RUNTIME_USER:-argus}" +RUNTIME_UID="${ARGUS_BUILD_UID:-2133}" +RUNTIME_GID="${ARGUS_BUILD_GID:-2015}" +MASTER_DATA_DIR="/private/argus/master" +METRIC_DIR="/private/argus/metric/prometheus" + +mkdir -p "$DNS_DIR" +chown -R "$RUNTIME_UID:$RUNTIME_GID" "$DNS_DIR" 2>/dev/null || true +mkdir -p "$MASTER_DATA_DIR" +mkdir -p "$METRIC_DIR" +chown -R "$RUNTIME_UID:$RUNTIME_GID" "$MASTER_DATA_DIR" "$METRIC_DIR" 2>/dev/null || true + +if [[ -x "$DNS_SCRIPT" ]]; then + echo "[INFO] Running update-dns.sh before master starts" + # 中文提示:若脚本存在则执行,保证容器使用 bind 作为 DNS + "$DNS_SCRIPT" || echo "[WARN] update-dns.sh execution failed" +else + echo "[WARN] DNS update script not found or not executable: $DNS_SCRIPT" +fi + +# 中文提示:记录 master 当前 IP,供 bind 服务同步 +MASTER_IP=$(ifconfig | grep -A 1 eth0 | grep inet | awk '{print $2}' || true) +if [[ -n "${MASTER_IP}" ]]; then + echo "current IP: ${MASTER_IP}" + echo "${MASTER_IP}" > "$MASTER_DOMAIN_FILE" + chown "$RUNTIME_UID:$RUNTIME_GID" "$MASTER_DOMAIN_FILE" 2>/dev/null || true +else + echo "[WARN] Failed to detect master IP via ifconfig" +fi + +WORKERS=${GUNICORN_WORKERS:-4} +BIND_ADDR=${GUNICORN_BIND:-0.0.0.0:3000} +EXTRA_OPTS=${GUNICORN_EXTRA_ARGS:-} + +if [[ -n "$EXTRA_OPTS" ]]; then + read -r -a EXTRA_ARRAY <<< "$EXTRA_OPTS" +else + EXTRA_ARRAY=() +fi + +command=(gunicorn --bind "$BIND_ADDR" --workers "$WORKERS") +if [[ ${#EXTRA_ARRAY[@]} -gt 0 ]]; then + command+=("${EXTRA_ARRAY[@]}") +fi +command+=("app:create_app()") + +if command -v runuser >/dev/null 2>&1; then + exec runuser -u "$RUNTIME_USER" -- "${command[@]}" +else + printf -v _cmd_str '%q ' "${command[@]}" + exec su -s /bin/bash -m "$RUNTIME_USER" -c "exec ${_cmd_str}" +fi diff --git a/src/master/build/supervisord.conf b/src/master/build/supervisord.conf new file mode 100644 index 0000000..5d250a2 --- /dev/null +++ b/src/master/build/supervisord.conf @@ -0,0 +1,39 @@ +[supervisord] +nodaemon=true +logfile=/var/log/supervisor/supervisord.log +pidfile=/var/run/supervisord.pid +user=root + +[program:master] +command=/usr/local/bin/start-master.sh +user=root +stdout_logfile=/var/log/supervisor/master.log +stderr_logfile=/var/log/supervisor/master_error.log +autostart=true +autorestart=true +startsecs=5 +stopwaitsecs=30 +killasgroup=true +stopasgroup=true + +[program:dns-monitor] +command=/usr/local/bin/dns-monitor.sh +user=root +stdout_logfile=/var/log/supervisor/dns-monitor.log +stderr_logfile=/var/log/supervisor/dns-monitor_error.log +autostart=true +autorestart=true +startsecs=5 +stopwaitsecs=10 +killasgroup=true +stopasgroup=true + +[unix_http_server] +file=/var/run/supervisor.sock +chmod=0700 + +[supervisorctl] +serverurl=unix:///var/run/supervisor.sock + +[rpcinterface:supervisor] +supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface diff --git a/src/master/images/.gitkeep b/src/master/images/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/master/offline_wheels.tar.gz b/src/master/offline_wheels.tar.gz new file mode 100644 index 0000000..c00f374 Binary files /dev/null and b/src/master/offline_wheels.tar.gz differ diff --git a/src/master/offline_wheels/.gitkeep b/src/master/offline_wheels/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/master/requirements.txt b/src/master/requirements.txt new file mode 100644 index 0000000..7eb4708 --- /dev/null +++ b/src/master/requirements.txt @@ -0,0 +1,2 @@ +Flask==2.3.3 +gunicorn==21.2.0 diff --git a/src/master/scripts/build_images.sh b/src/master/scripts/build_images.sh new file mode 100755 index 0000000..ebb8060 --- /dev/null +++ b/src/master/scripts/build_images.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat >&2 <<'USAGE' +Usage: $0 [--intranet] [--offline] [--tag ] + +Options: + --intranet 使用指定的 PyPI 镜像源(默认清华镜像)。 + --offline 完全离线构建,依赖 offline_wheels/ 目录中的离线依赖包。 + --tag 自定义镜像标签,默认 argus-master:latest。 +USAGE +} + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" +MODULE_ROOT="$PROJECT_ROOT/src/master" +IMAGE_TAG="${IMAGE_TAG:-argus-master:latest}" +DOCKERFILE="src/master/Dockerfile" +BUILD_ARGS=() +OFFLINE_MODE=0 + +source "$PROJECT_ROOT/scripts/common/build_user.sh" +load_build_user +BUILD_ARGS+=("--build-arg" "ARGUS_BUILD_UID=${ARGUS_BUILD_UID}" "--build-arg" "ARGUS_BUILD_GID=${ARGUS_BUILD_GID}") + +cd "$PROJECT_ROOT" + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --intranet) + INTRANET_INDEX="${INTRANET_INDEX:-https://pypi.tuna.tsinghua.edu.cn/simple}" + BUILD_ARGS+=("--build-arg" "PIP_INDEX_URL=${INTRANET_INDEX}") + BUILD_ARGS+=("--build-arg" "USE_INTRANET=true") + shift + ;; + --offline) + OFFLINE_MODE=1 + BUILD_ARGS+=("--build-arg" "USE_OFFLINE=1") + BUILD_ARGS+=("--build-arg" "USE_INTRANET=true") + shift + ;; + --tag) + [[ $# -ge 2 ]] || { usage; exit 1; } + IMAGE_TAG="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac + done + +if [[ "$OFFLINE_MODE" -eq 1 ]]; then + WHEELS_DIR="$MODULE_ROOT/offline_wheels" + if [[ ! -d "$WHEELS_DIR" ]]; then + echo "[ERROR] offline_wheels 目录不存在: $WHEELS_DIR" >&2 + exit 1 + fi + if ! find "$WHEELS_DIR" -maxdepth 1 -type f -name '*.whl' -print -quit >/dev/null; then + echo "[ERROR] offline_wheels 目录为空,请先在有网环境执行 scripts/prepare_offline_wheels.sh" >&2 + exit 1 + fi +fi + + + +echo "[INFO] Building image $IMAGE_TAG" +docker build -f "$DOCKERFILE" "${BUILD_ARGS[@]}" -t "$IMAGE_TAG" "$PROJECT_ROOT" +echo "[OK] Image $IMAGE_TAG built" diff --git a/src/master/scripts/load_images.sh b/src/master/scripts/load_images.sh new file mode 100755 index 0000000..fb1e126 --- /dev/null +++ b/src/master/scripts/load_images.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + echo "Usage: $0 [--file ]" >&2 +} + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +DEFAULT_INPUT="$PROJECT_ROOT/images/argus-master-dev.tar" +IMAGE_TAR="$DEFAULT_INPUT" + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --file) + [[ $# -ge 2 ]] || { usage; exit 1; } + IMAGE_TAR="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac + done + +if [[ ! -f "$IMAGE_TAR" ]]; then + echo "[ERROR] Image tarball not found: $IMAGE_TAR" >&2 + exit 1 +fi + +echo "[INFO] Loading image from $IMAGE_TAR" +docker image load -i "$IMAGE_TAR" +echo "[OK] Image loaded" diff --git a/src/master/scripts/prepare_offline_wheels.sh b/src/master/scripts/prepare_offline_wheels.sh new file mode 100755 index 0000000..08037ed --- /dev/null +++ b/src/master/scripts/prepare_offline_wheels.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat >&2 <<'USAGE' +Usage: $0 [--pip-version ] [--clean] [--local] + +Options: + --pip-version 额外下载指定版本的 pip wheel(例如 25.2)。 + --clean 清理 offline_wheels/*.whl 后重新下载。 + --local 使用本地 python 执行下载(默认通过 docker python:3.11-slim)。 +USAGE +} + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +REQUIREMENTS_FILE="$PROJECT_ROOT/requirements.txt" +WHEEL_DIR="$PROJECT_ROOT/offline_wheels" +PIP_VERSION="" +CLEAN=0 +USE_LOCAL=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --pip-version) + [[ $# -ge 2 ]] || { usage; exit 1; } + PIP_VERSION="$2" + shift 2 + ;; + --clean) + CLEAN=1 + shift + ;; + --local) + USE_LOCAL=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac + done + +if [[ ! -f "$REQUIREMENTS_FILE" ]]; then + echo "[ERROR] requirements.txt not found at $REQUIREMENTS_FILE" >&2 + exit 1 +fi + +mkdir -p "$WHEEL_DIR" + +if [[ "$CLEAN" -eq 1 ]]; then + echo "[INFO] Cleaning existing wheels in $WHEEL_DIR" + find "$WHEEL_DIR" -maxdepth 1 -type f -name '*.whl' -delete +fi + +run_with_python() { + local cmd=("python" "-m" "pip" "$@") + eval "${cmd[@]}" +} + +if [[ "$USE_LOCAL" -eq 1 ]]; then + PYTHON_BIN=${PYTHON_BIN:-python3} + if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then + echo "[ERROR] $PYTHON_BIN not found" >&2 + exit 1 + fi + echo "[INFO] Using local python ($PYTHON_BIN) to download wheels" + "$PYTHON_BIN" -m pip download -r "$REQUIREMENTS_FILE" -d "$WHEEL_DIR" + if [[ -n "$PIP_VERSION" ]]; then + "$PYTHON_BIN" -m pip download "pip==${PIP_VERSION}" -d "$WHEEL_DIR" + fi +else + if ! command -v docker >/dev/null 2>&1; then + echo "[ERROR] docker not found; rerun with --local or安装 docker" >&2 + exit 1 + fi + echo "[INFO] Using docker image python:3.11-slim 下载 wheel" + docker run --rm \ + -v "$WHEEL_DIR":/wheels \ + -v "$REQUIREMENTS_FILE":/tmp/requirements.txt \ + python:3.11-slim \ + bash -c "set -euo pipefail && python -m pip install --upgrade pip && python -m pip download -r /tmp/requirements.txt -d /wheels" + if [[ -n "$PIP_VERSION" ]]; then + docker run --rm \ + -v "$WHEEL_DIR":/wheels \ + python:3.11-slim \ + bash -c "set -euo pipefail && python -m pip download pip==${PIP_VERSION} -d /wheels" + fi +fi + +echo "[INFO] Offline wheels prepared at $WHEEL_DIR" diff --git a/src/master/scripts/save_images.sh b/src/master/scripts/save_images.sh new file mode 100755 index 0000000..cccfa77 --- /dev/null +++ b/src/master/scripts/save_images.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + echo "Usage: $0 [--tag ] [--output ]" >&2 +} + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +DEFAULT_OUTPUT="$PROJECT_ROOT/images/argus-master-dev.tar" +IMAGE_TAG="${IMAGE_TAG:-argus-master:latest}" +OUTPUT_PATH="$DEFAULT_OUTPUT" + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --tag) + [[ $# -ge 2 ]] || { usage; exit 1; } + IMAGE_TAG="$2" + shift 2 + ;; + --output) + [[ $# -ge 2 ]] || { usage; exit 1; } + OUTPUT_PATH="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac + done + +mkdir -p "$(dirname "$OUTPUT_PATH")" +echo "[INFO] Saving image $IMAGE_TAG to $OUTPUT_PATH" +docker image save "$IMAGE_TAG" -o "$OUTPUT_PATH" +echo "[OK] Image saved" diff --git a/src/master/tests/.gitignore b/src/master/tests/.gitignore new file mode 100644 index 0000000..285ed60 --- /dev/null +++ b/src/master/tests/.gitignore @@ -0,0 +1,2 @@ +private/ +tmp/ diff --git a/src/master/tests/docker-compose.yml b/src/master/tests/docker-compose.yml new file mode 100644 index 0000000..9118d92 --- /dev/null +++ b/src/master/tests/docker-compose.yml @@ -0,0 +1,19 @@ +services: + master: + image: ${MASTER_IMAGE_TAG:-argus-master:latest} + container_name: argus-master-e2e + environment: + - OFFLINE_THRESHOLD_SECONDS=6 + - ONLINE_THRESHOLD_SECONDS=2 + - SCHEDULER_INTERVAL_SECONDS=1 + ports: + - "31300:3000" + volumes: + - ./private/argus/master:/private/argus/master + - ./private/argus/metric/prometheus:/private/argus/metric/prometheus + - ./private/argus/etc:/private/argus/etc + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/src/master/tests/scripts/00_e2e_test.sh b/src/master/tests/scripts/00_e2e_test.sh new file mode 100755 index 0000000..42fb733 --- /dev/null +++ b/src/master/tests/scripts/00_e2e_test.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPTS=( + "01_up_master.sh" + "02_verify_ready_and_nodes_json.sh" + "03_register_via_curl.sh" + "04_reregister_and_error_cases.sh" + "05_status_report_via_curl.sh" + "06_config_update_and_nodes_json.sh" + "07_stats_single_node.sh" + "08_multi_node_stats.sh" + "09_restart_persistence.sh" + "10_down.sh" +) + +for script in "${SCRIPTS[@]}"; do + echo "[TEST] Running $script" + MASTER_IMAGE_TAG="${MASTER_IMAGE_TAG:-argus-master:latest}" "$SCRIPT_DIR/$script" + echo "[TEST] $script completed" + echo +done + +echo "[TEST] Master module E2E tests completed" diff --git a/src/master/tests/scripts/00_e2e_test_offline.sh b/src/master/tests/scripts/00_e2e_test_offline.sh new file mode 100755 index 0000000..1c3fc0d --- /dev/null +++ b/src/master/tests/scripts/00_e2e_test_offline.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +MODULE_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +MASTER_ROOT="$(cd "$MODULE_ROOT/.." && pwd)" + +# 准备离线依赖并构建镜像 +pushd "$MASTER_ROOT" >/dev/null +./scripts/prepare_offline_wheels.sh --clean --pip-version 25.2 +./scripts/build_images.sh --offline --tag argus-master:offline +popd >/dev/null + +# 使用离线镜像执行既有端到端用例 +MASTER_IMAGE_TAG="argus-master:offline" ./scripts/00_e2e_test.sh + diff --git a/src/master/tests/scripts/01_up_master.sh b/src/master/tests/scripts/01_up_master.sh new file mode 100755 index 0000000..62eb218 --- /dev/null +++ b/src/master/tests/scripts/01_up_master.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +MODULE_ROOT="$(cd "$TEST_ROOT/.." && pwd)" +PRIVATE_ROOT="$TEST_ROOT/private" +TMP_ROOT="$TEST_ROOT/tmp" +DNS_ROOT="$PRIVATE_ROOT/argus/etc" +BIND_UPDATE_SCRIPT_SRC="$(cd "$MODULE_ROOT/../bind" && pwd)/build/update-dns.sh" +BIND_UPDATE_SCRIPT_DEST="$DNS_ROOT/update-dns.sh" + +mkdir -p "$PRIVATE_ROOT/argus/master" +mkdir -p "$PRIVATE_ROOT/argus/metric/prometheus" +mkdir -p "$TMP_ROOT" +mkdir -p "$DNS_ROOT" + +# 确保上一次运行留下的容器/数据被清理 +compose() { + if docker compose version >/dev/null 2>&1; then + docker compose "$@" + else + docker-compose "$@" + fi +} + +pushd "$TEST_ROOT" >/dev/null +compose down --remove-orphans || true +popd >/dev/null + +rm -rf "$TMP_ROOT" "$PRIVATE_ROOT" +mkdir -p "$PRIVATE_ROOT/argus/master" +mkdir -p "$PRIVATE_ROOT/argus/metric/prometheus" +mkdir -p "$TMP_ROOT" +mkdir -p "$DNS_ROOT" + +# 中文提示:将 bind 模块自带的 update-dns.sh 下发到共享目录,模拟实际环境 +if [[ -f "$BIND_UPDATE_SCRIPT_SRC" ]]; then + cp "$BIND_UPDATE_SCRIPT_SRC" "$BIND_UPDATE_SCRIPT_DEST" + chmod +x "$BIND_UPDATE_SCRIPT_DEST" +else + echo "[WARN] bind update script missing at $BIND_UPDATE_SCRIPT_SRC" +fi + +pushd "$TEST_ROOT" >/dev/null +compose down --remove-orphans || true +MASTER_IMAGE_TAG="${MASTER_IMAGE_TAG:-argus-master:latest}" compose up -d +popd >/dev/null + +echo "[INFO] Master container is up on http://localhost:31300" diff --git a/src/master/tests/scripts/02_verify_ready_and_nodes_json.sh b/src/master/tests/scripts/02_verify_ready_and_nodes_json.sh new file mode 100755 index 0000000..65142dc --- /dev/null +++ b/src/master/tests/scripts/02_verify_ready_and_nodes_json.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +PRIVATE_ROOT="$TEST_ROOT/private" +API_BASE="http://localhost:31300" +NODES_JSON_PATH="$PRIVATE_ROOT/argus/metric/prometheus/nodes.json" +MASTER_DOMAIN_FILE="$PRIVATE_ROOT/argus/etc/master.argus.com" + +# 等待 readyz 返回 200,确保数据库初始化完成 +for _ in {1..30}; do + status=$(curl -s -o /dev/null -w '%{http_code}' "$API_BASE/readyz" || true) + if [[ "$status" == "200" ]]; then + break + fi + sleep 1 + done + +if [[ "${status:-}" != "200" ]]; then + echo "[ERROR] /readyz 未在预期时间内返回 200,实际=$status" >&2 + exit 1 +fi + +echo "[INFO] /readyz 已通过,就绪检查成功" + +# scheduler 启动时会产生空的 nodes.json,这里等待文件出现并校验内容 +for _ in {1..30}; do + if [[ -f "$NODES_JSON_PATH" ]]; then + break + fi + sleep 1 + done + +if [[ ! -f "$NODES_JSON_PATH" ]]; then + echo "[ERROR] 未在预期时间内生成 $NODES_JSON_PATH" >&2 + exit 1 +fi + +if ! python3 - "$NODES_JSON_PATH" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + data = json.load(handle) +if data != []: + raise SystemExit(f"nodes.json initial content should be [], got {data}") +PY +then + echo "[ERROR] nodes.json 初始内容不是空数组" >&2 + exit 1 +fi + +echo "[INFO] nodes.json 初始状态校验通过" + +# 中文提示:输出 master 写入的域名文件,失败不影响测试 +if [[ -f "$MASTER_DOMAIN_FILE" ]]; then + MASTER_IP=$(<"$MASTER_DOMAIN_FILE") + echo "[INFO] master.argus.com 记录: $MASTER_IP" +else + echo "[WARN] 未找到 master.argus.com 记录文件,目录=$MASTER_DOMAIN_FILE" +fi diff --git a/src/master/tests/scripts/03_register_via_curl.sh b/src/master/tests/scripts/03_register_via_curl.sh new file mode 100755 index 0000000..8bf5547 --- /dev/null +++ b/src/master/tests/scripts/03_register_via_curl.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_ROOT="$TEST_ROOT/tmp" +API_BASE="http://localhost:31300/api/v1/master" + +mkdir -p "$TMP_ROOT" + +for _ in {1..30}; do + if curl -sf "$API_BASE/healthz" >/dev/null; then + break + fi + sleep 1 +done + +payload=$(cat <<'JSON' +{ + "name": "dev-testuser-testinst-pod-0", + "type": "agent", + "meta_data": { + "hostname": "dev-testuser-testinst-pod-0", + "ip": "10.0.0.10", + "env": "dev", + "user": "testuser", + "instance": "testinst", + "cpu_number": 4, + "memory_in_bytes": 2147483648, + "gpu_number": 0 + }, + "version": "1.1.0" +} +JSON +) + +body_file="$TMP_ROOT/register_body.json" +status=$(curl -sS -o "$body_file" -w '%{http_code}' -H 'Content-Type: application/json' -X POST "$API_BASE/nodes" -d "$payload") +body="$(cat "$body_file")" + +if [[ "$status" != "201" ]]; then + echo "[ERROR] Unexpected status code: $status" >&2 + echo "$body" >&2 + exit 1 +fi + +node_id=$(python3 - "$body_file" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + body = json.load(handle) +print(body["id"]) +PY +) + +echo "$body" > "$TMP_ROOT/last_response.json" +echo "$node_id" > "$TMP_ROOT/node_id" + +list_file="$TMP_ROOT/nodes_list.json" +curl -sS "$API_BASE/nodes" -o "$list_file" +python3 - "$list_file" "$node_id" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + data = json.load(handle) +node_id = sys.argv[2] +assert any(item.get("id") == node_id for item in data), "node not in list" +PY + +echo "[INFO] Registered node with id $node_id" diff --git a/src/master/tests/scripts/04_reregister_and_error_cases.sh b/src/master/tests/scripts/04_reregister_and_error_cases.sh new file mode 100755 index 0000000..58795a7 --- /dev/null +++ b/src/master/tests/scripts/04_reregister_and_error_cases.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_ROOT="$TEST_ROOT/tmp" +API_BASE="http://localhost:31300/api/v1/master" +NODE_ID="$(cat "$TMP_ROOT/node_id")" + +# 使用相同 ID 重注册,同时修改部分 meta/version 字段 +payload=$(cat <&2 + cat "$TMP_ROOT/reregister_response.json" >&2 + exit 1 +fi + +python3 - "$TMP_ROOT/reregister_response.json" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) +assert node["meta_data"]["ip"] == "10.0.0.11", node["meta_data"] +assert node["meta_data"]["cpu_number"] == 8, node["meta_data"] +assert node["version"] == "1.2.0", node +PY + +echo "[INFO] 重注册成功,元数据已更新" + +# 未知 ID => 404 +unknown_payload=$(cat <<'JSON' +{ + "id": "A999", + "name": "dev-testuser-testinst-pod-0", + "type": "agent", + "meta_data": { + "hostname": "dev-testuser-testinst-pod-0", + "ip": "10.0.0.12", + "env": "dev", + "user": "testuser", + "instance": "testinst", + "cpu_number": 4, + "memory_in_bytes": 2147483648, + "gpu_number": 0 + }, + "version": "1.2.0" +} +JSON +) + +status=$(curl -sS -o "$TMP_ROOT/unknown_id_response.json" -w '%{http_code}' -H 'Content-Type: application/json' -X POST "$API_BASE/nodes" -d "$unknown_payload") +if [[ "$status" != "404" ]]; then + echo "[ERROR] 未知 ID 应返回 404,实际=$status" >&2 + cat "$TMP_ROOT/unknown_id_response.json" >&2 + exit 1 +fi + +echo "[INFO] 未知 ID 返回 404 验证通过" + +# id 与 name 不匹配 => 500,节点保持原名 +mismatch_payload=$(cat <&2 + cat "$TMP_ROOT/mismatch_response.json" >&2 + exit 1 +fi + +# 验证名称仍保持正确 +curl -sS "$API_BASE/nodes/$NODE_ID" -o "$TMP_ROOT/post_mismatch_detail.json" +python3 - "$TMP_ROOT/post_mismatch_detail.json" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) +assert node["name"] == "dev-testuser-testinst-pod-0", node["name"] +PY + +echo "[INFO] 名称不匹配返回 500,且原始节点未被篡改" diff --git a/src/master/tests/scripts/05_status_report_via_curl.sh b/src/master/tests/scripts/05_status_report_via_curl.sh new file mode 100755 index 0000000..567cf69 --- /dev/null +++ b/src/master/tests/scripts/05_status_report_via_curl.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_ROOT="$TEST_ROOT/tmp" +API_BASE="http://localhost:31300/api/v1/master" + +node_id="$(cat "$TMP_ROOT/node_id")" + +payload=$(python3 - <<'PY' +import json +from datetime import datetime, timezone +body = { + "timestamp": datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z"), + "health": { + "log-fluentbit": {"status": "healthy", "timestamp": "2023-10-05T12:05:00Z"}, + "metric-node-exporter": {"status": "healthy", "timestamp": "2023-10-05T12:05:00Z"} + } +} +print(json.dumps(body)) +PY +) + +response=$(curl -sS -w '\n%{http_code}' -H 'Content-Type: application/json' -X PUT "$API_BASE/nodes/$node_id/status" -d "$payload") +body="$(echo "$response" | head -n -1)" +status="$(echo "$response" | tail -n1)" + +if [[ "$status" != "200" ]]; then + echo "[ERROR] Status update failed with code $status" >&2 + echo "$body" >&2 + exit 1 +fi + +echo "$body" > "$TMP_ROOT/last_response.json" + +sleep 3 + +detail_file="$TMP_ROOT/status_detail.json" +curl -sS "$API_BASE/nodes/$node_id" -o "$detail_file" +python3 - "$detail_file" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) +assert node["status"] == "online", f"Expected online, got {node['status']}" +assert "log-fluentbit" in node["health"], node["health"].keys() +PY + +echo "[INFO] Status report successful and node is online" + +# 等待超过 offline 阈值,验证会自动转为 offline +sleep 7 + +offline_detail_file="$TMP_ROOT/status_offline.json" +curl -sS "$API_BASE/nodes/$node_id" -o "$offline_detail_file" +python3 - "$offline_detail_file" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) +assert node["status"] == "offline", f"Expected offline, got {node['status']}" +PY + +echo "[INFO] Node transitioned to offline as expected" + +# 再次上报健康,触发状态回到 online +payload=$(python3 - <<'PY' +import json +from datetime import datetime, timezone +body = { + "timestamp": datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z"), + "health": { + "log-fluentbit": {"status": "healthy", "timestamp": "2023-10-05T12:05:00Z"}, + "metric-node-exporter": {"status": "healthy", "timestamp": "2023-10-05T12:05:00Z"} + } +} +print(json.dumps(body)) +PY +) + +curl -sS -o "$TMP_ROOT/second_status_response.json" -w '%{http_code}' -H 'Content-Type: application/json' -X PUT "$API_BASE/nodes/$node_id/status" -d "$payload" > "$TMP_ROOT/second_status_code" +if [[ $(cat "$TMP_ROOT/second_status_code") != "200" ]]; then + echo "[ERROR] Second status update failed" >&2 + cat "$TMP_ROOT/second_status_response.json" >&2 + exit 1 +fi + +sleep 3 + +final_detail_file="$TMP_ROOT/status_back_online.json" +curl -sS "$API_BASE/nodes/$node_id" -o "$final_detail_file" +python3 - "$final_detail_file" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) +assert node["status"] == "online", f"Expected online after second report, got {node['status']}" +PY + +echo "[INFO] Node transitioned back to online after new status report" diff --git a/src/master/tests/scripts/06_config_update_and_nodes_json.sh b/src/master/tests/scripts/06_config_update_and_nodes_json.sh new file mode 100755 index 0000000..ed08750 --- /dev/null +++ b/src/master/tests/scripts/06_config_update_and_nodes_json.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_ROOT="$TEST_ROOT/tmp" +PRIVATE_ROOT="$TEST_ROOT/private" +API_BASE="http://localhost:31300/api/v1/master" +NODE_ID="$(cat "$TMP_ROOT/node_id")" + +payload='{"config":{"log_level":"debug"},"label":["gpu","exp001"]}' + +response=$(curl -sS -w '\n%{http_code}' -H 'Content-Type: application/json' -X PUT "$API_BASE/nodes/$NODE_ID/config" -d "$payload") +body="$(echo "$response" | head -n -1)" +status="$(echo "$response" | tail -n1)" + +if [[ "$status" != "200" ]]; then + echo "[ERROR] Config update failed: $status" >&2 + echo "$body" >&2 + exit 1 +fi + +sleep 2 + +nodes_json_path="$PRIVATE_ROOT/argus/metric/prometheus/nodes.json" +if [[ ! -f "$nodes_json_path" ]]; then + echo "[ERROR] nodes.json not generated at $nodes_json_path" >&2 + exit 1 +fi + +# 确保节点处于 online 状态,避免因等待导致 nodes.json 为空 +curl -sS "$API_BASE/nodes/$NODE_ID" -o "$TMP_ROOT/config_detail.json" +if ! python3 - "$TMP_ROOT/config_detail.json" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) +if node["status"] != "online": + raise SystemExit(1) +PY +then + payload='{"timestamp":"2025-09-24T00:00:00Z","health":{"log-fluentbit":{"status":"healthy"}}}' + curl -sS -o "$TMP_ROOT/config_second_report.json" -w '%{http_code}' -H 'Content-Type: application/json' -X PUT "$API_BASE/nodes/$NODE_ID/status" -d "$payload" > "$TMP_ROOT/config_second_code" + sleep 2 +fi + +python3 - "$nodes_json_path" <<'PY' +import json, sys +from pathlib import Path +path = Path(sys.argv[1]) +content = json.loads(path.read_text()) +assert isinstance(content, list) and len(content) == 1 +entry = content[0] +assert entry["labels"] == ["gpu", "exp001"], entry +PY + +echo "[INFO] Config updated and nodes.json verified" diff --git a/src/master/tests/scripts/07_stats_single_node.sh b/src/master/tests/scripts/07_stats_single_node.sh new file mode 100755 index 0000000..e2dfa9b --- /dev/null +++ b/src/master/tests/scripts/07_stats_single_node.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +PRIVATE_ROOT="$TEST_ROOT/private" +TMP_ROOT="$TEST_ROOT/tmp" +API_BASE="http://localhost:31300/api/v1/master" +NODE_ID="$(cat "$TMP_ROOT/node_id")" + +sleep 7 + +detail_file="$TMP_ROOT/offline_detail.json" +curl -sS "$API_BASE/nodes/$NODE_ID" -o "$detail_file" +python3 - "$detail_file" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) +assert node["status"] == "offline", f"Expected offline, got {node['status']}" +PY + +stats_file="$TMP_ROOT/stats.json" +curl -sS "$API_BASE/nodes/statistics" -o "$stats_file" +python3 - "$stats_file" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + stats = json.load(handle) +assert stats["total"] == 1 +found = {item["status"]: item["count"] for item in stats["status_statistics"]} +assert found.get("offline") == 1 +PY + +nodes_json_path="$PRIVATE_ROOT/argus/metric/prometheus/nodes.json" +python3 - "$nodes_json_path" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + content = json.load(handle) +assert content == [], content +PY + +echo "[INFO] Offline transition and statistics validated" diff --git a/src/master/tests/scripts/08_multi_node_stats.sh b/src/master/tests/scripts/08_multi_node_stats.sh new file mode 100755 index 0000000..e835857 --- /dev/null +++ b/src/master/tests/scripts/08_multi_node_stats.sh @@ -0,0 +1,106 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +PRIVATE_ROOT="$TEST_ROOT/private" +TMP_ROOT="$TEST_ROOT/tmp" +API_BASE="http://localhost:31300/api/v1/master" + +# 注册第二个节点 A2(保持在线) +second_payload=$(cat <<'JSON' +{ + "name": "dev-testuser-testinst-pod-1", + "type": "agent", + "meta_data": { + "hostname": "dev-testuser-testinst-pod-1", + "ip": "10.0.0.11", + "env": "dev", + "user": "testuser", + "instance": "testinst", + "cpu_number": 8, + "memory_in_bytes": 2147483648, + "gpu_number": 0 + }, + "version": "1.1.0" +} +JSON +) + +status=$(curl -sS -o "$TMP_ROOT/second_register.json" -w '%{http_code}' -H 'Content-Type: application/json' -X POST "$API_BASE/nodes" -d "$second_payload") +if [[ "$status" != "201" ]]; then + echo "[ERROR] Second node registration failed: $status" >&2 + cat "$TMP_ROOT/second_register.json" >&2 + exit 1 +fi +SECOND_NODE_ID=$(python3 - "$TMP_ROOT/second_register.json" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + data = json.load(handle) +print(data["id"]) +PY +) + +echo "$SECOND_NODE_ID" > "$TMP_ROOT/second_node_id" + +echo "[INFO] Second node registered with id $SECOND_NODE_ID" + +# A2 上报健康信息,保持 online +status_payload='{"timestamp":"2025-09-24T00:00:00Z","health":{"log-fluentbit":{"status":"healthy"}}}' +status=$(curl -sS -o "$TMP_ROOT/second_status.json" -w '%{http_code}' -H 'Content-Type: application/json' -X PUT "$API_BASE/nodes/$SECOND_NODE_ID/status" -d "$status_payload") +if [[ "$status" != "200" ]]; then + echo "[ERROR] Second node status update failed: $status" >&2 + cat "$TMP_ROOT/second_status.json" >&2 + exit 1 +fi + +# 等待调度器把第二节点标记为 online +second_online=false +for _ in {1..10}; do + sleep 1 + curl -sS "$API_BASE/nodes/$SECOND_NODE_ID" -o "$TMP_ROOT/second_detail.json" || continue + if python3 - "$TMP_ROOT/second_detail.json" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) +if node["status"] != "online": + raise SystemExit(1) +PY + then + second_online=true + break + fi +done + +if [[ "$second_online" != true ]]; then + echo "[ERROR] Second node did not become online" >&2 + exit 1 +fi + +# 再次获取统计信息 +stats_file="$TMP_ROOT/multi_stats.json" +curl -sS "$API_BASE/nodes/statistics" -o "$stats_file" +python3 - "$stats_file" "$TMP_ROOT/node_id" "$TMP_ROOT/second_node_id" <<'PY' +import json, sys, pathlib +with open(sys.argv[1]) as handle: + stats = json.load(handle) +first_id = pathlib.Path(sys.argv[2]).read_text().strip() +second_id = pathlib.Path(sys.argv[3]).read_text().strip() +assert stats["total"] == 2, stats +found = {item["status"]: item["count"] for item in stats["status_statistics"]} +assert found.get("offline") == 1, found +assert found.get("online") == 1, found +PY + +# 验证 nodes.json 只包含在线节点(应只有第二个 A2) +nodes_json_path="$PRIVATE_ROOT/argus/metric/prometheus/nodes.json" +python3 - "$nodes_json_path" "$SECOND_NODE_ID" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + content = json.load(handle) +expected_id = sys.argv[2] +assert len(content) == 1, content +assert content[0]["node_id"] == expected_id, content +PY + +echo "[INFO] Multi-node statistics and nodes.json validated" diff --git a/src/master/tests/scripts/09_restart_persistence.sh b/src/master/tests/scripts/09_restart_persistence.sh new file mode 100755 index 0000000..3bcfa79 --- /dev/null +++ b/src/master/tests/scripts/09_restart_persistence.sh @@ -0,0 +1,184 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +PRIVATE_ROOT="$TEST_ROOT/private" +TMP_ROOT="$TEST_ROOT/tmp" +API_BASE="http://localhost:31300/api/v1/master" +ROOT_BASE="http://localhost:31300" +DB_PATH="$PRIVATE_ROOT/argus/master/db.sqlite3" + +compose() { + if docker compose version >/dev/null 2>&1; then + docker compose "$@" + else + docker-compose "$@" + fi +} + +if [[ ! -f "$TMP_ROOT/node_id" ]]; then + echo "[ERROR] 主节点 ID 缺失,请先执行前置用例" >&2 + exit 1 +fi + +if [[ ! -f "$TMP_ROOT/second_node_id" ]]; then + echo "[ERROR] 第二个节点 ID 缺失,请先执行多节点场景脚本" >&2 + exit 1 +fi + +if [[ ! -f "$DB_PATH" ]]; then + echo "[ERROR] 持久化数据库缺失: $DB_PATH" >&2 + exit 1 +fi + +NODE_ID="$(cat "$TMP_ROOT/node_id")" +SECOND_NODE_ID="$(cat "$TMP_ROOT/second_node_id")" + +# 在重启前抓取节点详情与节点文件、统计信息,作为对比基线 +first_before="$TMP_ROOT/${NODE_ID}_pre_restart.json" +second_before="$TMP_ROOT/${SECOND_NODE_ID}_pre_restart.json" +curl -sS "$API_BASE/nodes/$NODE_ID" -o "$first_before" +curl -sS "$API_BASE/nodes/$SECOND_NODE_ID" -o "$second_before" + +nodes_json_before="$TMP_ROOT/nodes_json_pre_restart.json" +cp "$PRIVATE_ROOT/argus/metric/prometheus/nodes.json" "$nodes_json_before" + +stats_before="$TMP_ROOT/stats_pre_restart.json" +curl -sS "$API_BASE/nodes/statistics" -o "$stats_before" + +# 重启 master 容器,模拟服务重启后的持久化场景 +pushd "$TEST_ROOT" >/dev/null +compose restart master +popd >/dev/null + +# 等待 /readyz 恢复 200 +for _ in {1..30}; do + status=$(curl -s -o /dev/null -w '%{http_code}' "$ROOT_BASE/readyz" || true) + if [[ "$status" == "200" ]]; then + break + fi + sleep 1 +done + +if [[ "${status:-}" != "200" ]]; then + echo "[ERROR] master 容器重启后未恢复健康状态,readyz=$status" >&2 + exit 1 +fi + +sleep 2 + +first_after="$TMP_ROOT/${NODE_ID}_post_restart.json" +second_after="$TMP_ROOT/${SECOND_NODE_ID}_post_restart.json" +curl -sS "$API_BASE/nodes/$NODE_ID" -o "$first_after" +curl -sS "$API_BASE/nodes/$SECOND_NODE_ID" -o "$second_after" + +# 对比重启前后的节点关键信息,确保无丢失 +python3 - "$first_before" "$first_after" <<'PY' +import json, sys +before_path, after_path = sys.argv[1:3] +with open(before_path, 'r', encoding='utf-8') as handle: + before = json.load(handle) +with open(after_path, 'r', encoding='utf-8') as handle: + after = json.load(handle) +keys = [ + "id", + "name", + "type", + "version", + "register_time", + "meta_data", + "config", + "label", + "health", + "last_report", + "agent_last_report", +] +for key in keys: + if before.get(key) != after.get(key): + raise AssertionError(f"Key {key} changed after restart: {before.get(key)} -> {after.get(key)}") +PY + +python3 - "$second_before" "$second_after" <<'PY' +import json, sys +before_path, after_path = sys.argv[1:3] +with open(before_path, 'r', encoding='utf-8') as handle: + before = json.load(handle) +with open(after_path, 'r', encoding='utf-8') as handle: + after = json.load(handle) +keys = [ + "id", + "name", + "type", + "version", + "register_time", + "meta_data", + "config", + "label", + "health", + "last_report", + "agent_last_report", +] +for key in keys: + if before.get(key) != after.get(key): + raise AssertionError(f"Key {key} changed after restart: {before.get(key)} -> {after.get(key)}") +PY + +payload=$(python3 - <<'PY' +import json +from datetime import datetime, timezone +body = { + "timestamp": datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z"), + "health": { + "log-fluentbit": {"status": "healthy"} + } +} +print(json.dumps(body)) +PY +) + +curl -sS -o "$TMP_ROOT/restart_second_status.json" -w '%{http_code}' \ + -H 'Content-Type: application/json' -X PUT \ + "$API_BASE/nodes/$SECOND_NODE_ID/status" -d "$payload" > "$TMP_ROOT/restart_second_status_code" + +if [[ $(cat "$TMP_ROOT/restart_second_status_code") != "200" ]]; then + echo "[ERROR] Failed to restore second node status post-restart" >&2 + cat "$TMP_ROOT/restart_second_status.json" >&2 + exit 1 +fi + +sleep 3 + +# 对比重启前后的 nodes.json 与统计信息,验证持久化一致性 +nodes_json_after="$TMP_ROOT/nodes_json_post_restart.json" +cp "$PRIVATE_ROOT/argus/metric/prometheus/nodes.json" "$nodes_json_after" + +stats_after="$TMP_ROOT/stats_after_restart.json" +curl -sS "$API_BASE/nodes/statistics" -o "$stats_after" + +python3 - "$nodes_json_before" "$nodes_json_after" <<'PY' +import json, sys +with open(sys.argv[1], 'r', encoding='utf-8') as handle: + before = json.load(handle) +with open(sys.argv[2], 'r', encoding='utf-8') as handle: + after = json.load(handle) +if before != after: + raise AssertionError(f"nodes.json changed after restart: {before} -> {after}") +PY + +python3 - "$stats_before" "$stats_after" <<'PY' +import json, sys +with open(sys.argv[1], 'r', encoding='utf-8') as handle: + before = json.load(handle) +with open(sys.argv[2], 'r', encoding='utf-8') as handle: + after = json.load(handle) +if before != after: + raise AssertionError(f"Statistics changed after restart: {before} -> {after}") +PY + +if [[ ! -s "$DB_PATH" ]]; then + echo "[ERROR] 数据库文件为空,疑似未持久化" >&2 + exit 1 +fi + +echo "[INFO] Master 重启后持久化数据校验通过" diff --git a/src/master/tests/scripts/10_down.sh b/src/master/tests/scripts/10_down.sh new file mode 100755 index 0000000..7afce88 --- /dev/null +++ b/src/master/tests/scripts/10_down.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +PRIVATE_ROOT="$TEST_ROOT/private" +TMP_ROOT="$TEST_ROOT/tmp" + +compose() { + if docker compose version >/dev/null 2>&1; then + docker compose "$@" + else + docker-compose "$@" + fi +} + +pushd "$TEST_ROOT" >/dev/null +compose down --remove-orphans +popd >/dev/null + +rm -rf "$TMP_ROOT" +rm -rf "$PRIVATE_ROOT" + +echo "[INFO] Master E2E environment cleaned up" diff --git a/src/sys/README.md b/src/sys/README.md new file mode 100644 index 0000000..139597f --- /dev/null +++ b/src/sys/README.md @@ -0,0 +1,2 @@ + + diff --git a/src/sys/tests/README.md b/src/sys/tests/README.md new file mode 100644 index 0000000..77435a5 --- /dev/null +++ b/src/sys/tests/README.md @@ -0,0 +1,138 @@ +# ARGUS 系统级端到端测试(Sys E2E) + +本目录包含将 log 与 agent 两线验证合并后的系统级端到端测试。依赖 bind/master/es/kibana + 两个“日志节点”(每个节点容器内同时运行 Fluent Bit 与 argus-agent)。 + +--- + +## 一、如何运行 + +- 前置条件 + - 已构建镜像:`argus-elasticsearch:latest`、`argus-kibana:latest`、`argus-bind9:latest`、`argus-master:latest` + - 可用根目录命令构建:`./build/build_images.sh [--intranet]` + - 主机具备 Docker 与 Docker Compose。 + + - UID/GID 配置(用于容器内文件属主与挂载卷写入权限) + - 默认值:`UID=2133`、`GID=2015`。 + - 方式 A(推荐):在仓库根目录创建 `configs/build_user.local.conf`: + + UID=<你的宿主用户UID> + GID=<你的宿主用户GID> + + 例如: + + UID=1000 + GID=1000 + + - 方式 B:通过环境变量覆盖(优先级最高): + + export ARGUS_BUILD_UID=1000 + export ARGUS_BUILD_GID=1000 + + - 说明:`scripts/common/build_user.sh` 会按顺序读取 `configs/build_user.local.conf` → `configs/build_user.conf` → 环境变量,最终值会用于镜像构建参数与测试脚本,并在 `01_bootstrap.sh` 中对 `src/sys/tests/private/argus/*` 进行 `chown` 以匹配容器内运行用户。 + +- 一键执行 + - `cd src/sys/tests` + - `./scripts/00_e2e_test.sh` + +- 分步执行(推荐用于排查) + - `./scripts/01_bootstrap.sh` 生成目录/拷贝 `update-dns.sh`/构建 agent 二进制/写 `.env` + - `./scripts/02_up.sh` 启动 Compose 栈(工程名 `argus-sys`) + - `./scripts/03_wait_ready.sh` 等待 ES/Kibana/Master/Fluent‑Bit/Bind 就绪(Kibana 必须返回 200 且 overall.level=available) + - `./scripts/04_verify_dns_routing.sh` 校验 bind 解析与节点内域名解析 + - `./scripts/05_agent_register.sh` 获取两个节点的 `node_id` 与初始 IP,检查本地 `node.json` + - `./scripts/06_write_health_and_assert.sh` 写健康文件并断言 `nodes.json` 仅包含 2 个在线节点 + - `./scripts/07_logs_send_and_assert.sh` 向两个节点写日志,断言 ES `train-*`/`infer-*` 计数增长 + - `./scripts/08_restart_agent_reregister.sh` `node-b` 改为固定 IP `172.29.0.200`,验证保持同一节点 ID 且 IP/时间戳更新 + - `./scripts/09_down.sh` 回收容器、网络并清理 `private*/`、`tmp/` + +- 重置环境 + - 任何阶段失败可执行 `./scripts/09_down.sh` 后重跑 `01→…`。 + +--- + +## 二、测试部署架构(docker-compose) + +- 网络 + - 自定义 bridge:`argus-sys-net`,子网 `172.29.0.0/16` + - 固定地址:bind=`172.29.0.2`,master=`172.29.0.10` + +- 服务与端口 + - `bind`(`argus-bind9:latest`):监听 53/tcp+udp;负责同步 `*.argus.com` 记录 + - `master`(`argus-master:latest`):对外 `32300→3000`;API `http://localhost:32300` + - `es`(`argus-elasticsearch:latest`):`9200→9200`;单节点,无安全 + - `kibana`(`argus-kibana:latest`):`5601→5601`;通过 `ELASTICSEARCH_HOSTS=http://es:9200` 访问 ES + - `node-a`(`ubuntu:22.04`):同时运行 Fluent Bit + argus-agent,`hostname=dev-yyrshare-nbnyx10-cp2f-pod-0`,`2020→2020` + - `node-b`(`ubuntu:22.04`):同时运行 Fluent Bit + argus-agent,`hostname=dev-yyrshare-uuuu10-ep2f-pod-0`,`2021→2020` + +- 卷与目录 + - 核心服务(bind/master/es/kibana)共享宿主 `./private` 挂载到容器 `/private` + - 两个节点使用独立数据卷,互不与核心服务混用: + - node-a:`./private-nodea/argus/agent/ → /private/argus/agent/` + - node-b:`./private-nodeb/argus/agent/ → /private/argus/agent/` + - 节点容器的 Fluent Bit/agent 资产以只读方式挂载到 `/assets`/`/usr/local/bin/argus-agent` + +- DNS 配置 + - 节点容器通过 compose 配置 `dns: [172.29.0.2]` 指向 bind,不挂载 `/etc/resolv.conf`,也不依赖 `update-dns.sh` + - master/es/kibana 仍共享 `./private`,master 启动会写 `/private/argus/etc/master.argus.com` 供 bind 同步 A 记录 + +- 节点入口 + - `scripts/node_entrypoint.sh`: + - 复制 `/assets/fluent-bit/*` 到容器 `/private`,后台启动 Fluent Bit(监听 2020) + - 以运行用户(映射 UID/GID)前台启动 `argus-agent` + - 节点环境变量:`MASTER_ENDPOINT=http://master.argus.com:3000`、`REPORT_INTERVAL_SECONDS=2`、`ES_HOST=es`、`ES_PORT=9200`、`CLUSTER=local`、`RACK=dev` + +--- + +## 三、脚本与验证目标 + +- `01_bootstrap.sh` + - 目的:准备目录结构、修正 ES/Kibana 数据目录属主、分发 `update-dns.sh`(仅核心服务使用)、构建 agent 二进制、写 `.env` + - 失败排查:若 ES 无法写入数据,重跑本步骤确保目录属主为指定 UID/GID + +- `02_up.sh` + - 目的:以工程名 `argus-sys` 启动全栈;自动清理旧栈/网络 + +- `03_wait_ready.sh` + - 目的:等待关键端口/健康接口可用 + - 判定: + - ES `/_cluster/health?wait_for_status=yellow` 成功 + - Kibana `GET /api/status` 返回 200 且 `overall.level=available` + - Master `/readyz` 成功 + - Fluent Bit 指标接口 `:2020/:2021` 可访问 + - bind `named-checkconf` 通过 + +- `04_verify_dns_routing.sh` + - 目的:验证从 bind → 节点容器的解析链路 + - 判定: + - `private/argus/etc/master.argus.com` 存在且为 master IP + - 在 node-a/node-b 内 `getent hosts master.argus.com` 成功解析到 master IP + +- `05_agent_register.sh` + - 目的:确认两个节点注册到 master 并持久化 `node.json` + - 输出:`tmp/node_id_a|b`、`tmp/initial_ip_a|b`、`tmp/detail_*.json` + +- `06_write_health_and_assert.sh` + - 目的:模拟节点健康上报并在 master 侧可见;`nodes.json` 仅保留在线节点 + - 操作:写 `log-fluentbit.json`、`metric-node-exporter.json` 至两个节点的 health 目录 + +- `07_logs_send_and_assert.sh` + - 目的:通过 Fluent Bit 将两类日志注入 ES,计数应较基线增长且达到阈值(≥4) + - 同时校验 ES 健康 `green|yellow` + +- `08_restart_agent_reregister.sh` + - 目的:验证节点重启与 IP 变更时保持相同 `id` 并更新 `meta_data.ip` 与 `last_updated` + - 操作:以固定 IP `172.29.0.200` 重建 node‑b 后轮询校验 + +- `09_down.sh` + - 目的:栈销毁与环境清理;必要时使用临时容器修正属主再删除 `private*` 目录 + +--- + +### 常见问题与排查 +- Kibana 长时间 503:机器较慢时初始化较久;脚本最长等待 ~15 分钟;先确认 ES 已就绪。 +- Fluent Bit 指标未就绪:检查节点容器日志与环境变量 `CLUSTER/RACK` 是否设置;确认入口脚本已经复制资产到 `/private`。 +- ES 无法启动:多为宿主目录权限问题;重跑 `01_bootstrap.sh`,或手动 `chown -R src/sys/tests/private/argus/log/*`。 + +--- + +如需更严格的断言(例如 Kibana 载入具体插件、ES 文档字段校验),可在 `07_*.sh` 中追加查询与校验逻辑。 diff --git a/src/sys/tests/docker-compose.yml b/src/sys/tests/docker-compose.yml new file mode 100644 index 0000000..03b9f76 --- /dev/null +++ b/src/sys/tests/docker-compose.yml @@ -0,0 +1,139 @@ +version: "3.8" + +networks: + default: + name: argus-sys-net + driver: bridge + ipam: + driver: default + config: + - subnet: 172.29.0.0/16 + +services: + bind: + image: ${BIND_IMAGE_TAG:-argus-bind9:latest} + container_name: argus-bind-sys + networks: + default: + ipv4_address: 172.29.0.2 + volumes: + - ./private:/private + restart: unless-stopped + + master: + image: ${MASTER_IMAGE_TAG:-argus-master:latest} + container_name: argus-master-sys + depends_on: + - bind + environment: + - OFFLINE_THRESHOLD_SECONDS=6 + - ONLINE_THRESHOLD_SECONDS=2 + - SCHEDULER_INTERVAL_SECONDS=1 + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + ports: + - "32300:3000" + volumes: + - ./private/argus/master:/private/argus/master + - ./private/argus/metric/prometheus:/private/argus/metric/prometheus + - ./private/argus/etc:/private/argus/etc + networks: + default: + ipv4_address: 172.29.0.10 + restart: unless-stopped + + es: + image: argus-elasticsearch:latest + container_name: argus-es-sys + environment: + - discovery.type=single-node + - xpack.security.enabled=false + - ES_JAVA_OPTS=-Xms512m -Xmx512m + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + volumes: + - ./private/argus/log/elasticsearch:/private/argus/log/elasticsearch + - ./private/argus/etc:/private/argus/etc + ports: + - "9200:9200" + restart: unless-stopped + + kibana: + image: argus-kibana:latest + container_name: argus-kibana-sys + environment: + - ELASTICSEARCH_HOSTS=http://es.log.argus.com:9200 + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + volumes: + - ./private/argus/log/kibana:/private/argus/log/kibana + - ./private/argus/etc:/private/argus/etc + depends_on: + - es + ports: + - "5601:5601" + restart: unless-stopped + + node-a: + image: ubuntu:22.04 + container_name: argus-node-a + hostname: dev-yyrshare-nbnyx10-cp2f-pod-0 + depends_on: + - master + - bind + - es + environment: + - MASTER_ENDPOINT=http://master.argus.com:3000 + - REPORT_INTERVAL_SECONDS=2 + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + - ES_HOST=es + - ES_PORT=9200 + - CLUSTER=local + - RACK=dev + volumes: + - ./private-nodea/argus/agent/dev-yyrshare-nbnyx10-cp2f-pod-0:/private/argus/agent/dev-yyrshare-nbnyx10-cp2f-pod-0 + - ../../agent/dist/argus-agent:/usr/local/bin/argus-agent:ro + - ./scripts/node_entrypoint.sh:/usr/local/bin/node-entrypoint.sh:ro + - ../../log/fluent-bit/build/start-fluent-bit.sh:/assets/start-fluent-bit.sh:ro + - ../../log/fluent-bit/build/etc:/assets/fluent-bit/etc:ro + - ../../log/fluent-bit/build/packages:/assets/fluent-bit/packages:ro + entrypoint: + - /usr/local/bin/node-entrypoint.sh + dns: + - 172.29.0.2 + ports: + - "2020:2020" + restart: unless-stopped + + node-b: + image: ubuntu:22.04 + container_name: argus-node-b + hostname: dev-yyrshare-uuuu10-ep2f-pod-0 + depends_on: + - master + - bind + - es + environment: + - MASTER_ENDPOINT=http://master.argus.com:3000 + - REPORT_INTERVAL_SECONDS=2 + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + - ES_HOST=es + - ES_PORT=9200 + - CLUSTER=local + - RACK=dev + volumes: + - ./private-nodeb/argus/agent/dev-yyrshare-uuuu10-ep2f-pod-0:/private/argus/agent/dev-yyrshare-uuuu10-ep2f-pod-0 + - ../../agent/dist/argus-agent:/usr/local/bin/argus-agent:ro + - ./scripts/node_entrypoint.sh:/usr/local/bin/node-entrypoint.sh:ro + - ../../log/fluent-bit/build/start-fluent-bit.sh:/assets/start-fluent-bit.sh:ro + - ../../log/fluent-bit/build/etc:/assets/fluent-bit/etc:ro + - ../../log/fluent-bit/build/packages:/assets/fluent-bit/packages:ro + entrypoint: + - /usr/local/bin/node-entrypoint.sh + dns: + - 172.29.0.2 + ports: + - "2021:2020" + restart: unless-stopped diff --git a/src/sys/tests/scripts/00_e2e_test.sh b/src/sys/tests/scripts/00_e2e_test.sh new file mode 100755 index 0000000..2079c4f --- /dev/null +++ b/src/sys/tests/scripts/00_e2e_test.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +SCRIPTS=( + "01_bootstrap.sh" + "02_up.sh" + "03_wait_ready.sh" + "04_verify_dns_routing.sh" + "05_agent_register.sh" + "06_write_health_and_assert.sh" + "07_logs_send_and_assert.sh" + "08_restart_agent_reregister.sh" + "09_down.sh" +) + +for script in "${SCRIPTS[@]}"; do + echo "[SYS-E2E] Running $script" + "$SCRIPT_DIR/$script" + echo "[SYS-E2E] $script completed" + echo +done + +echo "[SYS-E2E] All tests completed" + diff --git a/src/sys/tests/scripts/01_bootstrap.sh b/src/sys/tests/scripts/01_bootstrap.sh new file mode 100755 index 0000000..e550a43 --- /dev/null +++ b/src/sys/tests/scripts/01_bootstrap.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)" + +PRIVATE_CORE="$TEST_ROOT/private" +PRIVATE_NODEA="$TEST_ROOT/private-nodea" +PRIVATE_NODEB="$TEST_ROOT/private-nodeb" +TMP_DIR="$TEST_ROOT/tmp" + +source "$REPO_ROOT/scripts/common/build_user.sh" +load_build_user + +ensure_image() { + local image="$1" + if ! docker image inspect "$image" >/dev/null 2>&1; then + echo "[ERROR] Missing image: $image. Please run ./build/build_images.sh" >&2 + exit 1 + fi +} + +echo "[INFO] Preparing directories..." +mkdir -p \ + "$PRIVATE_CORE/argus/etc" \ + "$PRIVATE_CORE/argus/bind" \ + "$PRIVATE_CORE/argus/master" \ + "$PRIVATE_CORE/argus/metric/prometheus" \ + "$PRIVATE_CORE/argus/log/elasticsearch" \ + "$PRIVATE_CORE/argus/log/kibana" \ + "$PRIVATE_NODEA/argus/agent/dev-yyrshare-nbnyx10-cp2f-pod-0/health" \ + "$PRIVATE_NODEB/argus/agent/dev-yyrshare-uuuu10-ep2f-pod-0/health" \ + "$TMP_DIR" + +# Align ownership for supervisor-managed services (ES/Kibana expect UID/GID inside container) +echo "[INFO] Fixing ownership for core private directories..." +chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" \ + "$PRIVATE_CORE/argus/log/elasticsearch" \ + "$PRIVATE_CORE/argus/log/kibana" \ + "$PRIVATE_CORE/argus/etc" 2>/dev/null || true + +echo "[INFO] Distributing update-dns.sh for core services (bind/master/es/kibana)" +BIND_UPDATE_SRC="$REPO_ROOT/src/bind/build/update-dns.sh" +BIND_UPDATE_DEST="$PRIVATE_CORE/argus/etc/update-dns.sh" +if [[ -f "$BIND_UPDATE_SRC" ]]; then + cp "$BIND_UPDATE_SRC" "$BIND_UPDATE_DEST" + chmod +x "$BIND_UPDATE_DEST" +else + echo "[WARN] bind update-dns.sh not found at $BIND_UPDATE_SRC" +fi + +echo "[INFO] Ensuring images present..." +ensure_image "argus-elasticsearch:latest" +ensure_image "argus-kibana:latest" +ensure_image "argus-bind9:latest" +ensure_image "argus-master:latest" + +echo "[INFO] Building agent binary..." +pushd "$REPO_ROOT/src/agent" >/dev/null +./scripts/build_binary.sh +popd >/dev/null + +AGENT_BIN="$REPO_ROOT/src/agent/dist/argus-agent" +if [[ ! -x "$AGENT_BIN" ]]; then + echo "[ERROR] Agent binary not found at $AGENT_BIN" >&2 + exit 1 +fi +echo "$AGENT_BIN" > "$TMP_DIR/agent_binary_path" + +echo "[INFO] Writing .env with UID/GID" +cat > "$TEST_ROOT/.env" </dev/null 2>&1; then + docker compose "$@" + else + docker-compose "$@" + fi +} + +echo "[INFO] Bringing up system stack..." +pushd "$TEST_ROOT" >/dev/null +compose -p argus-sys down --remove-orphans || true +compose -p argus-sys up -d +popd >/dev/null + +echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021" + diff --git a/src/sys/tests/scripts/03_wait_ready.sh b/src/sys/tests/scripts/03_wait_ready.sh new file mode 100755 index 0000000..4887181 --- /dev/null +++ b/src/sys/tests/scripts/03_wait_ready.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +compose() { + if docker compose version >/dev/null 2>&1; then + docker compose "$@" + else + docker-compose "$@" + fi +} + +service_id() { + compose -p argus-sys ps -q "$1" +} + +wait_http() { + local url="$1"; local attempts="${2:-120}"; local i=1 + while (( i <= attempts )); do + if curl -fsS "$url" >/dev/null 2>&1; then return 0; fi + echo "[..] waiting $url ($i/$attempts)"; sleep 5; ((i++)) + done + echo "[ERR] Timeout waiting for $url" >&2; return 1 +} + +echo "[INFO] Waiting for ES/Kibana/Master/Fluent Bit/Bind..." + +# ES (>= yellow) +attempt=1; max=120 +while (( attempt <= max )); do + if curl -fsS "http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then + break + fi + echo "[..] waiting ES ($attempt/$max)"; sleep 5; ((attempt++)) +done +[[ $attempt -le $max ]] || { echo "[ERR] ES not ready" >&2; exit 1; } + +# Kibana: must be HTTP 200 and overall.level=available +echo "[INFO] Waiting for Kibana to be available (HTTP 200)..." +kb_attempt=1; kb_max=180 +while (( kb_attempt <= kb_max )); do + body=$(curl -sS "http://localhost:5601/api/status" 2>/dev/null || true) + code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:5601/api/status" || echo 000) + if [[ "$code" == "200" ]]; then + if echo "$body" | grep -q '"level":"available"'; then + echo "[OK] Kibana available (HTTP 200)" + break + fi + fi + echo "[..] waiting kibana 200 ($kb_attempt/$kb_max), last_code=$code" + sleep 5 + ((kb_attempt++)) +done +if (( kb_attempt > kb_max )); then + echo "[ERR] Kibana did not reach HTTP 200 available in time" >&2; exit 1 +fi + +# Master +wait_http "http://localhost:32300/readyz" 120 + +# Fluent Bit (host metrics on host ports) +wait_http "http://localhost:2020/api/v2/metrics" 120 +wait_http "http://localhost:2021/api/v2/metrics" 120 + +# Bind config check +BIND_ID="$(service_id bind)" +if [[ -n "$BIND_ID" ]]; then + docker exec "$BIND_ID" named-checkconf >/dev/null +else + echo "[WARN] bind container id not found" +fi + +echo "[OK] All services are ready" diff --git a/src/sys/tests/scripts/04_verify_dns_routing.sh b/src/sys/tests/scripts/04_verify_dns_routing.sh new file mode 100755 index 0000000..635c4fe --- /dev/null +++ b/src/sys/tests/scripts/04_verify_dns_routing.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +compose() { + if docker compose version >/dev/null 2>&1; then + docker compose "$@" + else + docker-compose "$@" + fi +} + +service_id() { + compose -p argus-sys ps -q "$1" +} + +echo "[INFO] Verifying DNS routing via bind..." + +# Check master IP file exists in shared private +MASTER_FILE="$TEST_ROOT/private/argus/etc/master.argus.com" +if [[ ! -f "$MASTER_FILE" ]]; then + echo "[ERR] master.argus.com file missing at $MASTER_FILE" >&2 + exit 1 +fi +MASTER_IP_HOST="$(cat "$MASTER_FILE" | tr -d '\r\n' || true)" +echo "[INFO] master.argus.com file content: ${MASTER_IP_HOST}" + +# dig inside bind container +BIN_ID="$(service_id bind)" +if [[ -n "$BIN_ID" ]]; then + DIG_IP="$(docker exec "$BIN_ID" dig +short master.argus.com A | tail -n1 || true)" + echo "[INFO] dig(master.argus.com) from bind container -> $DIG_IP" + if [[ -z "$DIG_IP" ]]; then + echo "[ERR] bind did not resolve master.argus.com" >&2; exit 1 + fi +else + echo "[WARN] bind container not found; skip dig" +fi + +for node in node-a node-b; do + CID="$(service_id "$node")" + echo "[INFO] Checking resolution inside $node..." + if ! docker exec "$CID" getent hosts master.argus.com >/dev/null 2>&1; then + echo "[ERR] $node cannot resolve master.argus.com" >&2 + exit 1 + fi + RES="$(docker exec "$CID" getent hosts master.argus.com | awk '{print $1}' | head -n1)" + echo "[OK] $node resolved master.argus.com -> $RES" +done + +echo "[OK] DNS routing verified" + diff --git a/src/sys/tests/scripts/05_agent_register.sh b/src/sys/tests/scripts/05_agent_register.sh new file mode 100755 index 0000000..073d949 --- /dev/null +++ b/src/sys/tests/scripts/05_agent_register.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_DIR="$TEST_ROOT/tmp" + +API_BASE="http://localhost:32300/api/v1/master" + +HOST_A="dev-yyrshare-nbnyx10-cp2f-pod-0" +HOST_B="dev-yyrshare-uuuu10-ep2f-pod-0" + +mkdir -p "$TMP_DIR" + +echo "[INFO] Waiting for agent nodes to register..." + +extract_node() { + local name="$1"; local output="$2"; local json_file="$3" + python3 - "$name" "$output" "$json_file" <<'PY' +import json, sys, pathlib +name = sys.argv[1] +out = pathlib.Path(sys.argv[2]) +json_file = sys.argv[3] +with open(json_file, 'r') as fh: + data = json.load(fh) +node = next((n for n in data if n.get("name") == name), None) +if node: + out.write_text(node["id"]) # save id + print(node["id"]) # also print for shell capture +PY +} + +ID_A=""; ID_B="" +for _ in {1..60}; do + sleep 2 + resp=$(curl -fsS "$API_BASE/nodes" 2>/dev/null || true) + if [[ -z "$resp" ]]; then + continue + fi + # only try to parse when it's a JSON array + if ! echo "$resp" | head -c1 | grep -q '\['; then + continue + fi + echo "$resp" > "$TMP_DIR/nodes_list.json" + ID_A=$(extract_node "$HOST_A" "$TMP_DIR/node_id_a" "$TMP_DIR/nodes_list.json" 2>/dev/null || true) + ID_B=$(extract_node "$HOST_B" "$TMP_DIR/node_id_b" "$TMP_DIR/nodes_list.json" 2>/dev/null || true) + if [[ -s "$TMP_DIR/node_id_a" && -s "$TMP_DIR/node_id_b" ]]; then + break + fi +done + +if [[ ! -s "$TMP_DIR/node_id_a" || ! -s "$TMP_DIR/node_id_b" ]]; then + echo "[ERR] Agents did not register in time" >&2 + exit 1 +fi + +node_detail() { + local id="$1"; local out="$2" + curl -fsS "$API_BASE/nodes/$id" -o "$out" +} + +node_detail "$(cat "$TMP_DIR/node_id_a")" "$TMP_DIR/detail_a.json" +node_detail "$(cat "$TMP_DIR/node_id_b")" "$TMP_DIR/detail_b.json" + +python3 - "$TMP_DIR/detail_a.json" "$TMP_DIR/initial_ip_a" <<'PY' +import json, sys, pathlib +node=json.load(open(sys.argv[1])) +ip=node.get("meta_data",{}).get("ip") +assert ip, "missing ip" +pathlib.Path(sys.argv[2]).write_text(ip) +PY + +python3 - "$TMP_DIR/detail_b.json" "$TMP_DIR/initial_ip_b" <<'PY' +import json, sys, pathlib +node=json.load(open(sys.argv[1])) +ip=node.get("meta_data",{}).get("ip") +assert ip, "missing ip" +pathlib.Path(sys.argv[2]).write_text(ip) +PY + +NODE_JSON_A="$TEST_ROOT/private-nodea/argus/agent/$HOST_A/node.json" +NODE_JSON_B="$TEST_ROOT/private-nodeb/argus/agent/$HOST_B/node.json" + +[[ -f "$NODE_JSON_A" ]] || { echo "[ERR] node.json missing for $HOST_A" >&2; exit 1; } +[[ -f "$NODE_JSON_B" ]] || { echo "[ERR] node.json missing for $HOST_B" >&2; exit 1; } + +echo "[OK] Agents registered: $(cat "$TMP_DIR/node_id_a") , $(cat "$TMP_DIR/node_id_b")" diff --git a/src/sys/tests/scripts/06_write_health_and_assert.sh b/src/sys/tests/scripts/06_write_health_and_assert.sh new file mode 100755 index 0000000..6f888e6 --- /dev/null +++ b/src/sys/tests/scripts/06_write_health_and_assert.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_DIR="$TEST_ROOT/tmp" + +API_BASE="http://localhost:32300/api/v1/master" + +HOST_A="dev-yyrshare-nbnyx10-cp2f-pod-0" +HOST_B="dev-yyrshare-uuuu10-ep2f-pod-0" + +HEALTH_A="$TEST_ROOT/private-nodea/argus/agent/$HOST_A/health" +HEALTH_B="$TEST_ROOT/private-nodeb/argus/agent/$HOST_B/health" + +write_health() { + local dir="$1"; mkdir -p "$dir" + cat > "$dir/log-fluentbit.json" < "$dir/metric-node-exporter.json" </dev/null || true) + [[ -z "$resp" ]] && continue + echo "$resp" > "$TMP_DIR/node_${id}_detail.json" + if python3 - "$TMP_DIR/node_${id}_detail.json" <<'PY' +import json,sys +node=json.load(open(sys.argv[1])) +h=node.get("health",{}) +sys.exit(0 if ("log-fluentbit" in h and "metric-node-exporter" in h) else 1) +PY + then return 0; fi + done + return 1 +} + +check_health "$ID_A" || { echo "[ERR] health keys not reported for node A" >&2; exit 1; } +check_health "$ID_B" || { echo "[ERR] health keys not reported for node B" >&2; exit 1; } + +NODES_JSON="$TEST_ROOT/private/argus/metric/prometheus/nodes.json" +if [[ ! -f "$NODES_JSON" ]]; then + echo "[ERR] nodes.json missing at $NODES_JSON" >&2; exit 1 +fi + +python3 - "$NODES_JSON" <<'PY' +import json,sys +with open(sys.argv[1]) as h: + nodes=json.load(h) +assert isinstance(nodes,list) +assert len(nodes) == 2, f"expected 2 nodes online, got {len(nodes)}" +PY + +echo "[OK] Health reported and nodes.json has 2 online nodes" diff --git a/src/sys/tests/scripts/07_logs_send_and_assert.sh b/src/sys/tests/scripts/07_logs_send_and_assert.sh new file mode 100755 index 0000000..0363ebf --- /dev/null +++ b/src/sys/tests/scripts/07_logs_send_and_assert.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +set -euo pipefail + +echo "[INFO] Sending logs via node-a/node-b and asserting ES counts..." + +get_count() { + local idx="$1" + curl -s "http://localhost:9200/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" | sed -E 's/.*"count":([0-9]+).*/\1/' | awk 'NF{print $0;exit} END{if(NR==0)print 0}' +} + +train0=$(get_count "train-*") +infer0=$(get_count "infer-*") +base=$((train0 + infer0)) +echo "[INFO] initial counts: train=${train0} infer=${infer0} total=${base}" + +send_logs() { + local cname="$1"; local hosttag="$2" + docker exec "$cname" sh -lc 'mkdir -p /logs/train /logs/infer' + docker exec "$cname" sh -lc "ts=\ +\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log" + docker exec "$cname" sh -lc "ts=\ +\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log" + docker exec "$cname" sh -lc "ts=\ +\$(date '+%F %T'); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log" +} + +# Determine container names +node_a=$(docker ps --format '{{.Names}}' | grep -E '^argus-node-a$|argus-sys-node-a-1' | head -n1) +node_b=$(docker ps --format '{{.Names}}' | grep -E '^argus-node-b$|argus-sys-node-b-1' | head -n1) + +send_logs "$node_a" "host01" +send_logs "$node_b" "host02" + +echo "[INFO] Waiting for ES to ingest..." +sleep 10 + +train1=$(get_count "train-*") +infer1=$(get_count "infer-*") +final=$((train1 + infer1)) +echo "[INFO] final counts: train=${train1} infer=${infer1} total=${final}" + +if (( final <= base )); then + echo "[ERR] ES total did not increase (${base} -> ${final})" >&2 + exit 1 +fi + +if (( final < 4 )); then + echo "[ERR] ES total below expected threshold: ${final} < 4" >&2 + exit 1 +fi + +# Health endpoints +es_health=$(curl -s "http://localhost:9200/_cluster/health" | grep -o '"status":"[^"]*"' | cut -d'"' -f4) +if [[ "$es_health" != "green" && "$es_health" != "yellow" ]]; then + echo "[ERR] ES health not green/yellow: $es_health" >&2 + exit 1 +fi + +if ! curl -fs "http://localhost:5601/api/status" >/dev/null 2>&1; then + echo "[WARN] Kibana status endpoint not available" +fi + +echo "[OK] ES counts increased and services healthy" diff --git a/src/sys/tests/scripts/08_restart_agent_reregister.sh b/src/sys/tests/scripts/08_restart_agent_reregister.sh new file mode 100755 index 0000000..97a68ec --- /dev/null +++ b/src/sys/tests/scripts/08_restart_agent_reregister.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_DIR="$TEST_ROOT/tmp" +REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)" + +API_BASE="http://localhost:32300/api/v1/master" + +ID_B="$(cat "$TMP_DIR/node_id_b")" +IP0_B="$(cat "$TMP_DIR/initial_ip_b")" + +detail_before="$TMP_DIR/node_b_before.json" +curl -fsS "$API_BASE/nodes/$ID_B" -o "$detail_before" +LAST0=$(python3 - "$detail_before" <<'PY' +import json,sys +node=json.load(open(sys.argv[1])) +print(node.get("last_updated","")) +PY +) +IP_BEFORE=$(python3 - "$detail_before" <<'PY' +import json,sys +node=json.load(open(sys.argv[1])) +print(node.get("meta_data",{}).get("ip","")) +PY +) + +if [[ "$IP_BEFORE" != "$IP0_B" ]]; then + echo "[ERR] Expected initial IP $IP0_B for node-b, got $IP_BEFORE" >&2 + exit 1 +fi + +compose() { + if docker compose version >/dev/null 2>&1; then + docker compose "$@" + else + docker-compose "$@" + fi +} + +echo "[INFO] Recreating node-b with static IP 172.29.0.200..." +pushd "$TEST_ROOT" >/dev/null +compose -p argus-sys rm -sf node-b || true +popd >/dev/null + +docker rm -f argus-node-b >/dev/null 2>&1 || true + +AGENT_BIN_PATH="$(cat "$TMP_DIR/agent_binary_path")" + +docker run -d \ + --name argus-node-b \ + --hostname dev-yyrshare-uuuu10-ep2f-pod-0 \ + --network argus-sys-net \ + --ip 172.29.0.200 \ + --dns 172.29.0.2 \ + -e MASTER_ENDPOINT=http://master.argus.com:3000 \ + -e REPORT_INTERVAL_SECONDS=2 \ + -e ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} \ + -e ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} \ + -e ES_HOST=es \ + -e ES_PORT=9200 \ + -p 2021:2020 \ + -v "$TEST_ROOT/private-nodeb/argus/agent/dev-yyrshare-uuuu10-ep2f-pod-0:/private/argus/agent/dev-yyrshare-uuuu10-ep2f-pod-0" \ + -v "$AGENT_BIN_PATH:/usr/local/bin/argus-agent:ro" \ + -v "$SCRIPT_DIR/node_entrypoint.sh:/usr/local/bin/node-entrypoint.sh:ro" \ + -v "$REPO_ROOT/src/log/fluent-bit/build/start-fluent-bit.sh:/assets/start-fluent-bit.sh:ro" \ + -v "$REPO_ROOT/src/log/fluent-bit/build/etc:/assets/fluent-bit/etc:ro" \ + -v "$REPO_ROOT/src/log/fluent-bit/build/packages:/assets/fluent-bit/packages:ro" \ + --entrypoint /usr/local/bin/node-entrypoint.sh \ + ubuntu:22.04 >/dev/null + +echo "[INFO] Waiting for node-b to re-register with new IP..." +for _ in {1..40}; do + sleep 3 + if curl -fsS "$API_BASE/nodes/$ID_B" -o "$TMP_DIR/node_b_after.json"; then + if python3 - "$TMP_DIR/node_b_after.json" "$LAST0" <<'PY' +import json,sys +node=json.load(open(sys.argv[1])) +last0=sys.argv[2] +ip=node.get("meta_data",{}).get("ip") +lu=node.get("last_updated") +assert ip=="172.29.0.200" +assert lu and lu!=last0 +PY + then + echo "[OK] node-b re-registered with new IP 172.29.0.200" + exit 0 + fi + fi +done + +echo "[ERR] node-b did not update to IP 172.29.0.200 in time" >&2 +exit 1 diff --git a/src/sys/tests/scripts/09_down.sh b/src/sys/tests/scripts/09_down.sh new file mode 100755 index 0000000..d200540 --- /dev/null +++ b/src/sys/tests/scripts/09_down.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +compose() { + if docker compose version >/dev/null 2>&1; then + docker compose "$@" + else + docker-compose "$@" + fi +} + +docker rm -f argus-node-b >/dev/null 2>&1 || true + +pushd "$TEST_ROOT" >/dev/null +compose -p argus-sys down --remove-orphans || true +popd >/dev/null + +echo "[INFO] Cleaning private directories..." +if [[ -d "$TEST_ROOT/private" ]]; then + docker run --rm -v "$TEST_ROOT/private:/target" ubuntu:24.04 chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true + rm -rf "$TEST_ROOT/private" +fi +if [[ -d "$TEST_ROOT/private-nodea" ]]; then + docker run --rm -v "$TEST_ROOT/private-nodea:/target" ubuntu:24.04 chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true + rm -rf "$TEST_ROOT/private-nodea" +fi +if [[ -d "$TEST_ROOT/private-nodeb" ]]; then + docker run --rm -v "$TEST_ROOT/private-nodeb:/target" ubuntu:24.04 chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true + rm -rf "$TEST_ROOT/private-nodeb" +fi + +rm -rf "$TEST_ROOT/tmp" "$TEST_ROOT/.env" || true + +echo "[OK] Cleaned up system E2E" diff --git a/src/sys/tests/scripts/node_entrypoint.sh b/src/sys/tests/scripts/node_entrypoint.sh new file mode 100755 index 0000000..e1ed888 --- /dev/null +++ b/src/sys/tests/scripts/node_entrypoint.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +set -euo pipefail + +LOG_PREFIX="[NODE]" +RUNTIME_USER="argusagent" +RUNTIME_GROUP="argusagent" +AGENT_UID="${ARGUS_BUILD_UID:-2133}" +AGENT_GID="${ARGUS_BUILD_GID:-2015}" +HOSTNAME_VAL="${HOSTNAME:-unknown}" + +log() { echo "${LOG_PREFIX} $*"; } + +# Prepare runtime user +if ! getent group "$AGENT_GID" >/dev/null 2>&1; then + groupadd -g "$AGENT_GID" "$RUNTIME_GROUP" || true +else + RUNTIME_GROUP="$(getent group "$AGENT_GID" | cut -d: -f1)" +fi +if ! getent passwd "$AGENT_UID" >/dev/null 2>&1; then + useradd -u "$AGENT_UID" -g "$AGENT_GID" -M -s /bin/bash "$RUNTIME_USER" || true +else + RUNTIME_USER="$(getent passwd "$AGENT_UID" | cut -d: -f1)" +fi +log "runtime user: $RUNTIME_USER ($AGENT_UID:$AGENT_GID)" + +# Ensure agent data dirs exist (host volumes mounted) +AGENT_DIR="/private/argus/agent/${HOSTNAME_VAL}" +HEALTH_DIR="${AGENT_DIR}/health" +mkdir -p "$HEALTH_DIR" +chown -R "$AGENT_UID:$AGENT_GID" "$AGENT_DIR" 2>/dev/null || true + +# Stage Fluent Bit assets into /private to reuse existing startup script +mkdir -p /private +if [[ -f /assets/start-fluent-bit.sh ]]; then + cp /assets/start-fluent-bit.sh /private/start-fluent-bit.sh + chmod +x /private/start-fluent-bit.sh +fi +if [[ -d /assets/fluent-bit/etc ]]; then + rm -rf /private/etc && mkdir -p /private + cp -r /assets/fluent-bit/etc /private/ +fi +if [[ -d /assets/fluent-bit/packages ]]; then + cp -r /assets/fluent-bit/packages /private/ +fi + +# Start Fluent Bit in background (will block, so run via bash -lc &) +if [[ -x /private/start-fluent-bit.sh ]]; then + log "starting fluent-bit" + bash -lc '/private/start-fluent-bit.sh' & +else + log "missing /private/start-fluent-bit.sh; fluent-bit will not start" +fi + +# Start agent in foreground as runtime user +log "starting argus-agent" +exec su -s /bin/bash -c /usr/local/bin/argus-agent "$RUNTIME_USER" +