Compare commits

...

5 Commits

Author SHA1 Message Date
sundapeng.sdp
d279d8a6a8 refactor: 优化argus-metric模块e2e测试;
refs #20
2025-10-13 18:12:29 +08:00
c098f1d3ce dev_1.0.0_sundp 完成Metric模块及模块e2e测试 (#18)
Co-authored-by: sundapeng.sdp <sundapeng@hashdata.cn>
Reviewed-on: #18
Reviewed-by: xuxt <xuxt@zgclab.edu.cn>
Reviewed-by: yuyr <yuyr@zgclab.edu.cn>
Reviewed-by: huhy <husteryezi@163.com>
2025-10-11 17:15:06 +08:00
1e5e91b193 dev_1.0.0_yuyr_2:重新提交 PR,增加 master/agent 以及系统集成测试 (#17)
Reviewed-on: #17
Reviewed-by: sundapeng <sundp@mail.zgclab.edu.cn>
Reviewed-by: xuxt <xuxt@zgclab.edu.cn>
2025-10-11 15:04:46 +08:00
8a38d3d0b2 dev_1.0.0_yuyr 完成 log和bind模块开发部署测试 (#8)
- [x] 完成log模块镜像构建、本地端到端写日志——收集——查询流程;
- [x] 完成bind模块构建;
- [x] 内置域名IP自动更新脚本,使用 /private/argus/etc目录下文件进行同步,容器启动时自动写IP,定时任务刷新更新DNS服务器IP和DNS规则;

Co-authored-by: root <root@curious.host.com>
Reviewed-on: #8
Reviewed-by: sundapeng <sundp@mail.zgclab.edu.cn>
2025-09-22 16:39:38 +08:00
26e1c964ed init project 2025-09-15 11:00:03 +08:00
205 changed files with 19175 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
.idea/

View File

@ -5,3 +5,10 @@
项目文档【腾讯文档】GPU集群运维系统
https://docs.qq.com/doc/DQUxDdmhIZ1dpeERk
## 构建账号配置
镜像构建和运行账号的 UID/GID 可通过 `configs/build_user.conf` 配置,详细说明见 `doc/build-user-config.md`
## 本地端口占用提示
如需运行 BIND 模块端到端测试且宿主机 53 端口已占用,可通过环境变量 `HOST_DNS_PORT`(默认 1053指定对外映射端口例如 `HOST_DNS_PORT=12053 ./scripts/00_e2e_test.sh`

205
build/build_images.sh Executable file
View File

@ -0,0 +1,205 @@
#!/usr/bin/env bash
set -euo pipefail
show_help() {
cat <<'EOF'
ARGUS Unified Build System - Image Build Tool
Usage: $0 [OPTIONS]
Options:
--intranet Use intranet mirror for log/bind builds
--master-offline Build master offline image (requires src/master/offline_wheels.tar.gz)
-h, --help Show this help message
Examples:
$0 # Build with default sources
$0 --intranet # Build with intranet mirror
$0 --master-offline # Additionally build argus-master:offline
$0 --intranet --master-offline
EOF
}
use_intranet=false
build_master=true
build_master_offline=false
while [[ $# -gt 0 ]]; do
case $1 in
--intranet)
use_intranet=true
shift
;;
--master)
build_master=true
shift
;;
--master-offline)
build_master=true
build_master_offline=true
shift
;;
-h|--help)
show_help
exit 0
;;
*)
echo "Unknown option: $1" >&2
show_help
exit 1
;;
esac
done
root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
. "$root/scripts/common/build_user.sh"
declare -a build_args=()
if [[ "$use_intranet" == true ]]; then
build_args+=("--build-arg" "USE_INTRANET=true")
fi
cd "$root"
load_build_user
build_args+=("--build-arg" "ARGUS_BUILD_UID=${ARGUS_BUILD_UID}" "--build-arg" "ARGUS_BUILD_GID=${ARGUS_BUILD_GID}")
master_root="$root/src/master"
master_offline_tar="$master_root/offline_wheels.tar.gz"
master_offline_dir="$master_root/offline_wheels"
if [[ "$build_master_offline" == true ]]; then
if [[ ! -f "$master_offline_tar" ]]; then
echo "❌ offline wheels tar not found: $master_offline_tar" >&2
echo " 请提前准备好 offline_wheels.tar.gz 后再执行 --master-offline" >&2
exit 1
fi
echo "📦 Preparing offline wheels for master (extracting $master_offline_tar)"
rm -rf "$master_offline_dir"
mkdir -p "$master_offline_dir"
tar -xzf "$master_offline_tar" -C "$master_root"
has_wheel=$(find "$master_offline_dir" -maxdepth 1 -type f -name '*.whl' -print -quit)
if [[ -z "$has_wheel" ]]; then
echo "❌ offline_wheels extraction failed或无 wheel: $master_offline_dir" >&2
exit 1
fi
fi
echo "======================================="
echo "ARGUS Unified Build System"
echo "======================================="
if [[ "$use_intranet" == true ]]; then
echo "🌐 Mode: Intranet (Using internal mirror: 10.68.64.1)"
else
echo "🌐 Mode: Public (Using default package sources)"
fi
echo "👤 Build user UID:GID -> ${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}"
echo "📁 Build context: $root"
echo ""
build_image() {
local image_name=$1
local dockerfile_path=$2
local tag=$3
shift 3
local extra_args=("$@")
echo "🔄 Building $image_name image..."
echo " Dockerfile: $dockerfile_path"
echo " Tag: $tag"
if docker build "${build_args[@]}" "${extra_args[@]}" -f "$dockerfile_path" -t "$tag" .; then
echo "$image_name image built successfully"
return 0
else
echo "❌ Failed to build $image_name image"
return 1
fi
}
images_built=()
build_failed=false
if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then
images_built+=("argus-elasticsearch:latest")
else
build_failed=true
fi
echo ""
if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:latest"; then
images_built+=("argus-kibana:latest")
else
build_failed=true
fi
echo ""
if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:latest"; then
images_built+=("argus-bind9:latest")
else
build_failed=true
fi
echo ""
if [[ "$build_master" == true ]]; then
echo ""
echo "🔄 Building Master image..."
pushd "$master_root" >/dev/null
master_args=("--tag" "argus-master:latest")
if [[ "$use_intranet" == true ]]; then
master_args+=("--intranet")
fi
if [[ "$build_master_offline" == true ]]; then
master_args+=("--offline")
fi
if ./scripts/build_images.sh "${master_args[@]}"; then
if [[ "$build_master_offline" == true ]]; then
images_built+=("argus-master:offline")
else
images_built+=("argus-master:latest")
fi
else
build_failed=true
fi
popd >/dev/null
fi
echo "======================================="
echo "📦 Build Summary"
echo "======================================="
if [[ ${#images_built[@]} -gt 0 ]]; then
echo "✅ Successfully built images:"
for image in "${images_built[@]}"; do
echo "$image"
done
fi
if [[ "$build_failed" == true ]]; then
echo ""
echo "❌ Some images failed to build. Please check the errors above."
exit 1
fi
if [[ "$use_intranet" == true ]]; then
echo ""
echo "🌐 Built with intranet mirror configuration"
fi
if [[ "$build_master_offline" == true ]]; then
echo ""
echo "🧳 Master offline wheels 已解压到 $master_offline_dir"
fi
echo ""
echo "🚀 Next steps:"
echo " ./build/save_images.sh --compress # 导出镜像"
echo " cd src/master/tests && MASTER_IMAGE_TAG=argus-master:offline ./scripts/00_e2e_test.sh"
echo ""

223
build/save_images.sh Executable file
View File

@ -0,0 +1,223 @@
#!/usr/bin/env bash
set -euo pipefail
# 帮助信息
show_help() {
cat << EOF
ARGUS Unified Build System - Image Export Tool
Usage: $0 [OPTIONS]
Options:
--compress Compress exported images with gzip
-h, --help Show this help message
Examples:
$0 # Export all images without compression
$0 --compress # Export all images with gzip compression
EOF
}
# 解析命令行参数
use_compression=false
while [[ $# -gt 0 ]]; do
case $1 in
--compress)
use_compression=true
shift
;;
-h|--help)
show_help
exit 0
;;
*)
echo "Unknown option: $1"
show_help
exit 1
;;
esac
done
# 获取项目根目录
root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$root"
# 创建镜像输出目录
images_dir="$root/images"
mkdir -p "$images_dir"
echo "======================================="
echo "ARGUS Unified Build System - Image Export"
echo "======================================="
echo ""
if [[ "$use_compression" == true ]]; then
echo "🗜️ Mode: With gzip compression"
else
echo "📦 Mode: No compression"
fi
echo "📁 Output directory: $images_dir"
echo ""
# 定义镜像列表
declare -A images=(
["argus-elasticsearch:latest"]="argus-elasticsearch-latest.tar"
["argus-kibana:latest"]="argus-kibana-latest.tar"
["argus-bind9:latest"]="argus-bind9-latest.tar"
["argus-master:offline"]="argus-master-offline.tar"
)
# 函数:检查镜像是否存在
check_image() {
local image_name="$1"
if docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^$image_name$"; then
echo "✅ Image found: $image_name"
return 0
else
echo "❌ Image not found: $image_name"
return 1
fi
}
# 函数:显示镜像信息
show_image_info() {
local image_name="$1"
echo "📋 Image info for $image_name:"
docker images "$image_name" --format " Size: {{.Size}}, Created: {{.CreatedSince}}, ID: {{.ID}}"
}
# 函数:保存镜像
save_image() {
local image_name="$1"
local output_file="$2"
local output_path="$images_dir/$output_file"
echo "🔄 Saving $image_name to $output_file..."
# 删除旧的镜像文件(如果存在)
if [[ -f "$output_path" ]]; then
echo " Removing existing file: $output_file"
rm "$output_path"
fi
if [[ "$use_compression" == true && -f "$output_path.gz" ]]; then
echo " Removing existing compressed file: $output_file.gz"
rm "$output_path.gz"
fi
# 保存镜像
docker save "$image_name" -o "$output_path"
if [[ "$use_compression" == true ]]; then
echo " Compressing with gzip..."
gzip "$output_path"
output_path="$output_path.gz"
output_file="$output_file.gz"
fi
# 检查文件大小
local file_size=$(du -h "$output_path" | cut -f1)
echo "✅ Saved successfully: $output_file ($file_size)"
}
echo "🔍 Checking for ARGUS images..."
echo ""
# 检查所有镜像
available_images=()
missing_images=()
for image_name in "${!images[@]}"; do
if check_image "$image_name"; then
show_image_info "$image_name"
available_images+=("$image_name")
else
missing_images+=("$image_name")
fi
echo ""
done
# 如果没有镜像存在,提示构建
if [[ ${#available_images[@]} -eq 0 ]]; then
echo "❌ No ARGUS images found to export."
echo ""
echo "🔧 Please build the images first with:"
echo " ./build/build_images.sh"
exit 1
fi
# 显示缺失的镜像
if [[ ${#missing_images[@]} -gt 0 ]]; then
echo "⚠️ Missing images (will be skipped):"
for image_name in "${missing_images[@]}"; do
echo "$image_name"
done
echo ""
fi
echo "💾 Starting image export process..."
echo ""
# 保存所有可用的镜像
exported_files=()
for image_name in "${available_images[@]}"; do
output_file="${images[$image_name]}"
save_image "$image_name" "$output_file"
if [[ "$use_compression" == true ]]; then
exported_files+=("$output_file.gz")
else
exported_files+=("$output_file")
fi
echo ""
done
echo "======================================="
echo "📦 Export Summary"
echo "======================================="
# 显示导出的文件
echo "📁 Exported files in $images_dir:"
total_size=0
for file in "${exported_files[@]}"; do
full_path="$images_dir/$file"
if [[ -f "$full_path" ]]; then
size=$(du -h "$full_path" | cut -f1)
size_bytes=$(du -b "$full_path" | cut -f1)
total_size=$((total_size + size_bytes))
echo "$file ($size)"
fi
done
# 显示总大小
if [[ $total_size -gt 0 ]]; then
total_size_human=$(numfmt --to=iec --suffix=B $total_size)
echo ""
echo "📊 Total size: $total_size_human"
fi
echo ""
echo "🚀 Usage instructions:"
echo " To load these images on another system:"
if [[ "$use_compression" == true ]]; then
for file in "${exported_files[@]}"; do
if [[ -f "$images_dir/$file" ]]; then
base_name="${file%.gz}"
echo " gunzip $file && docker load -i $base_name"
fi
done
else
for file in "${exported_files[@]}"; do
if [[ -f "$images_dir/$file" ]]; then
echo " docker load -i $file"
fi
done
fi
echo ""
echo "✅ Image export completed successfully!"
echo ""

2
configs/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
# Local overrides for build user/group settings
build_user.local.conf

6
configs/build_user.conf Normal file
View File

@ -0,0 +1,6 @@
# Default build-time UID/GID for Argus images
# Override by creating configs/build_user.local.conf with the same format.
# Syntax: KEY=VALUE, supports UID/GID only. Whitespace and lines starting with # are ignored.
UID=2133
GID=2015

38
doc/build-user-config.md Normal file
View File

@ -0,0 +1,38 @@
# Argus 镜像构建 UID/GID 配置说明
通过统一配置文件可以为 Kibana、Elasticsearch、Bind、Master 等容器指定运行账号,解决跨机器部署时 UID/GID 不一致导致的权限问题。
## 配置入口
- 默认配置存放在 `configs/build_user.conf`,内容示例:
```bash
UID=2133
GID=2015
```
- 如果需要本地覆盖,可在 `configs/` 下新建 `build_user.local.conf`,字段与默认文件一致。该文件已列入 `.gitignore`,不会被意外提交。
- 亦可在执行脚本前通过环境变量 `ARGUS_BUILD_UID` / `ARGUS_BUILD_GID` 强制指定值,优先级最高。
## 作用范围
- `build/build_images.sh` 在构建 log/bind/master 镜像时读取配置,并传递 `--build-arg ARGUS_BUILD_UID/GID`;控制台会输出当前使用的 UID/GID。
- `src/master/scripts/build_images.sh` 同步使用配置,确保单独构建 master 镜像时行为一致。
- 各镜像 Dockerfile 会根据传入的 UID/GID 调整容器内账号(如 `elasticsearch``kibana``bind``argus`),并以环境变量形式暴露运行时可见值。
- Master 启动脚本会在执行 DNS 逻辑后,降权到配置的账号运行 `gunicorn`,确保写入 `/private/argus/**` 的文件具备正确属主。
- Log 模块测试脚本 `01_bootstrap.sh` 会根据配置修正挂载目录属主,方便端到端测试在任意用户下运行。
## 使用建议
1. 初次克隆仓库后无需修改,默认 UID/GID 保持向后兼容。
2. 如果在目标环境中使用新的账号(例如 `uid=4001,gid=4001`
- 编辑 `configs/build_user.local.conf` 填入新值;
- 使用新账号登录,并确保其加入宿主机的 `docker` 组;
- 重新执行 `build/build_images.sh` 或相关模块的构建脚本。
3. 切换配置后建议重新运行目标模块的端到端脚本(如 `src/log/tests/scripts/01_bootstrap.sh``src/master/tests/scripts/00_e2e_test.sh``src/agent/tests/scripts/00_e2e_test.sh`),验证 `/private/argus` 下文件属主是否为期望账号。
## 故障排查
- **镜像构建报错 `groupmod: GID already in use`**:说明所选 GID 已存在于基础镜像,建议换用未占用的值,或在自定义基础镜像中先移除冲突。
- **容器内运行时报写权限不足**:检查宿主机挂载目录是否已经由目标 UID/GID 创建;必要时重新执行模块的 `01_bootstrap.sh` 之类的准备脚本。
- **仍看到旧 UID/GID**:确认脚本执行时未继承旧缓存,可运行 `ARGUS_BUILD_UID=... ARGUS_BUILD_GID=... ./build/build_images.sh` 强制覆盖。

View File

@ -0,0 +1,115 @@
#!/usr/bin/env bash
set -euo pipefail
# Shared helper to load Argus build user/group configuration.
# Usage:
# source "${PROJECT_ROOT}/scripts/common/build_user.sh"
# load_build_user
# echo "$ARGUS_BUILD_UID:$ARGUS_BUILD_GID"
ARGUS_BUILD_UID_DEFAULT=2133
ARGUS_BUILD_GID_DEFAULT=2015
shopt -s extglob
_ARGUS_BUILD_USER_LOADED="${_ARGUS_BUILD_USER_LOADED:-0}"
_argus_build_user_script_dir() {
local dir
dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "$dir"
}
argus_project_root() {
local script_dir
script_dir="$(_argus_build_user_script_dir)"
(cd "$script_dir/../.." >/dev/null && pwd)
}
_argus_trim() {
local value="$1"
value="${value##+([[:space:]])}"
value="${value%%+([[:space:]])}"
printf '%s' "$value"
}
_argus_is_number() {
[[ "$1" =~ ^[0-9]+$ ]]
}
load_build_user() {
if [[ "$_ARGUS_BUILD_USER_LOADED" == "1" ]]; then
return 0
fi
local project_root config_files config uid gid
project_root="$(argus_project_root)"
config_files=(
"$project_root/configs/build_user.local.conf"
"$project_root/configs/build_user.conf"
)
uid="$ARGUS_BUILD_UID_DEFAULT"
gid="$ARGUS_BUILD_GID_DEFAULT"
for config in "${config_files[@]}"; do
if [[ -f "$config" ]]; then
while IFS= read -r raw_line || [[ -n "$raw_line" ]]; do
local line key value
line="${raw_line%%#*}"
line="$(_argus_trim "${line}")"
[[ -z "$line" ]] && continue
if [[ "$line" != *=* ]]; then
echo "[ARGUS build_user] Ignoring malformed line in $config: $raw_line" >&2
continue
fi
key="${line%%=*}"
value="${line#*=}"
key="$(_argus_trim "$key")"
value="$(_argus_trim "$value")"
case "$key" in
UID)
uid="$value"
;;
GID)
gid="$value"
;;
*)
echo "[ARGUS build_user] Unknown key '$key' in $config" >&2
;;
esac
done < "$config"
break
fi
done
if [[ -n "${ARGUS_BUILD_UID:-}" ]]; then
uid="$ARGUS_BUILD_UID"
fi
if [[ -n "${ARGUS_BUILD_GID:-}" ]]; then
gid="$ARGUS_BUILD_GID"
fi
if ! _argus_is_number "$uid"; then
echo "[ARGUS build_user] Invalid UID '$uid'" >&2
return 1
fi
if ! _argus_is_number "$gid"; then
echo "[ARGUS build_user] Invalid GID '$gid'" >&2
return 1
fi
export ARGUS_BUILD_UID="$uid"
export ARGUS_BUILD_GID="$gid"
_ARGUS_BUILD_USER_LOADED=1
}
argus_build_user_args() {
load_build_user
printf '%s' "--build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} --build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID}"
}
print_build_user() {
load_build_user
echo "ARGUS build user: UID=${ARGUS_BUILD_UID} GID=${ARGUS_BUILD_GID}"
}

2
src/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
__pycache__/

5
src/agent/.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
build/
*.egg-info/
__pycache__/
.env

66
src/agent/README.md Normal file
View File

@ -0,0 +1,66 @@
# Argus Agent 模块
Argus Agent 是一个轻量级 Python 进程,负责向 Argus Master 注册节点、汇报健康数据,并维护本地持久化信息。模块现以 PyInstaller 打包为独立可执行文件,便于在普通容器或虚机中直接运行。
## 构建可执行文件
```bash
cd src/agent
./scripts/build_binary.sh # 生成 dist/argus-agent
```
脚本默认会在 Docker 容器 (`python:3.11-slim-bullseye`) 内执行 PyInstaller确保产物运行时兼容 glibc 2.31+(覆盖 2.35 环境)。构建流程注意事项:
- 每次构建前会清理 `build/``dist/` 并在容器内重新创建虚拟环境。
- 需要使用内网 Python 镜像时,可通过 `PIP_INDEX_URL``PIP_EXTRA_INDEX_URL``PIP_TRUSTED_HOST` 等环境变量传入,脚本会自动透传给容器。
- 如果宿主机无法运行 Docker可设置 `AGENT_BUILD_USE_DOCKER=0` 回退到本地构建;此时代码必须在 glibc ≤ 2.35 的机器上执行。
构建结束后脚本会在 `build/compat_check/` 下解包关键动态库并输出最高 `GLIBC_x.y` 版本,便于快速核对兼容性。如果结果中缺少 `libssl.so.3` / `libcrypto.so.3`,表示系统会在目标宿主机上使用本地 OpenSSL 库,无需额外处理。
例如:
```bash
strings build/compat_check/libpython*.so.1.0 | grep -Eo 'GLIBC_[0-9]+\.[0-9]+' | sort -Vu | tail -n1
```
如遇构建失败,常见原因是 Docker 不可用(请改用 `AGENT_BUILD_USE_DOCKER=0`)或无法访问 Python 包镜像(先设置上述镜像环境变量后重试)。
## 运行时配置
Agent 不再依赖配置文件;所有参数均由环境变量与主机名推导:
| 变量 | 必填 | 默认值 | 说明 |
| --- | --- | --- | --- |
| `MASTER_ENDPOINT` | 是 | N/A | Master 基础地址,可写 `http://host:3000``host:3000`(自动补全 `http://`)。 |
| `REPORT_INTERVAL_SECONDS` | 否 | `60` | 状态上报间隔(秒)。必须为正整数。 |
| `AGENT_HOSTNAME` | 否 | `$(hostname)` | 覆盖容器内主机名,便于测试或特殊命名需求。 |
派生路径:
- 节点信息:`/private/argus/agent/<hostname>/node.json`
- 子模块健康目录:`/private/argus/agent/<hostname>/health/`
健康目录中的文件需遵循 `<模块前缀>-*.json` 命名(例如 `log-fluentbit.json``metric-node-exporter.json`),文件内容会原样并入上报的 `health` 字段。
## 日志与持久化
- Agent 会在成功注册、状态上报、异常重试等关键节点输出结构化日志,便于聚合分析。
- `node.json` 保存 Master 返回的最新节点对象,用于重启后继续使用既有节点 ID。
## 端到端测试
仓库内提供 Docker Compose 测试栈master + ubuntu 容器):
```bash
cd src/agent/tests
./scripts/00_e2e_test.sh
```
测试脚本会:
1. 构建 master 镜像与 agent 可执行文件。
2. 以 `ubuntu:24.04` 启动 agent 容器,并通过环境变量注入 `MASTER_ENDPOINT``REPORT_INTERVAL_SECONDS`
3. 验证注册、健康上报、nodes.json 生成、统计接口,以及“容器重启 + IP 变化”重注册流程。
4. 清理 `tests/private/` 与临时容器网络。
如需在真实环境部署,只需将 `dist/argus-agent` 连同健康目录挂载到目标主机,并按上表设置环境变量即可。

View File

60
src/agent/app/client.py Normal file
View File

@ -0,0 +1,60 @@
from __future__ import annotations
import json
from typing import Any, Dict, Optional
import requests
from .log import get_logger
LOGGER = get_logger("argus.agent.client")
class MasterAPIError(Exception):
def __init__(self, message: str, status_code: int, payload: Optional[Dict[str, Any]] = None) -> None:
super().__init__(message)
self.status_code = status_code
self.payload = payload or {}
class AgentClient:
def __init__(self, base_url: str, *, timeout: int = 10) -> None:
self._base_url = base_url.rstrip("/")
self._timeout = timeout
self._session = requests.Session()
def register_node(self, body: Dict[str, Any]) -> Dict[str, Any]:
"""调用 master 注册接口,返回节点对象。"""
url = f"{self._base_url}/api/v1/master/nodes"
response = self._session.post(url, json=body, timeout=self._timeout)
return self._parse_response(response, "Failed to register node")
def update_status(self, node_id: str, body: Dict[str, Any]) -> Dict[str, Any]:
"""上报健康信息,由 master 更新 last_report。"""
url = f"{self._base_url}/api/v1/master/nodes/{node_id}/status"
response = self._session.put(url, json=body, timeout=self._timeout)
return self._parse_response(response, "Failed to update node status")
def _parse_response(self, response: requests.Response, error_prefix: str) -> Dict[str, Any]:
content_type = response.headers.get("Content-Type", "")
payload: Dict[str, Any] | None = None
if "application/json" in content_type:
try:
payload = response.json()
except json.JSONDecodeError:
LOGGER.warning("Response contained invalid JSON", extra={"status": response.status_code})
if response.status_code >= 400:
message = payload.get("error") if isinstance(payload, dict) else response.text
raise MasterAPIError(
f"{error_prefix}: {message}",
status_code=response.status_code,
payload=payload if isinstance(payload, dict) else None,
)
if payload is None:
try:
payload = response.json()
except json.JSONDecodeError as exc:
raise MasterAPIError("Master returned non-JSON payload", response.status_code) from exc
return payload

111
src/agent/app/collector.py Normal file
View File

@ -0,0 +1,111 @@
from __future__ import annotations
import os
import re
import socket
import subprocess
from pathlib import Path
from typing import Any, Dict
from .config import AgentConfig
from .log import get_logger
LOGGER = get_logger("argus.agent.collector")
_HOSTNAME_PATTERN = re.compile(r"^([^-]+)-([^-]+)-([^-]+)-.*$")
def collect_metadata(config: AgentConfig) -> Dict[str, Any]:
"""汇总节点注册需要的静态信息。"""
hostname = config.hostname
env, user, instance = _parse_hostname(hostname)
meta = {
"hostname": hostname,
"ip": _detect_ip_address(),
"env": env,
"user": user,
"instance": instance,
"cpu_number": _detect_cpu_count(),
"memory_in_bytes": _detect_memory_bytes(),
"gpu_number": _detect_gpu_count(),
}
return meta
def _parse_hostname(hostname: str) -> tuple[str, str, str]:
"""按照约定的 env-user-instance 前缀拆解主机名。"""
match = _HOSTNAME_PATTERN.match(hostname)
if not match:
LOGGER.warning("Hostname does not match expected pattern", extra={"hostname": hostname})
return "", "", ""
return match.group(1), match.group(2), match.group(3)
def _detect_cpu_count() -> int:
count = os.cpu_count()
return count if count is not None else 0
def _detect_memory_bytes() -> int:
"""优先读取 cgroup 限额,失败时退回 /proc/meminfo。"""
cgroup_path = Path("/sys/fs/cgroup/memory.max")
try:
raw = cgroup_path.read_text(encoding="utf-8").strip()
if raw and raw != "max":
return int(raw)
except FileNotFoundError:
LOGGER.debug("cgroup memory.max not found, falling back to /proc/meminfo")
except ValueError:
LOGGER.warning("Failed to parse memory.max, falling back", extra={"value": raw})
try:
with open("/proc/meminfo", "r", encoding="utf-8") as handle:
for line in handle:
if line.startswith("MemTotal:"):
parts = line.split()
if len(parts) >= 2:
return int(parts[1]) * 1024
except FileNotFoundError:
LOGGER.error("/proc/meminfo not found; defaulting memory to 0")
return 0
def _detect_gpu_count() -> int:
"""采集 GPU 数量,如无法探测则默认为 0。"""
try:
proc = subprocess.run(
["nvidia-smi", "-L"],
check=False,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=5,
)
except FileNotFoundError:
LOGGER.debug("nvidia-smi not available; assuming 0 GPUs")
return 0
except subprocess.SubprocessError as exc:
LOGGER.warning("nvidia-smi invocation failed", extra={"error": str(exc)})
return 0
if proc.returncode != 0:
LOGGER.debug("nvidia-smi returned non-zero", extra={"stderr": proc.stderr.strip()})
return 0
count = sum(1 for line in proc.stdout.splitlines() if line.strip())
return count
def _detect_ip_address() -> str:
"""尝试通过 UDP socket 获得容器出口 IP失败则回退解析主机名。"""
try:
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock:
sock.connect(("8.8.8.8", 80))
return sock.getsockname()[0]
except OSError:
LOGGER.debug("UDP socket trick failed; falling back to hostname lookup")
try:
return socket.gethostbyname(socket.gethostname())
except OSError:
LOGGER.warning("Unable to resolve hostname to IP; defaulting to 127.0.0.1")
return "127.0.0.1"

74
src/agent/app/config.py Normal file
View File

@ -0,0 +1,74 @@
from __future__ import annotations
import os
import socket
from dataclasses import dataclass
from pathlib import Path
from typing import Final
from .version import VERSION
DEFAULT_REPORT_INTERVAL_SECONDS: Final[int] = 60
@dataclass(frozen=True)
class AgentConfig:
hostname: str
node_file: str
version: str
master_endpoint: str
report_interval_seconds: int
health_dir: str
request_timeout_seconds: int = 10
def _normalise_master_endpoint(value: str) -> str:
value = value.strip()
if not value:
raise ValueError("MASTER_ENDPOINT environment variable is required")
if not value.startswith("http://") and not value.startswith("https://"):
value = f"http://{value}"
return value.rstrip("/")
def _read_report_interval(raw_value: str | None) -> int:
if raw_value is None or raw_value.strip() == "":
return DEFAULT_REPORT_INTERVAL_SECONDS
try:
interval = int(raw_value)
except ValueError as exc:
raise ValueError("REPORT_INTERVAL_SECONDS must be an integer") from exc
if interval <= 0:
raise ValueError("REPORT_INTERVAL_SECONDS must be positive")
return interval
def _resolve_hostname() -> str:
return os.environ.get("AGENT_HOSTNAME") or socket.gethostname()
def load_config() -> AgentConfig:
"""从环境变量推导配置,移除了外部配置文件依赖。"""
hostname = _resolve_hostname()
node_file = f"/private/argus/agent/{hostname}/node.json"
health_dir = f"/private/argus/agent/{hostname}/health/"
master_endpoint_env = os.environ.get("MASTER_ENDPOINT")
if master_endpoint_env is None:
raise ValueError("MASTER_ENDPOINT environment variable is not set")
master_endpoint = _normalise_master_endpoint(master_endpoint_env)
report_interval_seconds = _read_report_interval(os.environ.get("REPORT_INTERVAL_SECONDS"))
Path(node_file).parent.mkdir(parents=True, exist_ok=True)
Path(health_dir).mkdir(parents=True, exist_ok=True)
return AgentConfig(
hostname=hostname,
node_file=node_file,
version=VERSION,
master_endpoint=master_endpoint,
report_interval_seconds=report_interval_seconds,
health_dir=health_dir,
)

View File

@ -0,0 +1,32 @@
from __future__ import annotations
import json
from pathlib import Path
from typing import Any, Dict
from .log import get_logger
LOGGER = get_logger("argus.agent.health")
def read_health_directory(path: str) -> Dict[str, Any]:
"""读取目录中所有 <prefix>-*.json 文件并返回 JSON 映射。"""
result: Dict[str, Any] = {}
directory = Path(path)
if not directory.exists():
LOGGER.debug("Health directory does not exist", extra={"path": str(directory)})
return result
for health_file in sorted(directory.glob("*.json")):
if "-" not in health_file.stem:
LOGGER.debug("Skipping non-prefixed health file", extra={"file": health_file.name})
continue
try:
with health_file.open("r", encoding="utf-8") as handle:
content = json.load(handle)
result[health_file.stem] = content
except json.JSONDecodeError as exc:
LOGGER.warning("Failed to parse health file", extra={"file": health_file.name, "error": str(exc)})
except OSError as exc:
LOGGER.warning("Failed to read health file", extra={"file": health_file.name, "error": str(exc)})
return result

18
src/agent/app/log.py Normal file
View File

@ -0,0 +1,18 @@
from __future__ import annotations
import logging
import os
_LOG_FORMAT = "%(asctime)s %(levelname)s %(name)s - %(message)s"
def setup_logging() -> None:
level_name = os.environ.get("AGENT_LOG_LEVEL", "INFO").upper()
level = getattr(logging, level_name, logging.INFO)
logging.basicConfig(level=level, format=_LOG_FORMAT)
def get_logger(name: str) -> logging.Logger:
setup_logging()
return logging.getLogger(name)

163
src/agent/app/main.py Normal file
View File

@ -0,0 +1,163 @@
from __future__ import annotations
import signal
import time
from datetime import datetime, timezone
from typing import Optional
from .client import AgentClient, MasterAPIError
from .collector import collect_metadata
from .config import AgentConfig, load_config
from .health_reader import read_health_directory
from .log import get_logger, setup_logging
from .state import clear_node_state, load_node_state, save_node_state
LOGGER = get_logger("argus.agent")
def _current_timestamp() -> str:
return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
class StopSignal:
def __init__(self) -> None:
self._stop = False
def set(self, *_args) -> None: # type: ignore[override]
self._stop = True
def is_set(self) -> bool:
return self._stop
def main(argv: Optional[list[str]] = None) -> int: # noqa: ARG001 - 保留签名以兼容入口调用
setup_logging()
stop_signal = StopSignal()
signal.signal(signal.SIGTERM, stop_signal.set)
signal.signal(signal.SIGINT, stop_signal.set)
try:
config = load_config()
except Exception as exc:
LOGGER.error("Failed to load configuration", extra={"error": str(exc)})
return 1
LOGGER.info(
"Agent starting",
extra={
"hostname": config.hostname,
"master_endpoint": config.master_endpoint,
"node_file": config.node_file,
},
)
client = AgentClient(config.master_endpoint, timeout=config.request_timeout_seconds)
node_state = load_node_state(config.node_file) or {}
node_id = node_state.get("id")
# 与 master 建立注册关系(支持重注册),失败则重试
register_response = _register_with_retry(client, config, node_id, stop_signal)
if register_response is None:
LOGGER.info("Registration aborted due to shutdown signal")
return 0
node_id = register_response.get("id")
if not node_id:
LOGGER.error("Master did not return node id; aborting")
return 1
save_node_state(config.node_file, register_response)
LOGGER.info("Entering status report loop", extra={"node_id": node_id})
_status_loop(client, config, node_id, stop_signal)
return 0
def _register_with_retry(
client: AgentClient,
config: AgentConfig,
node_id: Optional[str],
stop_signal: StopSignal,
):
backoff = 5
while not stop_signal.is_set():
payload = {
"name": config.hostname,
"type": "agent",
"meta_data": collect_metadata(config),
"version": config.version,
}
if node_id:
payload["id"] = node_id
try:
response = client.register_node(payload)
LOGGER.info("Registration successful", extra={"node_id": response.get("id")})
save_node_state(config.node_file, response)
return response
except MasterAPIError as exc:
if exc.status_code == 404 and node_id:
LOGGER.warning(
"Master does not recognise node id; clearing local node state",
extra={"node_id": node_id},
)
clear_node_state(config.node_file)
node_id = None
elif exc.status_code == 500 and node_id:
# id 与 name 不匹配通常意味着配置异常,记录但继续重试
LOGGER.error(
"Master rejected node due to id/name mismatch; will retry",
extra={"node_id": node_id},
)
else:
LOGGER.error("Registration failed", extra={"status_code": exc.status_code, "error": str(exc)})
time.sleep(min(backoff, 60))
backoff = min(backoff * 2, 60)
except Exception as exc: # pragma: no cover - defensive
LOGGER.exception("Unexpected error during registration", extra={"error": str(exc)})
time.sleep(min(backoff, 60))
backoff = min(backoff * 2, 60)
return None
def _status_loop(
client: AgentClient,
config: AgentConfig,
node_id: str,
stop_signal: StopSignal,
) -> None:
interval = config.report_interval_seconds
while not stop_signal.is_set():
timestamp = _current_timestamp()
health_payload = read_health_directory(config.health_dir)
body = {
"timestamp": timestamp,
"health": health_payload,
}
try:
response = client.update_status(node_id, body)
LOGGER.info(
"Status report succeeded",
extra={"node_id": node_id, "health_keys": list(health_payload.keys())},
)
save_node_state(config.node_file, response)
except MasterAPIError as exc:
# 保持循环继续执行,等待下一次重试
LOGGER.error(
"Failed to report status",
extra={"status_code": exc.status_code, "error": str(exc)},
)
except Exception as exc: # pragma: no cover - defensive
LOGGER.exception("Unexpected error during status report", extra={"error": str(exc)})
for _ in range(interval):
if stop_signal.is_set():
break
time.sleep(1)
LOGGER.info("Stop signal received; exiting status loop")
if __name__ == "__main__":
sys.exit(main())

44
src/agent/app/state.py Normal file
View File

@ -0,0 +1,44 @@
from __future__ import annotations
import json
import os
import tempfile
from pathlib import Path
from typing import Any, Dict, Optional
from .log import get_logger
LOGGER = get_logger("argus.agent.state")
def load_node_state(path: str) -> Optional[Dict[str, Any]]:
"""读取本地 node.json容器重启后沿用之前的 ID。"""
try:
with open(path, "r", encoding="utf-8") as handle:
return json.load(handle)
except FileNotFoundError:
return None
except json.JSONDecodeError as exc:
LOGGER.warning("node.json is invalid JSON; ignoring", extra={"error": str(exc)})
return None
def save_node_state(path: str, data: Dict[str, Any]) -> None:
"""原子化写入 node.json避免并发读取坏数据。"""
directory = Path(path).parent
directory.mkdir(parents=True, exist_ok=True)
with tempfile.NamedTemporaryFile("w", dir=directory, delete=False, encoding="utf-8") as tmp:
json.dump(data, tmp, separators=(",", ":"))
tmp.flush()
os.fsync(tmp.fileno())
temp_path = tmp.name
os.replace(temp_path, path)
def clear_node_state(path: str) -> None:
try:
os.remove(path)
except FileNotFoundError:
return
except OSError as exc:
LOGGER.warning("Failed to remove node state file", extra={"error": str(exc), "path": path})

69
src/agent/app/version.py Normal file
View File

@ -0,0 +1,69 @@
from __future__ import annotations
import os
import sys
from pathlib import Path
from typing import Optional
import importlib.metadata
try:
import tomllib
except ModuleNotFoundError: # pragma: no cover
import tomli as tomllib # type: ignore[no-redef]
def _candidate_paths() -> list[Path]:
paths = []
bundle_dir: Optional[str] = getattr(sys, "_MEIPASS", None)
if bundle_dir:
paths.append(Path(bundle_dir) / "pyproject.toml")
paths.append(Path(__file__).resolve().parent.parent / "pyproject.toml")
paths.append(Path(__file__).resolve().parent / "pyproject.toml")
paths.append(Path.cwd() / "pyproject.toml")
return paths
def _read_from_pyproject() -> Optional[str]:
for path in _candidate_paths():
if not path.exists():
continue
try:
with path.open("rb") as handle:
data = tomllib.load(handle)
except (OSError, tomllib.TOMLDecodeError):
continue
project = data.get("project")
if isinstance(project, dict):
version = project.get("version")
if isinstance(version, str):
return version
tool = data.get("tool")
if isinstance(tool, dict):
argus_cfg = tool.get("argus")
if isinstance(argus_cfg, dict):
version = argus_cfg.get("version")
if isinstance(version, str):
return version
return None
def _detect_version() -> str:
try:
return importlib.metadata.version("argus-agent")
except importlib.metadata.PackageNotFoundError:
pass
override = os.environ.get("AGENT_VERSION_OVERRIDE")
if override:
return override
fallback = _read_from_pyproject()
if fallback:
return fallback
return "0.0.0"
VERSION: str = _detect_version()
def get_version() -> str:
return VERSION

BIN
src/agent/dist/argus-agent vendored Executable file

Binary file not shown.

10
src/agent/entry.py Normal file
View File

@ -0,0 +1,10 @@
#!/usr/bin/env python3
from __future__ import annotations
import sys
from app.main import main as agent_main
if __name__ == "__main__":
sys.exit(agent_main())

19
src/agent/pyproject.toml Normal file
View File

@ -0,0 +1,19 @@
[project]
name = "argus-agent"
version = "1.1.0"
description = "Argus agent binary"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"requests==2.31.0"
]
[build-system]
requires = ["setuptools>=69", "wheel"]
build-backend = "setuptools.build_meta"
[tool.argus]
entry = "app.main:main"
[tool.setuptools]
packages = ["app"]

View File

@ -0,0 +1,690 @@
#!/usr/bin/env bash
set -euo pipefail
LOG_PREFIX="[AGENT-VERIFY]"
MASTER_ENDPOINT_DEFAULT=""
AGENT_DATA_ROOT_DEFAULT="/private/argus/agent"
AGENT_ETC_ROOT_DEFAULT="/private/argus/etc"
REPORT_INTERVAL_DEFAULT="2"
ALLOW_CONFIG_TOUCH="false"
KEEP_TEST_HEALTH="false"
log_info() {
echo "${LOG_PREFIX} INFO $*"
}
log_warn() {
echo "${LOG_PREFIX} WARN $*" >&2
}
log_error() {
echo "${LOG_PREFIX} ERROR $*" >&2
}
usage() {
cat <<'USAGE'
Usage: agent_deployment_verify.sh [options]
Options:
--allow-config-touch Enable optional config PUT dry-run check.
--keep-test-health Keep the temporary verify health file after checks.
-h, --help Show this help message.
Environment variables:
MASTER_ENDPOINT (required) Master API base endpoint, e.g. http://master:3000
AGENT_DATA_ROOT (default: /private/argus/agent)
AGENT_ETC_ROOT (default: /private/argus/etc)
VERIFY_HOSTNAME (default: output of hostname)
REPORT_INTERVAL_SECONDS (default: 2) Agent report interval in seconds
USAGE
}
while [[ $# -gt 0 ]]; do
case "$1" in
--allow-config-touch)
ALLOW_CONFIG_TOUCH="true"
shift
;;
--keep-test-health)
KEEP_TEST_HEALTH="true"
shift
;;
-h|--help)
usage
exit 0
;;
*)
log_error "Unknown option: $1"
usage >&2
exit 2
;;
esac
done
MASTER_ENDPOINT="${MASTER_ENDPOINT:-$MASTER_ENDPOINT_DEFAULT}"
AGENT_DATA_ROOT="${AGENT_DATA_ROOT:-$AGENT_DATA_ROOT_DEFAULT}"
AGENT_ETC_ROOT="${AGENT_ETC_ROOT:-$AGENT_ETC_ROOT_DEFAULT}"
VERIFY_HOSTNAME="${VERIFY_HOSTNAME:-$(hostname)}"
REPORT_INTERVAL_SECONDS="${REPORT_INTERVAL_SECONDS:-$REPORT_INTERVAL_DEFAULT}"
if [[ -z "$MASTER_ENDPOINT" ]]; then
log_error "MASTER_ENDPOINT is required"
exit 2
fi
if ! [[ "$REPORT_INTERVAL_SECONDS" =~ ^[0-9]+$ ]] || [[ "$REPORT_INTERVAL_SECONDS" -le 0 ]]; then
log_warn "Invalid REPORT_INTERVAL_SECONDS='$REPORT_INTERVAL_SECONDS', fallback to $REPORT_INTERVAL_DEFAULT"
REPORT_INTERVAL_SECONDS="$REPORT_INTERVAL_DEFAULT"
fi
normalize_endpoint() {
local endpoint="$1"
if [[ "$endpoint" != http://* && "$endpoint" != https://* ]]; then
endpoint="http://$endpoint"
fi
endpoint="${endpoint%/}"
echo "$endpoint"
}
MASTER_BASE="$(normalize_endpoint "$MASTER_ENDPOINT")"
NODE_DIR="$AGENT_DATA_ROOT/$VERIFY_HOSTNAME"
NODE_JSON="$NODE_DIR/node.json"
HEALTH_DIR="$NODE_DIR/health"
DNS_CONF="$AGENT_ETC_ROOT/dns.conf"
UPDATE_SCRIPT="$AGENT_ETC_ROOT/update-dns.sh"
declare -a RESULTS_PASS=()
declare -a RESULTS_WARN=()
declare -a RESULTS_FAIL=()
add_result() {
local level="$1" message="$2"
case "$level" in
PASS)
RESULTS_PASS+=("$message")
log_info "$message"
;;
WARN)
RESULTS_WARN+=("$message")
log_warn "$message"
;;
FAIL)
RESULTS_FAIL+=("$message")
log_error "$message"
;;
esac
}
HAS_JQ="0"
if command -v jq >/dev/null 2>&1; then
HAS_JQ="1"
fi
if ! command -v curl >/dev/null 2>&1; then
log_error "curl command not found; please install curl (e.g. apt-get install -y curl)"
exit 2
fi
if [[ "$HAS_JQ" == "0" ]] && ! command -v python3 >/dev/null 2>&1; then
log_error "Neither jq nor python3 is available for JSON processing"
exit 2
fi
CURL_OPTS=(--fail --show-error --silent --max-time 10)
curl_json() {
local url="$1"
if ! curl "${CURL_OPTS[@]}" "$url"; then
return 1
fi
}
json_query() {
local json="$1" jq_expr="$2" py_expr="$3"
if [[ "$HAS_JQ" == "1" ]]; then
if ! output=$(printf '%s' "$json" | jq -e -r "$jq_expr" 2>/dev/null); then
return 1
fi
printf '%s' "$output"
return 0
fi
python3 - "$py_expr" <<'PY'
import json
import sys
expr = sys.argv[1]
try:
data = json.load(sys.stdin)
value = eval(expr, {}, {"data": data})
except Exception:
sys.exit(1)
if value is None:
sys.exit(1)
if isinstance(value, (dict, list)):
print(json.dumps(value))
else:
print(value)
PY
}
json_length() {
local json="$1" jq_expr="$2" py_expr="$3"
if [[ "$HAS_JQ" == "1" ]]; then
if ! output=$(printf '%s' "$json" | jq -e "$jq_expr" 2>/dev/null); then
return 1
fi
printf '%s' "$output"
return 0
fi
python3 - "$py_expr" <<'PY'
import json
import sys
expr = sys.argv[1]
try:
data = json.load(sys.stdin)
value = eval(expr, {}, {"data": data})
except Exception:
sys.exit(1)
try:
print(len(value))
except Exception:
sys.exit(1)
PY
}
json_has_key() {
local json="$1" jq_expr="$2" py_expr="$3"
if [[ "$HAS_JQ" == "1" ]]; then
if printf '%s' "$json" | jq -e "$jq_expr" >/dev/null 2>&1; then
return 0
fi
return 1
fi
python3 - "$py_expr" <<'PY'
import json
import sys
expr = sys.argv[1]
try:
data = json.load(sys.stdin)
value = eval(expr, {}, {"data": data})
except Exception:
sys.exit(1)
if value:
sys.exit(0)
sys.exit(1)
PY
}
iso_to_epoch() {
local value="$1"
if command -v date >/dev/null 2>&1; then
date -d "$value" +%s 2>/dev/null && return 0
fi
if command -v python3 >/dev/null 2>&1; then
python3 - "$value" <<'PY'
import sys
from datetime import datetime
value = sys.argv[1]
if value is None or value == "":
sys.exit(1)
if value.endswith('Z'):
value = value[:-1] + '+00:00'
try:
dt = datetime.fromisoformat(value)
except ValueError:
sys.exit(1)
print(int(dt.timestamp()))
PY
return $?
fi
return 1
}
validate_json_file() {
local path="$1"
if [[ "$HAS_JQ" == "1" ]]; then
jq empty "$path" >/dev/null 2>&1 && return 0
return 1
fi
if command -v python3 >/dev/null 2>&1; then
python3 - "$path" <<'PY'
import json
import sys
path = sys.argv[1]
with open(path, 'r', encoding='utf-8') as handle:
json.load(handle)
PY
return $?
fi
return 0
}
ensure_directory() {
local dir="$1"
if [[ ! -d "$dir" ]]; then
log_warn "Creating missing directory $dir"
mkdir -p "$dir"
fi
}
TEST_HEALTH_FILE=""
TEST_HEALTH_BACKUP=""
TEST_HEALTH_EXISTED="false"
cleanup() {
if [[ -n "$TEST_HEALTH_FILE" ]]; then
if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then
printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE"
elif [[ "$KEEP_TEST_HEALTH" == "true" ]]; then
:
else
rm -f "$TEST_HEALTH_FILE"
fi
fi
}
trap cleanup EXIT
log_info "Starting agent deployment verification for hostname '$VERIFY_HOSTNAME'"
# 4.2 Master health checks
health_resp=""
if ! health_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/healthz" 2>/tmp/agent_verify_healthz.err); then
error_detail=$(cat /tmp/agent_verify_healthz.err || true)
add_result FAIL "GET /healthz failed: $error_detail"
else
http_meta=$(tail -n1 <<<"$health_resp")
payload=$(head -n -1 <<<"$health_resp" || true)
status_code=${http_meta%% *}
elapsed=${http_meta##* }
add_result PASS "GET /healthz status=$status_code elapsed=${elapsed}s payload=$payload"
fi
rm -f /tmp/agent_verify_healthz.err
if ! readyz_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/readyz" 2>/tmp/agent_verify_readyz.err); then
error_detail=$(cat /tmp/agent_verify_readyz.err || true)
add_result FAIL "GET /readyz failed: $error_detail"
readyz_payload=""
else
readyz_meta=$(tail -n1 <<<"$readyz_resp")
readyz_payload=$(head -n -1 <<<"$readyz_resp" || true)
readyz_status=${readyz_meta%% *}
readyz_elapsed=${readyz_meta##* }
add_result PASS "GET /readyz status=$readyz_status elapsed=${readyz_elapsed}s"
fi
rm -f /tmp/agent_verify_readyz.err
# 4.3 Nodes list and detail
if ! nodes_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes" 2>/tmp/agent_verify_nodes.err); then
error_detail=$(cat /tmp/agent_verify_nodes.err || true)
add_result FAIL "GET /api/v1/master/nodes failed: $error_detail"
nodes_json=""
fi
rm -f /tmp/agent_verify_nodes.err
NODE_ENTRY=""
NODE_ID=""
NODE_IP=""
if [[ -n "$nodes_json" ]]; then
if [[ "$HAS_JQ" == "1" ]]; then
NODE_ENTRY=$(printf '%s' "$nodes_json" | jq -e --arg name "$VERIFY_HOSTNAME" '.[] | select(.name == $name)') || NODE_ENTRY=""
else
NODE_ENTRY=$(python3 - "$VERIFY_HOSTNAME" <<'PY'
import json
import sys
hostname = sys.argv[1]
nodes = json.load(sys.stdin)
for node in nodes:
if node.get("name") == hostname:
import json as _json
print(_json.dumps(node))
sys.exit(0)
sys.exit(1)
PY
) || NODE_ENTRY=""
fi
if [[ -z "$NODE_ENTRY" ]]; then
add_result FAIL "Current node '$VERIFY_HOSTNAME' not found in master nodes list"
else
if NODE_ID=$(json_query "$NODE_ENTRY" '.id' 'data["id"]'); then
add_result PASS "Discovered node id '$NODE_ID' for hostname '$VERIFY_HOSTNAME'"
else
add_result FAIL "Failed to extract node id from master response"
fi
fi
if [[ -n "$NODE_ENTRY" ]] && NODE_DETAIL=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_node_detail.err); then
NODE_DETAIL_JSON="$NODE_DETAIL"
add_result PASS "Fetched node detail for $NODE_ID"
if NODE_IP=$(json_query "$NODE_DETAIL_JSON" '.meta_data.ip // .meta_data.host_ip // empty' 'data.get("meta_data", {}).get("ip") or data.get("meta_data", {}).get("host_ip") or ""'); then
if [[ -n "$NODE_IP" ]]; then
add_result PASS "Registered node IP=$NODE_IP"
else
add_result INFO "Node detail does not expose IP fields"
fi
fi
else
error_detail=$(cat /tmp/agent_verify_node_detail.err 2>/dev/null || true)
add_result FAIL "Failed to fetch node detail for $NODE_ID: $error_detail"
NODE_DETAIL_JSON=""
fi
rm -f /tmp/agent_verify_node_detail.err
if stats_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes/statistics" 2>/tmp/agent_verify_stats.err); then
if total_nodes=$(json_query "$stats_json" '.total // .total_nodes' 'data.get("total") or data.get("total_nodes")'); then
if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$total_nodes" -ge 1 ]]; then
add_result PASS "Statistics total=$total_nodes"
else
add_result WARN "Statistics total field not numeric: $total_nodes"
fi
else
add_result WARN "Unable to read total field from statistics"
fi
active_nodes=""
if [[ "$HAS_JQ" == "1" ]]; then
active_nodes=$(printf '%s' "$stats_json" | jq -e 'if .status_statistics then (.status_statistics[] | select(.status == "online") | .count) else empty end' 2>/dev/null | head -n1 || true)
elif command -v python3 >/dev/null 2>&1; then
active_nodes=$(printf '%s' "$stats_json" | python3 -c 'import json,sys; data=json.load(sys.stdin); print(next((row.get("count") for row in data.get("status_statistics", []) if row.get("status")=="online"), ""))' 2>/dev/null)
fi
if [[ -n "$active_nodes" ]]; then
add_result PASS "Online nodes reported by master: $active_nodes"
fi
if [[ "$HAS_JQ" == "1" ]]; then
node_count=$(printf '%s' "$nodes_json" | jq 'length')
else
node_count=$(json_length "$nodes_json" 'length' 'len(data)')
fi
if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$node_count" =~ ^[0-9]+$ ]] && [[ "$total_nodes" -lt "$node_count" ]]; then
add_result WARN "Statistics total=$total_nodes less than nodes list count=$node_count"
fi
else
error_detail=$(cat /tmp/agent_verify_stats.err 2>/dev/null || true)
add_result FAIL "Failed to fetch node statistics: $error_detail"
fi
rm -f /tmp/agent_verify_stats.err
else
NODE_DETAIL_JSON=""
fi
# 4.4 Agent persistence checks
if [[ -f "$NODE_JSON" ]]; then
node_file_content="$(cat "$NODE_JSON")"
if node_id_local=$(json_query "$node_file_content" '.id' 'data["id"]'); then
if [[ "$NODE_ID" != "" && "$node_id_local" == "$NODE_ID" ]]; then
add_result PASS "node.json id matches master ($NODE_ID)"
else
add_result FAIL "node.json id '$node_id_local' differs from master id '$NODE_ID'"
fi
else
add_result FAIL "Unable to extract id from node.json"
fi
if node_name_local=$(json_query "$node_file_content" '.name' 'data["name"]'); then
if [[ "$node_name_local" == "$VERIFY_HOSTNAME" ]]; then
add_result PASS "node.json name matches $VERIFY_HOSTNAME"
else
add_result FAIL "node.json name '$node_name_local' differs from hostname '$VERIFY_HOSTNAME'"
fi
else
add_result FAIL "Unable to extract name from node.json"
fi
if register_time=$(json_query "$node_file_content" '.register_time' 'data.get("register_time")'); then
if iso_to_epoch "$register_time" >/dev/null 2>&1; then
add_result PASS "node.json register_time valid ISO timestamp"
else
add_result WARN "node.json register_time invalid: $register_time"
fi
else
add_result WARN "node.json missing register_time"
fi
if last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then
if iso_to_epoch "$last_updated" >/dev/null 2>&1; then
add_result PASS "node.json last_updated valid ISO timestamp"
else
add_result WARN "node.json last_updated invalid: $last_updated"
fi
else
add_result WARN "node.json missing last_updated"
fi
else
add_result FAIL "node.json not found at $NODE_JSON"
node_file_content=""
fi
ensure_directory "$HEALTH_DIR"
if [[ -d "$HEALTH_DIR" ]]; then
shopt -s nullglob
health_files=("$HEALTH_DIR"/*.json)
shopt -u nullglob
if [[ ${#health_files[@]} -eq 0 ]]; then
add_result WARN "Health directory $HEALTH_DIR is empty"
else
for hf in "${health_files[@]}"; do
base=$(basename "$hf")
if [[ "$base" != *-* ]]; then
add_result WARN "Health file $base does not follow <module>-*.json"
continue
fi
if ! validate_json_file "$hf" >/dev/null 2>&1; then
add_result WARN "Health file $base is not valid JSON"
fi
done
fi
else
add_result WARN "Health directory $HEALTH_DIR missing"
fi
if getent hosts master.argus.com >/dev/null 2>&1; then
resolved_ips=$(getent hosts master.argus.com | awk '{print $1}' | xargs)
add_result PASS "master.argus.com resolves to $resolved_ips"
else
add_result FAIL "Failed to resolve master.argus.com"
fi
# 4.5 Master-Node status consistency
sleep_interval=$((REPORT_INTERVAL_SECONDS + 2))
if [[ -n "$NODE_DETAIL_JSON" ]]; then
detail_pre="$NODE_DETAIL_JSON"
else
detail_pre=""
fi
if [[ -z "$detail_pre" && -n "$NODE_ID" ]]; then
if detail_pre=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_pre.err); then
add_result PASS "Fetched node detail pre-check"
else
error_detail=$(cat /tmp/agent_verify_detail_pre.err 2>/dev/null || true)
add_result FAIL "Unable to fetch node detail for status check: $error_detail"
fi
rm -f /tmp/agent_verify_detail_pre.err
fi
server_ts_pre=""
agent_ts_pre=""
server_ts_post=""
agent_ts_post=""
if [[ -n "$detail_pre" ]]; then
server_ts_pre=$(json_query "$detail_pre" '.last_report' 'data.get("last_report")' || echo "")
agent_ts_pre=$(json_query "$detail_pre" '.agent_last_report' 'data.get("agent_last_report")' || echo "")
log_info "Captured initial last_report timestamps server='$server_ts_pre' agent='$agent_ts_pre'"
sleep "$sleep_interval"
if detail_post=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_post.err); then
server_ts_post=$(json_query "$detail_post" '.last_report' 'data.get("last_report")' || echo "")
agent_ts_post=$(json_query "$detail_post" '.agent_last_report' 'data.get("agent_last_report")' || echo "")
if [[ "$server_ts_post" != "$server_ts_pre" ]]; then
add_result PASS "last_report.server_timestamp advanced (pre=$server_ts_pre post=$server_ts_post)"
else
add_result FAIL "last_report.server_timestamp did not change after ${sleep_interval}s"
fi
if [[ "$agent_ts_post" != "$agent_ts_pre" ]]; then
add_result PASS "last_report.agent_timestamp advanced"
else
add_result FAIL "last_report.agent_timestamp did not change"
fi
if [[ -n "$node_file_content" ]]; then
if node_last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then
if epoch_post=$(iso_to_epoch "$server_ts_post" 2>/dev/null); then
if node_epoch=$(iso_to_epoch "$node_last_updated" 2>/dev/null); then
diff=$((epoch_post - node_epoch))
[[ $diff -lt 0 ]] && diff=$((-diff))
tolerance=$((REPORT_INTERVAL_SECONDS * 2))
if [[ $diff -le $tolerance ]]; then
add_result PASS "last_report.server_timestamp and node.json last_updated within tolerance ($diff s)"
else
add_result WARN "Timestamp gap between master ($server_ts_post) and node.json ($node_last_updated) is ${diff}s"
fi
fi
fi
fi
fi
NODE_DETAIL_JSON="$detail_post"
else
error_detail=$(cat /tmp/agent_verify_detail_post.err 2>/dev/null || true)
add_result FAIL "Failed to fetch node detail post-check: $error_detail"
fi
rm -f /tmp/agent_verify_detail_post.err
fi
# 4.6 Health simulation
TEST_HEALTH_FILE="$HEALTH_DIR/verify-master.json"
ensure_directory "$HEALTH_DIR"
if [[ -f "$TEST_HEALTH_FILE" ]]; then
TEST_HEALTH_EXISTED="true"
TEST_HEALTH_BACKUP="$(cat "$TEST_HEALTH_FILE")"
else
TEST_HEALTH_EXISTED="false"
fi
create_health_file() {
local message="$1"
cat > "$TEST_HEALTH_FILE" <<HEALTHJSON
{"status":"ok","message":"$message"}
HEALTHJSON
}
validate_health_in_master() {
local expected_message="$1"
local detail_json="$2"
local message
if message=$(json_query "$detail_json" '.health["verify-master"].message' 'data.get("health", {}).get("verify-master", {}).get("message")'); then
if [[ "$message" == "$expected_message" ]]; then
return 0
fi
fi
return 1
}
remove_health_from_master() {
local detail_json="$1"
if json_has_key "$detail_json" '(.health | has("verify-master"))' '"verify-master" in data.get("health", {})'; then
return 1
fi
return 0
}
health_message_one="verify $(date +%s)"
create_health_file "$health_message_one"
add_result PASS "Created test health file $TEST_HEALTH_FILE"
sleep "$sleep_interval"
if detail_health_one=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health1.err); then
if validate_health_in_master "$health_message_one" "$detail_health_one"; then
add_result PASS "Master reflects verify-master health message"
else
add_result FAIL "Master health payload does not match test message"
fi
else
error_detail=$(cat /tmp/agent_verify_health1.err 2>/dev/null || true)
add_result FAIL "Failed to fetch node detail during health validation: $error_detail"
detail_health_one=""
fi
rm -f /tmp/agent_verify_health1.err
health_message_two="verify $(date +%s)-update"
create_health_file "$health_message_two"
sleep "$sleep_interval"
if detail_health_two=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health2.err); then
if validate_health_in_master "$health_message_two" "$detail_health_two"; then
add_result PASS "Master health updated to new message"
else
add_result FAIL "Master health message did not update"
fi
else
error_detail=$(cat /tmp/agent_verify_health2.err 2>/dev/null || true)
add_result FAIL "Failed to fetch node detail after health update: $error_detail"
detail_health_two=""
fi
rm -f /tmp/agent_verify_health2.err
rm -f "$TEST_HEALTH_FILE"
sleep "$sleep_interval"
if detail_health_three=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health3.err); then
if remove_health_from_master "$detail_health_three"; then
add_result PASS "Master health no longer lists verify-master after removal"
else
add_result FAIL "Master health still contains verify-master after file deletion"
fi
else
error_detail=$(cat /tmp/agent_verify_health3.err 2>/dev/null || true)
add_result FAIL "Failed to fetch node detail after health removal: $error_detail"
fi
rm -f /tmp/agent_verify_health3.err
if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then
printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE"
fi
# Optional config touch
if [[ "$ALLOW_CONFIG_TOUCH" == "true" ]]; then
if [[ -n "$NODE_ID" ]]; then
payload='{"label": {"verify": "true"}}'
if curl "${CURL_OPTS[@]}" -X PUT -H 'Content-Type: application/json' -d "$payload" "$MASTER_BASE/api/v1/master/nodes/$NODE_ID/config" >/tmp/agent_verify_config.log 2>&1; then
add_result PASS "Config PUT dry-run succeeded"
else
add_result WARN "Config PUT dry-run failed: $(cat /tmp/agent_verify_config.log)"
fi
rm -f /tmp/agent_verify_config.log
fi
else
add_result WARN "Config PUT dry-run skipped (enable with --allow-config-touch)"
fi
# Result summary
echo
echo "==== Verification Summary ===="
for entry in "${RESULTS_PASS[@]}"; do
printf 'PASS: %s\n' "$entry"
done
for entry in "${RESULTS_WARN[@]}"; do
printf 'WARN: %s\n' "$entry"
done
for entry in "${RESULTS_FAIL[@]}"; do
printf 'FAIL: %s\n' "$entry"
done
if [[ ${#RESULTS_FAIL[@]} -gt 0 ]]; then
exit 1
fi
exit 0

269
src/agent/scripts/build_binary.sh Executable file
View File

@ -0,0 +1,269 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
MODULE_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
BUILD_ROOT="$MODULE_ROOT/build"
DIST_DIR="$MODULE_ROOT/dist"
PYINSTALLER_BUILD="$BUILD_ROOT/pyinstaller"
PYINSTALLER_SPEC="$PYINSTALLER_BUILD/spec"
PYINSTALLER_WORK="$PYINSTALLER_BUILD/work"
VENV_DIR="$BUILD_ROOT/venv"
AGENT_BUILD_IMAGE="${AGENT_BUILD_IMAGE:-python:3.11-slim-bullseye}"
AGENT_BUILD_USE_DOCKER="${AGENT_BUILD_USE_DOCKER:-1}"
USED_DOCKER=0
run_host_build() {
echo "[INFO] Using host Python environment for build" >&2
rm -rf "$BUILD_ROOT" "$DIST_DIR"
mkdir -p "$PYINSTALLER_BUILD" "$DIST_DIR"
python3 -m venv --copies "$VENV_DIR"
# shellcheck disable=SC1091
source "$VENV_DIR/bin/activate"
pip install --upgrade pip
pip install .
pip install "pyinstaller==6.6.0"
pyinstaller \
--clean \
--onefile \
--name argus-agent \
--distpath "$DIST_DIR" \
--workpath "$PYINSTALLER_WORK" \
--specpath "$PYINSTALLER_SPEC" \
--add-data "$MODULE_ROOT/pyproject.toml:." \
"$MODULE_ROOT/entry.py"
chmod +x "$DIST_DIR/argus-agent"
deactivate
}
run_docker_build() {
if ! command -v docker >/dev/null 2>&1; then
echo "[ERROR] docker 命令不存在,无法在容器内构建。请安装 Docker 或设置 AGENT_BUILD_USE_DOCKER=0" >&2
exit 1
fi
USED_DOCKER=1
echo "[INFO] Building agent binary inside $AGENT_BUILD_IMAGE" >&2
local host_uid host_gid
host_uid="$(id -u)"
host_gid="$(id -g)"
docker_env=("--rm" "-v" "$MODULE_ROOT:/workspace" "-w" "/workspace" "--env" "TARGET_UID=${host_uid}" "--env" "TARGET_GID=${host_gid}")
pass_env_if_set() {
local var="$1"
local value="${!var:-}"
if [[ -n "$value" ]]; then
docker_env+=("--env" "$var=$value")
fi
}
pass_env_if_set PIP_INDEX_URL
pass_env_if_set PIP_EXTRA_INDEX_URL
pass_env_if_set PIP_TRUSTED_HOST
pass_env_if_set HTTP_PROXY
pass_env_if_set HTTPS_PROXY
pass_env_if_set NO_PROXY
pass_env_if_set http_proxy
pass_env_if_set https_proxy
pass_env_if_set no_proxy
build_script=$(cat <<'INNER'
set -euo pipefail
cd /workspace
apt-get update >/dev/null
apt-get install -y --no-install-recommends binutils >/dev/null
rm -rf /var/lib/apt/lists/*
rm -rf build dist
mkdir -p build/pyinstaller dist
python3 -m venv --copies build/venv
source build/venv/bin/activate
pip install --upgrade pip
pip install .
pip install pyinstaller==6.6.0
pyinstaller \
--clean \
--onefile \
--name argus-agent \
--distpath dist \
--workpath build/pyinstaller/work \
--specpath build/pyinstaller/spec \
--add-data /workspace/pyproject.toml:. \
entry.py
chmod +x dist/argus-agent
TARGET_UID="${TARGET_UID:-0}"
TARGET_GID="${TARGET_GID:-0}"
chown -R "$TARGET_UID:$TARGET_GID" dist build 2>/dev/null || true
python3 - <<'PY'
from pathlib import Path
from PyInstaller.archive.readers import CArchiveReader
import sys
archive = Path('dist/argus-agent')
out_dir = Path('build/compat_check')
out_dir.mkdir(parents=True, exist_ok=True)
major, minor = sys.version_info[:2]
libpython = f'libpython{major}.{minor}.so.1.0'
expected_libs = [
libpython,
'libssl.so.3',
'libcrypto.so.3',
]
reader = CArchiveReader(str(archive))
extracted = []
missing = []
for name in expected_libs:
try:
data = reader.extract(name)
except KeyError:
missing.append(name)
continue
(out_dir / name).write_bytes(data)
extracted.append(name)
(out_dir / 'manifest').write_text('\n'.join(extracted))
if extracted:
print('[INFO] Extracted libraries: ' + ', '.join(extracted))
if missing:
print('[WARN] Missing expected libraries in bundle: ' + ', '.join(missing))
PY
compat_check() {
local lib_path="$1"
if [[ ! -f "$lib_path" ]]; then
echo "[WARN] Missing $lib_path for GLIBC check"
return
fi
local max_glibc
max_glibc=$(strings -a "$lib_path" | grep -Eo 'GLIBC_[0-9]+\.[0-9]+' | sort -Vu | tail -n 1 || true)
if [[ -n "$max_glibc" ]]; then
echo "[INFO] $lib_path references up to $max_glibc"
else
echo "[INFO] $lib_path does not expose GLIBC version strings"
fi
}
compat_libs=()
if [[ -f build/compat_check/manifest ]]; then
mapfile -t compat_libs < build/compat_check/manifest
fi
if [[ ${#compat_libs[@]} -eq 0 ]]; then
echo "[WARN] No libraries captured for GLIBC inspection"
else
for lib in "${compat_libs[@]}"; do
compat_check "build/compat_check/$lib"
done
fi
deactivate
INNER
)
if ! docker run "${docker_env[@]}" "$AGENT_BUILD_IMAGE" bash -lc "$build_script"; then
echo "[ERROR] Docker 构建失败,请检查 Docker 权限或设置 AGENT_BUILD_USE_DOCKER=0 在兼容主机上构建" >&2
exit 1
fi
}
if [[ "$AGENT_BUILD_USE_DOCKER" == "1" ]]; then
run_docker_build
else
run_host_build
fi
if [[ ! -f "$DIST_DIR/argus-agent" ]]; then
echo "[ERROR] Agent binary was not produced" >&2
exit 1
fi
if [[ "$USED_DOCKER" != "1" ]]; then
if [[ ! -x "$VENV_DIR/bin/python" ]]; then
echo "[WARN] PyInstaller virtualenv missing at $VENV_DIR; skipping compatibility check" >&2
else
COMPAT_DIR="$BUILD_ROOT/compat_check"
rm -rf "$COMPAT_DIR"
mkdir -p "$COMPAT_DIR"
EXTRACT_SCRIPT=$(cat <<'PY'
from pathlib import Path
from PyInstaller.archive.readers import CArchiveReader
import sys
archive = Path('dist/argus-agent')
out_dir = Path('build/compat_check')
out_dir.mkdir(parents=True, exist_ok=True)
major, minor = sys.version_info[:2]
libpython = f'libpython{major}.{minor}.so.1.0'
expected_libs = [
libpython,
'libssl.so.3',
'libcrypto.so.3',
]
reader = CArchiveReader(str(archive))
extracted = []
missing = []
for name in expected_libs:
try:
data = reader.extract(name)
except KeyError:
missing.append(name)
continue
(out_dir / name).write_bytes(data)
extracted.append(name)
(out_dir / 'manifest').write_text('\n'.join(extracted))
if extracted:
print('[INFO] Extracted libraries: ' + ', '.join(extracted))
if missing:
print('[WARN] Missing expected libraries in bundle: ' + ', '.join(missing))
PY
)
"$VENV_DIR/bin/python" - <<PY
$EXTRACT_SCRIPT
PY
compat_libs=()
if [[ -f "$COMPAT_DIR/manifest" ]]; then
mapfile -t compat_libs < "$COMPAT_DIR/manifest"
fi
check_glibc_version() {
local lib_path="$1"
if [[ ! -f "$lib_path" ]]; then
echo "[WARN] Skipping GLIBC check; file not found: $lib_path" >&2
return
fi
if command -v strings >/dev/null 2>&1; then
local max_glibc
max_glibc=$(strings -a "$lib_path" | grep -Eo 'GLIBC_[0-9]+\.[0-9]+' | sort -Vu | tail -n 1 || true)
if [[ -n "$max_glibc" ]]; then
echo "[INFO] $lib_path references up to $max_glibc"
else
echo "[INFO] $lib_path does not expose GLIBC version strings"
fi
else
echo "[WARN] strings command unavailable; cannot inspect $lib_path" >&2
fi
}
if [[ ${#compat_libs[@]} -eq 0 ]]; then
echo "[WARN] No libraries captured for GLIBC inspection" >&2
else
for lib in "${compat_libs[@]}"; do
check_glibc_version "$COMPAT_DIR/$lib"
done
fi
fi
else
echo "[INFO] Compatibility check executed inside container"
fi
echo "[INFO] Agent binary generated at $DIST_DIR/argus-agent"

2
src/agent/tests/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
private/
tmp/

View File

@ -0,0 +1,69 @@
services:
bind:
image: ${BIND_IMAGE_TAG:-argus-bind9:latest}
container_name: argus-bind-agent-e2e
volumes:
- ./private:/private
networks:
default:
ipv4_address: 172.28.0.2
environment:
- "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
- "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
restart: always
master:
image: argus-master:latest
container_name: argus-master-agent-e2e
depends_on:
- bind
environment:
- OFFLINE_THRESHOLD_SECONDS=6
- ONLINE_THRESHOLD_SECONDS=2
- SCHEDULER_INTERVAL_SECONDS=1
- "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
- "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
ports:
- "32300:3000"
volumes:
- ./private/argus/master:/private/argus/master
- ./private/argus/metric/prometheus:/private/argus/metric/prometheus
- ./private/argus/etc:/private/argus/etc
networks:
default:
ipv4_address: 172.28.0.10
restart: always
agent:
image: ubuntu:22.04
container_name: argus-agent-e2e
hostname: dev-e2euser-e2einst-pod-0
depends_on:
- master
- bind
environment:
- MASTER_ENDPOINT=http://master.argus.com:3000
- REPORT_INTERVAL_SECONDS=2
- "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
- "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
volumes:
- ./private/argus/agent/dev-e2euser-e2einst-pod-0:/private/argus/agent/dev-e2euser-e2einst-pod-0
- ./private/argus/agent/dev-e2euser-e2einst-pod-0/health:/private/argus/agent/dev-e2euser-e2einst-pod-0/health
- ./private/argus/etc:/private/argus/etc
- ../dist/argus-agent:/usr/local/bin/argus-agent:ro
- ./scripts/agent_entrypoint.sh:/usr/local/bin/agent-entrypoint.sh:ro
- ../scripts/agent_deployment_verify.sh:/usr/local/bin/agent_deployment_verify.sh:ro
entrypoint:
- /usr/local/bin/agent-entrypoint.sh
networks:
default:
ipv4_address: 172.28.0.20
restart: always
networks:
default:
driver: bridge
ipam:
driver: default
config:
- subnet: 172.28.0.0/16

View File

@ -0,0 +1,23 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SCRIPTS=(
"01_bootstrap.sh"
"02_up.sh"
"03_wait_and_assert_registration.sh"
"04_write_health_files.sh"
"08_verify_agent.sh"
"05_assert_status_on_master.sh"
"06_restart_agent_and_reregister.sh"
"07_down.sh"
)
for script in "${SCRIPTS[@]}"; do
echo "[TEST] Running $script"
"$SCRIPT_DIR/$script"
echo "[TEST] $script completed"
echo
done
echo "[TEST] Agent module E2E tests completed"

View File

@ -0,0 +1,63 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
AGENT_ROOT="$(cd "$TEST_ROOT/.." && pwd)"
MASTER_ROOT="$(cd "$AGENT_ROOT/../master" && pwd)"
REPO_ROOT="$(cd "$AGENT_ROOT/../.." && pwd)"
PRIVATE_ROOT="$TEST_ROOT/private"
TMP_ROOT="$TEST_ROOT/tmp"
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
AGENT_CONFIG_DIR="$PRIVATE_ROOT/argus/agent/$AGENT_HOSTNAME"
AGENT_HEALTH_DIR="$PRIVATE_ROOT/argus/agent/$AGENT_HOSTNAME/health"
MASTER_PRIVATE_DIR="$PRIVATE_ROOT/argus/master"
METRIC_PRIVATE_DIR="$PRIVATE_ROOT/argus/metric/prometheus"
DNS_DIR="$PRIVATE_ROOT/argus/etc"
BIND_IMAGE_TAG="${BIND_IMAGE_TAG:-argus-bind9:latest}"
BIND_ROOT="$(cd "$MASTER_ROOT/../bind" && pwd)"
ensure_image() {
local image="$1"
if ! docker image inspect "$image" >/dev/null 2>&1; then
echo "[ERROR] Docker image '$image' 未找到,请先运行统一构建脚本 (例如 ./build/build_images.sh) 生成所需镜像" >&2
exit 1
fi
}
mkdir -p "$AGENT_CONFIG_DIR"
mkdir -p "$AGENT_HEALTH_DIR"
mkdir -p "$MASTER_PRIVATE_DIR"
mkdir -p "$METRIC_PRIVATE_DIR"
mkdir -p "$TMP_ROOT"
mkdir -p "$DNS_DIR"
touch "$AGENT_HEALTH_DIR/.keep"
# 中文提示:准备 bind 模块提供的 update-dns.sh模拟生产下发
if [[ -f "$BIND_ROOT/build/update-dns.sh" ]]; then
cp "$BIND_ROOT/build/update-dns.sh" "$DNS_DIR/update-dns.sh"
chmod +x "$DNS_DIR/update-dns.sh"
else
echo "[WARN] bind update script missing at $BIND_ROOT/build/update-dns.sh"
fi
ensure_image "argus-master:latest"
ensure_image "$BIND_IMAGE_TAG"
AGENT_BINARY="$AGENT_ROOT/dist/argus-agent"
pushd "$AGENT_ROOT" >/dev/null
./scripts/build_binary.sh
popd >/dev/null
if [[ ! -x "$AGENT_BINARY" ]]; then
echo "[ERROR] Agent binary not found at $AGENT_BINARY" >&2
exit 1
fi
echo "$AGENT_BINARY" > "$TMP_ROOT/agent_binary_path"
echo "$BIND_IMAGE_TAG" > "$TMP_ROOT/bind_image_tag"
echo "[INFO] Agent E2E bootstrap complete"

View File

@ -0,0 +1,53 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
TMP_ROOT="$TEST_ROOT/tmp"
ENV_FILE="$TEST_ROOT/.env"
source "$REPO_ROOT/scripts/common/build_user.sh"
load_build_user
export ARGUS_BUILD_UID ARGUS_BUILD_GID
cat > "$ENV_FILE" <<EOF
ARGUS_BUILD_UID=$ARGUS_BUILD_UID
ARGUS_BUILD_GID=$ARGUS_BUILD_GID
EOF
if [[ ! -f "$TMP_ROOT/agent_binary_path" ]]; then
echo "[ERROR] Agent binary path missing; run 01_bootstrap.sh first" >&2
exit 1
fi
AGENT_BINARY="$(cat "$TMP_ROOT/agent_binary_path")"
if [[ ! -x "$AGENT_BINARY" ]]; then
echo "[ERROR] Agent binary not executable: $AGENT_BINARY" >&2
exit 1
fi
BIND_IMAGE_TAG_VALUE="argus-bind9:latest"
if [[ -f "$TMP_ROOT/bind_image_tag" ]]; then
BIND_IMAGE_TAG_VALUE="$(cat "$TMP_ROOT/bind_image_tag")"
fi
compose() {
if docker compose version >/dev/null 2>&1; then
docker compose "$@"
else
docker-compose "$@"
fi
}
docker container rm -f argus-agent-e2e argus-master-agent-e2e argus-bind-agent-e2e >/dev/null 2>&1 || true
docker network rm tests_default >/dev/null 2>&1 || true
pushd "$TEST_ROOT" >/dev/null
compose down --remove-orphans || true
BIND_IMAGE_TAG="$BIND_IMAGE_TAG_VALUE" compose up -d
popd >/dev/null
echo "[INFO] Master+Agent stack started"

View File

@ -0,0 +1,65 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TMP_ROOT="$TEST_ROOT/tmp"
API_BASE="http://localhost:32300/api/v1/master"
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
NODE_FILE="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/node.json"
mkdir -p "$TMP_ROOT"
node_id=""
for _ in {1..30}; do
sleep 2
response=$(curl -sS "$API_BASE/nodes" || true)
if [[ -z "$response" ]]; then
continue
fi
list_file="$TMP_ROOT/nodes_list.json"
echo "$response" > "$list_file"
node_id=$(python3 - "$list_file" <<'PY'
import json, sys
with open(sys.argv[1]) as handle:
nodes = json.load(handle)
print(nodes[0]["id"] if nodes else "")
PY
)
if [[ -n "$node_id" ]]; then
break
fi
done
if [[ -z "$node_id" ]]; then
echo "[ERROR] Agent did not register within timeout" >&2
exit 1
fi
echo "$node_id" > "$TMP_ROOT/node_id"
if [[ ! -f "$NODE_FILE" ]]; then
echo "[ERROR] node.json not created at $NODE_FILE" >&2
exit 1
fi
python3 - "$NODE_FILE" <<'PY'
import json, sys
with open(sys.argv[1]) as handle:
node = json.load(handle)
assert "id" in node and node["id"], "node.json missing id"
PY
detail_file="$TMP_ROOT/initial_detail.json"
curl -sS "$API_BASE/nodes/$node_id" -o "$detail_file"
python3 - "$detail_file" "$TMP_ROOT/initial_ip" <<'PY'
import json, sys, pathlib
with open(sys.argv[1]) as handle:
node = json.load(handle)
ip = node["meta_data"].get("ip")
if not ip:
raise SystemExit("meta_data.ip missing")
pathlib.Path(sys.argv[2]).write_text(ip)
PY
echo "[INFO] Agent registered with node id $node_id"

View File

@ -0,0 +1,22 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
HEALTH_DIR="$TEST_ROOT/private/argus/agent/dev-e2euser-e2einst-pod-0/health"
cat > "$HEALTH_DIR/log-fluentbit.json" <<JSON
{
"status": "healthy",
"timestamp": "2023-10-05T12:05:00Z"
}
JSON
cat > "$HEALTH_DIR/metric-node-exporter.json" <<JSON
{
"status": "healthy",
"timestamp": "2023-10-05T12:05:00Z"
}
JSON
echo "[INFO] Health files written"

View File

@ -0,0 +1,53 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TMP_ROOT="$TEST_ROOT/tmp"
API_BASE="http://localhost:32300/api/v1/master"
NODE_ID="$(cat "$TMP_ROOT/node_id")"
NODES_JSON="$TEST_ROOT/private/argus/metric/prometheus/nodes.json"
success=false
detail_file="$TMP_ROOT/agent_status_detail.json"
for _ in {1..20}; do
sleep 2
if ! curl -sS "$API_BASE/nodes/$NODE_ID" -o "$detail_file"; then
continue
fi
if python3 - "$detail_file" <<'PY'
import json, sys
with open(sys.argv[1]) as handle:
node = json.load(handle)
if node["status"] != "online":
raise SystemExit(1)
health = node.get("health", {})
if "log-fluentbit" not in health or "metric-node-exporter" not in health:
raise SystemExit(1)
PY
then
success=true
break
fi
done
if [[ "$success" != true ]]; then
echo "[ERROR] Node did not report health data in time" >&2
exit 1
fi
if [[ ! -f "$NODES_JSON" ]]; then
echo "[ERROR] nodes.json missing at $NODES_JSON" >&2
exit 1
fi
python3 - "$NODES_JSON" <<'PY'
import json, sys
with open(sys.argv[1]) as handle:
nodes = json.load(handle)
assert len(nodes) == 1, nodes
entry = nodes[0]
assert entry["node_id"], entry
PY
echo "[INFO] Master reflects agent health and nodes.json entries"

View File

@ -0,0 +1,143 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TMP_ROOT="$TEST_ROOT/tmp"
API_BASE="http://localhost:32300/api/v1/master"
NODE_ID="$(cat "$TMP_ROOT/node_id")"
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
NETWORK_NAME="tests_default"
NEW_AGENT_IP="172.28.0.200"
ENTRYPOINT_SCRIPT="$SCRIPT_DIR/agent_entrypoint.sh"
ENV_FILE="$TEST_ROOT/.env"
# 中文提示:重启场景也需要同样的入口脚本,确保 DNS 注册逻辑一致
if [[ ! -f "$ENTRYPOINT_SCRIPT" ]]; then
echo "[ERROR] agent entrypoint script missing at $ENTRYPOINT_SCRIPT" >&2
exit 1
fi
if [[ ! -f "$TMP_ROOT/agent_binary_path" ]]; then
echo "[ERROR] Agent binary path missing; rerun bootstrap" >&2
exit 1
fi
AGENT_BINARY="$(cat "$TMP_ROOT/agent_binary_path")"
if [[ ! -x "$AGENT_BINARY" ]]; then
echo "[ERROR] Agent binary not executable: $AGENT_BINARY" >&2
exit 1
fi
if [[ -f "$ENV_FILE" ]]; then
set -a
# shellcheck disable=SC1090
source "$ENV_FILE"
set +a
else
REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
# shellcheck disable=SC1090
source "$REPO_ROOT/scripts/common/build_user.sh"
load_build_user
fi
AGENT_UID="${ARGUS_BUILD_UID:-2133}"
AGENT_GID="${ARGUS_BUILD_GID:-2015}"
compose() {
if docker compose version >/dev/null 2>&1; then
docker compose "$@"
else
docker-compose "$@"
fi
}
before_file="$TMP_ROOT/before_restart.json"
curl -sS "$API_BASE/nodes/$NODE_ID" -o "$before_file"
prev_last_updated=$(python3 - "$before_file" <<'PY'
import json, sys
with open(sys.argv[1]) as handle:
node = json.load(handle)
print(node.get("last_updated", ""))
PY
)
prev_ip=$(python3 - "$before_file" <<'PY'
import json, sys
with open(sys.argv[1]) as handle:
node = json.load(handle)
print(node["meta_data"].get("ip", ""))
PY
)
initial_ip=$(cat "$TMP_ROOT/initial_ip")
if [[ "$prev_ip" != "$initial_ip" ]]; then
echo "[ERROR] Expected initial IP $initial_ip, got $prev_ip" >&2
exit 1
fi
pushd "$TEST_ROOT" >/dev/null
compose rm -sf agent
popd >/dev/null
docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true
AGENT_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME"
HEALTH_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/health"
# 先以 sleep 方式启动容器,确保我们掌握注册时的网络状态
if ! docker run -d \
--name argus-agent-e2e \
--hostname "$AGENT_HOSTNAME" \
--network "$NETWORK_NAME" \
--ip "$NEW_AGENT_IP" \
-v "$AGENT_DIR:/private/argus/agent/$AGENT_HOSTNAME" \
-v "$HEALTH_DIR:/private/argus/agent/$AGENT_HOSTNAME/health" \
-v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \
-v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
-v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
-e REPORT_INTERVAL_SECONDS=2 \
-e ARGUS_BUILD_UID="$AGENT_UID" \
-e ARGUS_BUILD_GID="$AGENT_GID" \
--entrypoint /usr/local/bin/agent-entrypoint.sh \
ubuntu:22.04 >/dev/null; then
echo "[ERROR] Failed to start agent container with custom IP" >&2
exit 1
fi
success=false
detail_file="$TMP_ROOT/post_restart.json"
for _ in {1..20}; do
sleep 3
if ! curl -sS "$API_BASE/nodes/$NODE_ID" -o "$detail_file"; then
continue
fi
if python3 - "$detail_file" "$prev_last_updated" "$NODE_ID" "$prev_ip" "$NEW_AGENT_IP" <<'PY'
import json, sys
with open(sys.argv[1]) as handle:
node = json.load(handle)
prev_last_updated = sys.argv[2]
expected_id = sys.argv[3]
old_ip = sys.argv[4]
expected_ip = sys.argv[5]
last_updated = node.get("last_updated")
current_ip = node["meta_data"].get("ip")
assert node["id"] == expected_id
if current_ip != expected_ip:
raise SystemExit(1)
if current_ip == old_ip:
raise SystemExit(1)
if not last_updated or last_updated == prev_last_updated:
raise SystemExit(1)
PY
then
success=true
break
fi
done
if [[ "$success" != true ]]; then
echo "[ERROR] Agent did not report expected new IP $NEW_AGENT_IP after restart" >&2
exit 1
fi
echo "[INFO] Agent restart produced successful re-registration with IP change"

View File

@ -0,0 +1,36 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$TEST_ROOT/.env"
compose() {
if docker compose version >/dev/null 2>&1; then
docker compose "$@"
else
docker-compose "$@"
fi
}
docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true
pushd "$TEST_ROOT" >/dev/null
compose down --remove-orphans
popd >/dev/null
if [[ -d "$TEST_ROOT/private" ]]; then
docker run --rm \
-v "$TEST_ROOT/private:/target" \
ubuntu:24.04 \
chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true
rm -rf "$TEST_ROOT/private"
fi
rm -rf "$TEST_ROOT/tmp"
if [[ -f "$ENV_FILE" ]]; then
rm -f "$ENV_FILE"
fi
echo "[INFO] Agent E2E environment cleaned up"

View File

@ -0,0 +1,26 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
VERIFY_SCRIPT="$(cd "$TEST_ROOT/.." && pwd)/scripts/agent_deployment_verify.sh"
if ! docker ps --format '{{.Names}}' | grep -q '^argus-agent-e2e$'; then
echo "[WARN] agent container not running; skip verification"
exit 0
fi
if docker exec -i argus-agent-e2e bash -lc 'command -v curl >/dev/null && command -v jq >/dev/null'; then
echo "[INFO] curl/jq already installed in agent container"
else
echo "[INFO] Installing curl/jq in agent container"
docker exec -i argus-agent-e2e bash -lc 'apt-get update >/dev/null 2>&1 && apt-get install -y curl jq >/dev/null 2>&1' || true
fi
if docker exec -i argus-agent-e2e bash -lc 'command -v /usr/local/bin/agent_deployment_verify.sh >/dev/null'; then
docker exec -i argus-agent-e2e /usr/local/bin/agent_deployment_verify.sh
elif [[ -x "$VERIFY_SCRIPT" ]]; then
docker exec -i argus-agent-e2e "$VERIFY_SCRIPT"
else
echo "[WARN] agent_deployment_verify.sh not found"
fi

View File

@ -0,0 +1,79 @@
#!/usr/bin/env bash
set -euo pipefail
LOG_PREFIX="[AGENT-ENTRYPOINT]"
DNS_SCRIPT="/private/argus/etc/update-dns.sh"
DNS_CONF="/private/argus/etc/dns.conf"
TARGET_DOMAIN="master.argus.com"
AGENT_UID="${ARGUS_BUILD_UID:-2133}"
AGENT_GID="${ARGUS_BUILD_GID:-2015}"
AGENT_HOSTNAME="${HOSTNAME:-unknown}"
AGENT_DATA_DIR="/private/argus/agent/${AGENT_HOSTNAME}"
AGENT_HEALTH_DIR="${AGENT_DATA_DIR}/health"
RUNTIME_GROUP="argusagent"
RUNTIME_USER="argusagent"
log() {
echo "${LOG_PREFIX} $*"
}
mkdir -p "$AGENT_DATA_DIR" "$AGENT_HEALTH_DIR"
chown -R "$AGENT_UID:$AGENT_GID" "$AGENT_DATA_DIR" "$AGENT_HEALTH_DIR" 2>/dev/null || true
chown -R "$AGENT_UID:$AGENT_GID" "/private/argus/etc" 2>/dev/null || true
if ! getent group "$AGENT_GID" >/dev/null 2>&1; then
groupadd -g "$AGENT_GID" "$RUNTIME_GROUP"
else
RUNTIME_GROUP="$(getent group "$AGENT_GID" | cut -d: -f1)"
fi
if ! getent passwd "$AGENT_UID" >/dev/null 2>&1; then
useradd -u "$AGENT_UID" -g "$AGENT_GID" -M -s /bin/bash "$RUNTIME_USER"
else
RUNTIME_USER="$(getent passwd "$AGENT_UID" | cut -d: -f1)"
fi
log "运行用户: $RUNTIME_USER ($AGENT_UID:$AGENT_GID)"
# 中文提示:等待 bind 下发的 update-dns.sh 脚本
for _ in {1..30}; do
if [[ -x "$DNS_SCRIPT" ]]; then
break
fi
log "等待 update-dns.sh 准备就绪..."
sleep 1
done
if [[ -x "$DNS_SCRIPT" ]]; then
log "执行 update-dns.sh 更新容器 DNS"
while true; do
if "$DNS_SCRIPT"; then
log "update-dns.sh 执行成功"
break
fi
log "update-dns.sh 执行失败3 秒后重试"
sleep 3
done
else
log "未获取到 update-dns.sh使用镜像默认 DNS"
fi
# 中文提示:记录当前 dns.conf 内容,便于排查
if [[ -f "$DNS_CONF" ]]; then
log "dns.conf 内容: $(tr '\n' ' ' < "$DNS_CONF")"
else
log "dns.conf 暂未生成"
fi
# 中文提示:尝试解析 master 域名,失败不阻塞但会打日志
for _ in {1..30}; do
if getent hosts "$TARGET_DOMAIN" >/dev/null 2>&1; then
MASTER_IP=$(getent hosts "$TARGET_DOMAIN" | awk '{print $1}' | head -n 1)
log "master.argus.com 解析成功: $MASTER_IP"
break
fi
sleep 1
done
log "启动 argus-agent"
exec su -s /bin/bash -c /usr/local/bin/argus-agent "$RUNTIME_USER"

0
src/alert/README.md Normal file
View File

2
src/bind/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
images/

89
src/bind/build/Dockerfile Normal file
View File

@ -0,0 +1,89 @@
FROM ubuntu:22.04
# Set timezone and avoid interactive prompts
ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=Asia/Shanghai
# 设置构建参数
ARG USE_INTRANET=false
ARG ARGUS_BUILD_UID=2133
ARG ARGUS_BUILD_GID=2015
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
# 配置内网 apt 源 (如果指定了内网选项)
RUN if [ "$USE_INTRANET" = "true" ]; then \
echo "Configuring intranet apt sources..." && \
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
fi
# Update package list and install required packages
RUN apt-get update && \
apt-get install -y \
bind9 \
bind9utils \
bind9-doc \
supervisor \
net-tools \
inetutils-ping \
vim \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# 调整 bind 用户与用户组 ID 以匹配宿主机配置
RUN set -eux; \
current_gid="$(getent group bind | awk -F: '{print $3}')"; \
if [ -z "$current_gid" ]; then \
groupadd -g "${ARGUS_BUILD_GID}" bind; \
elif [ "$current_gid" != "${ARGUS_BUILD_GID}" ]; then \
groupmod -g "${ARGUS_BUILD_GID}" bind; \
fi; \
if id bind >/dev/null 2>&1; then \
current_uid="$(id -u bind)"; \
if [ "$current_uid" != "${ARGUS_BUILD_UID}" ]; then \
usermod -u "${ARGUS_BUILD_UID}" bind; \
fi; \
else \
useradd -m -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" bind; \
fi; \
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /var/cache/bind /var/lib/bind
# 配置部署时使用的apt源
RUN if [ "$USE_INTRANET" = "true" ]; then \
echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
fi
# Create supervisor configuration directory
RUN mkdir -p /etc/supervisor/conf.d
# Copy supervisor configuration
COPY src/bind/build/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
# Copy BIND9 configuration files
COPY src/bind/build/named.conf.local /etc/bind/named.conf.local
COPY src/bind/build/db.argus.com /etc/bind/db.argus.com
# Copy startup and reload scripts
COPY src/bind/build/startup.sh /usr/local/bin/startup.sh
COPY src/bind/build/reload-bind9.sh /usr/local/bin/reload-bind9.sh
COPY src/bind/build/argus_dns_sync.sh /usr/local/bin/argus_dns_sync.sh
COPY src/bind/build/update-dns.sh /usr/local/bin/update-dns.sh
# Make scripts executable
RUN chmod +x /usr/local/bin/startup.sh /usr/local/bin/reload-bind9.sh /usr/local/bin/argus_dns_sync.sh /usr/local/bin/update-dns.sh
# Set proper ownership for BIND9 files
RUN chown bind:bind /etc/bind/named.conf.local /etc/bind/db.argus.com
# Expose DNS port
EXPOSE 53/tcp 53/udp
# Use root user as requested
USER root
# Start with startup script
CMD ["/usr/local/bin/startup.sh"]

View File

@ -0,0 +1,106 @@
#!/usr/bin/env bash
set -euo pipefail
WATCH_DIR="/private/argus/etc"
ZONE_DB="/private/argus/bind/db.argus.com"
LOCKFILE="/var/lock/argus_dns_sync.lock"
BACKUP_DIR="/private/argus/bind/.backup"
SLEEP_SECONDS=10
RELOAD_SCRIPT="/usr/local/bin/reload-bind9.sh" # 这里放你已有脚本的路径
mkdir -p "$(dirname "$LOCKFILE")" "$BACKUP_DIR"
BACKUP_UID="${ARGUS_BUILD_UID:-2133}"
BACKUP_GID="${ARGUS_BUILD_GID:-2015}"
chown -R "$BACKUP_UID:$BACKUP_GID" "$BACKUP_DIR" 2>/dev/null || true
is_ipv4() {
local ip="$1"
[[ "$ip" =~ ^([0-9]{1,3}\.){3}[0-9]{1,3}$ ]] || return 1
IFS='.' read -r a b c d <<<"$ip"
for n in "$a" "$b" "$c" "$d"; do
(( n >= 0 && n <= 255 )) || return 1
done
return 0
}
get_current_ip() {
local name="$1"
sed -n -E "s/^${name}[[:space:]]+IN[[:space:]]+A[[:space:]]+([0-9.]+)[[:space:]]*$/\1/p" "$ZONE_DB" | head -n1
}
upsert_record() {
local name="$1"
local new_ip="$2"
local ts
ts="$(date +%Y%m%d-%H%M%S)"
local changed=0
cp -a "$ZONE_DB" "$BACKUP_DIR/db.argus.com.$ts.bak"
chown "$BACKUP_UID:$BACKUP_GID" "$BACKUP_DIR/db.argus.com.$ts.bak" 2>/dev/null || true
local cur_ip
cur_ip="$(get_current_ip "$name" || true)"
if [[ -z "$cur_ip" ]]; then
# Ensure the file ends with a newline before adding new record
if [[ -s "$ZONE_DB" ]] && [[ $(tail -c1 "$ZONE_DB" | wc -l) -eq 0 ]]; then
echo "" >> "$ZONE_DB"
fi
printf "%-20s IN A %s\n" "$name" "$new_ip" >> "$ZONE_DB"
echo "[ADD] ${name} -> ${new_ip}"
changed=1
elif [[ "$cur_ip" != "$new_ip" ]]; then
awk -v n="$name" -v ip="$new_ip" '
{
if ($1==n && $2=="IN" && $3=="A") {
printf "%-20s IN A %s\n", n, ip
} else {
print
}
}
' "$ZONE_DB" > "${ZONE_DB}.tmp" && mv "${ZONE_DB}.tmp" "$ZONE_DB"
echo "[UPDATE] ${name}: ${cur_ip} -> ${new_ip}"
changed=1
else
echo "[SKIP] ${name} unchanged (${new_ip})"
fi
if [[ $changed -eq 1 ]]; then
return 0
fi
return 1
}
while true; do
exec 9>"$LOCKFILE"
if flock -n 9; then
shopt -s nullglob
NEED_RELOAD=0
for f in "$WATCH_DIR"/*.argus.com; do
base="$(basename "$f")"
name="${base%.argus.com}"
ip="$(grep -Eo '([0-9]{1,3}\.){3}[0-9]{1,3}' "$f" | tail -n1 || true)"
if [[ -z "$ip" ]] || ! is_ipv4 "$ip"; then
echo "[WARN] $f 未找到有效 IPv4跳过"
continue
fi
if upsert_record "$name" "$ip"; then
NEED_RELOAD=1
fi
done
if [[ $NEED_RELOAD -eq 1 ]]; then
echo "[INFO] 检测到 db.argus.com 变更,执行 reload-bind9.sh"
bash "$RELOAD_SCRIPT"
fi
flock -u 9
else
echo "[INFO] 已有同步任务在运行,跳过本轮"
fi
sleep "$SLEEP_SECONDS"
done

View File

@ -0,0 +1,16 @@
$TTL 604800
@ IN SOA ns1.argus.com. admin.argus.com. (
2 ; Serial
604800 ; Refresh
86400 ; Retry
2419200 ; Expire
604800 ) ; Negative Cache TTL
; 定义 DNS 服务器
@ IN NS ns1.argus.com.
; 定义 ns1 主机
ns1 IN A 127.0.0.1
; 定义 web 指向 12.4.5.6
web IN A 12.4.5.6

View File

@ -0,0 +1,68 @@
#!/bin/bash
# DNS监控脚本 - 每10秒检查dns.conf是否有变化
# 如果有变化则执行update-dns.sh脚本
DNS_CONF="/private/argus/etc/dns.conf"
DNS_BACKUP="/tmp/dns.conf.backup"
UPDATE_SCRIPT="/private/argus/etc/update-dns.sh"
LOG_FILE="/var/log/supervisor/dns-monitor.log"
# 确保日志文件存在
touch "$LOG_FILE"
log_message() {
echo "$(date '+%Y-%m-%d %H:%M:%S') [DNS-Monitor] $1" >> "$LOG_FILE"
}
log_message "DNS监控脚本启动"
while true; do
if [ -f "$DNS_CONF" ]; then
if [ -f "$DNS_BACKUP" ]; then
# 比较文件内容
if ! cmp -s "$DNS_CONF" "$DNS_BACKUP"; then
log_message "检测到DNS配置变化"
# 更新备份文件
cp "$DNS_CONF" "$DNS_BACKUP"
# 执行更新脚本
if [ -x "$UPDATE_SCRIPT" ]; then
log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
"$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
if [ $? -eq 0 ]; then
log_message "DNS更新脚本执行成功"
else
log_message "DNS更新脚本执行失败"
fi
else
log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
fi
fi
else
# 第一次检测到配置文件,执行更新脚本
if [ -x "$UPDATE_SCRIPT" ]; then
log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
"$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
if [ $? -eq 0 ]; then
log_message "DNS更新脚本执行成功"
# 第一次运行,创建备份并执行更新
cp "$DNS_CONF" "$DNS_BACKUP"
log_message "创建DNS配置备份文件"
else
log_message "DNS更新脚本执行失败"
fi
else
log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
fi
fi
else
log_message "警告: DNS配置文件不存在: $DNS_CONF"
fi
sleep 10
done

View File

@ -0,0 +1,4 @@
zone "argus.com" {
type master;
file "/etc/bind/db.argus.com";
};

View File

@ -0,0 +1,27 @@
#!/bin/bash
echo "Reloading BIND9 configuration..."
# Check if configuration files are valid
echo "Checking named.conf.local syntax..."
if ! named-checkconf /etc/bind/named.conf.local; then
echo "ERROR: named.conf.local has syntax errors!"
exit 1
fi
echo "Checking zone file syntax..."
if ! named-checkzone argus.com /etc/bind/db.argus.com; then
echo "ERROR: db.argus.com has syntax errors!"
exit 1
fi
# Reload BIND9 via supervisor
echo "Reloading BIND9 service..."
supervisorctl restart bind9
if [ $? -eq 0 ]; then
echo "BIND9 reloaded successfully!"
else
echo "ERROR: Failed to reload BIND9!"
exit 1
fi

42
src/bind/build/startup.sh Normal file
View File

@ -0,0 +1,42 @@
#!/bin/bash
# Set /private permissions to 777 as requested
chmod 777 /private 2>/dev/null || true
# Create persistent directories for BIND9 configs and DNS sync
mkdir -p /private/argus/bind
mkdir -p /private/argus/etc
chown bind:bind /private/argus 2>/dev/null || true
chown -R bind:bind /private/argus/bind /private/argus/etc
# Copy configuration files to persistent storage if they don't exist
if [ ! -f /private/argus/bind/named.conf.local ]; then
cp /etc/bind/named.conf.local /private/argus/bind/named.conf.local
fi
if [ ! -f /private/argus/bind/db.argus.com ]; then
cp /etc/bind/db.argus.com /private/argus/bind/db.argus.com
fi
# Copy update-dns.sh to /private/argus/etc/
cp /usr/local/bin/update-dns.sh /private/argus/etc/update-dns.sh
chown bind:bind /private/argus/etc/update-dns.sh
chmod a+x /private/argus/etc/update-dns.sh
# Create symlinks to use persistent configs
ln -sf /private/argus/bind/named.conf.local /etc/bind/named.conf.local
ln -sf /private/argus/bind/db.argus.com /etc/bind/db.argus.com
# Set proper ownership
chown bind:bind /private/argus/bind/named.conf.local /private/argus/bind/db.argus.com
# 记录容器ip地址更新到dns.conf
IP=`ifconfig | grep -A 1 eth0 | grep inet | awk '{print $2}'`
echo current IP: ${IP}
echo ${IP} > /private/argus/etc/dns.conf
# Create supervisor log directory
mkdir -p /var/log/supervisor
# Start supervisor
exec /usr/bin/supervisord -c /etc/supervisor/conf.d/supervisord.conf

View File

@ -0,0 +1,37 @@
[unix_http_server]
file=/var/run/supervisor.sock
chmod=0700
[supervisord]
nodaemon=true
user=root
logfile=/var/log/supervisor/supervisord.log
pidfile=/var/run/supervisord.pid
[rpcinterface:supervisor]
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
[supervisorctl]
serverurl=unix:///var/run/supervisor.sock
[program:bind9]
command=/usr/sbin/named -g -c /etc/bind/named.conf -u bind
user=bind
autostart=true
autorestart=true
stderr_logfile=/var/log/supervisor/bind9.err.log
stdout_logfile=/var/log/supervisor/bind9.out.log
priority=10
[program:argus-dns-sync]
command=/usr/local/bin/argus_dns_sync.sh
autostart=true
autorestart=true
startsecs=3
stopsignal=TERM
user=root
stdout_logfile=/var/log/argus_dns_sync.out.log
stderr_logfile=/var/log/argus_dns_sync.err.log
; 根据环境调整环境变量(可选)
; environment=RNDC_RELOAD="yes"

31
src/bind/build/update-dns.sh Executable file
View File

@ -0,0 +1,31 @@
#!/bin/sh
# update-dns.sh
# 从 /private/argus/etc/dns.conf 读取 IP写入 /etc/resolv.conf
DNS_CONF="/private/argus/etc/dns.conf"
RESOLV_CONF="/etc/resolv.conf"
# 检查配置文件是否存在
if [ ! -f "$DNS_CONF" ]; then
echo "配置文件不存在: $DNS_CONF" >&2
exit 1
fi
# 生成 resolv.conf 内容
{
while IFS= read -r ip; do
# 跳过空行和注释
case "$ip" in
\#*) continue ;;
"") continue ;;
esac
echo "nameserver $ip"
done < "$DNS_CONF"
} > "$RESOLV_CONF".tmp
# 替换写入 /etc/resolv.conf
cat "$RESOLV_CONF".tmp > "$RESOLV_CONF"
rm -f "$RESOLV_CONF".tmp
echo "已更新 $RESOLV_CONF"

View File

@ -0,0 +1,16 @@
services:
bind9:
image: argus-bind9:latest
container_name: argus-bind9-test
ports:
- "${HOST_DNS_PORT:-1053}:53/tcp"
- "${HOST_DNS_PORT:-1053}:53/udp"
volumes:
- ./private:/private
restart: unless-stopped
networks:
- bind-test-network
networks:
bind-test-network:
driver: bridge

View File

@ -0,0 +1,118 @@
#!/bin/bash
# End-to-end test for BIND9 DNS server
# This script runs all tests in sequence to validate the complete functionality
# Usage: ./00_e2e_test.sh
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
export HOST_DNS_PORT
echo "=========================================="
echo "BIND9 DNS Server End-to-End Test Suite"
echo "=========================================="
# Track test results
total_tests=0
passed_tests=0
failed_tests=0
# Function to run a test step
run_test_step() {
local step_name="$1"
local script_name="$2"
local description="$3"
echo ""
echo "[$step_name] $description"
echo "$(printf '=%.0s' {1..50})"
((total_tests++))
if [ ! -f "$SCRIPT_DIR/$script_name" ]; then
echo "✗ Test script not found: $script_name"
((failed_tests++))
return 1
fi
# Make sure script is executable
chmod +x "$SCRIPT_DIR/$script_name"
# Run the test
echo "Executing: $SCRIPT_DIR/$script_name"
if "$SCRIPT_DIR/$script_name"; then
echo "$step_name completed successfully"
((passed_tests++))
return 0
else
echo "$step_name failed"
((failed_tests++))
return 1
fi
}
# Cleanup any previous test environment (but preserve the Docker image)
echo ""
echo "[SETUP] Cleaning up any previous test environment..."
if [ -f "$SCRIPT_DIR/05_cleanup.sh" ]; then
chmod +x "$SCRIPT_DIR/05_cleanup.sh"
"$SCRIPT_DIR/05_cleanup.sh" || true
fi
echo ""
echo "Starting BIND9 DNS server end-to-end test sequence..."
# Test sequence
run_test_step "TEST-01" "01_start_container.sh" "Start BIND9 container" || true
run_test_step "TEST-02" "02_dig_test.sh" "Initial DNS resolution test" || true
run_test_step "TEST-03" "03_reload_test.sh" "Configuration reload with IP modification" || true
run_test_step "TEST-03.5" "03.5_dns_sync_test.sh" "DNS auto-sync functionality test" || true
run_test_step "TEST-04" "04_persistence_test.sh" "Configuration persistence after restart" || true
# Final cleanup (but preserve logs for review)
echo ""
echo "[CLEANUP] Cleaning up test environment..."
run_test_step "CLEANUP" "05_cleanup.sh" "Clean up containers and networks" || true
# Test summary
echo ""
echo "=========================================="
echo "TEST SUMMARY"
echo "=========================================="
echo "Total tests: $total_tests"
echo "Passed: $passed_tests"
echo "Failed: $failed_tests"
if [ $failed_tests -eq 0 ]; then
echo ""
echo "✅ ALL TESTS PASSED!"
echo ""
echo "BIND9 DNS server functionality validated:"
echo " ✓ Container startup and basic functionality"
echo " ✓ DNS resolution for configured domains"
echo " ✓ Configuration modification and reload"
echo " ✓ DNS auto-sync from IP files"
echo " ✓ Configuration persistence across restarts"
echo " ✓ Cleanup and resource management"
echo ""
echo "The BIND9 DNS server is ready for production use."
exit 0
else
echo ""
echo "❌ SOME TESTS FAILED!"
echo ""
echo "Please review the test output above to identify and fix issues."
echo "You may need to:"
echo " - Check Docker installation and permissions"
echo " - Verify network connectivity"
echo " - Review BIND9 configuration files"
echo " - Check system resources and port availability"
exit 1
fi

View File

@ -0,0 +1,42 @@
#!/bin/bash
# Start BIND9 test container
# Usage: ./01_start_container.sh
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(dirname "$SCRIPT_DIR")"
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
export HOST_DNS_PORT
cd "$TEST_DIR"
echo "Starting BIND9 test container..."
# Ensure private directory exists with proper permissions
mkdir -p private/argus/bind
mkdir -p private/argus/etc
chmod 777 private
# Start the container
docker compose up -d
echo "Waiting for container to be ready..."
sleep 5
# Check if container is running
if docker compose ps | grep -q "Up"; then
echo "✓ Container started successfully"
echo "Container status:"
docker compose ps
else
echo "✗ Failed to start container"
docker compose logs
exit 1
fi
echo ""
echo "BIND9 test environment is ready!"
echo "DNS server listening on localhost:${HOST_DNS_PORT}"

View File

@ -0,0 +1,75 @@
#!/bin/bash
# Test DNS resolution using dig
# Usage: ./02_dig_test.sh
set -e
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
echo "Testing DNS resolution with dig..."
echo "Using DNS server localhost:${HOST_DNS_PORT}"
# Function to test DNS query
test_dns_query() {
local hostname="$1"
local expected_ip="$2"
local description="$3"
echo ""
echo "Testing: $description"
echo "Query: $hostname.argus.com"
echo "Expected IP: $expected_ip"
# Perform dig query
result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
if [ "$result" = "QUERY_FAILED" ]; then
echo "✗ DNS query failed"
return 1
elif [ "$result" = "$expected_ip" ]; then
echo "✓ DNS query successful: $result"
return 0
else
echo "✗ DNS query returned unexpected result: $result"
return 1
fi
}
# Check if dig is available
if ! command -v dig &> /dev/null; then
echo "Installing dig (dnsutils)..."
apt-get update && apt-get install -y dnsutils
fi
# Check if container is running
if ! docker compose ps | grep -q "Up"; then
echo "Error: BIND9 container is not running"
echo "Please start the container first with: ./01_start_container.sh"
exit 1
fi
echo "=== DNS Resolution Tests ==="
# Test cases based on current configuration
failed_tests=0
# Test ns1.argus.com -> 127.0.0.1
if ! test_dns_query "ns1" "127.0.0.1" "Name server resolution"; then
((failed_tests++))
fi
# Test web.argus.com -> 12.4.5.6
if ! test_dns_query "web" "12.4.5.6" "Web server resolution"; then
((failed_tests++))
fi
echo ""
echo "=== Test Summary ==="
if [ $failed_tests -eq 0 ]; then
echo "✓ All DNS tests passed!"
exit 0
else
echo "$failed_tests test(s) failed"
exit 1
fi

View File

@ -0,0 +1,259 @@
#!/bin/bash
# Test DNS auto-sync functionality using argus_dns_sync.sh
# This test validates the automatic DNS record updates from IP files
# Usage: ./03.5_dns_sync_test.sh
set -e
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(dirname "$SCRIPT_DIR")"
echo "=== DNS Auto-Sync Functionality Test ==="
echo "Using DNS server localhost:${HOST_DNS_PORT}"
# Check if container is running
if ! docker compose ps | grep -q "Up"; then
echo "Error: BIND9 container is not running"
echo "Please start the container first with: ./01_start_container.sh"
exit 1
fi
# Check if dig is available
if ! command -v dig &> /dev/null; then
echo "Installing dig (dnsutils)..."
apt-get update && apt-get install -y dnsutils
fi
# Function to test DNS query
test_dns_query() {
local hostname="$1"
local expected_ip="$2"
local description="$3"
echo "Testing: $description"
echo "Query: $hostname.argus.com -> Expected: $expected_ip"
# Wait a moment for DNS cache
sleep 2
result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
if [ "$result" = "$expected_ip" ]; then
echo "$result"
return 0
else
echo "✗ Got: $result, Expected: $expected_ip"
return 1
fi
}
# Function to wait for sync to complete
wait_for_sync() {
local timeout=15
local elapsed=0
echo "Waiting for DNS sync to complete (max ${timeout}s)..."
while [ $elapsed -lt $timeout ]; do
if docker compose exec bind9 test -f /var/lock/argus_dns_sync.lock; then
echo "Sync process is running..."
else
echo "Sync completed"
sleep 2 # Extra wait for DNS propagation
return 0
fi
sleep 2
elapsed=$((elapsed + 2))
done
echo "Warning: Sync may still be running after ${timeout}s"
return 0
}
echo ""
echo "Step 1: Preparing test environment..."
# Ensure required directories exist
docker compose exec bind9 mkdir -p /private/argus/etc
docker compose exec bind9 mkdir -p /private/argus/bind/.backup
# Backup original configuration if it exists
docker compose exec bind9 test -f /private/argus/bind/db.argus.com && \
docker compose exec bind9 cp /private/argus/bind/db.argus.com /private/argus/bind/db.argus.com.backup.test || true
# Ensure initial configuration is available (may already be symlinked)
docker compose exec bind9 test -f /private/argus/bind/db.argus.com || \
docker compose exec bind9 cp /etc/bind/db.argus.com /private/argus/bind/db.argus.com
echo "✓ Test environment prepared"
echo ""
echo "Step 2: Testing initial DNS configuration..."
# Get current IP for web.argus.com (may have been changed by previous tests)
current_web_ip=$(dig @localhost -p "$HOST_DNS_PORT" web.argus.com A +short 2>/dev/null || echo "UNKNOWN")
echo "Current web.argus.com IP: $current_web_ip"
# Test that DNS is working (regardless of specific IP)
if [ "$current_web_ip" = "UNKNOWN" ] || [ -z "$current_web_ip" ]; then
echo "DNS resolution not working for web.argus.com"
exit 1
fi
echo "✓ DNS resolution is working"
echo ""
echo "Step 3: Creating IP files for auto-sync..."
# Create test IP files in the watch directory
echo "Creating test1.argus.com with IP 10.0.0.100"
docker compose exec bind9 bash -c 'echo "10.0.0.100" > /private/argus/etc/test1.argus.com'
echo "Creating test2.argus.com with IP 10.0.0.200"
docker compose exec bind9 bash -c 'echo "test2 service running on 10.0.0.200" > /private/argus/etc/test2.argus.com'
echo "Creating api.argus.com with IP 192.168.1.50"
docker compose exec bind9 bash -c 'echo "API server: 192.168.1.50 port 8080" > /private/argus/etc/api.argus.com'
echo "✓ IP files created"
echo ""
echo "Step 4: Checking DNS sync process..."
# Check if DNS sync process is already running (via supervisord)
if docker compose exec bind9 pgrep -f argus_dns_sync.sh > /dev/null; then
echo "✓ DNS sync process already running (via supervisord)"
else
echo "Starting DNS sync process manually..."
# Start the DNS sync process in background if not running
docker compose exec -d bind9 /usr/local/bin/argus_dns_sync.sh
echo "✓ DNS sync process started manually"
fi
# Wait for first sync cycle
wait_for_sync
echo ""
echo "Step 5: Testing auto-synced DNS records..."
failed_tests=0
# Test new DNS records created by auto-sync
if ! test_dns_query "test1" "10.0.0.100" "Auto-synced test1.argus.com"; then
((failed_tests++))
fi
if ! test_dns_query "test2" "10.0.0.200" "Auto-synced test2.argus.com"; then
((failed_tests++))
fi
if ! test_dns_query "api" "192.168.1.50" "Auto-synced api.argus.com"; then
((failed_tests++))
fi
# Verify original records still work (use current IP from earlier)
if ! test_dns_query "web" "$current_web_ip" "Original web.argus.com still working"; then
((failed_tests++))
fi
if ! test_dns_query "ns1" "127.0.0.1" "Original ns1.argus.com still working"; then
((failed_tests++))
fi
echo ""
echo "Step 6: Testing IP update functionality..."
# Update an existing IP file
echo "Updating test1.argus.com IP from 10.0.0.100 to 10.0.0.150"
docker compose exec bind9 bash -c 'echo "10.0.0.150" > /private/argus/etc/test1.argus.com'
# Wait for sync
wait_for_sync
# Test updated record
if ! test_dns_query "test1" "10.0.0.150" "Updated test1.argus.com IP"; then
((failed_tests++))
fi
echo ""
echo "Step 7: Testing invalid IP handling..."
# Create file with invalid IP
echo "Creating invalid.argus.com with invalid IP"
docker compose exec bind9 bash -c 'echo "this is not an IP address" > /private/argus/etc/invalid.argus.com'
# Wait for sync (should skip invalid IP)
wait_for_sync
# Verify invalid record was not added (should fail to resolve)
result=$(dig @localhost -p "$HOST_DNS_PORT" invalid.argus.com A +short 2>/dev/null || echo "NO_RESULT")
if [ "$result" = "NO_RESULT" ] || [ -z "$result" ]; then
echo "✓ Invalid IP correctly ignored"
else
echo "✗ Invalid IP was processed: $result"
((failed_tests++))
fi
echo ""
echo "Step 8: Verifying backup functionality..."
# Check if backups were created
backup_count=$(docker compose exec bind9 ls -1 /private/argus/bind/.backup/ | wc -l || echo "0")
if [ "$backup_count" -gt 0 ]; then
echo "✓ Configuration backups created ($backup_count files)"
# Show latest backup
docker compose exec bind9 ls -la /private/argus/bind/.backup/ | tail -1
else
echo "✗ No backup files found"
((failed_tests++))
fi
echo ""
echo "Step 9: Cleanup..."
# Note: We don't stop the DNS sync process since it's managed by supervisord
echo "Note: DNS sync process will continue running (managed by supervisord)"
# Clean up test files
docker compose exec bind9 rm -f /private/argus/etc/test1.argus.com
docker compose exec bind9 rm -f /private/argus/etc/test2.argus.com
docker compose exec bind9 rm -f /private/argus/etc/api.argus.com
docker compose exec bind9 rm -f /private/argus/etc/invalid.argus.com
# Restore original configuration if backup exists
docker compose exec bind9 test -f /private/argus/bind/db.argus.com.backup.test && \
docker compose exec bind9 cp /private/argus/bind/db.argus.com.backup.test /private/argus/bind/db.argus.com && \
docker compose exec bind9 rm /private/argus/bind/db.argus.com.backup.test || true
# Reload original configuration
docker compose exec bind9 /usr/local/bin/reload-bind9.sh
echo "✓ Cleanup completed"
echo ""
echo "=== DNS Auto-Sync Test Summary ==="
if [ $failed_tests -eq 0 ]; then
echo "✅ All DNS auto-sync tests passed!"
echo ""
echo "Validated functionality:"
echo " ✓ Automatic DNS record creation from IP files"
echo " ✓ IP address extraction from various file formats"
echo " ✓ Dynamic DNS record updates"
echo " ✓ Invalid IP address handling"
echo " ✓ Configuration backup mechanism"
echo " ✓ Preservation of existing DNS records"
echo ""
echo "The DNS auto-sync functionality is working correctly!"
exit 0
else
echo "$failed_tests DNS auto-sync test(s) failed!"
echo ""
echo "Please check:"
echo " - argus_dns_sync.sh script configuration"
echo " - File permissions in /private/argus/etc/"
echo " - BIND9 reload functionality"
echo " - Network connectivity and DNS resolution"
exit 1
fi

View File

@ -0,0 +1,115 @@
#!/bin/bash
# Test DNS configuration reload with IP modification
# Usage: ./03_reload_test.sh
set -e
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(dirname "$SCRIPT_DIR")"
echo "=== DNS Configuration Reload Test ==="
echo "Using DNS server localhost:${HOST_DNS_PORT}"
# Check if container is running
if ! docker compose ps | grep -q "Up"; then
echo "Error: BIND9 container is not running"
echo "Please start the container first with: ./01_start_container.sh"
exit 1
fi
# Check if dig is available
if ! command -v dig &> /dev/null; then
echo "Installing dig (dnsutils)..."
apt-get update && apt-get install -y dnsutils
fi
# Function to test DNS query
test_dns_query() {
local hostname="$1"
local expected_ip="$2"
local description="$3"
echo "Testing: $description"
echo "Query: $hostname.argus.com -> Expected: $expected_ip"
result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
if [ "$result" = "$expected_ip" ]; then
echo "$result"
return 0
else
echo "✗ Got: $result, Expected: $expected_ip"
return 1
fi
}
echo ""
echo "Step 1: Testing initial DNS configuration..."
# Test initial configuration
if ! test_dns_query "web" "12.4.5.6" "Initial web.argus.com resolution"; then
echo "Initial DNS test failed"
exit 1
fi
echo ""
echo "Step 2: Modifying DNS configuration..."
# Backup original configuration
cp "$TEST_DIR/private/argus/bind/db.argus.com" "$TEST_DIR/private/argus/bind/db.argus.com.backup" 2>/dev/null || true
# Create new configuration with modified IP
DB_FILE="$TEST_DIR/private/argus/bind/db.argus.com"
# Check if persistent config exists, if not use from container
if [ ! -f "$DB_FILE" ]; then
echo "Persistent config not found, copying from container..."
docker compose exec bind9 cp /etc/bind/db.argus.com /private/argus/bind/db.argus.com
docker compose exec bind9 chown bind:bind /private/argus/bind/db.argus.com
fi
# Modify the IP address (12.4.5.6 -> 192.168.1.100)
sed -i 's/12\.4\.5\.6/192.168.1.100/g' "$DB_FILE"
# Increment serial number for DNS cache invalidation
current_serial=$(grep -o "2[[:space:]]*;" "$DB_FILE" | grep -o "2")
new_serial=$((current_serial + 1))
sed -i "s/2[[:space:]]*;/${new_serial} ;/" "$DB_FILE"
echo "Modified configuration:"
echo "- Changed web.argus.com IP: 12.4.5.6 -> 192.168.1.100"
echo "- Updated serial number: $current_serial -> $new_serial"
echo ""
echo "Step 3: Reloading BIND9 configuration..."
# Reload BIND9 configuration
docker compose exec bind9 /usr/local/bin/reload-bind9.sh
echo "Configuration reloaded"
# Wait a moment for changes to take effect
sleep 3
echo ""
echo "Step 4: Testing modified DNS configuration..."
# Test modified configuration
if ! test_dns_query "web" "192.168.1.100" "Modified web.argus.com resolution"; then
echo "Modified DNS test failed"
exit 1
fi
# Also verify ns1 still works
if ! test_dns_query "ns1" "127.0.0.1" "ns1.argus.com still working"; then
echo "ns1 DNS test failed after reload"
exit 1
fi
echo ""
echo "✓ DNS configuration reload test completed successfully!"
echo "✓ IP address changed from 12.4.5.6 to 192.168.1.100"
echo "✓ Configuration persisted and reloaded correctly"

View File

@ -0,0 +1,118 @@
#!/bin/bash
# Test configuration persistence after container restart
# Usage: ./04_persistence_test.sh
set -e
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(dirname "$SCRIPT_DIR")"
echo "=== Configuration Persistence Test ==="
echo "Using DNS server localhost:${HOST_DNS_PORT}"
# Check if dig is available
if ! command -v dig &> /dev/null; then
echo "Installing dig (dnsutils)..."
apt-get update && apt-get install -y dnsutils
fi
# Function to test DNS query
test_dns_query() {
local hostname="$1"
local expected_ip="$2"
local description="$3"
echo "Testing: $description"
echo "Query: $hostname.argus.com -> Expected: $expected_ip"
result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
if [ "$result" = "$expected_ip" ]; then
echo "$result"
return 0
else
echo "✗ Got: $result, Expected: $expected_ip"
return 1
fi
}
echo ""
echo "Step 1: Stopping current container..."
# Stop the container
docker compose down
echo "Container stopped"
echo ""
echo "Step 2: Verifying persistent configuration exists..."
# Check if modified configuration exists
DB_FILE="$TEST_DIR/private/argus/bind/db.argus.com"
if [ ! -f "$DB_FILE" ]; then
echo "✗ Persistent configuration file not found: $DB_FILE"
exit 1
fi
# Check if the modified IP is in the configuration
if grep -q "192.168.1.100" "$DB_FILE"; then
echo "✓ Modified IP (192.168.1.100) found in persistent configuration"
else
echo "✗ Modified IP not found in persistent configuration"
echo "Configuration content:"
cat "$DB_FILE"
exit 1
fi
echo ""
echo "Step 3: Restarting container with persistent configuration..."
# Start the container again
docker compose up -d
echo "Waiting for container to be ready..."
sleep 5
# Check if container is running
if ! docker compose ps | grep -q "Up"; then
echo "✗ Failed to restart container"
docker compose logs
exit 1
fi
echo "✓ Container restarted successfully"
echo ""
echo "Step 4: Testing DNS resolution after restart..."
# Wait a bit more for DNS to be fully ready
sleep 5
# Test that the modified configuration is still active
if ! test_dns_query "web" "192.168.1.100" "Persistent web.argus.com resolution"; then
echo "✗ Persistent configuration test failed"
exit 1
fi
# Also verify ns1 still works
if ! test_dns_query "ns1" "127.0.0.1" "ns1.argus.com still working"; then
echo "✗ ns1 DNS test failed after restart"
exit 1
fi
echo ""
echo "Step 5: Verifying configuration files are linked correctly..."
# Check that the persistent files are properly linked
echo "Checking file links in container:"
docker compose exec bind9 ls -la /etc/bind/named.conf.local /etc/bind/db.argus.com
echo ""
echo "✓ Configuration persistence test completed successfully!"
echo "✓ Modified IP (192.168.1.100) persisted after container restart"
echo "✓ Configuration files properly linked to persistent storage"
echo "✓ DNS resolution working correctly with persisted configuration"

View File

@ -0,0 +1,90 @@
#!/bin/bash
# Clean up test environment and containers
# Usage: ./05_cleanup.sh [--full]
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(dirname "$SCRIPT_DIR")"
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
export HOST_DNS_PORT
# Parse command line arguments
FULL_CLEANUP=true
while [[ $# -gt 0 ]]; do
case $1 in
--full)
FULL_CLEANUP=true
shift
;;
*)
echo "Unknown option: $1"
echo "Usage: $0 [--full]"
echo " --full: Also remove persistent data "
exit 1
;;
esac
done
cd "$TEST_DIR"
echo "=== Cleaning up BIND9 test environment ==="
echo ""
echo "Step 1: Stopping and removing containers..."
# Stop and remove containers
docker compose down -v
echo "✓ Containers stopped and removed"
echo ""
echo "Step 2: Removing Docker networks..."
# Clean up networks
docker network prune -f > /dev/null 2>&1 || true
echo "✓ Docker networks cleaned"
if [ "$FULL_CLEANUP" = true ]; then
echo ""
echo "Step 3: Removing persistent data..."
# Remove persistent data directory
if [ -d "private" ]; then
rm -rf private
echo "✓ Persistent data directory removed"
else
echo "✓ No persistent data directory found"
fi
else
echo ""
echo "Step 3: Preserving persistent data and Docker image..."
echo "✓ Persistent data preserved in: private/"
echo "✓ Docker image 'argus-bind9:latest' preserved"
echo ""
echo "To perform full cleanup including persistent data and image, run:"
echo " $0 --full"
fi
echo ""
echo "=== Cleanup Summary ==="
echo "✓ Containers stopped and removed"
echo "✓ Docker networks cleaned"
if [ "$FULL_CLEANUP" = true ]; then
echo "✓ Persistent data removed"
echo ""
echo "Full cleanup completed! Test environment completely removed."
else
echo "✓ Persistent data preserved"
echo "✓ Docker image preserved"
echo ""
echo "Basic cleanup completed! Run './01_start_container.sh' to restart testing."
fi
echo ""
echo "Test environment cleanup finished."

5
src/log/.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
private/
images/

8
src/log/README.md Normal file
View File

@ -0,0 +1,8 @@
测试log模块开发
elasticsearch: 部署镜像构建及启动脚本解决账号问题、挂载目录、使用supervisor守护
kibana: 镜像构建
fluent-bit: 安装包,脚本准备, 交付给大鹏统一组织客户端侧安装流程
init: EK初始化脚本数据视图创建脚本等

View File

@ -0,0 +1,75 @@
FROM docker.elastic.co/elasticsearch/elasticsearch:8.13.4
# 切换到 root 用户进行系统级安装
USER root
ARG ARGUS_BUILD_UID=2133
ARG ARGUS_BUILD_GID=2015
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
# 调整 elasticsearch 用户与用户组 ID 以匹配宿主机配置
RUN set -eux; \
current_gid="$(getent group elasticsearch | awk -F: '{print $3}')"; \
if [ -z "$current_gid" ]; then \
groupadd -g "${ARGUS_BUILD_GID}" elasticsearch; \
elif [ "$current_gid" != "${ARGUS_BUILD_GID}" ]; then \
groupmod -g "${ARGUS_BUILD_GID}" elasticsearch; \
fi; \
if id elasticsearch >/dev/null 2>&1; then \
current_uid="$(id -u elasticsearch)"; \
if [ "$current_uid" != "${ARGUS_BUILD_UID}" ]; then \
usermod -u "${ARGUS_BUILD_UID}" elasticsearch; \
fi; \
else \
useradd -m -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" elasticsearch; \
fi; \
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/elasticsearch
# 设置构建参数
ARG USE_INTRANET=false
# 配置内网 apt 源 (如果指定了内网选项)
RUN if [ "$USE_INTRANET" = "true" ]; then \
echo "Configuring intranet apt sources..." && \
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
fi
# 安装 supervisor, net-tools, vim
RUN apt-get update && \
apt-get install -y supervisor net-tools inetutils-ping vim && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# 配置部署时使用的apt源
RUN if [ "$USE_INTRANET" = "true" ]; then \
echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
fi
# 创建 supervisor 日志目录
RUN mkdir -p /var/log/supervisor
# 复制 supervisor 配置文件
COPY src/log/elasticsearch/build/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
# 复制启动脚本
COPY src/log/elasticsearch/build/start-es-supervised.sh /usr/local/bin/start-es-supervised.sh
RUN chmod +x /usr/local/bin/start-es-supervised.sh
# 复制DNS监控脚本
COPY src/log/elasticsearch/build/dns-monitor.sh /usr/local/bin/dns-monitor.sh
RUN chmod +x /usr/local/bin/dns-monitor.sh
# 保持 root 用户,由 supervisor 管理用户切换
USER root
# 暴露端口
EXPOSE 9200 9300
# 使用 supervisor 作为入口点
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]

View File

@ -0,0 +1 @@
../../../bind/build/dns-monitor.sh

View File

@ -0,0 +1,32 @@
#!/bin/bash
set -euo pipefail
echo "[INFO] Starting Elasticsearch under supervisor..."
# 创建数据目录并设置权限(如果不存在)
mkdir -p /private/argus/log/elasticsearch
# 创建软链接到Elasticsearch预期的数据目录
if [ -L /usr/share/elasticsearch/data ]; then
rm /usr/share/elasticsearch/data
elif [ -d /usr/share/elasticsearch/data ]; then
rm -rf /usr/share/elasticsearch/data
fi
ln -sf /private/argus/log/elasticsearch /usr/share/elasticsearch/data
# 记录容器ip地址
DOMAIN=es.log.argus.com
IP=`ifconfig | grep -A 1 eth0 | grep inet | awk '{print $2}'`
echo current IP: ${IP}
echo ${IP} > /private/argus/etc/${DOMAIN}
echo "[INFO] Data directory linked: /usr/share/elasticsearch/data -> /private/argus/log/elasticsearch"
# 设置环境变量ES配置通过docker-compose传递
export ES_JAVA_OPTS="${ES_JAVA_OPTS:-"-Xms512m -Xmx512m"}"
echo "[INFO] Starting Elasticsearch process..."
# 启动原始的Elasticsearch entrypoint
exec /usr/local/bin/docker-entrypoint.sh elasticsearch

View File

@ -0,0 +1,39 @@
[supervisord]
nodaemon=true
logfile=/var/log/supervisor/supervisord.log
pidfile=/var/run/supervisord.pid
user=root
[program:elasticsearch]
command=/usr/local/bin/start-es-supervised.sh
user=elasticsearch
stdout_logfile=/var/log/supervisor/elasticsearch.log
stderr_logfile=/var/log/supervisor/elasticsearch_error.log
autorestart=true
startretries=3
startsecs=30
stopwaitsecs=30
killasgroup=true
stopasgroup=true
[program:dns-monitor]
command=/usr/local/bin/dns-monitor.sh
user=root
stdout_logfile=/var/log/supervisor/dns-monitor.log
stderr_logfile=/var/log/supervisor/dns-monitor_error.log
autorestart=true
startretries=3
startsecs=5
stopwaitsecs=10
killasgroup=true
stopasgroup=true
[unix_http_server]
file=/var/run/supervisor.sock
chmod=0700
[supervisorctl]
serverurl=unix:///var/run/supervisor.sock
[rpcinterface:supervisor]
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface

View File

@ -0,0 +1,37 @@
[SERVICE]
Daemon Off
Parsers_File parsers.conf
HTTP_Server On
HTTP_Listen 0.0.0.0
HTTP_Port 2020
storage.path /buffers
storage.sync normal
storage.checksum on
storage.backlog.mem_limit 128M
# 备注:该镜像默认未开启 Hot Reload修改配置后请重启容器。
@INCLUDE inputs.d/*.conf
[FILTER]
Name parser
Match app.*
Key_Name log
Parser timestamp_parser
Reserve_Data On
Preserve_Key On
Unescape_Key On
[FILTER]
Name record_modifier
Match *
Record cluster ${CLUSTER}
Record rack ${RACK}
Record host ${HOSTNAME}
[FILTER]
Name lua
Match app.*
script inject_labels.lua
call add_labels
@INCLUDE outputs.d/*.conf

View File

@ -0,0 +1,15 @@
function add_labels(tag, ts, record)
record["job_id"] = os.getenv("FB_JOB_ID") or record["job_id"] or "unknown"
record["user"] = os.getenv("FB_USER") or record["user"] or "unknown"
record["model"] = os.getenv("FB_MODEL") or record["model"] or "unknown"
record["gpu_id"] = os.getenv("FB_GPU_ID") or record["gpu_id"] or "na"
local p = record["log_path"] or ""
if string.find(p, "/logs/infer/") then
record["role"] = "infer"
elseif string.find(p, "/logs/train/") then
record["role"] = "train"
else
record["role"] = record["role"] or "app"
end
return 1, ts, record
end

View File

@ -0,0 +1,10 @@
[INPUT]
Name tail
Path /logs/train/*.log
Tag app.train
Path_Key log_path
Refresh_Interval 5
DB /buffers/train.db
Skip_Long_Lines On
storage.type filesystem
multiline.parser python,go,java

View File

@ -0,0 +1,10 @@
[INPUT]
Name tail
Path /logs/infer/*.log
Tag app.infer
Path_Key log_path
Refresh_Interval 5
DB /buffers/infer.db
Skip_Long_Lines On
storage.type filesystem
multiline.parser python,go,java

View File

@ -0,0 +1,24 @@
# 重要:使用 Logstash_Format + Logstash_Prefix生成 train-*/infer-* 索引
[OUTPUT]
Name es
Match app.train
Host ${ES_HOST}
Port ${ES_PORT}
Logstash_Format On
Logstash_Prefix train
Replace_Dots On
Generate_ID On
Retry_Limit False
Suppress_Type_Name On
[OUTPUT]
Name es
Match app.infer
Host ${ES_HOST}
Port ${ES_PORT}
Logstash_Format On
Logstash_Prefix infer
Replace_Dots On
Generate_ID On
Retry_Limit False
Suppress_Type_Name On

View File

@ -0,0 +1,29 @@
[MULTILINE_PARSER]
Name python
Type regex
Flush 2
Rule "start_state" "/^\d{4}-\d{2}-\d{2}[\sT]/" "cont"
Rule "cont" "/^\s+|^Traceback|^\tat\s+/" "cont"
[MULTILINE_PARSER]
Name go
Type regex
Flush 2
Rule "start_state" "/^[0-9]{4}\/[0-9]{2}\/[0-9]{2}/" "cont"
Rule "cont" "/^\s+|^\t/" "cont"
[MULTILINE_PARSER]
Name java
Type regex
Flush 2
Rule "start_state" "/^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}/" "cont"
Rule "cont" "/^\s+at\s+|^\t.../" "cont"
[PARSER]
Name timestamp_parser
Format regex
Regex ^(?<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(?<level>\w+)\s+(?<message>.*)$
Time_Key timestamp
Time_Format %Y-%m-%d %H:%M:%S
Time_Offset +0800
Time_Keep On

View File

@ -0,0 +1,47 @@
#!/bin/bash
set -euo pipefail
echo "[INFO] Starting Fluent Bit setup in Ubuntu container..."
# 安装必要的工具
echo "[INFO] Installing required packages..."
export DEBIAN_FRONTEND=noninteractive
apt-get update -qq
apt-get install -y -qq curl
# 解压bundle到/tmp
echo "[INFO] Extracting fluent-bit bundle..."
cp -r /private/etc /tmp
cp -r /private/packages /tmp
cd /tmp
# 安装 Fluent Bit 从 deb 包
echo "[INFO] Installing Fluent Bit from deb package..."
dpkg -i /tmp/packages/fluent-bit_3.1.9_amd64.deb || true
apt-get install -f -y -qq # 解决依赖问题
# 验证 Fluent Bit 可以运行
echo "[INFO] Fluent Bit version:"
/opt/fluent-bit/bin/fluent-bit --version
# 创建配置目录
mkdir -p /etc/fluent-bit
cp -r /tmp/etc/* /etc/fluent-bit/
# 创建日志和缓冲区目录
mkdir -p /logs/train /logs/infer /buffers
chmod 755 /logs/train /logs/infer /buffers
# 等待 Elasticsearch 就绪
echo "[INFO] Waiting for Elasticsearch to be ready..."
while ! curl -fs http://${ES_HOST}:${ES_PORT}/_cluster/health >/dev/null 2>&1; do
echo " Waiting for ES at ${ES_HOST}:${ES_PORT}..."
sleep 5
done
echo "[INFO] Elasticsearch is ready"
# 启动 Fluent Bit
echo "[INFO] Starting Fluent Bit with configuration from /etc/fluent-bit/"
echo "[INFO] Command: /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf"
exec /opt/fluent-bit/bin/fluent-bit \
--config=/etc/fluent-bit/fluent-bit.conf

View File

@ -0,0 +1,79 @@
FROM docker.elastic.co/kibana/kibana:8.13.4
# 切换到 root 用户进行系统级安装
USER root
ARG ARGUS_BUILD_UID=2133
ARG ARGUS_BUILD_GID=2015
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
# 调整 kibana 用户与用户组 ID 以匹配宿主机配置
RUN set -eux; \
current_gid="$(getent group kibana | awk -F: '{print $3}')"; \
if [ -z "$current_gid" ]; then \
groupadd -g "${ARGUS_BUILD_GID}" kibana; \
elif [ "$current_gid" != "${ARGUS_BUILD_GID}" ]; then \
groupmod -g "${ARGUS_BUILD_GID}" kibana; \
fi; \
if id kibana >/dev/null 2>&1; then \
current_uid="$(id -u kibana)"; \
if [ "$current_uid" != "${ARGUS_BUILD_UID}" ]; then \
usermod -u "${ARGUS_BUILD_UID}" kibana; \
fi; \
else \
useradd -m -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" kibana; \
fi; \
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/kibana
# 设置构建参数
ARG USE_INTRANET=false
# 配置内网 apt 源 (如果指定了内网选项)
RUN if [ "$USE_INTRANET" = "true" ]; then \
echo "Configuring intranet apt sources..." && \
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
fi
# 安装 supervisor, net-tools, vim
RUN apt-get update && \
apt-get install -y supervisor net-tools inetutils-ping vim && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# 配置部署时使用的apt源
RUN if [ "$USE_INTRANET" = "true" ]; then \
echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
fi
# 创建 supervisor 日志目录
RUN mkdir -p /var/log/supervisor
# 复制 supervisor 配置文件
COPY src/log/kibana/build/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
# 复制启动脚本
COPY src/log/kibana/build/start-kibana-supervised.sh /usr/local/bin/start-kibana-supervised.sh
COPY src/log/kibana/build/kibana-post-start.sh /usr/local/bin/kibana-post-start.sh
RUN chmod +x /usr/local/bin/start-kibana-supervised.sh /usr/local/bin/kibana-post-start.sh
# 复制DNS监控脚本
COPY src/log/kibana/build/dns-monitor.sh /usr/local/bin/dns-monitor.sh
RUN chmod +x /usr/local/bin/dns-monitor.sh
# kibana需要用到 /root/.config/puppeteer 路径
RUN chmod 777 /root
# 保持 root 用户,由 supervisor 管理用户切换
USER root
# 暴露端口
EXPOSE 5601
# 使用 supervisor 作为入口点
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]

View File

@ -0,0 +1 @@
../../../bind/build/dns-monitor.sh

View File

@ -0,0 +1,146 @@
#!/bin/bash
set -euo pipefail
ES_HOST="${ELASTICSEARCH_HOSTS:-http://es:9200}"
KB_HOST="http://localhost:5601"
echo "[INFO] Starting Kibana post-start configuration..."
# 等待 Elasticsearch 可用
wait_for_elasticsearch() {
echo "[INFO] Waiting for Elasticsearch..."
local max_attempts=60
local attempt=1
while [ $attempt -le $max_attempts ]; do
if curl -fs "$ES_HOST/_cluster/health" >/dev/null 2>&1; then
echo "[OK] Elasticsearch is available"
return 0
fi
echo " Waiting for ES... ($attempt/$max_attempts)"
sleep 5
((attempt++))
done
echo "[ERROR] Elasticsearch timeout"
return 1
}
# 等待 Kibana 可用
wait_for_kibana() {
echo "[INFO] Waiting for Kibana..."
local max_attempts=120
local attempt=1
while [ $attempt -le $max_attempts ]; do
if curl -fs "$KB_HOST/api/status" >/dev/null 2>&1; then
local status=$(curl -s "$KB_HOST/api/status" | grep -o '"level":"available"' || echo "")
if [ -n "$status" ]; then
echo "[OK] Kibana is available"
return 0
fi
echo " Waiting for Kibana... ($attempt/$max_attempts, status: $status)"
else
echo " Waiting for Kibana... ($attempt/$max_attempts, connection failed)"
fi
sleep 5
((attempt++))
done
echo "[ERROR] Kibana timeout"
return 1
}
# 幂等设置索引副本数为0
fix_replicas_idempotent() {
echo "[INFO] Checking and fixing index replicas..."
# 获取所有 train-* 和 infer-* 索引
local indices=$(curl -s "$ES_HOST/_cat/indices/train-*,infer-*?h=index" 2>/dev/null || echo "")
if [ -z "$indices" ]; then
echo "[INFO] No train-*/infer-* indices found, skipping replica adjustment"
return 0
fi
for idx in $indices; do
# 检查当前副本数
local current_replicas=$(curl -s "$ES_HOST/$idx/_settings" | grep -o '"number_of_replicas":"[^"]*"' | cut -d'"' -f4 || echo "")
if [ "$current_replicas" != "0" ]; then
echo "[INFO] Setting replicas to 0 for index: $idx (current: $current_replicas)"
curl -fsS -X PUT "$ES_HOST/$idx/_settings" \
-H 'Content-Type: application/json' \
-d '{"index":{"number_of_replicas":0}}' >/dev/null || {
echo "[WARN] Failed to set replicas for $idx"
continue
}
echo "[OK] Updated replicas for $idx"
else
echo "[INFO] Index $idx already has 0 replicas, skipping"
fi
done
}
# 幂等创建数据视图
create_data_views_idempotent() {
echo "[INFO] Checking and creating data views..."
# 检查是否存在匹配的索引
local train_indices=$(curl -s "$ES_HOST/_cat/indices/train-*?h=index" 2>/dev/null | wc -l || echo "0")
local infer_indices=$(curl -s "$ES_HOST/_cat/indices/infer-*?h=index" 2>/dev/null | wc -l || echo "0")
# 创建 train 数据视图
if [ "$train_indices" -gt 0 ]; then
# 检查数据视图是否已存在
local train_exists=$(curl -s "$KB_HOST/api/data_views" -H 'kbn-xsrf: true' 2>/dev/null | grep '"title":"train-\*"' | wc -l )
if [ "$train_exists" -eq 0 ]; then
echo "[INFO] Creating data view for train-* indices"
curl -fsS -X POST "$KB_HOST/api/data_views/data_view" \
-H 'kbn-xsrf: true' \
-H 'Content-Type: application/json' \
-d '{"data_view":{"name":"train","title":"train-*","timeFieldName":"@timestamp"}}' \
>/dev/null && echo "[OK] Created train data view" || echo "[WARN] Failed to create train data view"
else
echo "[INFO] Train data view already exists, skipping"
fi
else
echo "[INFO] No train-* indices found, skipping train data view creation"
fi
# 创建 infer 数据视图
if [ "$infer_indices" -gt 0 ]; then
# 检查数据视图是否已存在
local infer_exists=$(curl -s "$KB_HOST/api/data_views" -H 'kbn-xsrf: true' 2>/dev/null | grep '"title":"infer-\*"' | wc -l )
if [ "$infer_exists" -eq 0 ]; then
echo "[INFO] Creating data view for infer-* indices"
curl -fsS -X POST "$KB_HOST/api/data_views/data_view" \
-H 'kbn-xsrf: true' \
-H 'Content-Type: application/json' \
-d '{"data_view":{"name":"infer","title":"infer-*","timeFieldName":"@timestamp"}}' \
>/dev/null && echo "[OK] Created infer data view" || echo "[WARN] Failed to create infer data view"
else
echo "[INFO] Infer data view already exists, skipping"
fi
else
echo "[INFO] No infer-* indices found, skipping infer data view creation"
fi
}
# 主逻辑
main() {
# 等待服务可用
wait_for_elasticsearch || exit 1
wait_for_kibana || exit 1
# 执行幂等配置
fix_replicas_idempotent
create_data_views_idempotent
echo "[INFO] Kibana post-start configuration completed"
}
# 运行主逻辑
main

View File

@ -0,0 +1,37 @@
#!/bin/bash
set -euo pipefail
echo "[INFO] Starting Kibana under supervisor..."
mkdir -p /private/argus/log/kibana
# 创建软链接到Kibana预期的数据目录
if [ -L /usr/share/kibana/data ]; then
rm /usr/share/kibana/data
elif [ -d /usr/share/kibana/data ]; then
rm -rf /usr/share/kibana/data
fi
ln -sf /private/argus/log/kibana /usr/share/kibana/data
echo "[INFO] Data directory linked: /usr/share/kibana/data -> /private/argus/log/kibana"
# 记录容器ip地址
DOMAIN=kibana.log.argus.com
IP=`ifconfig | grep -A 1 eth0 | grep inet | awk '{print $2}'`
echo current IP: ${IP}
echo ${IP} > /private/argus/etc/${DOMAIN}
# 设置环境变量
export ELASTICSEARCH_HOSTS="${ELASTICSEARCH_HOSTS:-"http://es:9200"}"
echo "[INFO] Connecting to Elasticsearch at: $ELASTICSEARCH_HOSTS"
# 启动后台配置任务
echo "[INFO] Starting background post-start configuration..."
/usr/local/bin/kibana-post-start.sh &
echo "[INFO] Starting Kibana process..."
# 启动原始的Kibana entrypoint
exec /usr/local/bin/kibana-docker

View File

@ -0,0 +1,39 @@
[supervisord]
nodaemon=true
logfile=/var/log/supervisor/supervisord.log
pidfile=/var/run/supervisord.pid
user=root
[program:kibana]
command=/usr/local/bin/start-kibana-supervised.sh
user=kibana
stdout_logfile=/var/log/supervisor/kibana.log
stderr_logfile=/var/log/supervisor/kibana_error.log
autorestart=true
startretries=3
startsecs=30
stopwaitsecs=30
killasgroup=true
stopasgroup=true
[program:dns-monitor]
command=/usr/local/bin/dns-monitor.sh
user=root
stdout_logfile=/var/log/supervisor/dns-monitor.log
stderr_logfile=/var/log/supervisor/dns-monitor_error.log
autorestart=true
startretries=3
startsecs=5
stopwaitsecs=10
killasgroup=true
stopasgroup=true
[unix_http_server]
file=/var/run/supervisor.sock
chmod=0700
[supervisorctl]
serverurl=unix:///var/run/supervisor.sock
[rpcinterface:supervisor]
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface

View File

@ -0,0 +1,84 @@
version: "3.8"
services:
es:
build:
context: ../elasticsearch/build
dockerfile: Dockerfile
image: argus-elasticsearch:latest
environment:
- discovery.type=single-node
- xpack.security.enabled=false
- ES_JAVA_OPTS=-Xms512m -Xmx512m
volumes:
- ./private/argus/:/private/argus/
ports: ["9200:9200"]
healthcheck:
test: ["CMD-SHELL", "curl -fs http://localhost:9200 >/dev/null || exit 1"]
interval: 10s
timeout: 5s
retries: 30
restart: always
kibana:
build:
context: ../kibana/build
dockerfile: Dockerfile
image: argus-kibana:latest
environment:
- ELASTICSEARCH_HOSTS=http://es.log.argus.com:9200
volumes:
- ./private/argus/:/private/argus/
ports: ["5601:5601"]
depends_on:
es:
condition: service_healthy
fluent-bit-host01:
image: ubuntu:22.04
environment:
- CLUSTER=local
- RACK=dev
- HOSTNAME=host01
- ES_HOST=es
- ES_PORT=9200
volumes:
- ../fluent-bit/build:/private/
ports: ["2020:2020"]
depends_on:
es:
condition: service_healthy
command: /private/start-fluent-bit.sh
healthcheck:
test: ["CMD-SHELL", "curl -fs http://localhost:2020/api/v2/metrics >/dev/null || exit 1"]
interval: 15s
timeout: 10s
retries: 30
fluent-bit-host02:
image: ubuntu:22.04
environment:
- CLUSTER=local
- RACK=dev
- HOSTNAME=host02
- ES_HOST=es
- ES_PORT=9200
volumes:
- ../fluent-bit/build:/private/
ports: ["2021:2020"]
depends_on:
es:
condition: service_healthy
command: /private/start-fluent-bit.sh
healthcheck:
test: ["CMD-SHELL", "curl -fs http://localhost:2020/api/v2/metrics >/dev/null || exit 1"]
interval: 15s
timeout: 10s
retries: 30
restart: always
bind9:
image: argus-bind9:latest
volumes:
- ./private/argus:/private/argus/
restart: always

View File

@ -0,0 +1,34 @@
#!/usr/bin/env bash
set -euo pipefail
root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)"
project_root="$(cd "$root/../../.." && pwd)"
source "$project_root/scripts/common/build_user.sh"
load_build_user
# 创建新的private目录结构 (基于argus目录结构)
echo "[INFO] Creating private directory structure for supervisor-based containers..."
mkdir -p "$root/private/argus/log/elasticsearch"
mkdir -p "$root/private/argus/log/kibana"
mkdir -p "$root/private/argus/etc/"
# 设置数据目录权限ES 和 Kibana 容器都使用 UID 1000
echo "[INFO] Setting permissions for data directories..."
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/log/elasticsearch" 2>/dev/null || true
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/log/kibana" 2>/dev/null || true
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/etc" 2>/dev/null || true
echo "[INFO] Supervisor-based containers will manage their own scripts and configurations"
# 检查fluent-bit相关文件是否存在
if [[ ! -f "$root/../fluent-bit/fluent-bit-bundle.tar.gz" ]]; then
echo "[WARN] fluent-bit/fluent-bit-bundle.tar.gz 不存在,请确保已创建该文件"
fi
if [[ ! -f "$root/../fluent-bit/start-fluent-bit.sh" ]]; then
echo "[WARN] fluent-bit/start-fluent-bit.sh 不存在,请确保已创建该启动脚本"
fi
echo "[OK] 初始化完成: private/argus/log/{elasticsearch,kibana}"
echo "[INFO] Fluent-bit files should be in fluent-bit/ directory"

10
src/log/tests/scripts/02_up.sh Executable file
View File

@ -0,0 +1,10 @@
#!/usr/bin/env bash
set -euo pipefail
cd "$(dirname "$0")/.."
compose_cmd="docker compose"
if ! $compose_cmd version >/dev/null 2>&1; then
if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else
echo "需要 Docker Compose请安装后重试" >&2; exit 1; fi
fi
$compose_cmd -p logging-mvp up -d --remove-orphans
echo "[OK] 服务已启动ES http://localhost:9200 Kibana http://localhost:5601 Fluent-Bit host01 http://localhost:2020 Fluent-Bit host02 http://localhost:2021"

View File

@ -0,0 +1,45 @@
#!/usr/bin/env bash
set -euo pipefail
# 获取fluent-bit-host01容器名称
container_name="logging-mvp-fluent-bit-host01-1"
wait_for_container() {
local name="$1"
local attempts=30
local delay=5
local i
for ((i = 1; i <= attempts; i++)); do
if docker ps --format '{{.Names}}' | grep -Fx "$name" >/dev/null; then
return 0
fi
echo "[INFO] 等待容器 $name 启动中... ($i/$attempts)"
sleep "$delay"
done
return 1
}
if ! wait_for_container "$container_name"; then
echo "[ERROR] Fluent Bit容器 $container_name 未运行"
exit 1
fi
# 创建日志目录
docker exec "$container_name" mkdir -p /logs/train /logs/infer
# 写入训练日志 (host01)
docker exec "$container_name" sh -c "printf '%s INFO [host01] training step=1 loss=1.23 model=bert\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log"
docker exec "$container_name" sh -c "printf '%s INFO [host01] training step=2 loss=1.15 model=bert\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log"
# 写入推理日志 (host01)
docker exec "$container_name" sh -c "printf '%s ERROR [host01] inference failed on batch=1\n' \"\$(date '+%F %T')\" >> /logs/infer/infer-demo.log"
docker exec "$container_name" sh -c "cat <<'STACK' >> /logs/infer/infer-demo.log
Traceback (most recent call last):
File \"inference.py\", line 15, in <module>
raise RuntimeError(\"CUDA out of memory on host01\")
RuntimeError: CUDA out of memory on host01
STACK"
echo "[OK] 已通过docker exec写入测试日志到 host01 容器内:"
echo " - /logs/train/train-demo.log"
echo " - /logs/infer/infer-demo.log"

View File

@ -0,0 +1,41 @@
#!/usr/bin/env bash
set -euo pipefail
# 获取fluent-bit-host02容器名称
container_name="logging-mvp-fluent-bit-host02-1"
wait_for_container() {
local name="$1"
local attempts=30
local delay=5
local i
for ((i = 1; i <= attempts; i++)); do
if docker ps --format '{{.Names}}' | grep -Fx "$name" >/dev/null; then
return 0
fi
echo "[INFO] 等待容器 $name 启动中... ($i/$attempts)"
sleep "$delay"
done
return 1
}
if ! wait_for_container "$container_name"; then
echo "[ERROR] Fluent Bit容器 $container_name 未运行"
exit 1
fi
# 创建日志目录
docker exec "$container_name" mkdir -p /logs/train /logs/infer
# 写入训练日志 (host02)
docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=1 loss=1.45 model=gpt\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log"
docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=2 loss=1.38 model=gpt\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log"
docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=3 loss=1.32 model=gpt\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log"
# 写入推理日志 (host02)
docker exec "$container_name" sh -c "printf '%s WARN [host02] inference slow on batch=5 latency=2.3s\n' \"\$(date '+%F %T')\" >> /logs/infer/infer-demo.log"
docker exec "$container_name" sh -c "printf '%s INFO [host02] inference completed batch=6 latency=0.8s\n' \"\$(date '+%F %T')\" >> /logs/infer/infer-demo.log"
echo "[OK] 已通过docker exec写入测试日志到 host02 容器内:"
echo " - /logs/train/train-demo.log"
echo " - /logs/infer/infer-demo.log"

View File

@ -0,0 +1,42 @@
#!/usr/bin/env bash
set -euo pipefail
# ES endpoint and wait strategy
ES="${ES:-http://localhost:9200}"
es_wait_attempts="${ES_WAIT_ATTEMPTS:-60}" # total attempts to wait for ES
es_wait_interval="${ES_WAIT_INTERVAL:-2}" # seconds between attempts
echo "[i] 查询 ES 端点:$ES"
wait_for_es() {
local attempt=1
while (( attempt <= es_wait_attempts )); do
# 等待集群达到至少 yellow 状态;请求失败则重试
if curl -fsS "$ES/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then
echo "[ok] Elasticsearch 已就绪 (attempt=${attempt}/${es_wait_attempts})"
return 0
fi
echo "[..] 等待 Elasticsearch 可用中 (${attempt}/${es_wait_attempts})"
sleep "${es_wait_interval}"
(( attempt++ ))
done
echo "[err] Elasticsearch 在 ${es_wait_attempts} 次尝试后仍不可用"
return 1
}
safe_count() {
# 对缺失索引返回 0避免 404 触发失败
local pattern="$1"
local json
json=$(curl -fsS "$ES/${pattern}/_count?ignore_unavailable=true&allow_no_indices=true" 2>/dev/null || echo '{}')
echo "$json" | sed -E 's/.*"count":([0-9]+).*/\1/' | awk 'NF{print $0;exit} END{if(NR==0)print 0}'
}
wait_for_es
# 列出相关索引(可能为空,允许)
curl -fsS "$ES/_cat/indices?v" | egrep 'train-|infer-|logstash' || true
# 打印计数,缺失索引按 0 处理
printf "train-* 计数:"; safe_count "train-*"; echo
printf "infer-* 计数:"; safe_count "infer-*"; echo

View File

@ -0,0 +1,21 @@
#!/usr/bin/env bash
set -euo pipefail
cd "$(dirname "$0")/.."
compose_cmd="docker compose"
if ! $compose_cmd version >/dev/null 2>&1; then
if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else
echo "需要 Docker Compose请安装后重试" >&2; exit 1; fi
fi
$compose_cmd -p logging-mvp down
echo "[OK] 已停止所有容器"
# 清理private目录内容
echo "[INFO] 清理private目录内容..."
cd "$(dirname "$0")/.."
if [ -d "private" ]; then
# 删除private目录及其所有内容
rm -rf private
echo "[OK] 已清理private目录"
else
echo "[INFO] private目录不存在无需清理"
fi

View File

@ -0,0 +1,208 @@
#!/usr/bin/env bash
set -euo pipefail
echo "======================================="
echo "ARGUS DNS监控功能测试"
echo "======================================="
echo ""
# 记录测试开始时间
test_start_time=$(date +%s)
# 函数:显示测试步骤
show_step() {
echo ""
echo "🔄 Step $1: $2"
echo "----------------------------------------"
}
# 函数:验证步骤结果
verify_step() {
if [ $? -eq 0 ]; then
echo "$1 - SUCCESS"
else
echo "$1 - FAILED"
exit 1
fi
}
# 函数:等待服务就绪
wait_for_services() {
echo "[INFO] Waiting for services to be ready..."
local max_attempts=60
local attempt=1
while [ $attempt -le $max_attempts ]; do
if curl -fs http://localhost:9200/_cluster/health >/dev/null 2>&1 && \
curl -fs http://localhost:5601/api/status >/dev/null 2>&1; then
echo "[OK] Services are ready!"
return 0
fi
echo " Waiting for services... ($attempt/$max_attempts)"
sleep 5
((attempt++))
done
echo "[ERROR] Services not ready after $max_attempts attempts"
return 1
}
# 函数:检查容器中的/etc/resolv.conf
check_resolv_conf() {
local service_name=$1
local expected_dns=$2
echo "[INFO] 检查 $service_name 容器的 /etc/resolv.conf..."
local resolv_content=$(docker exec "${service_name}" cat /etc/resolv.conf 2>/dev/null || echo "")
if echo "$resolv_content" | grep -q "nameserver $expected_dns"; then
echo "$service_name resolv.conf contains nameserver $expected_dns"
return 0
else
echo "$service_name resolv.conf does not contain nameserver $expected_dns"
echo "实际内容:"
echo "$resolv_content"
return 1
fi
}
# 函数检查DNS监控日志
check_dns_monitor_logs() {
local service_name=$1
echo "[INFO] 检查 $service_name 的DNS监控日志..."
local dns_logs=$(docker exec "$service_name" tail -n 20 /var/log/supervisor/dns-monitor.log 2>/dev/null || echo "")
if [ -n "$dns_logs" ]; then
echo "$service_name DNS监控日志存在"
echo "最近的日志:"
echo "$dns_logs"
return 0
else
echo "$service_name DNS监控日志为空或不存在"
return 1
fi
}
# 函数:确保目录结构存在
ensure_directories() {
echo "[INFO] 确保目录结构存在..."
# 确保目录存在
mkdir -p ./private/argus/etc/
echo "✅ 目录结构准备完成使用真实的update-dns.sh脚本"
}
# 开始DNS监控测试
show_step "1" "Bootstrap - Initialize environment"
./scripts/01_bootstrap.sh
verify_step "Bootstrap"
# 确保目录结构
ensure_directories
show_step "2" "Startup - Start all services"
./scripts/02_up.sh
verify_step "Service startup"
# 等待服务完全就绪
wait_for_services || exit 1
show_step "3" "Create initial DNS configuration"
# 创建初始的DNS配置文件 - 只有一个IP
echo "[INFO] 创建初始的dns.conf文件 (8.8.8.8)..."
cat > ./private/argus/etc/dns.conf << 'EOF'
8.8.8.8
EOF
echo "✅ 初始dns.conf文件创建成功 (8.8.8.8)"
verify_step "Initial DNS configuration creation"
# 等待DNS监控检测到配置文件
echo "[INFO] 等待DNS监控检测并处理初始配置..."
sleep 15
show_step "4" "Verify initial DNS configuration processing"
# 检查两个容器的DNS监控日志
check_dns_monitor_logs "logging-mvp-es-1"
verify_step "Elasticsearch DNS monitor logs"
check_dns_monitor_logs "logging-mvp-kibana-1"
verify_step "Kibana DNS monitor logs"
# 检查resolv.conf是否包含新的DNS服务器
check_resolv_conf "logging-mvp-es-1" "8.8.8.8"
verify_step "Elasticsearch resolv.conf initial check"
check_resolv_conf "logging-mvp-kibana-1" "8.8.8.8"
verify_step "Kibana resolv.conf initial check"
show_step "5" "Modify DNS configuration and test auto-update"
# 修改DNS配置文件 - 改为另一个IP
echo "[INFO] 修改dns.conf文件改为1.1.1.1..."
cat > ./private/argus/etc/dns.conf << 'EOF'
1.1.1.1
EOF
echo "✅ dns.conf文件更新成功改为1.1.1.1"
# 等待DNS监控检测到配置变化
echo "[INFO] 等待DNS监控检测配置变化并执行更新..."
sleep 15
show_step "6" "Verify DNS configuration auto-update"
# 再次检查DNS监控日志应该看到配置变化检测
echo "[INFO] 检查DNS监控是否检测到配置变化..."
# 检查elasticsearch容器
echo "[INFO] 检查elasticsearch容器的DNS监控日志最近30行..."
docker exec logging-mvp-es-1 tail -n 30 /var/log/supervisor/dns-monitor.log || true
# 检查kibana容器
echo "[INFO] 检查kibana容器的DNS监控日志最近30行..."
docker exec logging-mvp-kibana-1 tail -n 30 /var/log/supervisor/dns-monitor.log || true
# 验证新的DNS服务器是否被添加到resolv.conf
check_resolv_conf "logging-mvp-es-1" "1.1.1.1"
verify_step "Elasticsearch resolv.conf after update"
check_resolv_conf "logging-mvp-kibana-1" "1.1.1.1"
verify_step "Kibana resolv.conf after update"
show_step "7" "Final verification - Check DNS configuration"
# 最终验证DNS配置
echo "[INFO] 最终验证elasticsearch容器的resolv.conf..."
docker exec logging-mvp-es-1 cat /etc/resolv.conf
echo "[INFO] 最终验证kibana容器的resolv.conf..."
docker exec logging-mvp-kibana-1 cat /etc/resolv.conf
echo "[INFO] 最终dns.conf内容:"
cat ./private/argus/etc/dns.conf
verify_step "Final DNS configuration verification"
show_step "8" "Cleanup - Stop all services"
./scripts/05_down.sh
verify_step "Service cleanup"
# 清理测试文件
rm -f ./private/argus/etc/dns.conf
# 注不删除update-dns.sh因为这是真实的脚本
# 计算总测试时间
test_end_time=$(date +%s)
total_time=$((test_end_time - test_start_time))
echo ""
echo "======================================="
echo "🎉 DNS监控功能测试完成!"
echo "======================================="
echo "📊 测试总结:"
echo " • 总耗时: ${total_time}"
echo " • 初始DNS配置: 8.8.8.8"
echo " • 更新DNS配置: 1.1.1.1"
echo " • DNS监控脚本正常工作"
echo " • 容器resolv.conf自动覆盖更新成功"
echo ""
echo "✅ DNS自动更新功能测试通过!"
echo ""

169
src/log/tests/scripts/e2e_test.sh Executable file
View File

@ -0,0 +1,169 @@
#!/usr/bin/env bash
set -euo pipefail
echo "======================================="
echo "ARGUS Log System End-to-End Test"
echo "======================================="
echo ""
# 记录测试开始时间
test_start_time=$(date +%s)
# 函数获取ES中的日志计数
get_log_count() {
local train_count=$(curl -s "http://localhost:9200/train-*/_count" 2>/dev/null | grep -o '"count":[0-9]*' | cut -d':' -f2 || echo "0")
local infer_count=$(curl -s "http://localhost:9200/infer-*/_count" 2>/dev/null | grep -o '"count":[0-9]*' | cut -d':' -f2 || echo "0")
echo "$((train_count + infer_count))"
}
# 函数:等待服务就绪
wait_for_services() {
echo "[INFO] Waiting for all services to be ready..."
local max_attempts=${SERVICE_WAIT_ATTEMPTS:-120}
local attempt=1
while [ $attempt -le $max_attempts ]; do
if curl -fs http://localhost:9200/_cluster/health >/dev/null 2>&1 && \
curl -fs http://localhost:5601/api/status >/dev/null 2>&1 && \
curl -fs http://localhost:2020/api/v2/metrics >/dev/null 2>&1 && \
curl -fs http://localhost:2021/api/v2/metrics >/dev/null 2>&1; then
echo "[OK] All services are ready!"
return 0
fi
echo " Waiting for services... ($attempt/$max_attempts)"
sleep 5
((attempt++))
done
echo "[ERROR] Services not ready after $max_attempts attempts"
return 1
}
# 函数:显示测试步骤
show_step() {
echo ""
echo "🔄 Step $1: $2"
echo "----------------------------------------"
}
# 函数:验证步骤结果
verify_step() {
if [ $? -eq 0 ]; then
echo "$1 - SUCCESS"
else
echo "$1 - FAILED"
exit 1
fi
}
# 开始端到端测试
show_step "1" "Bootstrap - Initialize environment"
./scripts/01_bootstrap.sh
verify_step "Bootstrap"
show_step "2" "Startup - Start all services"
./scripts/02_up.sh
verify_step "Service startup"
# 等待服务完全就绪
wait_for_services || exit 1
# 记录发送测试数据前的日志计数
initial_count=$(get_log_count)
echo "[INFO] Initial log count: $initial_count"
show_step "3a" "Send test data - Host01"
./scripts/03_send_test_host01.sh
verify_step "Test data sending (host01)"
show_step "3b" "Send test data - Host02"
./scripts/03_send_test_host02.sh
verify_step "Test data sending (host02)"
# 等待数据被处理
echo "[INFO] Waiting for data to be processed..."
sleep 10
show_step "4" "Verify data - Query Elasticsearch"
./scripts/04_query_es.sh
verify_step "Data verification"
# 记录发送测试数据后的日志计数
final_count=$(get_log_count)
echo "[INFO] Final log count: $final_count"
# 验证日志数量是否增加
if [ "$final_count" -gt "$initial_count" ]; then
added_logs=$((final_count - initial_count))
echo "✅ Log count verification - SUCCESS: Added $added_logs logs (from $initial_count to $final_count)"
else
echo "❌ Log count verification - FAILED: Expected count to increase, but got $initial_count -> $final_count"
exit 1
fi
# 验证预期的最小日志数量(每个主机应该发送一些日志)
expected_min_logs=4 # 至少应该有几条日志
if [ "$final_count" -ge "$expected_min_logs" ]; then
echo "✅ Minimum log threshold - SUCCESS: $final_count logs (>= $expected_min_logs expected)"
else
echo "❌ Minimum log threshold - FAILED: Only $final_count logs (>= $expected_min_logs expected)"
exit 1
fi
# 检查服务健康状态
show_step "Health" "Check service health"
echo "[INFO] Checking service health..."
# 检查 Elasticsearch 健康状态
es_health=$(curl -s "http://localhost:9200/_cluster/health" | grep -o '"status":"[^"]*"' | cut -d'"' -f4)
if [ "$es_health" = "green" ] || [ "$es_health" = "yellow" ]; then
echo "✅ Elasticsearch health: $es_health"
else
echo "❌ Elasticsearch health: $es_health"
fi
# 检查 Kibana 状态
if curl -fs "http://localhost:5601/api/status" >/dev/null 2>&1; then
kb_status="available"
echo "✅ Kibana status: $kb_status"
else
kb_status="unavailable"
echo "⚠️ Kibana status: $kb_status"
fi
# 检查 Fluent-Bit 指标
fb_host01_uptime=$(curl -s "http://localhost:2020/api/v2/metrics" | grep "fluentbit_uptime" | head -1 | grep -o "[0-9]\+$" || echo "0")
fb_host02_uptime=$(curl -s "http://localhost:2021/api/v2/metrics" | grep "fluentbit_uptime" | head -1 | grep -o "[0-9]\+$" || echo "0")
if [ "$fb_host01_uptime" -gt 0 ] && [ "$fb_host02_uptime" -gt 0 ]; then
echo "✅ Fluent-Bit services: host01 uptime=${fb_host01_uptime}s, host02 uptime=${fb_host02_uptime}s"
else
echo "⚠️ Fluent-Bit services: host01 uptime=${fb_host01_uptime}s, host02 uptime=${fb_host02_uptime}s"
fi
verify_step "Service health check"
show_step "5" "Cleanup - Stop all services"
./scripts/05_down.sh
verify_step "Service cleanup"
# 计算总测试时间
test_end_time=$(date +%s)
total_time=$((test_end_time - test_start_time))
echo ""
echo "======================================="
echo "🎉 END-TO-END TEST COMPLETED SUCCESSFULLY!"
echo "======================================="
echo "📊 Test Summary:"
echo " • Initial logs: $initial_count"
echo " • Final logs: $final_count"
echo " • Added logs: $added_logs"
echo " • Total time: ${total_time}s"
echo " • ES health: $es_health"
echo " • Kibana status: $kb_status"
echo " • DNS resolv: ✅ Passed (ES domain verified)"
echo " • All services started and stopped successfully"
echo ""
echo "✅ The ARGUS log system is working correctly!"
echo ""

81
src/master/Dockerfile Normal file
View File

@ -0,0 +1,81 @@
FROM python:3.11-slim
SHELL ["/bin/bash", "-c"]
ARG PIP_INDEX_URL=
ARG USE_OFFLINE=0
ARG USE_INTRANET=false
ARG ARGUS_BUILD_UID=2133
ARG ARGUS_BUILD_GID=2015
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
ENV PIP_NO_CACHE_DIR=1 \
PYTHONUNBUFFERED=1 \
PYTHONPATH=/app
USER root
WORKDIR /app
COPY ./src/master/requirements.txt ./requirements.txt
COPY ./src/master/offline_wheels/ /opt/offline_wheels/
RUN set -euxo pipefail \
&& if [[ "$USE_OFFLINE" == "1" ]]; then \
python -m pip install --no-index --find-links /opt/offline_wheels pip && \
python -m pip install --no-index --find-links /opt/offline_wheels -r requirements.txt; \
else \
python -m pip install --upgrade pip && \
if [[ -n "$PIP_INDEX_URL" ]]; then \
PIP_INDEX_URL="$PIP_INDEX_URL" python -m pip install -r requirements.txt; \
else \
python -m pip install -r requirements.txt; \
fi; \
fi
# 配置内网 apt 源并安装常用工具
RUN if [[ "$USE_INTRANET" == "true" ]]; then \
echo "Configuring intranet apt sources" && \
if [[ -f /etc/apt/sources.list ]]; then cp /etc/apt/sources.list /etc/apt/sources.list.bak; fi && \
mkdir -p /etc/apt && \
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
rm -rf /etc/apt/sources.list.d && \
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
fi && \
apt-get update && \
apt-get install -y supervisor net-tools inetutils-ping && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# 运行期切换到运行所需的 apt 源
RUN if [[ "$USE_INTRANET" == "true" ]]; then \
echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
fi
RUN mkdir -p /var/log/supervisor
RUN set -eux; \
if getent group argus >/dev/null; then \
groupmod -g "${ARGUS_BUILD_GID}" argus; \
else \
groupadd -g "${ARGUS_BUILD_GID}" argus; \
fi; \
if id argus >/dev/null 2>&1; then \
usermod -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" argus; \
else \
useradd -m -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" -s /bin/bash argus; \
fi
COPY ./src/master/build/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
COPY ./src/master/build/start-master.sh /usr/local/bin/start-master.sh
COPY ./src/master/build/dns-monitor.sh /usr/local/bin/dns-monitor.sh
RUN chmod +x /usr/local/bin/start-master.sh /usr/local/bin/dns-monitor.sh
COPY ./src/master/app ./app
EXPOSE 3000
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]

186
src/master/README.md Normal file
View File

@ -0,0 +1,186 @@
# Argus Master 模块
Argus Master 是基于 Flask + SQLite 的节点管理服务,负责:
- 接收 agent 的注册与重注册请求,分配/校验节点 ID。
- 存储节点元数据、配置、健康状态,并根据上报时间计算在线状态。
- 输出仅包含在线节点的 `nodes.json`,供其他模块(如 metric消费。
- 提供查询、配置更新、统计等 REST API。
## 构建与运行
```bash
cd src/master
./scripts/build_images.sh # 生成 argus-master:latest 镜像
```
如需离线构建,先在有网环境运行准备脚本:
```bash
cd src/master
./scripts/prepare_offline_wheels.sh --pip-version 25.2 # 可选 --clean
```
脚本会把 `requirements.txt` 及 pip 指定版本全部下载到 `offline_wheels/`。随后将源码目录(含该子目录)与基础镜像一并拷贝到内网,执行:
```bash
cd src/master
./scripts/build_images.sh --offline --tag argus-master:latest
```
若内网缺少 `python:3.11-slim`,请提前在外网 `docker save` 后通过离线介质 `docker load`
本仓库提供的端到端测试会使用 `src/master/tests/docker-compose.yml` 启动示例环境:
```bash
cd src/master/tests
./scripts/01_up_master.sh # 构建镜像并启动容器,监听 http://localhost:31300
```
服务日志与数据默认写入 `tests/private/argus/master/`(或自定义的挂载目录)。
## 运行时环境变量
| 变量 | 默认值 | 说明 |
| --- | --- | --- |
| `DB_PATH` | `/private/argus/master/db.sqlite3` | SQLite 数据库存放路径。目录会在启动时自动创建。 |
| `METRIC_NODES_JSON_PATH` | `/private/argus/metric/prometheus/nodes.json` | `nodes.json` 输出位置,仅包含在线节点。采用原子写入避免部分文件。 |
| `OFFLINE_THRESHOLD_SECONDS` | `180` | 若距离最近一次上报时间超过该值,调度器会将节点标记为 `offline`。 |
| `ONLINE_THRESHOLD_SECONDS` | `120` | 若最新上报时间距当前不超过该值,则标记为 `online`。范围处于两个阈值之间时保持原状态。 |
| `SCHEDULER_INTERVAL_SECONDS` | `30` | 调度器检查节点状态与刷新 `nodes.json` 的周期。 |
| `NODE_ID_PREFIX` | `A` | 新节点 ID 的前缀,实际 ID 形如 `A1``A2`。 |
| `AUTH_MODE` | `disabled` | 预留的认证开关,当前固定为禁用。 |
## 进程与监控
镜像内通过 `supervisord` 管理进程:
- `master`:执行 `/usr/local/bin/start-master.sh`,默认以 4 个 Gunicorn worker 监听 `0.0.0.0:3000`;可通过环境变量 `GUNICORN_WORKERS``GUNICORN_BIND``GUNICORN_EXTRA_ARGS` 调整。
- `dns-monitor`:轮询 `/private/argus/etc/dns.conf`,若发现变更则调用 `/private/argus/etc/update-dns.sh`,日志输出在 `/var/log/supervisor/dns-monitor.log`
镜像构建阶段会安装 `supervisor`/`net-tools`/`inetutils-ping`/`vim` 等基础工具,并在运行前把 apt 源切换到内网镜像,方便容器内进一步运维。
## 域名注册与 DNS 联动
- Master 容器启动时会主动执行 `/private/argus/etc/update-dns.sh`(若存在),把自身 `/etc/resolv.conf` 指向 bind 服务提供的 DNS随后解析 `eth0` 的 IPv4 地址并写入 `/private/argus/etc/master.argus.com`。该文件会被 bind 模块的 `argus_dns_sync.sh` 监控,用于生成 `master.argus.com` → 当前容器 IP 的 A 记录。
- 测试与生产都需要将 bind 下发的 `update-dns.sh``dns.conf` 等文件挂载到 `/private/argus/etc/`。在 E2E 场景中,`tests/private/argus/etc` 会由脚本自动准备。
- 其他模块(如 agent在启动脚本中只需执行同一份 `update-dns.sh`,即可使用域名访问 master若域名注册异常agent 将无法成功上报,可据此快速定位问题。
## REST API 详解
基础路径:`/api/v1/master`,全部返回 JSON。
### 1. `GET /nodes`
- **用途**:获取所有节点的简要信息。
- **响应示例**
```json
[
{"id": "A1", "name": "dev-user-inst-pod-0", "status": "online", "type": "agent", "version": "1.1.0"}
]
```
### 2. `GET /nodes/{id}`
- **用途**:获取节点详情(包含配置、健康、持久化时间戳等)。
- **错误**`404` 表示节点不存在。
### 3. `POST /nodes`
- **用途**:注册或重注册节点。
- **请求体**
```json
{
"id": "A1", // 可选,重注册时携带
"name": "dev-user-inst-pod-0",
"type": "agent",
"version": "1.1.0",
"meta_data": {
"hostname": "dev-user-inst-pod-0",
"ip": "10.0.0.10",
"env": "dev",
"user": "testuser",
"instance": "testinst",
"cpu_number": 4,
"memory_in_bytes": 2147483648,
"gpu_number": 0
}
}
```
- **成功返回**
- 新节点:`201 Created`,返回完整节点对象。
- 重注册:`200 OK`,返回更新后的节点对象。
- **错误情况**
- `404 Not Found`:携带的 ID 在 Master 中不存在。
- `500 Internal Server Error`:携带的 ID 与已有名称不匹配。
- `400 Bad Request`:请求体缺字段或类型不正确。
### 4. `PUT /nodes/{id}/status`
- **用途**Agent 上报状态。Master 记录 `last_report`(服务器时间)与 `agent_last_report`(上报内时间),并更新 `health` 字段。
- **请求体示例**
```json
{
"timestamp": "2025-09-24T03:24:59Z",
"health": {
"log-fluentbit": {"status": "healthy"},
"metric-node-exporter": {"status": "healthy"}
}
}
```
- **响应**`200 OK`,返回最新节点对象。`404` 表示节点不存在。
### 5. `PUT /nodes/{id}/config`
- **用途**:局部更新节点配置与标签。
- **请求体示例**
```json
{
"config": {"log_level": "debug"},
"label": ["gpu", "exp001"]
}
```
- **说明**:字段可任选其一;未提供的配置保持原值。更新标签会触发 `nodes.json` 重新生成。
- **错误**`404` 表示节点不存在;`400` 表示请求体不合法。
### 6. `GET /nodes/statistics`
- **用途**:统计节点总数及按状态分布。
- **响应示例**
```json
{
"total": 2,
"status_statistics": [
{"status": "online", "count": 1},
{"status": "offline", "count": 1}
]
}
```
### 7. 健康探针
- `GET /healthz`:进程存活检查。
- `GET /readyz`:数据库可用性检查(会尝试访问 `DB_PATH`)。
如需验证离线镜像,可使用自动化脚本:
```bash
cd src/master/tests
./scripts/00_e2e_test_offline.sh # 构建离线镜像并执行完整 E2E
```
## 端到端测试场景
执行 `src/master/tests/scripts/00_e2e_test.sh` 会串联以下用例(脚本 0110
1. **01_up_master**:构建镜像、启动容器、初始化目录与卷。
2. **02_verify_ready_and_nodes_json**:轮询 `/readyz`,校验初始 `nodes.json``[]`
3. **03_register_via_curl**:模拟 agent 注册,保存返回的节点 ID并确认节点出现在列表接口中。
4. **04_reregister_and_error_cases**:覆盖重注册成功、携带未知 ID 的 `404`、ID/名称不匹配触发 `500` 等场景。
5. **05_status_report_via_curl**:上报健康信息并验证状态自动从 `initialized``online``offline``online` 的转换。
6. **06_config_update_and_nodes_json**:更新配置/标签,检查 `nodes.json` 中的标签同步,并确保离线节点不会出现在文件里。
7. **07_stats_single_node**:等待节点掉线,验证统计接口与 `nodes.json` 为空列表。
8. **08_multi_node_stats**:注册第二节点,使一在线一离线,校验统计聚合和 `nodes.json` 仅包含在线节点。
9. **09_restart_persistence**:重启 master 容器,确认节点数据、统计结果与 `nodes.json` 在持久化目录中保持不变。
10. **10_down**:停止并清理容器、网络与临时目录。
## 相关持久化文件
- SQLite默认位于 `DB_PATH`,包含 `nodes``kv` 两张表。
- `nodes.json`:由调度器周期生成,仅保留状态为 `online` 的节点信息。
- 测试用例中的 `tests/private/``tests/tmp/` 会随脚本自动清理,避免污染后续运行。
如需在生产环境运行,可将镜像推送到私有仓库,或参考测试 Compose 配置自行部署;只需确保上述环境变量在容器内正确设置即可。

View File

@ -0,0 +1,41 @@
from __future__ import annotations
import atexit
import logging
from flask import Flask
from .config import AppConfig, load_config
from .routes import register_routes
from .scheduler import StatusScheduler
from .storage import Storage
def create_app(config: AppConfig | None = None) -> Flask:
app_config = config or load_config()
storage = Storage(app_config.db_path, app_config.node_id_prefix)
scheduler = StatusScheduler(storage, app_config)
app = Flask(__name__)
app.config["APP_CONFIG"] = app_config
app.config["STORAGE"] = storage
app.config["SCHEDULER"] = scheduler
register_routes(app, storage, scheduler, app_config)
scheduler.start()
def _cleanup() -> None:
logging.getLogger("argus.master").info("Shutting down master app")
try:
scheduler.stop()
except Exception: # pragma: no cover - defensive
logging.getLogger("argus.master").exception("Failed to stop scheduler")
try:
storage.close()
except Exception: # pragma: no cover - defensive
logging.getLogger("argus.master").exception("Failed to close storage")
atexit.register(_cleanup)
return app

40
src/master/app/config.py Normal file
View File

@ -0,0 +1,40 @@
from __future__ import annotations
import os
from dataclasses import dataclass
@dataclass(frozen=True)
class AppConfig:
db_path: str
metric_nodes_json_path: str
offline_threshold_seconds: int
online_threshold_seconds: int
scheduler_interval_seconds: int
node_id_prefix: str
auth_mode: str
def _get_int_env(name: str, default: int) -> int:
raw = os.environ.get(name)
if raw is None or raw.strip() == "":
return default
try:
return int(raw)
except ValueError as exc:
raise ValueError(f"Environment variable {name} must be an integer, got {raw!r}") from exc
def load_config() -> AppConfig:
"""读取环境变量生成配置对象,方便统一管理运行参数。"""
return AppConfig(
db_path=os.environ.get("DB_PATH", "/private/argus/master/db.sqlite3"),
metric_nodes_json_path=os.environ.get(
"METRIC_NODES_JSON_PATH", "/private/argus/metric/prometheus/nodes.json"
),
offline_threshold_seconds=_get_int_env("OFFLINE_THRESHOLD_SECONDS", 180),
online_threshold_seconds=_get_int_env("ONLINE_THRESHOLD_SECONDS", 120),
scheduler_interval_seconds=_get_int_env("SCHEDULER_INTERVAL_SECONDS", 30),
node_id_prefix=os.environ.get("NODE_ID_PREFIX", "A"),
auth_mode=os.environ.get("AUTH_MODE", "disabled"),
)

171
src/master/app/models.py Normal file
View File

@ -0,0 +1,171 @@
from __future__ import annotations
import json
from dataclasses import dataclass
from typing import Any, Dict, Iterable, Mapping
from .util import parse_iso
class ValidationError(Exception):
"""Raised when user payload fails validation."""
@dataclass
class Node:
id: str
name: str
type: str
version: str | None
status: str
config: Dict[str, Any]
labels: Iterable[str]
meta_data: Dict[str, Any]
health: Dict[str, Any]
register_time: str | None
last_report: str | None
agent_last_report: str | None
last_updated: str | None
def serialize_node_row(row: Mapping[str, Any]) -> Dict[str, Any]:
def _json_or_default(value: str | None, default: Any) -> Any:
if value is None or value == "":
return default
try:
return json.loads(value)
except json.JSONDecodeError:
return default
config = _json_or_default(row["config_json"], {})
labels = _json_or_default(row["labels_json"], [])
meta = _json_or_default(row["meta_json"], {})
health = _json_or_default(row["health_json"], {})
return {
"id": row["id"],
"name": row["name"],
"type": row["type"],
"version": row["version"],
"status": row["status"],
"config": config if isinstance(config, dict) else {},
"label": list(labels) if isinstance(labels, list) else [],
"meta_data": meta if isinstance(meta, dict) else {},
"health": health if isinstance(health, dict) else {},
"register_time": row["register_time"],
"last_report": row["last_report"],
"agent_last_report": row["agent_last_report"],
"last_updated": row["last_updated"],
}
def serialize_node_summary(row: Mapping[str, Any]) -> Dict[str, Any]:
return {
"id": row["id"],
"name": row["name"],
"status": row["status"],
"type": row["type"],
"version": row["version"],
}
def validate_registration_payload(payload: Mapping[str, Any]) -> Dict[str, Any]:
if not isinstance(payload, Mapping):
raise ValidationError("Request body must be a JSON object")
name = payload.get("name")
if not isinstance(name, str) or not name.strip():
raise ValidationError("Field 'name' is required and must be a non-empty string")
node_type = payload.get("type", "agent")
if not isinstance(node_type, str) or not node_type:
raise ValidationError("Field 'type' must be a string")
version = payload.get("version")
if version is not None and not isinstance(version, str):
raise ValidationError("Field 'version' must be a string if provided")
meta = payload.get("meta_data")
if not isinstance(meta, Mapping):
raise ValidationError("Field 'meta_data' must be an object")
required_meta = ["hostname", "ip", "env", "user", "instance", "cpu_number", "memory_in_bytes", "gpu_number"]
for key in required_meta:
if key not in meta:
raise ValidationError(f"meta_data.{key} is required")
cpu_number = meta["cpu_number"]
memory_in_bytes = meta["memory_in_bytes"]
gpu_number = meta["gpu_number"]
if not isinstance(cpu_number, int) or cpu_number < 0:
raise ValidationError("meta_data.cpu_number must be a non-negative integer")
if not isinstance(memory_in_bytes, int) or memory_in_bytes < 0:
raise ValidationError("meta_data.memory_in_bytes must be a non-negative integer")
if not isinstance(gpu_number, int) or gpu_number < 0:
raise ValidationError("meta_data.gpu_number must be a non-negative integer")
node_id = payload.get("id")
if node_id is not None and (not isinstance(node_id, str) or not node_id.strip()):
raise ValidationError("Field 'id' must be a non-empty string when provided")
return {
"id": node_id,
"name": name,
"type": node_type,
"version": version,
"meta_data": dict(meta),
}
def validate_status_payload(payload: Mapping[str, Any]) -> Dict[str, Any]:
if not isinstance(payload, Mapping):
raise ValidationError("Request body must be a JSON object")
timestamp = payload.get("timestamp")
if not isinstance(timestamp, str) or not timestamp:
raise ValidationError("Field 'timestamp' is required and must be a string")
parsed = parse_iso(timestamp)
if parsed is None:
raise ValidationError("Field 'timestamp' must be an ISO8601 datetime string")
health = payload.get("health", {})
if not isinstance(health, Mapping):
raise ValidationError("Field 'health' must be an object if provided")
sanitized_health: Dict[str, Any] = {}
for key, value in health.items():
if not isinstance(key, str):
raise ValidationError("Keys in 'health' must be strings")
if not isinstance(value, (Mapping, list, str, int, float, bool)) and value is not None:
raise ValidationError("Values in 'health' must be JSON-compatible")
sanitized_health[key] = value
return {
"timestamp": timestamp,
"parsed_timestamp": parsed,
"health": sanitized_health,
}
def validate_config_payload(payload: Mapping[str, Any]) -> Dict[str, Any]:
if not isinstance(payload, Mapping):
raise ValidationError("Request body must be a JSON object")
result: Dict[str, Any] = {}
if "config" in payload:
config = payload["config"]
if not isinstance(config, Mapping):
raise ValidationError("Field 'config' must be an object")
result["config"] = dict(config)
if "label" in payload:
labels = payload["label"]
if not isinstance(labels, list) or not all(isinstance(item, str) for item in labels):
raise ValidationError("Field 'label' must be an array of strings")
result["label"] = list(labels)
if not result:
raise ValidationError("At least one of 'config' or 'label' must be provided")
return result

155
src/master/app/nodes_api.py Normal file
View File

@ -0,0 +1,155 @@
from __future__ import annotations
import logging
from http import HTTPStatus
from typing import Any, Mapping
from flask import Blueprint, jsonify, request
from .models import (
ValidationError,
validate_config_payload,
validate_registration_payload,
validate_status_payload,
)
from .scheduler import StatusScheduler
from .storage import Storage
from .util import to_iso, utcnow
def create_nodes_blueprint(storage: Storage, scheduler: StatusScheduler) -> Blueprint:
bp = Blueprint("nodes", __name__)
logger = logging.getLogger("argus.master.api")
def _json_error(message: str, status: HTTPStatus, code: str) -> Any:
response = jsonify({"error": message, "code": code})
response.status_code = status
return response
@bp.errorhandler(ValidationError)
def _handle_validation_error(err: ValidationError):
return _json_error(str(err), HTTPStatus.BAD_REQUEST, "invalid_request")
@bp.get("/nodes")
def list_nodes():
nodes = storage.list_nodes()
return jsonify(nodes)
@bp.get("/nodes/<node_id>")
def get_node(node_id: str):
node = storage.get_node(node_id)
if node is None:
return _json_error("Node not found", HTTPStatus.NOT_FOUND, "not_found")
return jsonify(node)
@bp.post("/nodes")
def register_node():
payload = _get_json()
data = validate_registration_payload(payload)
now = utcnow()
now_iso = to_iso(now)
node_id = data["id"]
name = data["name"]
node_type = data["type"]
version = data["version"]
meta = data["meta_data"]
if node_id:
# 携带 id 说明是重注册,需要校验名称一致性
existing_row = storage.get_node_raw(node_id)
if existing_row is None:
return _json_error("Node not found", HTTPStatus.NOT_FOUND, "not_found")
if existing_row["name"] != name:
return _json_error(
"Node id and name mismatch during re-registration",
HTTPStatus.INTERNAL_SERVER_ERROR,
"id_name_mismatch",
)
updated = storage.update_node_meta(
node_id,
node_type=node_type,
version=version,
meta_data=meta,
last_updated_iso=now_iso,
)
scheduler.trigger_nodes_json_refresh()
return jsonify(updated), HTTPStatus.OK
# No id provided → search by name
existing_by_name = storage.get_node_by_name(name)
if existing_by_name:
# 同名节点已存在,视为无 id 重注册
updated = storage.update_node_meta(
existing_by_name["id"],
node_type=node_type,
version=version,
meta_data=meta,
last_updated_iso=now_iso,
)
scheduler.trigger_nodes_json_refresh()
return jsonify(updated), HTTPStatus.OK
new_id = storage.allocate_node_id()
created = storage.create_node(
new_id,
name,
node_type,
version,
meta,
status="initialized",
register_time_iso=now_iso,
last_updated_iso=now_iso,
)
scheduler.trigger_nodes_json_refresh()
return jsonify(created), HTTPStatus.CREATED
@bp.put("/nodes/<node_id>/config")
def update_node_config(node_id: str):
payload = _get_json()
updates = validate_config_payload(payload)
try:
updated = storage.update_config_and_labels(
node_id,
config=updates.get("config"),
labels=updates.get("label"),
)
except KeyError:
return _json_error("Node not found", HTTPStatus.NOT_FOUND, "not_found")
if "label" in updates:
scheduler.trigger_nodes_json_refresh()
return jsonify(updated)
@bp.get("/nodes/statistics")
def node_statistics():
stats = storage.get_statistics()
return jsonify(stats)
@bp.put("/nodes/<node_id>/status")
def update_status(node_id: str):
payload = _get_json()
data = validate_status_payload(payload)
try:
# master 负责写入 last_report状态由调度器计算
updated = storage.update_last_report(
node_id,
server_timestamp_iso=to_iso(utcnow()),
agent_timestamp_iso=data["timestamp"],
health=data["health"],
)
except KeyError:
return _json_error("Node not found", HTTPStatus.NOT_FOUND, "not_found")
scheduler.trigger_nodes_json_refresh()
return jsonify(updated)
return bp
def _get_json() -> Mapping[str, Any]:
data = request.get_json(silent=True)
if data is None:
raise ValidationError("Request body must be valid JSON")
if not isinstance(data, Mapping):
raise ValidationError("Request body must be a JSON object")
return data

24
src/master/app/routes.py Normal file
View File

@ -0,0 +1,24 @@
from __future__ import annotations
from flask import Flask, jsonify
from .config import AppConfig
from .nodes_api import create_nodes_blueprint
from .scheduler import StatusScheduler
from .storage import Storage
def register_routes(app: Flask, storage: Storage, scheduler: StatusScheduler, config: AppConfig) -> None:
app.register_blueprint(create_nodes_blueprint(storage, scheduler), url_prefix="/api/v1/master")
@app.get("/healthz")
def healthz():
return jsonify({"status": "ok"})
@app.get("/readyz")
def readyz():
try:
storage.list_nodes() # simple readiness probe
except Exception as exc: # pragma: no cover - defensive
return jsonify({"status": "error", "error": str(exc)}), 500
return jsonify({"status": "ok"})

View File

@ -0,0 +1,90 @@
from __future__ import annotations
import logging
import threading
from typing import Optional
from .config import AppConfig
from .storage import Storage
from .util import atomic_write_json, parse_iso, to_iso, utcnow
class StatusScheduler:
def __init__(self, storage: Storage, config: AppConfig, logger: Optional[logging.Logger] = None) -> None:
self._storage = storage
self._config = config
self._logger = logger or logging.getLogger("argus.master.scheduler")
self._stop_event = threading.Event()
self._thread = threading.Thread(target=self._run, name="status-scheduler", daemon=True)
self._nodes_json_lock = threading.Lock()
self._pending_nodes_json = threading.Event()
def start(self) -> None:
"""启动后台线程,定期刷新节点状态与 nodes.json。"""
if not self._thread.is_alive():
self._logger.info("Starting scheduler thread")
self._thread.start()
def stop(self) -> None:
self._stop_event.set()
self._pending_nodes_json.set()
self._thread.join(timeout=5)
def trigger_nodes_json_refresh(self) -> None:
self._pending_nodes_json.set()
def generate_nodes_json(self) -> None:
with self._nodes_json_lock:
online_nodes = self._storage.get_online_nodes()
atomic_write_json(self._config.metric_nodes_json_path, online_nodes)
self._logger.info("nodes.json updated", extra={"count": len(online_nodes)})
# ------------------------------------------------------------------
# internal loop
# ------------------------------------------------------------------
def _run(self) -> None:
# 确保启动时 nodes.json 会立即生成
self._pending_nodes_json.set()
while not self._stop_event.is_set():
changed = self._reconcile_statuses()
if changed or self._pending_nodes_json.is_set():
try:
self.generate_nodes_json()
finally:
self._pending_nodes_json.clear()
self._stop_event.wait(self._config.scheduler_interval_seconds)
def _reconcile_statuses(self) -> bool:
"""根据 last_report 与当前时间对比,决定是否切换状态。"""
any_status_changed = False
now = utcnow()
rows = self._storage.fetch_nodes_for_scheduler()
for row in rows:
node_id = row["id"]
last_report_iso = row["last_report"]
current_status = row["status"]
last_report_dt = parse_iso(last_report_iso)
if last_report_dt is None:
# No report yet; treat as initialized until report arrives
continue
delta_seconds = (now - last_report_dt).total_seconds()
new_status = current_status
if delta_seconds > self._config.offline_threshold_seconds:
new_status = "offline"
elif delta_seconds <= self._config.online_threshold_seconds:
new_status = "online"
# Between thresholds: keep current status (sticky)
if new_status != current_status:
any_status_changed = True
self._logger.info(
"Updating node status",
extra={
"node_id": node_id,
"previous": current_status,
"new": new_status,
"delta_seconds": delta_seconds,
},
)
self._storage.update_status(node_id, new_status, last_updated_iso=to_iso(now))
return any_status_changed

332
src/master/app/storage.py Normal file
View File

@ -0,0 +1,332 @@
from __future__ import annotations
import json
import sqlite3
import threading
from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple
from .models import serialize_node_row, serialize_node_summary
from .util import ensure_parent, to_iso, utcnow
class Storage:
def __init__(self, db_path: str, node_id_prefix: str) -> None:
self._db_path = db_path
self._node_id_prefix = node_id_prefix
ensure_parent(db_path)
self._lock = threading.Lock()
self._conn = sqlite3.connect(db_path, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
self._conn.row_factory = sqlite3.Row
with self._lock:
self._conn.execute("PRAGMA foreign_keys = ON;")
self._ensure_schema()
# ------------------------------------------------------------------
# schema & helpers
# ------------------------------------------------------------------
def _ensure_schema(self) -> None:
"""初始化表结构,确保服务启动时数据库结构就绪。"""
with self._lock:
self._conn.executescript(
"""
CREATE TABLE IF NOT EXISTS nodes (
id TEXT PRIMARY KEY,
name TEXT NOT NULL UNIQUE,
type TEXT NOT NULL,
version TEXT,
status TEXT NOT NULL,
config_json TEXT,
labels_json TEXT,
meta_json TEXT,
health_json TEXT,
register_time TEXT,
last_report TEXT,
agent_last_report TEXT,
last_updated TEXT
);
CREATE TABLE IF NOT EXISTS kv (
key TEXT PRIMARY KEY,
value TEXT NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_nodes_status ON nodes(status);
CREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name);
"""
)
self._conn.commit()
def close(self) -> None:
with self._lock:
self._conn.close()
# ------------------------------------------------------------------
# Node ID allocation
# ------------------------------------------------------------------
def allocate_node_id(self) -> str:
"""在 kv 表里维护自增序列,为新节点生成形如 A1 的 ID。"""
with self._lock:
cur = self._conn.execute("SELECT value FROM kv WHERE key = ?", ("node_id_seq",))
row = cur.fetchone()
if row is None:
next_id = 1
self._conn.execute("INSERT INTO kv(key, value) VALUES(?, ?)", ("node_id_seq", str(next_id)))
else:
next_id = int(row["value"]) + 1
self._conn.execute("UPDATE kv SET value = ? WHERE key = ?", (str(next_id), "node_id_seq"))
self._conn.commit()
return f"{self._node_id_prefix}{next_id}"
# ------------------------------------------------------------------
# Query helpers
# ------------------------------------------------------------------
def list_nodes(self) -> List[Dict[str, Any]]:
with self._lock:
cur = self._conn.execute(
"SELECT id, name, status, type, version FROM nodes ORDER BY id ASC"
)
rows = cur.fetchall()
return [serialize_node_summary(row) for row in rows]
def get_node(self, node_id: str) -> Optional[Dict[str, Any]]:
with self._lock:
cur = self._conn.execute("SELECT * FROM nodes WHERE id = ?", (node_id,))
row = cur.fetchone()
if row is None:
return None
return serialize_node_row(row)
def get_node_raw(self, node_id: str) -> Optional[sqlite3.Row]:
with self._lock:
cur = self._conn.execute("SELECT * FROM nodes WHERE id = ?", (node_id,))
row = cur.fetchone()
return row
def get_node_by_name(self, name: str) -> Optional[Dict[str, Any]]:
with self._lock:
cur = self._conn.execute("SELECT * FROM nodes WHERE name = ?", (name,))
row = cur.fetchone()
if row is None:
return None
return serialize_node_row(row)
# ------------------------------------------------------------------
# Mutation helpers
# ------------------------------------------------------------------
def create_node(
self,
node_id: str,
name: str,
node_type: str,
version: str | None,
meta_data: Mapping[str, Any],
status: str,
register_time_iso: str,
last_updated_iso: str,
) -> Dict[str, Any]:
"""插入节点初始记录,默认 config/label/health 为空。"""
now_iso = last_updated_iso
with self._lock:
self._conn.execute(
"""
INSERT INTO nodes (
id, name, type, version, status, config_json, labels_json, meta_json,
health_json, register_time, last_report, agent_last_report, last_updated
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
node_id,
name,
node_type,
version,
status,
json.dumps({}),
json.dumps([]),
json.dumps(dict(meta_data)),
json.dumps({}),
register_time_iso,
None,
None,
now_iso,
),
)
self._conn.commit()
created = self.get_node(node_id)
if created is None:
raise RuntimeError("Failed to read back created node")
return created
def update_node_meta(
self,
node_id: str,
*,
name: Optional[str] = None,
node_type: Optional[str] = None,
version: Optional[str | None] = None,
meta_data: Optional[Mapping[str, Any]] = None,
last_updated_iso: Optional[str] = None,
) -> Dict[str, Any]:
"""重注册时更新节点静态信息,缺省字段保持不变。"""
updates: List[str] = []
params: List[Any] = []
if name is not None:
updates.append("name = ?")
params.append(name)
if node_type is not None:
updates.append("type = ?")
params.append(node_type)
if version is not None:
updates.append("version = ?")
params.append(version)
if meta_data is not None:
updates.append("meta_json = ?")
params.append(json.dumps(dict(meta_data)))
if last_updated_iso is not None:
updates.append("last_updated = ?")
params.append(last_updated_iso)
if not updates:
result = self.get_node(node_id)
if result is None:
raise KeyError(node_id)
return result
params.append(node_id)
with self._lock:
self._conn.execute(
f"UPDATE nodes SET {', '.join(updates)} WHERE id = ?",
tuple(params),
)
self._conn.commit()
updated = self.get_node(node_id)
if updated is None:
raise KeyError(node_id)
return updated
def update_config_and_labels(
self, node_id: str, *, config: Optional[Mapping[str, Any]] = None, labels: Optional[Iterable[str]] = None
) -> Dict[str, Any]:
"""部分更新 config/label并刷新 last_updated 时间戳。"""
updates: List[str] = []
params: List[Any] = []
if config is not None:
updates.append("config_json = ?")
params.append(json.dumps(dict(config)))
if labels is not None:
updates.append("labels_json = ?")
params.append(json.dumps(list(labels)))
updates.append("last_updated = ?")
params.append(to_iso(utcnow()))
params.append(node_id)
with self._lock:
self._conn.execute(
f"UPDATE nodes SET {', '.join(updates)} WHERE id = ?",
tuple(params),
)
if self._conn.total_changes == 0:
self._conn.rollback()
raise KeyError(node_id)
self._conn.commit()
updated = self.get_node(node_id)
if updated is None:
raise KeyError(node_id)
return updated
def update_last_report(
self,
node_id: str,
*,
server_timestamp_iso: str,
agent_timestamp_iso: str,
health: Mapping[str, Any],
) -> Dict[str, Any]:
"""记录最新上报时间和健康信息,用于后续状态计算。"""
with self._lock:
self._conn.execute(
"""
UPDATE nodes
SET last_report = ?,
agent_last_report = ?,
health_json = ?,
last_updated = ?
WHERE id = ?
""",
(
server_timestamp_iso,
agent_timestamp_iso,
json.dumps(health),
server_timestamp_iso,
node_id,
),
)
if self._conn.total_changes == 0:
self._conn.rollback()
raise KeyError(node_id)
self._conn.commit()
updated = self.get_node(node_id)
if updated is None:
raise KeyError(node_id)
return updated
def update_status(self, node_id: str, status: str, *, last_updated_iso: str) -> None:
with self._lock:
self._conn.execute(
"UPDATE nodes SET status = ?, last_updated = ? WHERE id = ?",
(status, last_updated_iso, node_id),
)
self._conn.commit()
# ------------------------------------------------------------------
# Reporting helpers
# ------------------------------------------------------------------
def get_statistics(self) -> Dict[str, Any]:
"""统计节点总数及按状态聚合的数量。"""
with self._lock:
cur = self._conn.execute("SELECT COUNT(*) AS total FROM nodes")
total_row = cur.fetchone()
cur = self._conn.execute("SELECT status, COUNT(*) AS count FROM nodes GROUP BY status")
status_rows = cur.fetchall()
return {
"total": total_row["total"] if total_row else 0,
"status_statistics": [
{"status": row["status"], "count": row["count"]}
for row in status_rows
],
}
def fetch_nodes_for_scheduler(self) -> List[sqlite3.Row]:
with self._lock:
cur = self._conn.execute(
"SELECT id, last_report, status FROM nodes"
)
return cur.fetchall()
def get_online_nodes(self) -> List[Dict[str, Any]]:
"""返回在线节点列表,用于生成 nodes.json。"""
with self._lock:
cur = self._conn.execute(
"SELECT id, meta_json, labels_json, name FROM nodes WHERE status = ? ORDER BY id ASC",
("online",),
)
rows = cur.fetchall()
result: List[Dict[str, Any]] = []
for row in rows:
meta = json.loads(row["meta_json"]) if row["meta_json"] else {}
labels = json.loads(row["labels_json"]) if row["labels_json"] else []
result.append(
{
"node_id": row["id"],
"user_id": meta.get("user"),
"ip": meta.get("ip"),
"hostname": meta.get("hostname", row["name"]),
"labels": labels if isinstance(labels, list) else [],
}
)
return result

51
src/master/app/util.py Normal file
View File

@ -0,0 +1,51 @@
from __future__ import annotations
import json
import os
import tempfile
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Iterable
ISO_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
def utcnow() -> datetime:
"""获取当前 UTC 时间,统一时间基准。"""
return datetime.now(timezone.utc)
def to_iso(dt: datetime | None) -> str | None:
if dt is None:
return None
return dt.astimezone(timezone.utc).replace(microsecond=0).strftime(ISO_FORMAT)
def parse_iso(value: str | None) -> datetime | None:
if not value:
return None
try:
if value.endswith("Z"):
return datetime.strptime(value, ISO_FORMAT).replace(tzinfo=timezone.utc)
# Fallback for ISO strings with offset
return datetime.fromisoformat(value).astimezone(timezone.utc)
except ValueError:
return None
def ensure_parent(path: str) -> None:
"""确保目标文件所在目录存在。"""
Path(path).parent.mkdir(parents=True, exist_ok=True)
def atomic_write_json(path: str, data: Iterable[Any] | Any) -> None:
"""原子化写 JSON避免被其它进程读到半成品。"""
ensure_parent(path)
directory = Path(path).parent
with tempfile.NamedTemporaryFile("w", dir=directory, delete=False) as tmp:
json.dump(data, tmp, separators=(",", ":"))
tmp.flush()
os.fsync(tmp.fileno())
temp_path = tmp.name
os.replace(temp_path, path)

View File

@ -0,0 +1 @@
../../bind/build/dns-monitor.sh

View File

@ -0,0 +1,59 @@
#!/usr/bin/env bash
set -euo pipefail
# 中文提示:确保共享目录与 DNS 相关脚本存在
DNS_DIR="/private/argus/etc"
DNS_SCRIPT="${DNS_DIR}/update-dns.sh"
MASTER_DOMAIN_FILE="${DNS_DIR}/master.argus.com"
RUNTIME_USER="${ARGUS_RUNTIME_USER:-argus}"
RUNTIME_UID="${ARGUS_BUILD_UID:-2133}"
RUNTIME_GID="${ARGUS_BUILD_GID:-2015}"
MASTER_DATA_DIR="/private/argus/master"
METRIC_DIR="/private/argus/metric/prometheus"
mkdir -p "$DNS_DIR"
chown -R "$RUNTIME_UID:$RUNTIME_GID" "$DNS_DIR" 2>/dev/null || true
mkdir -p "$MASTER_DATA_DIR"
mkdir -p "$METRIC_DIR"
chown -R "$RUNTIME_UID:$RUNTIME_GID" "$MASTER_DATA_DIR" "$METRIC_DIR" 2>/dev/null || true
if [[ -x "$DNS_SCRIPT" ]]; then
echo "[INFO] Running update-dns.sh before master starts"
# 中文提示:若脚本存在则执行,保证容器使用 bind 作为 DNS
"$DNS_SCRIPT" || echo "[WARN] update-dns.sh execution failed"
else
echo "[WARN] DNS update script not found or not executable: $DNS_SCRIPT"
fi
# 中文提示:记录 master 当前 IP供 bind 服务同步
MASTER_IP=$(ifconfig | grep -A 1 eth0 | grep inet | awk '{print $2}' || true)
if [[ -n "${MASTER_IP}" ]]; then
echo "current IP: ${MASTER_IP}"
echo "${MASTER_IP}" > "$MASTER_DOMAIN_FILE"
chown "$RUNTIME_UID:$RUNTIME_GID" "$MASTER_DOMAIN_FILE" 2>/dev/null || true
else
echo "[WARN] Failed to detect master IP via ifconfig"
fi
WORKERS=${GUNICORN_WORKERS:-4}
BIND_ADDR=${GUNICORN_BIND:-0.0.0.0:3000}
EXTRA_OPTS=${GUNICORN_EXTRA_ARGS:-}
if [[ -n "$EXTRA_OPTS" ]]; then
read -r -a EXTRA_ARRAY <<< "$EXTRA_OPTS"
else
EXTRA_ARRAY=()
fi
command=(gunicorn --bind "$BIND_ADDR" --workers "$WORKERS")
if [[ ${#EXTRA_ARRAY[@]} -gt 0 ]]; then
command+=("${EXTRA_ARRAY[@]}")
fi
command+=("app:create_app()")
if command -v runuser >/dev/null 2>&1; then
exec runuser -u "$RUNTIME_USER" -- "${command[@]}"
else
printf -v _cmd_str '%q ' "${command[@]}"
exec su -s /bin/bash -m "$RUNTIME_USER" -c "exec ${_cmd_str}"
fi

View File

@ -0,0 +1,39 @@
[supervisord]
nodaemon=true
logfile=/var/log/supervisor/supervisord.log
pidfile=/var/run/supervisord.pid
user=root
[program:master]
command=/usr/local/bin/start-master.sh
user=root
stdout_logfile=/var/log/supervisor/master.log
stderr_logfile=/var/log/supervisor/master_error.log
autostart=true
autorestart=true
startsecs=5
stopwaitsecs=30
killasgroup=true
stopasgroup=true
[program:dns-monitor]
command=/usr/local/bin/dns-monitor.sh
user=root
stdout_logfile=/var/log/supervisor/dns-monitor.log
stderr_logfile=/var/log/supervisor/dns-monitor_error.log
autostart=true
autorestart=true
startsecs=5
stopwaitsecs=10
killasgroup=true
stopasgroup=true
[unix_http_server]
file=/var/run/supervisor.sock
chmod=0700
[supervisorctl]
serverurl=unix:///var/run/supervisor.sock
[rpcinterface:supervisor]
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface

View File

Binary file not shown.

View File

Some files were not shown because too many files have changed in this diff Show More