Compare commits
7 Commits
abc739b1be
...
fb4630c3f6
Author | SHA1 | Date | |
---|---|---|---|
|
fb4630c3f6 | ||
|
c67dcb48a7 | ||
|
f47d6560f5 | ||
|
41bd3ca1f6 | ||
|
5b461ece66 | ||
|
45b34cfe2c | ||
|
cc014a8a4d |
1
.gitignore
vendored
1
.gitignore
vendored
@ -1 +0,0 @@
|
|||||||
.idea/
|
|
@ -5,10 +5,3 @@
|
|||||||
项目文档:【腾讯文档】GPU集群运维系统
|
项目文档:【腾讯文档】GPU集群运维系统
|
||||||
https://docs.qq.com/doc/DQUxDdmhIZ1dpeERk
|
https://docs.qq.com/doc/DQUxDdmhIZ1dpeERk
|
||||||
|
|
||||||
## 构建账号配置
|
|
||||||
|
|
||||||
镜像构建和运行账号的 UID/GID 可通过 `configs/build_user.conf` 配置,详细说明见 `doc/build-user-config.md`。
|
|
||||||
|
|
||||||
## 本地端口占用提示
|
|
||||||
|
|
||||||
如需运行 BIND 模块端到端测试且宿主机 53 端口已占用,可通过环境变量 `HOST_DNS_PORT`(默认 1053)指定对外映射端口,例如 `HOST_DNS_PORT=12053 ./scripts/00_e2e_test.sh`。
|
|
||||||
|
@ -1,205 +1,138 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
# 帮助信息
|
||||||
show_help() {
|
show_help() {
|
||||||
cat <<'EOF'
|
cat << EOF
|
||||||
ARGUS Unified Build System - Image Build Tool
|
ARGUS Unified Build System - Image Build Tool
|
||||||
|
|
||||||
Usage: $0 [OPTIONS]
|
Usage: $0 [OPTIONS]
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
--intranet Use intranet mirror for log/bind builds
|
--intranet Use intranet mirror for Ubuntu 22.04 packages
|
||||||
--master-offline Build master offline image (requires src/master/offline_wheels.tar.gz)
|
-h, --help Show this help message
|
||||||
-h, --help Show this help message
|
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
$0 # Build with default sources
|
$0 # Build with default sources
|
||||||
$0 --intranet # Build with intranet mirror
|
$0 --intranet # Build with intranet mirror
|
||||||
$0 --master-offline # Additionally build argus-master:offline
|
|
||||||
$0 --intranet --master-offline
|
|
||||||
EOF
|
EOF
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# 解析命令行参数
|
||||||
use_intranet=false
|
use_intranet=false
|
||||||
build_master=true
|
|
||||||
build_master_offline=false
|
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
case $1 in
|
case $1 in
|
||||||
--intranet)
|
--intranet)
|
||||||
use_intranet=true
|
use_intranet=true
|
||||||
shift
|
shift
|
||||||
;;
|
;;
|
||||||
--master)
|
-h|--help)
|
||||||
build_master=true
|
show_help
|
||||||
shift
|
exit 0
|
||||||
;;
|
;;
|
||||||
--master-offline)
|
*)
|
||||||
build_master=true
|
echo "Unknown option: $1"
|
||||||
build_master_offline=true
|
show_help
|
||||||
shift
|
exit 1
|
||||||
;;
|
;;
|
||||||
-h|--help)
|
esac
|
||||||
show_help
|
|
||||||
exit 0
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "Unknown option: $1" >&2
|
|
||||||
show_help
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
done
|
||||||
|
|
||||||
|
# 获取项目根目录
|
||||||
root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||||
. "$root/scripts/common/build_user.sh"
|
|
||||||
|
|
||||||
declare -a build_args=()
|
|
||||||
|
|
||||||
if [[ "$use_intranet" == true ]]; then
|
|
||||||
build_args+=("--build-arg" "USE_INTRANET=true")
|
|
||||||
fi
|
|
||||||
|
|
||||||
cd "$root"
|
cd "$root"
|
||||||
|
|
||||||
load_build_user
|
|
||||||
build_args+=("--build-arg" "ARGUS_BUILD_UID=${ARGUS_BUILD_UID}" "--build-arg" "ARGUS_BUILD_GID=${ARGUS_BUILD_GID}")
|
|
||||||
|
|
||||||
master_root="$root/src/master"
|
|
||||||
master_offline_tar="$master_root/offline_wheels.tar.gz"
|
|
||||||
master_offline_dir="$master_root/offline_wheels"
|
|
||||||
|
|
||||||
if [[ "$build_master_offline" == true ]]; then
|
|
||||||
if [[ ! -f "$master_offline_tar" ]]; then
|
|
||||||
echo "❌ offline wheels tar not found: $master_offline_tar" >&2
|
|
||||||
echo " 请提前准备好 offline_wheels.tar.gz 后再执行 --master-offline" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "📦 Preparing offline wheels for master (extracting $master_offline_tar)"
|
|
||||||
rm -rf "$master_offline_dir"
|
|
||||||
mkdir -p "$master_offline_dir"
|
|
||||||
tar -xzf "$master_offline_tar" -C "$master_root"
|
|
||||||
has_wheel=$(find "$master_offline_dir" -maxdepth 1 -type f -name '*.whl' -print -quit)
|
|
||||||
if [[ -z "$has_wheel" ]]; then
|
|
||||||
echo "❌ offline_wheels extraction failed或无 wheel: $master_offline_dir" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "======================================="
|
echo "======================================="
|
||||||
echo "ARGUS Unified Build System"
|
echo "ARGUS Unified Build System"
|
||||||
echo "======================================="
|
echo "======================================="
|
||||||
|
|
||||||
if [[ "$use_intranet" == true ]]; then
|
if [[ "$use_intranet" == true ]]; then
|
||||||
echo "🌐 Mode: Intranet (Using internal mirror: 10.68.64.1)"
|
echo "🌐 Mode: Intranet (Using internal mirror: 10.68.64.1)"
|
||||||
|
build_args="--build-arg USE_INTRANET=true"
|
||||||
else
|
else
|
||||||
echo "🌐 Mode: Public (Using default package sources)"
|
echo "🌐 Mode: Public (Using default package sources)"
|
||||||
|
build_args=""
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "👤 Build user UID:GID -> ${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}"
|
|
||||||
|
|
||||||
echo "📁 Build context: $root"
|
echo "📁 Build context: $root"
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
|
# 构建镜像的函数
|
||||||
build_image() {
|
build_image() {
|
||||||
local image_name=$1
|
local image_name=$1
|
||||||
local dockerfile_path=$2
|
local dockerfile_path=$2
|
||||||
local tag=$3
|
local tag=$3
|
||||||
shift 3
|
|
||||||
local extra_args=("$@")
|
|
||||||
|
|
||||||
echo "🔄 Building $image_name image..."
|
echo "🔄 Building $image_name image..."
|
||||||
echo " Dockerfile: $dockerfile_path"
|
echo " Dockerfile: $dockerfile_path"
|
||||||
echo " Tag: $tag"
|
echo " Tag: $tag"
|
||||||
|
|
||||||
if docker build "${build_args[@]}" "${extra_args[@]}" -f "$dockerfile_path" -t "$tag" .; then
|
if docker build $build_args -f "$dockerfile_path" -t "$tag" .; then
|
||||||
echo "✅ $image_name image built successfully"
|
echo "✅ $image_name image built successfully"
|
||||||
return 0
|
return 0
|
||||||
else
|
else
|
||||||
echo "❌ Failed to build $image_name image"
|
echo "❌ Failed to build $image_name image"
|
||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# 构建所有镜像
|
||||||
images_built=()
|
images_built=()
|
||||||
build_failed=false
|
build_failed=false
|
||||||
|
|
||||||
|
# 构建 Elasticsearch 镜像
|
||||||
if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then
|
if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then
|
||||||
images_built+=("argus-elasticsearch:latest")
|
images_built+=("argus-elasticsearch:latest")
|
||||||
else
|
else
|
||||||
build_failed=true
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:latest"; then
|
|
||||||
images_built+=("argus-kibana:latest")
|
|
||||||
else
|
|
||||||
build_failed=true
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:latest"; then
|
|
||||||
images_built+=("argus-bind9:latest")
|
|
||||||
else
|
|
||||||
build_failed=true
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
if [[ "$build_master" == true ]]; then
|
|
||||||
echo ""
|
|
||||||
echo "🔄 Building Master image..."
|
|
||||||
pushd "$master_root" >/dev/null
|
|
||||||
master_args=("--tag" "argus-master:latest")
|
|
||||||
if [[ "$use_intranet" == true ]]; then
|
|
||||||
master_args+=("--intranet")
|
|
||||||
fi
|
|
||||||
if [[ "$build_master_offline" == true ]]; then
|
|
||||||
master_args+=("--offline")
|
|
||||||
fi
|
|
||||||
if ./scripts/build_images.sh "${master_args[@]}"; then
|
|
||||||
if [[ "$build_master_offline" == true ]]; then
|
|
||||||
images_built+=("argus-master:offline")
|
|
||||||
else
|
|
||||||
images_built+=("argus-master:latest")
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
build_failed=true
|
build_failed=true
|
||||||
fi
|
|
||||||
popd >/dev/null
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 构建 Kibana 镜像
|
||||||
|
if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:latest"; then
|
||||||
|
images_built+=("argus-kibana:latest")
|
||||||
|
else
|
||||||
|
build_failed=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 构建 BIND9 镜像
|
||||||
|
if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:latest"; then
|
||||||
|
images_built+=("argus-bind9:latest")
|
||||||
|
else
|
||||||
|
build_failed=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
echo "======================================="
|
echo "======================================="
|
||||||
echo "📦 Build Summary"
|
echo "📦 Build Summary"
|
||||||
echo "======================================="
|
echo "======================================="
|
||||||
|
|
||||||
if [[ ${#images_built[@]} -gt 0 ]]; then
|
if [[ ${#images_built[@]} -gt 0 ]]; then
|
||||||
echo "✅ Successfully built images:"
|
echo "✅ Successfully built images:"
|
||||||
for image in "${images_built[@]}"; do
|
for image in "${images_built[@]}"; do
|
||||||
echo " • $image"
|
echo " • $image"
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ "$build_failed" == true ]]; then
|
if [[ "$build_failed" == true ]]; then
|
||||||
echo ""
|
echo ""
|
||||||
echo "❌ Some images failed to build. Please check the errors above."
|
echo "❌ Some images failed to build. Please check the errors above."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ "$use_intranet" == true ]]; then
|
if [[ "$use_intranet" == true ]]; then
|
||||||
echo ""
|
echo ""
|
||||||
echo "🌐 Built with intranet mirror configuration"
|
echo "🌐 Built with intranet mirror configuration"
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$build_master_offline" == true ]]; then
|
|
||||||
echo ""
|
|
||||||
echo "🧳 Master offline wheels 已解压到 $master_offline_dir"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "🚀 Next steps:"
|
echo "🚀 Next steps:"
|
||||||
echo " ./build/save_images.sh --compress # 导出镜像"
|
echo " cd src/log && ./scripts/save_images.sh # Export log images"
|
||||||
echo " cd src/master/tests && MASTER_IMAGE_TAG=argus-master:offline ./scripts/00_e2e_test.sh"
|
echo " cd src/bind && ./scripts/save_images.sh # Export bind images"
|
||||||
|
echo " cd src/log/tests && ./scripts/02_up.sh # Start log services"
|
||||||
echo ""
|
echo ""
|
@ -67,7 +67,6 @@ declare -A images=(
|
|||||||
["argus-elasticsearch:latest"]="argus-elasticsearch-latest.tar"
|
["argus-elasticsearch:latest"]="argus-elasticsearch-latest.tar"
|
||||||
["argus-kibana:latest"]="argus-kibana-latest.tar"
|
["argus-kibana:latest"]="argus-kibana-latest.tar"
|
||||||
["argus-bind9:latest"]="argus-bind9-latest.tar"
|
["argus-bind9:latest"]="argus-bind9-latest.tar"
|
||||||
["argus-master:offline"]="argus-master-offline.tar"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# 函数:检查镜像是否存在
|
# 函数:检查镜像是否存在
|
||||||
|
2
configs/.gitignore
vendored
2
configs/.gitignore
vendored
@ -1,2 +0,0 @@
|
|||||||
# Local overrides for build user/group settings
|
|
||||||
build_user.local.conf
|
|
@ -1,6 +0,0 @@
|
|||||||
# Default build-time UID/GID for Argus images
|
|
||||||
# Override by creating configs/build_user.local.conf with the same format.
|
|
||||||
# Syntax: KEY=VALUE, supports UID/GID only. Whitespace and lines starting with # are ignored.
|
|
||||||
|
|
||||||
UID=2133
|
|
||||||
GID=2015
|
|
@ -1,38 +0,0 @@
|
|||||||
# Argus 镜像构建 UID/GID 配置说明
|
|
||||||
|
|
||||||
通过统一配置文件可以为 Kibana、Elasticsearch、Bind、Master 等容器指定运行账号,解决跨机器部署时 UID/GID 不一致导致的权限问题。
|
|
||||||
|
|
||||||
## 配置入口
|
|
||||||
|
|
||||||
- 默认配置存放在 `configs/build_user.conf`,内容示例:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
UID=2133
|
|
||||||
GID=2015
|
|
||||||
```
|
|
||||||
|
|
||||||
- 如果需要本地覆盖,可在 `configs/` 下新建 `build_user.local.conf`,字段与默认文件一致。该文件已列入 `.gitignore`,不会被意外提交。
|
|
||||||
- 亦可在执行脚本前通过环境变量 `ARGUS_BUILD_UID` / `ARGUS_BUILD_GID` 强制指定值,优先级最高。
|
|
||||||
|
|
||||||
## 作用范围
|
|
||||||
|
|
||||||
- `build/build_images.sh` 在构建 log/bind/master 镜像时读取配置,并传递 `--build-arg ARGUS_BUILD_UID/GID`;控制台会输出当前使用的 UID/GID。
|
|
||||||
- `src/master/scripts/build_images.sh` 同步使用配置,确保单独构建 master 镜像时行为一致。
|
|
||||||
- 各镜像 Dockerfile 会根据传入的 UID/GID 调整容器内账号(如 `elasticsearch`、`kibana`、`bind`、`argus`),并以环境变量形式暴露运行时可见值。
|
|
||||||
- Master 启动脚本会在执行 DNS 逻辑后,降权到配置的账号运行 `gunicorn`,确保写入 `/private/argus/**` 的文件具备正确属主。
|
|
||||||
- Log 模块测试脚本 `01_bootstrap.sh` 会根据配置修正挂载目录属主,方便端到端测试在任意用户下运行。
|
|
||||||
|
|
||||||
## 使用建议
|
|
||||||
|
|
||||||
1. 初次克隆仓库后无需修改,默认 UID/GID 保持向后兼容。
|
|
||||||
2. 如果在目标环境中使用新的账号(例如 `uid=4001,gid=4001`):
|
|
||||||
- 编辑 `configs/build_user.local.conf` 填入新值;
|
|
||||||
- 使用新账号登录,并确保其加入宿主机的 `docker` 组;
|
|
||||||
- 重新执行 `build/build_images.sh` 或相关模块的构建脚本。
|
|
||||||
3. 切换配置后建议重新运行目标模块的端到端脚本(如 `src/log/tests/scripts/01_bootstrap.sh`、`src/master/tests/scripts/00_e2e_test.sh`、`src/agent/tests/scripts/00_e2e_test.sh`),验证 `/private/argus` 下文件属主是否为期望账号。
|
|
||||||
|
|
||||||
## 故障排查
|
|
||||||
|
|
||||||
- **镜像构建报错 `groupmod: GID already in use`**:说明所选 GID 已存在于基础镜像,建议换用未占用的值,或在自定义基础镜像中先移除冲突。
|
|
||||||
- **容器内运行时报写权限不足**:检查宿主机挂载目录是否已经由目标 UID/GID 创建;必要时重新执行模块的 `01_bootstrap.sh` 之类的准备脚本。
|
|
||||||
- **仍看到旧 UID/GID**:确认脚本执行时未继承旧缓存,可运行 `ARGUS_BUILD_UID=... ARGUS_BUILD_GID=... ./build/build_images.sh` 强制覆盖。
|
|
@ -1,115 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Shared helper to load Argus build user/group configuration.
|
|
||||||
# Usage:
|
|
||||||
# source "${PROJECT_ROOT}/scripts/common/build_user.sh"
|
|
||||||
# load_build_user
|
|
||||||
# echo "$ARGUS_BUILD_UID:$ARGUS_BUILD_GID"
|
|
||||||
|
|
||||||
ARGUS_BUILD_UID_DEFAULT=2133
|
|
||||||
ARGUS_BUILD_GID_DEFAULT=2015
|
|
||||||
|
|
||||||
shopt -s extglob
|
|
||||||
|
|
||||||
_ARGUS_BUILD_USER_LOADED="${_ARGUS_BUILD_USER_LOADED:-0}"
|
|
||||||
|
|
||||||
_argus_build_user_script_dir() {
|
|
||||||
local dir
|
|
||||||
dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
echo "$dir"
|
|
||||||
}
|
|
||||||
|
|
||||||
argus_project_root() {
|
|
||||||
local script_dir
|
|
||||||
script_dir="$(_argus_build_user_script_dir)"
|
|
||||||
(cd "$script_dir/../.." >/dev/null && pwd)
|
|
||||||
}
|
|
||||||
|
|
||||||
_argus_trim() {
|
|
||||||
local value="$1"
|
|
||||||
value="${value##+([[:space:]])}"
|
|
||||||
value="${value%%+([[:space:]])}"
|
|
||||||
printf '%s' "$value"
|
|
||||||
}
|
|
||||||
|
|
||||||
_argus_is_number() {
|
|
||||||
[[ "$1" =~ ^[0-9]+$ ]]
|
|
||||||
}
|
|
||||||
|
|
||||||
load_build_user() {
|
|
||||||
if [[ "$_ARGUS_BUILD_USER_LOADED" == "1" ]]; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
local project_root config_files config uid gid
|
|
||||||
project_root="$(argus_project_root)"
|
|
||||||
config_files=(
|
|
||||||
"$project_root/configs/build_user.local.conf"
|
|
||||||
"$project_root/configs/build_user.conf"
|
|
||||||
)
|
|
||||||
|
|
||||||
uid="$ARGUS_BUILD_UID_DEFAULT"
|
|
||||||
gid="$ARGUS_BUILD_GID_DEFAULT"
|
|
||||||
|
|
||||||
for config in "${config_files[@]}"; do
|
|
||||||
if [[ -f "$config" ]]; then
|
|
||||||
while IFS= read -r raw_line || [[ -n "$raw_line" ]]; do
|
|
||||||
local line key value
|
|
||||||
line="${raw_line%%#*}"
|
|
||||||
line="$(_argus_trim "${line}")"
|
|
||||||
[[ -z "$line" ]] && continue
|
|
||||||
if [[ "$line" != *=* ]]; then
|
|
||||||
echo "[ARGUS build_user] Ignoring malformed line in $config: $raw_line" >&2
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
key="${line%%=*}"
|
|
||||||
value="${line#*=}"
|
|
||||||
key="$(_argus_trim "$key")"
|
|
||||||
value="$(_argus_trim "$value")"
|
|
||||||
case "$key" in
|
|
||||||
UID)
|
|
||||||
uid="$value"
|
|
||||||
;;
|
|
||||||
GID)
|
|
||||||
gid="$value"
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "[ARGUS build_user] Unknown key '$key' in $config" >&2
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done < "$config"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ -n "${ARGUS_BUILD_UID:-}" ]]; then
|
|
||||||
uid="$ARGUS_BUILD_UID"
|
|
||||||
fi
|
|
||||||
if [[ -n "${ARGUS_BUILD_GID:-}" ]]; then
|
|
||||||
gid="$ARGUS_BUILD_GID"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! _argus_is_number "$uid"; then
|
|
||||||
echo "[ARGUS build_user] Invalid UID '$uid'" >&2
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
if ! _argus_is_number "$gid"; then
|
|
||||||
echo "[ARGUS build_user] Invalid GID '$gid'" >&2
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
export ARGUS_BUILD_UID="$uid"
|
|
||||||
export ARGUS_BUILD_GID="$gid"
|
|
||||||
_ARGUS_BUILD_USER_LOADED=1
|
|
||||||
}
|
|
||||||
|
|
||||||
argus_build_user_args() {
|
|
||||||
load_build_user
|
|
||||||
printf '%s' "--build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} --build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID}"
|
|
||||||
}
|
|
||||||
|
|
||||||
print_build_user() {
|
|
||||||
load_build_user
|
|
||||||
echo "ARGUS build user: UID=${ARGUS_BUILD_UID} GID=${ARGUS_BUILD_GID}"
|
|
||||||
}
|
|
2
src/.gitignore
vendored
2
src/.gitignore
vendored
@ -1,2 +0,0 @@
|
|||||||
|
|
||||||
__pycache__/
|
|
5
src/agent/.gitignore
vendored
5
src/agent/.gitignore
vendored
@ -1,5 +0,0 @@
|
|||||||
build/
|
|
||||||
*.egg-info/
|
|
||||||
__pycache__/
|
|
||||||
|
|
||||||
.env
|
|
@ -1,66 +0,0 @@
|
|||||||
# Argus Agent 模块
|
|
||||||
|
|
||||||
Argus Agent 是一个轻量级 Python 进程,负责向 Argus Master 注册节点、汇报健康数据,并维护本地持久化信息。模块现以 PyInstaller 打包为独立可执行文件,便于在普通容器或虚机中直接运行。
|
|
||||||
|
|
||||||
## 构建可执行文件
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd src/agent
|
|
||||||
./scripts/build_binary.sh # 生成 dist/argus-agent
|
|
||||||
```
|
|
||||||
|
|
||||||
脚本默认会在 Docker 容器 (`python:3.11-slim-bullseye`) 内执行 PyInstaller,确保产物运行时兼容 glibc 2.31+(覆盖 2.35 环境)。构建流程注意事项:
|
|
||||||
|
|
||||||
- 每次构建前会清理 `build/`、`dist/` 并在容器内重新创建虚拟环境。
|
|
||||||
- 需要使用内网 Python 镜像时,可通过 `PIP_INDEX_URL`、`PIP_EXTRA_INDEX_URL`、`PIP_TRUSTED_HOST` 等环境变量传入,脚本会自动透传给容器。
|
|
||||||
- 如果宿主机无法运行 Docker,可设置 `AGENT_BUILD_USE_DOCKER=0` 回退到本地构建;此时代码必须在 glibc ≤ 2.35 的机器上执行。
|
|
||||||
|
|
||||||
构建结束后脚本会在 `build/compat_check/` 下解包关键动态库并输出最高 `GLIBC_x.y` 版本,便于快速核对兼容性。如果结果中缺少 `libssl.so.3` / `libcrypto.so.3`,表示系统会在目标宿主机上使用本地 OpenSSL 库,无需额外处理。
|
|
||||||
|
|
||||||
例如:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
strings build/compat_check/libpython*.so.1.0 | grep -Eo 'GLIBC_[0-9]+\.[0-9]+' | sort -Vu | tail -n1
|
|
||||||
```
|
|
||||||
|
|
||||||
如遇构建失败,常见原因是 Docker 不可用(请改用 `AGENT_BUILD_USE_DOCKER=0`)或无法访问 Python 包镜像(先设置上述镜像环境变量后重试)。
|
|
||||||
|
|
||||||
## 运行时配置
|
|
||||||
|
|
||||||
Agent 不再依赖配置文件;所有参数均由环境变量与主机名推导:
|
|
||||||
|
|
||||||
| 变量 | 必填 | 默认值 | 说明 |
|
|
||||||
| --- | --- | --- | --- |
|
|
||||||
| `MASTER_ENDPOINT` | 是 | N/A | Master 基础地址,可写 `http://host:3000` 或 `host:3000`(自动补全 `http://`)。 |
|
|
||||||
| `REPORT_INTERVAL_SECONDS` | 否 | `60` | 状态上报间隔(秒)。必须为正整数。 |
|
|
||||||
| `AGENT_HOSTNAME` | 否 | `$(hostname)` | 覆盖容器内主机名,便于测试或特殊命名需求。 |
|
|
||||||
|
|
||||||
派生路径:
|
|
||||||
|
|
||||||
- 节点信息:`/private/argus/agent/<hostname>/node.json`
|
|
||||||
- 子模块健康目录:`/private/argus/agent/<hostname>/health/`
|
|
||||||
|
|
||||||
健康目录中的文件需遵循 `<模块前缀>-*.json` 命名(例如 `log-fluentbit.json`、`metric-node-exporter.json`),文件内容会原样并入上报的 `health` 字段。
|
|
||||||
|
|
||||||
## 日志与持久化
|
|
||||||
|
|
||||||
- Agent 会在成功注册、状态上报、异常重试等关键节点输出结构化日志,便于聚合分析。
|
|
||||||
- `node.json` 保存 Master 返回的最新节点对象,用于重启后继续使用既有节点 ID。
|
|
||||||
|
|
||||||
## 端到端测试
|
|
||||||
|
|
||||||
仓库内提供 Docker Compose 测试栈(master + ubuntu 容器):
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd src/agent/tests
|
|
||||||
./scripts/00_e2e_test.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
测试脚本会:
|
|
||||||
|
|
||||||
1. 构建 master 镜像与 agent 可执行文件。
|
|
||||||
2. 以 `ubuntu:24.04` 启动 agent 容器,并通过环境变量注入 `MASTER_ENDPOINT`、`REPORT_INTERVAL_SECONDS`。
|
|
||||||
3. 验证注册、健康上报、nodes.json 生成、统计接口,以及“容器重启 + IP 变化”重注册流程。
|
|
||||||
4. 清理 `tests/private/` 与临时容器网络。
|
|
||||||
|
|
||||||
如需在真实环境部署,只需将 `dist/argus-agent` 连同健康目录挂载到目标主机,并按上表设置环境变量即可。
|
|
@ -1,60 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
from typing import Any, Dict, Optional
|
|
||||||
|
|
||||||
import requests
|
|
||||||
|
|
||||||
from .log import get_logger
|
|
||||||
|
|
||||||
LOGGER = get_logger("argus.agent.client")
|
|
||||||
|
|
||||||
|
|
||||||
class MasterAPIError(Exception):
|
|
||||||
def __init__(self, message: str, status_code: int, payload: Optional[Dict[str, Any]] = None) -> None:
|
|
||||||
super().__init__(message)
|
|
||||||
self.status_code = status_code
|
|
||||||
self.payload = payload or {}
|
|
||||||
|
|
||||||
|
|
||||||
class AgentClient:
|
|
||||||
def __init__(self, base_url: str, *, timeout: int = 10) -> None:
|
|
||||||
self._base_url = base_url.rstrip("/")
|
|
||||||
self._timeout = timeout
|
|
||||||
self._session = requests.Session()
|
|
||||||
|
|
||||||
def register_node(self, body: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
"""调用 master 注册接口,返回节点对象。"""
|
|
||||||
url = f"{self._base_url}/api/v1/master/nodes"
|
|
||||||
response = self._session.post(url, json=body, timeout=self._timeout)
|
|
||||||
return self._parse_response(response, "Failed to register node")
|
|
||||||
|
|
||||||
def update_status(self, node_id: str, body: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
"""上报健康信息,由 master 更新 last_report。"""
|
|
||||||
url = f"{self._base_url}/api/v1/master/nodes/{node_id}/status"
|
|
||||||
response = self._session.put(url, json=body, timeout=self._timeout)
|
|
||||||
return self._parse_response(response, "Failed to update node status")
|
|
||||||
|
|
||||||
def _parse_response(self, response: requests.Response, error_prefix: str) -> Dict[str, Any]:
|
|
||||||
content_type = response.headers.get("Content-Type", "")
|
|
||||||
payload: Dict[str, Any] | None = None
|
|
||||||
if "application/json" in content_type:
|
|
||||||
try:
|
|
||||||
payload = response.json()
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
LOGGER.warning("Response contained invalid JSON", extra={"status": response.status_code})
|
|
||||||
|
|
||||||
if response.status_code >= 400:
|
|
||||||
message = payload.get("error") if isinstance(payload, dict) else response.text
|
|
||||||
raise MasterAPIError(
|
|
||||||
f"{error_prefix}: {message}",
|
|
||||||
status_code=response.status_code,
|
|
||||||
payload=payload if isinstance(payload, dict) else None,
|
|
||||||
)
|
|
||||||
|
|
||||||
if payload is None:
|
|
||||||
try:
|
|
||||||
payload = response.json()
|
|
||||||
except json.JSONDecodeError as exc:
|
|
||||||
raise MasterAPIError("Master returned non-JSON payload", response.status_code) from exc
|
|
||||||
return payload
|
|
@ -1,111 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import socket
|
|
||||||
import subprocess
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Dict
|
|
||||||
|
|
||||||
from .config import AgentConfig
|
|
||||||
from .log import get_logger
|
|
||||||
|
|
||||||
LOGGER = get_logger("argus.agent.collector")
|
|
||||||
|
|
||||||
_HOSTNAME_PATTERN = re.compile(r"^([^-]+)-([^-]+)-([^-]+)-.*$")
|
|
||||||
|
|
||||||
|
|
||||||
def collect_metadata(config: AgentConfig) -> Dict[str, Any]:
|
|
||||||
"""汇总节点注册需要的静态信息。"""
|
|
||||||
hostname = config.hostname
|
|
||||||
env, user, instance = _parse_hostname(hostname)
|
|
||||||
meta = {
|
|
||||||
"hostname": hostname,
|
|
||||||
"ip": _detect_ip_address(),
|
|
||||||
"env": env,
|
|
||||||
"user": user,
|
|
||||||
"instance": instance,
|
|
||||||
"cpu_number": _detect_cpu_count(),
|
|
||||||
"memory_in_bytes": _detect_memory_bytes(),
|
|
||||||
"gpu_number": _detect_gpu_count(),
|
|
||||||
}
|
|
||||||
return meta
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_hostname(hostname: str) -> tuple[str, str, str]:
|
|
||||||
"""按照约定的 env-user-instance 前缀拆解主机名。"""
|
|
||||||
match = _HOSTNAME_PATTERN.match(hostname)
|
|
||||||
if not match:
|
|
||||||
LOGGER.warning("Hostname does not match expected pattern", extra={"hostname": hostname})
|
|
||||||
return "", "", ""
|
|
||||||
return match.group(1), match.group(2), match.group(3)
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_cpu_count() -> int:
|
|
||||||
count = os.cpu_count()
|
|
||||||
return count if count is not None else 0
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_memory_bytes() -> int:
|
|
||||||
"""优先读取 cgroup 限额,失败时退回 /proc/meminfo。"""
|
|
||||||
cgroup_path = Path("/sys/fs/cgroup/memory.max")
|
|
||||||
try:
|
|
||||||
raw = cgroup_path.read_text(encoding="utf-8").strip()
|
|
||||||
if raw and raw != "max":
|
|
||||||
return int(raw)
|
|
||||||
except FileNotFoundError:
|
|
||||||
LOGGER.debug("cgroup memory.max not found, falling back to /proc/meminfo")
|
|
||||||
except ValueError:
|
|
||||||
LOGGER.warning("Failed to parse memory.max, falling back", extra={"value": raw})
|
|
||||||
|
|
||||||
try:
|
|
||||||
with open("/proc/meminfo", "r", encoding="utf-8") as handle:
|
|
||||||
for line in handle:
|
|
||||||
if line.startswith("MemTotal:"):
|
|
||||||
parts = line.split()
|
|
||||||
if len(parts) >= 2:
|
|
||||||
return int(parts[1]) * 1024
|
|
||||||
except FileNotFoundError:
|
|
||||||
LOGGER.error("/proc/meminfo not found; defaulting memory to 0")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_gpu_count() -> int:
|
|
||||||
"""采集 GPU 数量,如无法探测则默认为 0。"""
|
|
||||||
try:
|
|
||||||
proc = subprocess.run(
|
|
||||||
["nvidia-smi", "-L"],
|
|
||||||
check=False,
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
text=True,
|
|
||||||
timeout=5,
|
|
||||||
)
|
|
||||||
except FileNotFoundError:
|
|
||||||
LOGGER.debug("nvidia-smi not available; assuming 0 GPUs")
|
|
||||||
return 0
|
|
||||||
except subprocess.SubprocessError as exc:
|
|
||||||
LOGGER.warning("nvidia-smi invocation failed", extra={"error": str(exc)})
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if proc.returncode != 0:
|
|
||||||
LOGGER.debug("nvidia-smi returned non-zero", extra={"stderr": proc.stderr.strip()})
|
|
||||||
return 0
|
|
||||||
|
|
||||||
count = sum(1 for line in proc.stdout.splitlines() if line.strip())
|
|
||||||
return count
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_ip_address() -> str:
|
|
||||||
"""尝试通过 UDP socket 获得容器出口 IP,失败则回退解析主机名。"""
|
|
||||||
try:
|
|
||||||
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock:
|
|
||||||
sock.connect(("8.8.8.8", 80))
|
|
||||||
return sock.getsockname()[0]
|
|
||||||
except OSError:
|
|
||||||
LOGGER.debug("UDP socket trick failed; falling back to hostname lookup")
|
|
||||||
try:
|
|
||||||
return socket.gethostbyname(socket.gethostname())
|
|
||||||
except OSError:
|
|
||||||
LOGGER.warning("Unable to resolve hostname to IP; defaulting to 127.0.0.1")
|
|
||||||
return "127.0.0.1"
|
|
@ -1,74 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import os
|
|
||||||
import socket
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Final
|
|
||||||
|
|
||||||
from .version import VERSION
|
|
||||||
|
|
||||||
DEFAULT_REPORT_INTERVAL_SECONDS: Final[int] = 60
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class AgentConfig:
|
|
||||||
hostname: str
|
|
||||||
node_file: str
|
|
||||||
version: str
|
|
||||||
master_endpoint: str
|
|
||||||
report_interval_seconds: int
|
|
||||||
health_dir: str
|
|
||||||
request_timeout_seconds: int = 10
|
|
||||||
|
|
||||||
|
|
||||||
def _normalise_master_endpoint(value: str) -> str:
|
|
||||||
value = value.strip()
|
|
||||||
if not value:
|
|
||||||
raise ValueError("MASTER_ENDPOINT environment variable is required")
|
|
||||||
if not value.startswith("http://") and not value.startswith("https://"):
|
|
||||||
value = f"http://{value}"
|
|
||||||
return value.rstrip("/")
|
|
||||||
|
|
||||||
|
|
||||||
def _read_report_interval(raw_value: str | None) -> int:
|
|
||||||
if raw_value is None or raw_value.strip() == "":
|
|
||||||
return DEFAULT_REPORT_INTERVAL_SECONDS
|
|
||||||
try:
|
|
||||||
interval = int(raw_value)
|
|
||||||
except ValueError as exc:
|
|
||||||
raise ValueError("REPORT_INTERVAL_SECONDS must be an integer") from exc
|
|
||||||
if interval <= 0:
|
|
||||||
raise ValueError("REPORT_INTERVAL_SECONDS must be positive")
|
|
||||||
return interval
|
|
||||||
|
|
||||||
|
|
||||||
def _resolve_hostname() -> str:
|
|
||||||
return os.environ.get("AGENT_HOSTNAME") or socket.gethostname()
|
|
||||||
|
|
||||||
|
|
||||||
def load_config() -> AgentConfig:
|
|
||||||
"""从环境变量推导配置,移除了外部配置文件依赖。"""
|
|
||||||
|
|
||||||
hostname = _resolve_hostname()
|
|
||||||
node_file = f"/private/argus/agent/{hostname}/node.json"
|
|
||||||
health_dir = f"/private/argus/agent/{hostname}/health/"
|
|
||||||
|
|
||||||
master_endpoint_env = os.environ.get("MASTER_ENDPOINT")
|
|
||||||
if master_endpoint_env is None:
|
|
||||||
raise ValueError("MASTER_ENDPOINT environment variable is not set")
|
|
||||||
master_endpoint = _normalise_master_endpoint(master_endpoint_env)
|
|
||||||
|
|
||||||
report_interval_seconds = _read_report_interval(os.environ.get("REPORT_INTERVAL_SECONDS"))
|
|
||||||
|
|
||||||
Path(node_file).parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
Path(health_dir).mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
return AgentConfig(
|
|
||||||
hostname=hostname,
|
|
||||||
node_file=node_file,
|
|
||||||
version=VERSION,
|
|
||||||
master_endpoint=master_endpoint,
|
|
||||||
report_interval_seconds=report_interval_seconds,
|
|
||||||
health_dir=health_dir,
|
|
||||||
)
|
|
@ -1,32 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Dict
|
|
||||||
|
|
||||||
from .log import get_logger
|
|
||||||
|
|
||||||
LOGGER = get_logger("argus.agent.health")
|
|
||||||
|
|
||||||
|
|
||||||
def read_health_directory(path: str) -> Dict[str, Any]:
|
|
||||||
"""读取目录中所有 <prefix>-*.json 文件并返回 JSON 映射。"""
|
|
||||||
result: Dict[str, Any] = {}
|
|
||||||
directory = Path(path)
|
|
||||||
if not directory.exists():
|
|
||||||
LOGGER.debug("Health directory does not exist", extra={"path": str(directory)})
|
|
||||||
return result
|
|
||||||
|
|
||||||
for health_file in sorted(directory.glob("*.json")):
|
|
||||||
if "-" not in health_file.stem:
|
|
||||||
LOGGER.debug("Skipping non-prefixed health file", extra={"file": health_file.name})
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
with health_file.open("r", encoding="utf-8") as handle:
|
|
||||||
content = json.load(handle)
|
|
||||||
result[health_file.stem] = content
|
|
||||||
except json.JSONDecodeError as exc:
|
|
||||||
LOGGER.warning("Failed to parse health file", extra={"file": health_file.name, "error": str(exc)})
|
|
||||||
except OSError as exc:
|
|
||||||
LOGGER.warning("Failed to read health file", extra={"file": health_file.name, "error": str(exc)})
|
|
||||||
return result
|
|
@ -1,18 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
_LOG_FORMAT = "%(asctime)s %(levelname)s %(name)s - %(message)s"
|
|
||||||
|
|
||||||
|
|
||||||
def setup_logging() -> None:
|
|
||||||
level_name = os.environ.get("AGENT_LOG_LEVEL", "INFO").upper()
|
|
||||||
level = getattr(logging, level_name, logging.INFO)
|
|
||||||
logging.basicConfig(level=level, format=_LOG_FORMAT)
|
|
||||||
|
|
||||||
|
|
||||||
def get_logger(name: str) -> logging.Logger:
|
|
||||||
setup_logging()
|
|
||||||
return logging.getLogger(name)
|
|
@ -1,163 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import signal
|
|
||||||
import time
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from .client import AgentClient, MasterAPIError
|
|
||||||
from .collector import collect_metadata
|
|
||||||
from .config import AgentConfig, load_config
|
|
||||||
from .health_reader import read_health_directory
|
|
||||||
from .log import get_logger, setup_logging
|
|
||||||
from .state import clear_node_state, load_node_state, save_node_state
|
|
||||||
|
|
||||||
LOGGER = get_logger("argus.agent")
|
|
||||||
|
|
||||||
|
|
||||||
def _current_timestamp() -> str:
|
|
||||||
return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
|
||||||
|
|
||||||
|
|
||||||
class StopSignal:
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self._stop = False
|
|
||||||
|
|
||||||
def set(self, *_args) -> None: # type: ignore[override]
|
|
||||||
self._stop = True
|
|
||||||
|
|
||||||
def is_set(self) -> bool:
|
|
||||||
return self._stop
|
|
||||||
|
|
||||||
|
|
||||||
def main(argv: Optional[list[str]] = None) -> int: # noqa: ARG001 - 保留签名以兼容入口调用
|
|
||||||
setup_logging()
|
|
||||||
|
|
||||||
stop_signal = StopSignal()
|
|
||||||
signal.signal(signal.SIGTERM, stop_signal.set)
|
|
||||||
signal.signal(signal.SIGINT, stop_signal.set)
|
|
||||||
|
|
||||||
try:
|
|
||||||
config = load_config()
|
|
||||||
except Exception as exc:
|
|
||||||
LOGGER.error("Failed to load configuration", extra={"error": str(exc)})
|
|
||||||
return 1
|
|
||||||
|
|
||||||
LOGGER.info(
|
|
||||||
"Agent starting",
|
|
||||||
extra={
|
|
||||||
"hostname": config.hostname,
|
|
||||||
"master_endpoint": config.master_endpoint,
|
|
||||||
"node_file": config.node_file,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
client = AgentClient(config.master_endpoint, timeout=config.request_timeout_seconds)
|
|
||||||
|
|
||||||
node_state = load_node_state(config.node_file) or {}
|
|
||||||
node_id = node_state.get("id")
|
|
||||||
|
|
||||||
# 与 master 建立注册关系(支持重注册),失败则重试
|
|
||||||
register_response = _register_with_retry(client, config, node_id, stop_signal)
|
|
||||||
if register_response is None:
|
|
||||||
LOGGER.info("Registration aborted due to shutdown signal")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
node_id = register_response.get("id")
|
|
||||||
if not node_id:
|
|
||||||
LOGGER.error("Master did not return node id; aborting")
|
|
||||||
return 1
|
|
||||||
save_node_state(config.node_file, register_response)
|
|
||||||
|
|
||||||
LOGGER.info("Entering status report loop", extra={"node_id": node_id})
|
|
||||||
_status_loop(client, config, node_id, stop_signal)
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
def _register_with_retry(
|
|
||||||
client: AgentClient,
|
|
||||||
config: AgentConfig,
|
|
||||||
node_id: Optional[str],
|
|
||||||
stop_signal: StopSignal,
|
|
||||||
):
|
|
||||||
backoff = 5
|
|
||||||
while not stop_signal.is_set():
|
|
||||||
payload = {
|
|
||||||
"name": config.hostname,
|
|
||||||
"type": "agent",
|
|
||||||
"meta_data": collect_metadata(config),
|
|
||||||
"version": config.version,
|
|
||||||
}
|
|
||||||
if node_id:
|
|
||||||
payload["id"] = node_id
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = client.register_node(payload)
|
|
||||||
LOGGER.info("Registration successful", extra={"node_id": response.get("id")})
|
|
||||||
save_node_state(config.node_file, response)
|
|
||||||
return response
|
|
||||||
except MasterAPIError as exc:
|
|
||||||
if exc.status_code == 404 and node_id:
|
|
||||||
LOGGER.warning(
|
|
||||||
"Master does not recognise node id; clearing local node state",
|
|
||||||
extra={"node_id": node_id},
|
|
||||||
)
|
|
||||||
clear_node_state(config.node_file)
|
|
||||||
node_id = None
|
|
||||||
elif exc.status_code == 500 and node_id:
|
|
||||||
# id 与 name 不匹配通常意味着配置异常,记录但继续重试
|
|
||||||
LOGGER.error(
|
|
||||||
"Master rejected node due to id/name mismatch; will retry",
|
|
||||||
extra={"node_id": node_id},
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
LOGGER.error("Registration failed", extra={"status_code": exc.status_code, "error": str(exc)})
|
|
||||||
time.sleep(min(backoff, 60))
|
|
||||||
backoff = min(backoff * 2, 60)
|
|
||||||
except Exception as exc: # pragma: no cover - defensive
|
|
||||||
LOGGER.exception("Unexpected error during registration", extra={"error": str(exc)})
|
|
||||||
time.sleep(min(backoff, 60))
|
|
||||||
backoff = min(backoff * 2, 60)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _status_loop(
|
|
||||||
client: AgentClient,
|
|
||||||
config: AgentConfig,
|
|
||||||
node_id: str,
|
|
||||||
stop_signal: StopSignal,
|
|
||||||
) -> None:
|
|
||||||
interval = config.report_interval_seconds
|
|
||||||
while not stop_signal.is_set():
|
|
||||||
timestamp = _current_timestamp()
|
|
||||||
health_payload = read_health_directory(config.health_dir)
|
|
||||||
body = {
|
|
||||||
"timestamp": timestamp,
|
|
||||||
"health": health_payload,
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
response = client.update_status(node_id, body)
|
|
||||||
LOGGER.info(
|
|
||||||
"Status report succeeded",
|
|
||||||
extra={"node_id": node_id, "health_keys": list(health_payload.keys())},
|
|
||||||
)
|
|
||||||
save_node_state(config.node_file, response)
|
|
||||||
except MasterAPIError as exc:
|
|
||||||
# 保持循环继续执行,等待下一次重试
|
|
||||||
LOGGER.error(
|
|
||||||
"Failed to report status",
|
|
||||||
extra={"status_code": exc.status_code, "error": str(exc)},
|
|
||||||
)
|
|
||||||
except Exception as exc: # pragma: no cover - defensive
|
|
||||||
LOGGER.exception("Unexpected error during status report", extra={"error": str(exc)})
|
|
||||||
|
|
||||||
for _ in range(interval):
|
|
||||||
if stop_signal.is_set():
|
|
||||||
break
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
LOGGER.info("Stop signal received; exiting status loop")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sys.exit(main())
|
|
@ -1,44 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import tempfile
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Dict, Optional
|
|
||||||
|
|
||||||
from .log import get_logger
|
|
||||||
|
|
||||||
LOGGER = get_logger("argus.agent.state")
|
|
||||||
|
|
||||||
|
|
||||||
def load_node_state(path: str) -> Optional[Dict[str, Any]]:
|
|
||||||
"""读取本地 node.json,容器重启后沿用之前的 ID。"""
|
|
||||||
try:
|
|
||||||
with open(path, "r", encoding="utf-8") as handle:
|
|
||||||
return json.load(handle)
|
|
||||||
except FileNotFoundError:
|
|
||||||
return None
|
|
||||||
except json.JSONDecodeError as exc:
|
|
||||||
LOGGER.warning("node.json is invalid JSON; ignoring", extra={"error": str(exc)})
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def save_node_state(path: str, data: Dict[str, Any]) -> None:
|
|
||||||
"""原子化写入 node.json,避免并发读取坏数据。"""
|
|
||||||
directory = Path(path).parent
|
|
||||||
directory.mkdir(parents=True, exist_ok=True)
|
|
||||||
with tempfile.NamedTemporaryFile("w", dir=directory, delete=False, encoding="utf-8") as tmp:
|
|
||||||
json.dump(data, tmp, separators=(",", ":"))
|
|
||||||
tmp.flush()
|
|
||||||
os.fsync(tmp.fileno())
|
|
||||||
temp_path = tmp.name
|
|
||||||
os.replace(temp_path, path)
|
|
||||||
|
|
||||||
|
|
||||||
def clear_node_state(path: str) -> None:
|
|
||||||
try:
|
|
||||||
os.remove(path)
|
|
||||||
except FileNotFoundError:
|
|
||||||
return
|
|
||||||
except OSError as exc:
|
|
||||||
LOGGER.warning("Failed to remove node state file", extra={"error": str(exc), "path": path})
|
|
@ -1,69 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import importlib.metadata
|
|
||||||
|
|
||||||
try:
|
|
||||||
import tomllib
|
|
||||||
except ModuleNotFoundError: # pragma: no cover
|
|
||||||
import tomli as tomllib # type: ignore[no-redef]
|
|
||||||
|
|
||||||
|
|
||||||
def _candidate_paths() -> list[Path]:
|
|
||||||
paths = []
|
|
||||||
bundle_dir: Optional[str] = getattr(sys, "_MEIPASS", None)
|
|
||||||
if bundle_dir:
|
|
||||||
paths.append(Path(bundle_dir) / "pyproject.toml")
|
|
||||||
paths.append(Path(__file__).resolve().parent.parent / "pyproject.toml")
|
|
||||||
paths.append(Path(__file__).resolve().parent / "pyproject.toml")
|
|
||||||
paths.append(Path.cwd() / "pyproject.toml")
|
|
||||||
return paths
|
|
||||||
|
|
||||||
|
|
||||||
def _read_from_pyproject() -> Optional[str]:
|
|
||||||
for path in _candidate_paths():
|
|
||||||
if not path.exists():
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
with path.open("rb") as handle:
|
|
||||||
data = tomllib.load(handle)
|
|
||||||
except (OSError, tomllib.TOMLDecodeError):
|
|
||||||
continue
|
|
||||||
project = data.get("project")
|
|
||||||
if isinstance(project, dict):
|
|
||||||
version = project.get("version")
|
|
||||||
if isinstance(version, str):
|
|
||||||
return version
|
|
||||||
tool = data.get("tool")
|
|
||||||
if isinstance(tool, dict):
|
|
||||||
argus_cfg = tool.get("argus")
|
|
||||||
if isinstance(argus_cfg, dict):
|
|
||||||
version = argus_cfg.get("version")
|
|
||||||
if isinstance(version, str):
|
|
||||||
return version
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_version() -> str:
|
|
||||||
try:
|
|
||||||
return importlib.metadata.version("argus-agent")
|
|
||||||
except importlib.metadata.PackageNotFoundError:
|
|
||||||
pass
|
|
||||||
override = os.environ.get("AGENT_VERSION_OVERRIDE")
|
|
||||||
if override:
|
|
||||||
return override
|
|
||||||
fallback = _read_from_pyproject()
|
|
||||||
if fallback:
|
|
||||||
return fallback
|
|
||||||
return "0.0.0"
|
|
||||||
|
|
||||||
|
|
||||||
VERSION: str = _detect_version()
|
|
||||||
|
|
||||||
|
|
||||||
def get_version() -> str:
|
|
||||||
return VERSION
|
|
BIN
src/agent/dist/argus-agent
vendored
BIN
src/agent/dist/argus-agent
vendored
Binary file not shown.
@ -1,10 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from app.main import main as agent_main
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sys.exit(agent_main())
|
|
@ -1,19 +0,0 @@
|
|||||||
[project]
|
|
||||||
name = "argus-agent"
|
|
||||||
version = "1.1.0"
|
|
||||||
description = "Argus agent binary"
|
|
||||||
readme = "README.md"
|
|
||||||
requires-python = ">=3.11"
|
|
||||||
dependencies = [
|
|
||||||
"requests==2.31.0"
|
|
||||||
]
|
|
||||||
|
|
||||||
[build-system]
|
|
||||||
requires = ["setuptools>=69", "wheel"]
|
|
||||||
build-backend = "setuptools.build_meta"
|
|
||||||
|
|
||||||
[tool.argus]
|
|
||||||
entry = "app.main:main"
|
|
||||||
|
|
||||||
[tool.setuptools]
|
|
||||||
packages = ["app"]
|
|
@ -1,690 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
LOG_PREFIX="[AGENT-VERIFY]"
|
|
||||||
MASTER_ENDPOINT_DEFAULT=""
|
|
||||||
AGENT_DATA_ROOT_DEFAULT="/private/argus/agent"
|
|
||||||
AGENT_ETC_ROOT_DEFAULT="/private/argus/etc"
|
|
||||||
REPORT_INTERVAL_DEFAULT="2"
|
|
||||||
|
|
||||||
ALLOW_CONFIG_TOUCH="false"
|
|
||||||
KEEP_TEST_HEALTH="false"
|
|
||||||
|
|
||||||
log_info() {
|
|
||||||
echo "${LOG_PREFIX} INFO $*"
|
|
||||||
}
|
|
||||||
|
|
||||||
log_warn() {
|
|
||||||
echo "${LOG_PREFIX} WARN $*" >&2
|
|
||||||
}
|
|
||||||
|
|
||||||
log_error() {
|
|
||||||
echo "${LOG_PREFIX} ERROR $*" >&2
|
|
||||||
}
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
cat <<'USAGE'
|
|
||||||
Usage: agent_deployment_verify.sh [options]
|
|
||||||
|
|
||||||
Options:
|
|
||||||
--allow-config-touch Enable optional config PUT dry-run check.
|
|
||||||
--keep-test-health Keep the temporary verify health file after checks.
|
|
||||||
-h, --help Show this help message.
|
|
||||||
|
|
||||||
Environment variables:
|
|
||||||
MASTER_ENDPOINT (required) Master API base endpoint, e.g. http://master:3000
|
|
||||||
AGENT_DATA_ROOT (default: /private/argus/agent)
|
|
||||||
AGENT_ETC_ROOT (default: /private/argus/etc)
|
|
||||||
VERIFY_HOSTNAME (default: output of hostname)
|
|
||||||
REPORT_INTERVAL_SECONDS (default: 2) Agent report interval in seconds
|
|
||||||
USAGE
|
|
||||||
}
|
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
case "$1" in
|
|
||||||
--allow-config-touch)
|
|
||||||
ALLOW_CONFIG_TOUCH="true"
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
--keep-test-health)
|
|
||||||
KEEP_TEST_HEALTH="true"
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
-h|--help)
|
|
||||||
usage
|
|
||||||
exit 0
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
log_error "Unknown option: $1"
|
|
||||||
usage >&2
|
|
||||||
exit 2
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
MASTER_ENDPOINT="${MASTER_ENDPOINT:-$MASTER_ENDPOINT_DEFAULT}"
|
|
||||||
AGENT_DATA_ROOT="${AGENT_DATA_ROOT:-$AGENT_DATA_ROOT_DEFAULT}"
|
|
||||||
AGENT_ETC_ROOT="${AGENT_ETC_ROOT:-$AGENT_ETC_ROOT_DEFAULT}"
|
|
||||||
VERIFY_HOSTNAME="${VERIFY_HOSTNAME:-$(hostname)}"
|
|
||||||
REPORT_INTERVAL_SECONDS="${REPORT_INTERVAL_SECONDS:-$REPORT_INTERVAL_DEFAULT}"
|
|
||||||
|
|
||||||
if [[ -z "$MASTER_ENDPOINT" ]]; then
|
|
||||||
log_error "MASTER_ENDPOINT is required"
|
|
||||||
exit 2
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! [[ "$REPORT_INTERVAL_SECONDS" =~ ^[0-9]+$ ]] || [[ "$REPORT_INTERVAL_SECONDS" -le 0 ]]; then
|
|
||||||
log_warn "Invalid REPORT_INTERVAL_SECONDS='$REPORT_INTERVAL_SECONDS', fallback to $REPORT_INTERVAL_DEFAULT"
|
|
||||||
REPORT_INTERVAL_SECONDS="$REPORT_INTERVAL_DEFAULT"
|
|
||||||
fi
|
|
||||||
|
|
||||||
normalize_endpoint() {
|
|
||||||
local endpoint="$1"
|
|
||||||
if [[ "$endpoint" != http://* && "$endpoint" != https://* ]]; then
|
|
||||||
endpoint="http://$endpoint"
|
|
||||||
fi
|
|
||||||
endpoint="${endpoint%/}"
|
|
||||||
echo "$endpoint"
|
|
||||||
}
|
|
||||||
|
|
||||||
MASTER_BASE="$(normalize_endpoint "$MASTER_ENDPOINT")"
|
|
||||||
|
|
||||||
NODE_DIR="$AGENT_DATA_ROOT/$VERIFY_HOSTNAME"
|
|
||||||
NODE_JSON="$NODE_DIR/node.json"
|
|
||||||
HEALTH_DIR="$NODE_DIR/health"
|
|
||||||
DNS_CONF="$AGENT_ETC_ROOT/dns.conf"
|
|
||||||
UPDATE_SCRIPT="$AGENT_ETC_ROOT/update-dns.sh"
|
|
||||||
|
|
||||||
declare -a RESULTS_PASS=()
|
|
||||||
declare -a RESULTS_WARN=()
|
|
||||||
declare -a RESULTS_FAIL=()
|
|
||||||
|
|
||||||
add_result() {
|
|
||||||
local level="$1" message="$2"
|
|
||||||
case "$level" in
|
|
||||||
PASS)
|
|
||||||
RESULTS_PASS+=("$message")
|
|
||||||
log_info "$message"
|
|
||||||
;;
|
|
||||||
WARN)
|
|
||||||
RESULTS_WARN+=("$message")
|
|
||||||
log_warn "$message"
|
|
||||||
;;
|
|
||||||
FAIL)
|
|
||||||
RESULTS_FAIL+=("$message")
|
|
||||||
log_error "$message"
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
HAS_JQ="0"
|
|
||||||
if command -v jq >/dev/null 2>&1; then
|
|
||||||
HAS_JQ="1"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! command -v curl >/dev/null 2>&1; then
|
|
||||||
log_error "curl command not found; please install curl (e.g. apt-get install -y curl)"
|
|
||||||
exit 2
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$HAS_JQ" == "0" ]] && ! command -v python3 >/dev/null 2>&1; then
|
|
||||||
log_error "Neither jq nor python3 is available for JSON processing"
|
|
||||||
exit 2
|
|
||||||
fi
|
|
||||||
|
|
||||||
CURL_OPTS=(--fail --show-error --silent --max-time 10)
|
|
||||||
|
|
||||||
curl_json() {
|
|
||||||
local url="$1"
|
|
||||||
if ! curl "${CURL_OPTS[@]}" "$url"; then
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
json_query() {
|
|
||||||
local json="$1" jq_expr="$2" py_expr="$3"
|
|
||||||
if [[ "$HAS_JQ" == "1" ]]; then
|
|
||||||
if ! output=$(printf '%s' "$json" | jq -e -r "$jq_expr" 2>/dev/null); then
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
printf '%s' "$output"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
python3 - "$py_expr" <<'PY'
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
|
|
||||||
expr = sys.argv[1]
|
|
||||||
try:
|
|
||||||
data = json.load(sys.stdin)
|
|
||||||
value = eval(expr, {}, {"data": data})
|
|
||||||
except Exception:
|
|
||||||
sys.exit(1)
|
|
||||||
if value is None:
|
|
||||||
sys.exit(1)
|
|
||||||
if isinstance(value, (dict, list)):
|
|
||||||
print(json.dumps(value))
|
|
||||||
else:
|
|
||||||
print(value)
|
|
||||||
PY
|
|
||||||
}
|
|
||||||
|
|
||||||
json_length() {
|
|
||||||
local json="$1" jq_expr="$2" py_expr="$3"
|
|
||||||
if [[ "$HAS_JQ" == "1" ]]; then
|
|
||||||
if ! output=$(printf '%s' "$json" | jq -e "$jq_expr" 2>/dev/null); then
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
printf '%s' "$output"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
python3 - "$py_expr" <<'PY'
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
|
|
||||||
expr = sys.argv[1]
|
|
||||||
try:
|
|
||||||
data = json.load(sys.stdin)
|
|
||||||
value = eval(expr, {}, {"data": data})
|
|
||||||
except Exception:
|
|
||||||
sys.exit(1)
|
|
||||||
try:
|
|
||||||
print(len(value))
|
|
||||||
except Exception:
|
|
||||||
sys.exit(1)
|
|
||||||
PY
|
|
||||||
}
|
|
||||||
|
|
||||||
json_has_key() {
|
|
||||||
local json="$1" jq_expr="$2" py_expr="$3"
|
|
||||||
if [[ "$HAS_JQ" == "1" ]]; then
|
|
||||||
if printf '%s' "$json" | jq -e "$jq_expr" >/dev/null 2>&1; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
python3 - "$py_expr" <<'PY'
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
|
|
||||||
expr = sys.argv[1]
|
|
||||||
try:
|
|
||||||
data = json.load(sys.stdin)
|
|
||||||
value = eval(expr, {}, {"data": data})
|
|
||||||
except Exception:
|
|
||||||
sys.exit(1)
|
|
||||||
if value:
|
|
||||||
sys.exit(0)
|
|
||||||
sys.exit(1)
|
|
||||||
PY
|
|
||||||
}
|
|
||||||
|
|
||||||
iso_to_epoch() {
|
|
||||||
local value="$1"
|
|
||||||
if command -v date >/dev/null 2>&1; then
|
|
||||||
date -d "$value" +%s 2>/dev/null && return 0
|
|
||||||
fi
|
|
||||||
if command -v python3 >/dev/null 2>&1; then
|
|
||||||
python3 - "$value" <<'PY'
|
|
||||||
import sys
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
value = sys.argv[1]
|
|
||||||
if value is None or value == "":
|
|
||||||
sys.exit(1)
|
|
||||||
if value.endswith('Z'):
|
|
||||||
value = value[:-1] + '+00:00'
|
|
||||||
try:
|
|
||||||
dt = datetime.fromisoformat(value)
|
|
||||||
except ValueError:
|
|
||||||
sys.exit(1)
|
|
||||||
print(int(dt.timestamp()))
|
|
||||||
PY
|
|
||||||
return $?
|
|
||||||
fi
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
validate_json_file() {
|
|
||||||
local path="$1"
|
|
||||||
if [[ "$HAS_JQ" == "1" ]]; then
|
|
||||||
jq empty "$path" >/dev/null 2>&1 && return 0
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
if command -v python3 >/dev/null 2>&1; then
|
|
||||||
python3 - "$path" <<'PY'
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
path = sys.argv[1]
|
|
||||||
with open(path, 'r', encoding='utf-8') as handle:
|
|
||||||
json.load(handle)
|
|
||||||
PY
|
|
||||||
return $?
|
|
||||||
fi
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
ensure_directory() {
|
|
||||||
local dir="$1"
|
|
||||||
if [[ ! -d "$dir" ]]; then
|
|
||||||
log_warn "Creating missing directory $dir"
|
|
||||||
mkdir -p "$dir"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_HEALTH_FILE=""
|
|
||||||
TEST_HEALTH_BACKUP=""
|
|
||||||
TEST_HEALTH_EXISTED="false"
|
|
||||||
|
|
||||||
cleanup() {
|
|
||||||
if [[ -n "$TEST_HEALTH_FILE" ]]; then
|
|
||||||
if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then
|
|
||||||
printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE"
|
|
||||||
elif [[ "$KEEP_TEST_HEALTH" == "true" ]]; then
|
|
||||||
:
|
|
||||||
else
|
|
||||||
rm -f "$TEST_HEALTH_FILE"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
log_info "Starting agent deployment verification for hostname '$VERIFY_HOSTNAME'"
|
|
||||||
|
|
||||||
# 4.2 Master health checks
|
|
||||||
health_resp=""
|
|
||||||
if ! health_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/healthz" 2>/tmp/agent_verify_healthz.err); then
|
|
||||||
error_detail=$(cat /tmp/agent_verify_healthz.err || true)
|
|
||||||
add_result FAIL "GET /healthz failed: $error_detail"
|
|
||||||
else
|
|
||||||
http_meta=$(tail -n1 <<<"$health_resp")
|
|
||||||
payload=$(head -n -1 <<<"$health_resp" || true)
|
|
||||||
status_code=${http_meta%% *}
|
|
||||||
elapsed=${http_meta##* }
|
|
||||||
add_result PASS "GET /healthz status=$status_code elapsed=${elapsed}s payload=$payload"
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_healthz.err
|
|
||||||
|
|
||||||
if ! readyz_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/readyz" 2>/tmp/agent_verify_readyz.err); then
|
|
||||||
error_detail=$(cat /tmp/agent_verify_readyz.err || true)
|
|
||||||
add_result FAIL "GET /readyz failed: $error_detail"
|
|
||||||
readyz_payload=""
|
|
||||||
else
|
|
||||||
readyz_meta=$(tail -n1 <<<"$readyz_resp")
|
|
||||||
readyz_payload=$(head -n -1 <<<"$readyz_resp" || true)
|
|
||||||
readyz_status=${readyz_meta%% *}
|
|
||||||
readyz_elapsed=${readyz_meta##* }
|
|
||||||
add_result PASS "GET /readyz status=$readyz_status elapsed=${readyz_elapsed}s"
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_readyz.err
|
|
||||||
|
|
||||||
# 4.3 Nodes list and detail
|
|
||||||
if ! nodes_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes" 2>/tmp/agent_verify_nodes.err); then
|
|
||||||
error_detail=$(cat /tmp/agent_verify_nodes.err || true)
|
|
||||||
add_result FAIL "GET /api/v1/master/nodes failed: $error_detail"
|
|
||||||
nodes_json=""
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_nodes.err
|
|
||||||
|
|
||||||
NODE_ENTRY=""
|
|
||||||
NODE_ID=""
|
|
||||||
NODE_IP=""
|
|
||||||
if [[ -n "$nodes_json" ]]; then
|
|
||||||
if [[ "$HAS_JQ" == "1" ]]; then
|
|
||||||
NODE_ENTRY=$(printf '%s' "$nodes_json" | jq -e --arg name "$VERIFY_HOSTNAME" '.[] | select(.name == $name)') || NODE_ENTRY=""
|
|
||||||
else
|
|
||||||
NODE_ENTRY=$(python3 - "$VERIFY_HOSTNAME" <<'PY'
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
|
|
||||||
hostname = sys.argv[1]
|
|
||||||
nodes = json.load(sys.stdin)
|
|
||||||
for node in nodes:
|
|
||||||
if node.get("name") == hostname:
|
|
||||||
import json as _json
|
|
||||||
print(_json.dumps(node))
|
|
||||||
sys.exit(0)
|
|
||||||
sys.exit(1)
|
|
||||||
PY
|
|
||||||
) || NODE_ENTRY=""
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -z "$NODE_ENTRY" ]]; then
|
|
||||||
add_result FAIL "Current node '$VERIFY_HOSTNAME' not found in master nodes list"
|
|
||||||
else
|
|
||||||
if NODE_ID=$(json_query "$NODE_ENTRY" '.id' 'data["id"]'); then
|
|
||||||
add_result PASS "Discovered node id '$NODE_ID' for hostname '$VERIFY_HOSTNAME'"
|
|
||||||
else
|
|
||||||
add_result FAIL "Failed to extract node id from master response"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -n "$NODE_ENTRY" ]] && NODE_DETAIL=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_node_detail.err); then
|
|
||||||
NODE_DETAIL_JSON="$NODE_DETAIL"
|
|
||||||
add_result PASS "Fetched node detail for $NODE_ID"
|
|
||||||
if NODE_IP=$(json_query "$NODE_DETAIL_JSON" '.meta_data.ip // .meta_data.host_ip // empty' 'data.get("meta_data", {}).get("ip") or data.get("meta_data", {}).get("host_ip") or ""'); then
|
|
||||||
if [[ -n "$NODE_IP" ]]; then
|
|
||||||
add_result PASS "Registered node IP=$NODE_IP"
|
|
||||||
else
|
|
||||||
add_result INFO "Node detail does not expose IP fields"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
error_detail=$(cat /tmp/agent_verify_node_detail.err 2>/dev/null || true)
|
|
||||||
add_result FAIL "Failed to fetch node detail for $NODE_ID: $error_detail"
|
|
||||||
NODE_DETAIL_JSON=""
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_node_detail.err
|
|
||||||
|
|
||||||
if stats_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes/statistics" 2>/tmp/agent_verify_stats.err); then
|
|
||||||
if total_nodes=$(json_query "$stats_json" '.total // .total_nodes' 'data.get("total") or data.get("total_nodes")'); then
|
|
||||||
if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$total_nodes" -ge 1 ]]; then
|
|
||||||
add_result PASS "Statistics total=$total_nodes"
|
|
||||||
else
|
|
||||||
add_result WARN "Statistics total field not numeric: $total_nodes"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
add_result WARN "Unable to read total field from statistics"
|
|
||||||
fi
|
|
||||||
|
|
||||||
active_nodes=""
|
|
||||||
if [[ "$HAS_JQ" == "1" ]]; then
|
|
||||||
active_nodes=$(printf '%s' "$stats_json" | jq -e 'if .status_statistics then (.status_statistics[] | select(.status == "online") | .count) else empty end' 2>/dev/null | head -n1 || true)
|
|
||||||
elif command -v python3 >/dev/null 2>&1; then
|
|
||||||
active_nodes=$(printf '%s' "$stats_json" | python3 -c 'import json,sys; data=json.load(sys.stdin); print(next((row.get("count") for row in data.get("status_statistics", []) if row.get("status")=="online"), ""))' 2>/dev/null)
|
|
||||||
fi
|
|
||||||
if [[ -n "$active_nodes" ]]; then
|
|
||||||
add_result PASS "Online nodes reported by master: $active_nodes"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$HAS_JQ" == "1" ]]; then
|
|
||||||
node_count=$(printf '%s' "$nodes_json" | jq 'length')
|
|
||||||
else
|
|
||||||
node_count=$(json_length "$nodes_json" 'length' 'len(data)')
|
|
||||||
fi
|
|
||||||
if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$node_count" =~ ^[0-9]+$ ]] && [[ "$total_nodes" -lt "$node_count" ]]; then
|
|
||||||
add_result WARN "Statistics total=$total_nodes less than nodes list count=$node_count"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
error_detail=$(cat /tmp/agent_verify_stats.err 2>/dev/null || true)
|
|
||||||
add_result FAIL "Failed to fetch node statistics: $error_detail"
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_stats.err
|
|
||||||
else
|
|
||||||
NODE_DETAIL_JSON=""
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 4.4 Agent persistence checks
|
|
||||||
if [[ -f "$NODE_JSON" ]]; then
|
|
||||||
node_file_content="$(cat "$NODE_JSON")"
|
|
||||||
if node_id_local=$(json_query "$node_file_content" '.id' 'data["id"]'); then
|
|
||||||
if [[ "$NODE_ID" != "" && "$node_id_local" == "$NODE_ID" ]]; then
|
|
||||||
add_result PASS "node.json id matches master ($NODE_ID)"
|
|
||||||
else
|
|
||||||
add_result FAIL "node.json id '$node_id_local' differs from master id '$NODE_ID'"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
add_result FAIL "Unable to extract id from node.json"
|
|
||||||
fi
|
|
||||||
if node_name_local=$(json_query "$node_file_content" '.name' 'data["name"]'); then
|
|
||||||
if [[ "$node_name_local" == "$VERIFY_HOSTNAME" ]]; then
|
|
||||||
add_result PASS "node.json name matches $VERIFY_HOSTNAME"
|
|
||||||
else
|
|
||||||
add_result FAIL "node.json name '$node_name_local' differs from hostname '$VERIFY_HOSTNAME'"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
add_result FAIL "Unable to extract name from node.json"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if register_time=$(json_query "$node_file_content" '.register_time' 'data.get("register_time")'); then
|
|
||||||
if iso_to_epoch "$register_time" >/dev/null 2>&1; then
|
|
||||||
add_result PASS "node.json register_time valid ISO timestamp"
|
|
||||||
else
|
|
||||||
add_result WARN "node.json register_time invalid: $register_time"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
add_result WARN "node.json missing register_time"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then
|
|
||||||
if iso_to_epoch "$last_updated" >/dev/null 2>&1; then
|
|
||||||
add_result PASS "node.json last_updated valid ISO timestamp"
|
|
||||||
else
|
|
||||||
add_result WARN "node.json last_updated invalid: $last_updated"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
add_result WARN "node.json missing last_updated"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
add_result FAIL "node.json not found at $NODE_JSON"
|
|
||||||
node_file_content=""
|
|
||||||
fi
|
|
||||||
|
|
||||||
ensure_directory "$HEALTH_DIR"
|
|
||||||
|
|
||||||
if [[ -d "$HEALTH_DIR" ]]; then
|
|
||||||
shopt -s nullglob
|
|
||||||
health_files=("$HEALTH_DIR"/*.json)
|
|
||||||
shopt -u nullglob
|
|
||||||
if [[ ${#health_files[@]} -eq 0 ]]; then
|
|
||||||
add_result WARN "Health directory $HEALTH_DIR is empty"
|
|
||||||
else
|
|
||||||
for hf in "${health_files[@]}"; do
|
|
||||||
base=$(basename "$hf")
|
|
||||||
if [[ "$base" != *-* ]]; then
|
|
||||||
add_result WARN "Health file $base does not follow <module>-*.json"
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
if ! validate_json_file "$hf" >/dev/null 2>&1; then
|
|
||||||
add_result WARN "Health file $base is not valid JSON"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
add_result WARN "Health directory $HEALTH_DIR missing"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if getent hosts master.argus.com >/dev/null 2>&1; then
|
|
||||||
resolved_ips=$(getent hosts master.argus.com | awk '{print $1}' | xargs)
|
|
||||||
add_result PASS "master.argus.com resolves to $resolved_ips"
|
|
||||||
else
|
|
||||||
add_result FAIL "Failed to resolve master.argus.com"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 4.5 Master-Node status consistency
|
|
||||||
sleep_interval=$((REPORT_INTERVAL_SECONDS + 2))
|
|
||||||
|
|
||||||
if [[ -n "$NODE_DETAIL_JSON" ]]; then
|
|
||||||
detail_pre="$NODE_DETAIL_JSON"
|
|
||||||
else
|
|
||||||
detail_pre=""
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -z "$detail_pre" && -n "$NODE_ID" ]]; then
|
|
||||||
if detail_pre=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_pre.err); then
|
|
||||||
add_result PASS "Fetched node detail pre-check"
|
|
||||||
else
|
|
||||||
error_detail=$(cat /tmp/agent_verify_detail_pre.err 2>/dev/null || true)
|
|
||||||
add_result FAIL "Unable to fetch node detail for status check: $error_detail"
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_detail_pre.err
|
|
||||||
fi
|
|
||||||
|
|
||||||
server_ts_pre=""
|
|
||||||
agent_ts_pre=""
|
|
||||||
server_ts_post=""
|
|
||||||
agent_ts_post=""
|
|
||||||
|
|
||||||
if [[ -n "$detail_pre" ]]; then
|
|
||||||
server_ts_pre=$(json_query "$detail_pre" '.last_report' 'data.get("last_report")' || echo "")
|
|
||||||
agent_ts_pre=$(json_query "$detail_pre" '.agent_last_report' 'data.get("agent_last_report")' || echo "")
|
|
||||||
log_info "Captured initial last_report timestamps server='$server_ts_pre' agent='$agent_ts_pre'"
|
|
||||||
|
|
||||||
sleep "$sleep_interval"
|
|
||||||
|
|
||||||
if detail_post=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_post.err); then
|
|
||||||
server_ts_post=$(json_query "$detail_post" '.last_report' 'data.get("last_report")' || echo "")
|
|
||||||
agent_ts_post=$(json_query "$detail_post" '.agent_last_report' 'data.get("agent_last_report")' || echo "")
|
|
||||||
if [[ "$server_ts_post" != "$server_ts_pre" ]]; then
|
|
||||||
add_result PASS "last_report.server_timestamp advanced (pre=$server_ts_pre post=$server_ts_post)"
|
|
||||||
else
|
|
||||||
add_result FAIL "last_report.server_timestamp did not change after ${sleep_interval}s"
|
|
||||||
fi
|
|
||||||
if [[ "$agent_ts_post" != "$agent_ts_pre" ]]; then
|
|
||||||
add_result PASS "last_report.agent_timestamp advanced"
|
|
||||||
else
|
|
||||||
add_result FAIL "last_report.agent_timestamp did not change"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -n "$node_file_content" ]]; then
|
|
||||||
if node_last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then
|
|
||||||
if epoch_post=$(iso_to_epoch "$server_ts_post" 2>/dev/null); then
|
|
||||||
if node_epoch=$(iso_to_epoch "$node_last_updated" 2>/dev/null); then
|
|
||||||
diff=$((epoch_post - node_epoch))
|
|
||||||
[[ $diff -lt 0 ]] && diff=$((-diff))
|
|
||||||
tolerance=$((REPORT_INTERVAL_SECONDS * 2))
|
|
||||||
if [[ $diff -le $tolerance ]]; then
|
|
||||||
add_result PASS "last_report.server_timestamp and node.json last_updated within tolerance ($diff s)"
|
|
||||||
else
|
|
||||||
add_result WARN "Timestamp gap between master ($server_ts_post) and node.json ($node_last_updated) is ${diff}s"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
NODE_DETAIL_JSON="$detail_post"
|
|
||||||
else
|
|
||||||
error_detail=$(cat /tmp/agent_verify_detail_post.err 2>/dev/null || true)
|
|
||||||
add_result FAIL "Failed to fetch node detail post-check: $error_detail"
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_detail_post.err
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 4.6 Health simulation
|
|
||||||
TEST_HEALTH_FILE="$HEALTH_DIR/verify-master.json"
|
|
||||||
ensure_directory "$HEALTH_DIR"
|
|
||||||
|
|
||||||
if [[ -f "$TEST_HEALTH_FILE" ]]; then
|
|
||||||
TEST_HEALTH_EXISTED="true"
|
|
||||||
TEST_HEALTH_BACKUP="$(cat "$TEST_HEALTH_FILE")"
|
|
||||||
else
|
|
||||||
TEST_HEALTH_EXISTED="false"
|
|
||||||
fi
|
|
||||||
|
|
||||||
create_health_file() {
|
|
||||||
local message="$1"
|
|
||||||
cat > "$TEST_HEALTH_FILE" <<HEALTHJSON
|
|
||||||
{"status":"ok","message":"$message"}
|
|
||||||
HEALTHJSON
|
|
||||||
}
|
|
||||||
|
|
||||||
validate_health_in_master() {
|
|
||||||
local expected_message="$1"
|
|
||||||
local detail_json="$2"
|
|
||||||
local message
|
|
||||||
if message=$(json_query "$detail_json" '.health["verify-master"].message' 'data.get("health", {}).get("verify-master", {}).get("message")'); then
|
|
||||||
if [[ "$message" == "$expected_message" ]]; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
remove_health_from_master() {
|
|
||||||
local detail_json="$1"
|
|
||||||
if json_has_key "$detail_json" '(.health | has("verify-master"))' '"verify-master" in data.get("health", {})'; then
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
health_message_one="verify $(date +%s)"
|
|
||||||
create_health_file "$health_message_one"
|
|
||||||
add_result PASS "Created test health file $TEST_HEALTH_FILE"
|
|
||||||
|
|
||||||
sleep "$sleep_interval"
|
|
||||||
if detail_health_one=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health1.err); then
|
|
||||||
if validate_health_in_master "$health_message_one" "$detail_health_one"; then
|
|
||||||
add_result PASS "Master reflects verify-master health message"
|
|
||||||
else
|
|
||||||
add_result FAIL "Master health payload does not match test message"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
error_detail=$(cat /tmp/agent_verify_health1.err 2>/dev/null || true)
|
|
||||||
add_result FAIL "Failed to fetch node detail during health validation: $error_detail"
|
|
||||||
detail_health_one=""
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_health1.err
|
|
||||||
|
|
||||||
health_message_two="verify $(date +%s)-update"
|
|
||||||
create_health_file "$health_message_two"
|
|
||||||
sleep "$sleep_interval"
|
|
||||||
if detail_health_two=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health2.err); then
|
|
||||||
if validate_health_in_master "$health_message_two" "$detail_health_two"; then
|
|
||||||
add_result PASS "Master health updated to new message"
|
|
||||||
else
|
|
||||||
add_result FAIL "Master health message did not update"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
error_detail=$(cat /tmp/agent_verify_health2.err 2>/dev/null || true)
|
|
||||||
add_result FAIL "Failed to fetch node detail after health update: $error_detail"
|
|
||||||
detail_health_two=""
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_health2.err
|
|
||||||
|
|
||||||
rm -f "$TEST_HEALTH_FILE"
|
|
||||||
sleep "$sleep_interval"
|
|
||||||
if detail_health_three=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health3.err); then
|
|
||||||
if remove_health_from_master "$detail_health_three"; then
|
|
||||||
add_result PASS "Master health no longer lists verify-master after removal"
|
|
||||||
else
|
|
||||||
add_result FAIL "Master health still contains verify-master after file deletion"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
error_detail=$(cat /tmp/agent_verify_health3.err 2>/dev/null || true)
|
|
||||||
add_result FAIL "Failed to fetch node detail after health removal: $error_detail"
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_health3.err
|
|
||||||
|
|
||||||
if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then
|
|
||||||
printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Optional config touch
|
|
||||||
if [[ "$ALLOW_CONFIG_TOUCH" == "true" ]]; then
|
|
||||||
if [[ -n "$NODE_ID" ]]; then
|
|
||||||
payload='{"label": {"verify": "true"}}'
|
|
||||||
if curl "${CURL_OPTS[@]}" -X PUT -H 'Content-Type: application/json' -d "$payload" "$MASTER_BASE/api/v1/master/nodes/$NODE_ID/config" >/tmp/agent_verify_config.log 2>&1; then
|
|
||||||
add_result PASS "Config PUT dry-run succeeded"
|
|
||||||
else
|
|
||||||
add_result WARN "Config PUT dry-run failed: $(cat /tmp/agent_verify_config.log)"
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_config.log
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
add_result WARN "Config PUT dry-run skipped (enable with --allow-config-touch)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Result summary
|
|
||||||
echo
|
|
||||||
echo "==== Verification Summary ===="
|
|
||||||
for entry in "${RESULTS_PASS[@]}"; do
|
|
||||||
printf 'PASS: %s\n' "$entry"
|
|
||||||
done
|
|
||||||
for entry in "${RESULTS_WARN[@]}"; do
|
|
||||||
printf 'WARN: %s\n' "$entry"
|
|
||||||
done
|
|
||||||
for entry in "${RESULTS_FAIL[@]}"; do
|
|
||||||
printf 'FAIL: %s\n' "$entry"
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ ${#RESULTS_FAIL[@]} -gt 0 ]]; then
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
exit 0
|
|
@ -1,269 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
MODULE_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
BUILD_ROOT="$MODULE_ROOT/build"
|
|
||||||
DIST_DIR="$MODULE_ROOT/dist"
|
|
||||||
PYINSTALLER_BUILD="$BUILD_ROOT/pyinstaller"
|
|
||||||
PYINSTALLER_SPEC="$PYINSTALLER_BUILD/spec"
|
|
||||||
PYINSTALLER_WORK="$PYINSTALLER_BUILD/work"
|
|
||||||
VENV_DIR="$BUILD_ROOT/venv"
|
|
||||||
|
|
||||||
AGENT_BUILD_IMAGE="${AGENT_BUILD_IMAGE:-python:3.11-slim-bullseye}"
|
|
||||||
AGENT_BUILD_USE_DOCKER="${AGENT_BUILD_USE_DOCKER:-1}"
|
|
||||||
USED_DOCKER=0
|
|
||||||
|
|
||||||
run_host_build() {
|
|
||||||
echo "[INFO] Using host Python environment for build" >&2
|
|
||||||
rm -rf "$BUILD_ROOT" "$DIST_DIR"
|
|
||||||
mkdir -p "$PYINSTALLER_BUILD" "$DIST_DIR"
|
|
||||||
python3 -m venv --copies "$VENV_DIR"
|
|
||||||
# shellcheck disable=SC1091
|
|
||||||
source "$VENV_DIR/bin/activate"
|
|
||||||
|
|
||||||
pip install --upgrade pip
|
|
||||||
pip install .
|
|
||||||
pip install "pyinstaller==6.6.0"
|
|
||||||
|
|
||||||
pyinstaller \
|
|
||||||
--clean \
|
|
||||||
--onefile \
|
|
||||||
--name argus-agent \
|
|
||||||
--distpath "$DIST_DIR" \
|
|
||||||
--workpath "$PYINSTALLER_WORK" \
|
|
||||||
--specpath "$PYINSTALLER_SPEC" \
|
|
||||||
--add-data "$MODULE_ROOT/pyproject.toml:." \
|
|
||||||
"$MODULE_ROOT/entry.py"
|
|
||||||
|
|
||||||
chmod +x "$DIST_DIR/argus-agent"
|
|
||||||
deactivate
|
|
||||||
}
|
|
||||||
|
|
||||||
run_docker_build() {
|
|
||||||
if ! command -v docker >/dev/null 2>&1; then
|
|
||||||
echo "[ERROR] docker 命令不存在,无法在容器内构建。请安装 Docker 或设置 AGENT_BUILD_USE_DOCKER=0" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
USED_DOCKER=1
|
|
||||||
echo "[INFO] Building agent binary inside $AGENT_BUILD_IMAGE" >&2
|
|
||||||
|
|
||||||
local host_uid host_gid
|
|
||||||
host_uid="$(id -u)"
|
|
||||||
host_gid="$(id -g)"
|
|
||||||
docker_env=("--rm" "-v" "$MODULE_ROOT:/workspace" "-w" "/workspace" "--env" "TARGET_UID=${host_uid}" "--env" "TARGET_GID=${host_gid}")
|
|
||||||
|
|
||||||
pass_env_if_set() {
|
|
||||||
local var="$1"
|
|
||||||
local value="${!var:-}"
|
|
||||||
if [[ -n "$value" ]]; then
|
|
||||||
docker_env+=("--env" "$var=$value")
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
pass_env_if_set PIP_INDEX_URL
|
|
||||||
pass_env_if_set PIP_EXTRA_INDEX_URL
|
|
||||||
pass_env_if_set PIP_TRUSTED_HOST
|
|
||||||
pass_env_if_set HTTP_PROXY
|
|
||||||
pass_env_if_set HTTPS_PROXY
|
|
||||||
pass_env_if_set NO_PROXY
|
|
||||||
pass_env_if_set http_proxy
|
|
||||||
pass_env_if_set https_proxy
|
|
||||||
pass_env_if_set no_proxy
|
|
||||||
|
|
||||||
build_script=$(cat <<'INNER'
|
|
||||||
set -euo pipefail
|
|
||||||
cd /workspace
|
|
||||||
apt-get update >/dev/null
|
|
||||||
apt-get install -y --no-install-recommends binutils >/dev/null
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
rm -rf build dist
|
|
||||||
mkdir -p build/pyinstaller dist
|
|
||||||
python3 -m venv --copies build/venv
|
|
||||||
source build/venv/bin/activate
|
|
||||||
pip install --upgrade pip
|
|
||||||
pip install .
|
|
||||||
pip install pyinstaller==6.6.0
|
|
||||||
pyinstaller \
|
|
||||||
--clean \
|
|
||||||
--onefile \
|
|
||||||
--name argus-agent \
|
|
||||||
--distpath dist \
|
|
||||||
--workpath build/pyinstaller/work \
|
|
||||||
--specpath build/pyinstaller/spec \
|
|
||||||
--add-data /workspace/pyproject.toml:. \
|
|
||||||
entry.py
|
|
||||||
chmod +x dist/argus-agent
|
|
||||||
|
|
||||||
TARGET_UID="${TARGET_UID:-0}"
|
|
||||||
TARGET_GID="${TARGET_GID:-0}"
|
|
||||||
chown -R "$TARGET_UID:$TARGET_GID" dist build 2>/dev/null || true
|
|
||||||
|
|
||||||
python3 - <<'PY'
|
|
||||||
from pathlib import Path
|
|
||||||
from PyInstaller.archive.readers import CArchiveReader
|
|
||||||
import sys
|
|
||||||
|
|
||||||
archive = Path('dist/argus-agent')
|
|
||||||
out_dir = Path('build/compat_check')
|
|
||||||
out_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
major, minor = sys.version_info[:2]
|
|
||||||
libpython = f'libpython{major}.{minor}.so.1.0'
|
|
||||||
expected_libs = [
|
|
||||||
libpython,
|
|
||||||
'libssl.so.3',
|
|
||||||
'libcrypto.so.3',
|
|
||||||
]
|
|
||||||
reader = CArchiveReader(str(archive))
|
|
||||||
extracted = []
|
|
||||||
missing = []
|
|
||||||
for name in expected_libs:
|
|
||||||
try:
|
|
||||||
data = reader.extract(name)
|
|
||||||
except KeyError:
|
|
||||||
missing.append(name)
|
|
||||||
continue
|
|
||||||
(out_dir / name).write_bytes(data)
|
|
||||||
extracted.append(name)
|
|
||||||
(out_dir / 'manifest').write_text('\n'.join(extracted))
|
|
||||||
if extracted:
|
|
||||||
print('[INFO] Extracted libraries: ' + ', '.join(extracted))
|
|
||||||
if missing:
|
|
||||||
print('[WARN] Missing expected libraries in bundle: ' + ', '.join(missing))
|
|
||||||
PY
|
|
||||||
|
|
||||||
compat_check() {
|
|
||||||
local lib_path="$1"
|
|
||||||
if [[ ! -f "$lib_path" ]]; then
|
|
||||||
echo "[WARN] Missing $lib_path for GLIBC check"
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
local max_glibc
|
|
||||||
max_glibc=$(strings -a "$lib_path" | grep -Eo 'GLIBC_[0-9]+\.[0-9]+' | sort -Vu | tail -n 1 || true)
|
|
||||||
if [[ -n "$max_glibc" ]]; then
|
|
||||||
echo "[INFO] $lib_path references up to $max_glibc"
|
|
||||||
else
|
|
||||||
echo "[INFO] $lib_path does not expose GLIBC version strings"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
compat_libs=()
|
|
||||||
if [[ -f build/compat_check/manifest ]]; then
|
|
||||||
mapfile -t compat_libs < build/compat_check/manifest
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ${#compat_libs[@]} -eq 0 ]]; then
|
|
||||||
echo "[WARN] No libraries captured for GLIBC inspection"
|
|
||||||
else
|
|
||||||
for lib in "${compat_libs[@]}"; do
|
|
||||||
compat_check "build/compat_check/$lib"
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
deactivate
|
|
||||||
INNER
|
|
||||||
)
|
|
||||||
|
|
||||||
if ! docker run "${docker_env[@]}" "$AGENT_BUILD_IMAGE" bash -lc "$build_script"; then
|
|
||||||
echo "[ERROR] Docker 构建失败,请检查 Docker 权限或设置 AGENT_BUILD_USE_DOCKER=0 在兼容主机上构建" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
if [[ "$AGENT_BUILD_USE_DOCKER" == "1" ]]; then
|
|
||||||
run_docker_build
|
|
||||||
else
|
|
||||||
run_host_build
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -f "$DIST_DIR/argus-agent" ]]; then
|
|
||||||
echo "[ERROR] Agent binary was not produced" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$USED_DOCKER" != "1" ]]; then
|
|
||||||
if [[ ! -x "$VENV_DIR/bin/python" ]]; then
|
|
||||||
echo "[WARN] PyInstaller virtualenv missing at $VENV_DIR; skipping compatibility check" >&2
|
|
||||||
else
|
|
||||||
COMPAT_DIR="$BUILD_ROOT/compat_check"
|
|
||||||
rm -rf "$COMPAT_DIR"
|
|
||||||
mkdir -p "$COMPAT_DIR"
|
|
||||||
|
|
||||||
EXTRACT_SCRIPT=$(cat <<'PY'
|
|
||||||
from pathlib import Path
|
|
||||||
from PyInstaller.archive.readers import CArchiveReader
|
|
||||||
import sys
|
|
||||||
|
|
||||||
archive = Path('dist/argus-agent')
|
|
||||||
out_dir = Path('build/compat_check')
|
|
||||||
out_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
major, minor = sys.version_info[:2]
|
|
||||||
libpython = f'libpython{major}.{minor}.so.1.0'
|
|
||||||
expected_libs = [
|
|
||||||
libpython,
|
|
||||||
'libssl.so.3',
|
|
||||||
'libcrypto.so.3',
|
|
||||||
]
|
|
||||||
reader = CArchiveReader(str(archive))
|
|
||||||
extracted = []
|
|
||||||
missing = []
|
|
||||||
for name in expected_libs:
|
|
||||||
try:
|
|
||||||
data = reader.extract(name)
|
|
||||||
except KeyError:
|
|
||||||
missing.append(name)
|
|
||||||
continue
|
|
||||||
(out_dir / name).write_bytes(data)
|
|
||||||
extracted.append(name)
|
|
||||||
(out_dir / 'manifest').write_text('\n'.join(extracted))
|
|
||||||
if extracted:
|
|
||||||
print('[INFO] Extracted libraries: ' + ', '.join(extracted))
|
|
||||||
if missing:
|
|
||||||
print('[WARN] Missing expected libraries in bundle: ' + ', '.join(missing))
|
|
||||||
PY
|
|
||||||
)
|
|
||||||
|
|
||||||
"$VENV_DIR/bin/python" - <<PY
|
|
||||||
$EXTRACT_SCRIPT
|
|
||||||
PY
|
|
||||||
|
|
||||||
compat_libs=()
|
|
||||||
if [[ -f "$COMPAT_DIR/manifest" ]]; then
|
|
||||||
mapfile -t compat_libs < "$COMPAT_DIR/manifest"
|
|
||||||
fi
|
|
||||||
|
|
||||||
check_glibc_version() {
|
|
||||||
local lib_path="$1"
|
|
||||||
if [[ ! -f "$lib_path" ]]; then
|
|
||||||
echo "[WARN] Skipping GLIBC check; file not found: $lib_path" >&2
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
if command -v strings >/dev/null 2>&1; then
|
|
||||||
local max_glibc
|
|
||||||
max_glibc=$(strings -a "$lib_path" | grep -Eo 'GLIBC_[0-9]+\.[0-9]+' | sort -Vu | tail -n 1 || true)
|
|
||||||
if [[ -n "$max_glibc" ]]; then
|
|
||||||
echo "[INFO] $lib_path references up to $max_glibc"
|
|
||||||
else
|
|
||||||
echo "[INFO] $lib_path does not expose GLIBC version strings"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo "[WARN] strings command unavailable; cannot inspect $lib_path" >&2
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
if [[ ${#compat_libs[@]} -eq 0 ]]; then
|
|
||||||
echo "[WARN] No libraries captured for GLIBC inspection" >&2
|
|
||||||
else
|
|
||||||
for lib in "${compat_libs[@]}"; do
|
|
||||||
check_glibc_version "$COMPAT_DIR/$lib"
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo "[INFO] Compatibility check executed inside container"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[INFO] Agent binary generated at $DIST_DIR/argus-agent"
|
|
2
src/agent/tests/.gitignore
vendored
2
src/agent/tests/.gitignore
vendored
@ -1,2 +0,0 @@
|
|||||||
private/
|
|
||||||
tmp/
|
|
@ -1,69 +0,0 @@
|
|||||||
services:
|
|
||||||
bind:
|
|
||||||
image: ${BIND_IMAGE_TAG:-argus-bind9:latest}
|
|
||||||
container_name: argus-bind-agent-e2e
|
|
||||||
volumes:
|
|
||||||
- ./private:/private
|
|
||||||
networks:
|
|
||||||
default:
|
|
||||||
ipv4_address: 172.28.0.2
|
|
||||||
environment:
|
|
||||||
- "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
|
|
||||||
- "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
|
|
||||||
restart: always
|
|
||||||
|
|
||||||
master:
|
|
||||||
image: argus-master:latest
|
|
||||||
container_name: argus-master-agent-e2e
|
|
||||||
depends_on:
|
|
||||||
- bind
|
|
||||||
environment:
|
|
||||||
- OFFLINE_THRESHOLD_SECONDS=6
|
|
||||||
- ONLINE_THRESHOLD_SECONDS=2
|
|
||||||
- SCHEDULER_INTERVAL_SECONDS=1
|
|
||||||
- "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
|
|
||||||
- "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
|
|
||||||
ports:
|
|
||||||
- "32300:3000"
|
|
||||||
volumes:
|
|
||||||
- ./private/argus/master:/private/argus/master
|
|
||||||
- ./private/argus/metric/prometheus:/private/argus/metric/prometheus
|
|
||||||
- ./private/argus/etc:/private/argus/etc
|
|
||||||
networks:
|
|
||||||
default:
|
|
||||||
ipv4_address: 172.28.0.10
|
|
||||||
restart: always
|
|
||||||
|
|
||||||
agent:
|
|
||||||
image: ubuntu:22.04
|
|
||||||
container_name: argus-agent-e2e
|
|
||||||
hostname: dev-e2euser-e2einst-pod-0
|
|
||||||
depends_on:
|
|
||||||
- master
|
|
||||||
- bind
|
|
||||||
environment:
|
|
||||||
- MASTER_ENDPOINT=http://master.argus.com:3000
|
|
||||||
- REPORT_INTERVAL_SECONDS=2
|
|
||||||
- "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
|
|
||||||
- "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
|
|
||||||
volumes:
|
|
||||||
- ./private/argus/agent/dev-e2euser-e2einst-pod-0:/private/argus/agent/dev-e2euser-e2einst-pod-0
|
|
||||||
- ./private/argus/agent/dev-e2euser-e2einst-pod-0/health:/private/argus/agent/dev-e2euser-e2einst-pod-0/health
|
|
||||||
- ./private/argus/etc:/private/argus/etc
|
|
||||||
- ../dist/argus-agent:/usr/local/bin/argus-agent:ro
|
|
||||||
- ./scripts/agent_entrypoint.sh:/usr/local/bin/agent-entrypoint.sh:ro
|
|
||||||
- ../scripts/agent_deployment_verify.sh:/usr/local/bin/agent_deployment_verify.sh:ro
|
|
||||||
entrypoint:
|
|
||||||
- /usr/local/bin/agent-entrypoint.sh
|
|
||||||
networks:
|
|
||||||
default:
|
|
||||||
ipv4_address: 172.28.0.20
|
|
||||||
restart: always
|
|
||||||
|
|
||||||
networks:
|
|
||||||
default:
|
|
||||||
driver: bridge
|
|
||||||
ipam:
|
|
||||||
driver: default
|
|
||||||
config:
|
|
||||||
- subnet: 172.28.0.0/16
|
|
@ -1,23 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
SCRIPTS=(
|
|
||||||
"01_bootstrap.sh"
|
|
||||||
"02_up.sh"
|
|
||||||
"03_wait_and_assert_registration.sh"
|
|
||||||
"04_write_health_files.sh"
|
|
||||||
"08_verify_agent.sh"
|
|
||||||
"05_assert_status_on_master.sh"
|
|
||||||
"06_restart_agent_and_reregister.sh"
|
|
||||||
"07_down.sh"
|
|
||||||
)
|
|
||||||
|
|
||||||
for script in "${SCRIPTS[@]}"; do
|
|
||||||
echo "[TEST] Running $script"
|
|
||||||
"$SCRIPT_DIR/$script"
|
|
||||||
echo "[TEST] $script completed"
|
|
||||||
echo
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "[TEST] Agent module E2E tests completed"
|
|
@ -1,63 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
AGENT_ROOT="$(cd "$TEST_ROOT/.." && pwd)"
|
|
||||||
MASTER_ROOT="$(cd "$AGENT_ROOT/../master" && pwd)"
|
|
||||||
REPO_ROOT="$(cd "$AGENT_ROOT/../.." && pwd)"
|
|
||||||
PRIVATE_ROOT="$TEST_ROOT/private"
|
|
||||||
TMP_ROOT="$TEST_ROOT/tmp"
|
|
||||||
|
|
||||||
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
|
|
||||||
AGENT_CONFIG_DIR="$PRIVATE_ROOT/argus/agent/$AGENT_HOSTNAME"
|
|
||||||
AGENT_HEALTH_DIR="$PRIVATE_ROOT/argus/agent/$AGENT_HOSTNAME/health"
|
|
||||||
MASTER_PRIVATE_DIR="$PRIVATE_ROOT/argus/master"
|
|
||||||
METRIC_PRIVATE_DIR="$PRIVATE_ROOT/argus/metric/prometheus"
|
|
||||||
DNS_DIR="$PRIVATE_ROOT/argus/etc"
|
|
||||||
BIND_IMAGE_TAG="${BIND_IMAGE_TAG:-argus-bind9:latest}"
|
|
||||||
BIND_ROOT="$(cd "$MASTER_ROOT/../bind" && pwd)"
|
|
||||||
|
|
||||||
ensure_image() {
|
|
||||||
local image="$1"
|
|
||||||
if ! docker image inspect "$image" >/dev/null 2>&1; then
|
|
||||||
echo "[ERROR] Docker image '$image' 未找到,请先运行统一构建脚本 (例如 ./build/build_images.sh) 生成所需镜像" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
mkdir -p "$AGENT_CONFIG_DIR"
|
|
||||||
mkdir -p "$AGENT_HEALTH_DIR"
|
|
||||||
mkdir -p "$MASTER_PRIVATE_DIR"
|
|
||||||
mkdir -p "$METRIC_PRIVATE_DIR"
|
|
||||||
mkdir -p "$TMP_ROOT"
|
|
||||||
mkdir -p "$DNS_DIR"
|
|
||||||
|
|
||||||
touch "$AGENT_HEALTH_DIR/.keep"
|
|
||||||
|
|
||||||
# 中文提示:准备 bind 模块提供的 update-dns.sh,模拟生产下发
|
|
||||||
if [[ -f "$BIND_ROOT/build/update-dns.sh" ]]; then
|
|
||||||
cp "$BIND_ROOT/build/update-dns.sh" "$DNS_DIR/update-dns.sh"
|
|
||||||
chmod +x "$DNS_DIR/update-dns.sh"
|
|
||||||
else
|
|
||||||
echo "[WARN] bind update script missing at $BIND_ROOT/build/update-dns.sh"
|
|
||||||
fi
|
|
||||||
|
|
||||||
ensure_image "argus-master:latest"
|
|
||||||
ensure_image "$BIND_IMAGE_TAG"
|
|
||||||
|
|
||||||
AGENT_BINARY="$AGENT_ROOT/dist/argus-agent"
|
|
||||||
|
|
||||||
pushd "$AGENT_ROOT" >/dev/null
|
|
||||||
./scripts/build_binary.sh
|
|
||||||
popd >/dev/null
|
|
||||||
|
|
||||||
if [[ ! -x "$AGENT_BINARY" ]]; then
|
|
||||||
echo "[ERROR] Agent binary not found at $AGENT_BINARY" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "$AGENT_BINARY" > "$TMP_ROOT/agent_binary_path"
|
|
||||||
echo "$BIND_IMAGE_TAG" > "$TMP_ROOT/bind_image_tag"
|
|
||||||
|
|
||||||
echo "[INFO] Agent E2E bootstrap complete"
|
|
@ -1,53 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
|
|
||||||
|
|
||||||
TMP_ROOT="$TEST_ROOT/tmp"
|
|
||||||
ENV_FILE="$TEST_ROOT/.env"
|
|
||||||
|
|
||||||
source "$REPO_ROOT/scripts/common/build_user.sh"
|
|
||||||
load_build_user
|
|
||||||
export ARGUS_BUILD_UID ARGUS_BUILD_GID
|
|
||||||
|
|
||||||
cat > "$ENV_FILE" <<EOF
|
|
||||||
ARGUS_BUILD_UID=$ARGUS_BUILD_UID
|
|
||||||
ARGUS_BUILD_GID=$ARGUS_BUILD_GID
|
|
||||||
EOF
|
|
||||||
|
|
||||||
if [[ ! -f "$TMP_ROOT/agent_binary_path" ]]; then
|
|
||||||
echo "[ERROR] Agent binary path missing; run 01_bootstrap.sh first" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
AGENT_BINARY="$(cat "$TMP_ROOT/agent_binary_path")"
|
|
||||||
if [[ ! -x "$AGENT_BINARY" ]]; then
|
|
||||||
echo "[ERROR] Agent binary not executable: $AGENT_BINARY" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
BIND_IMAGE_TAG_VALUE="argus-bind9:latest"
|
|
||||||
if [[ -f "$TMP_ROOT/bind_image_tag" ]]; then
|
|
||||||
BIND_IMAGE_TAG_VALUE="$(cat "$TMP_ROOT/bind_image_tag")"
|
|
||||||
fi
|
|
||||||
|
|
||||||
compose() {
|
|
||||||
if docker compose version >/dev/null 2>&1; then
|
|
||||||
docker compose "$@"
|
|
||||||
else
|
|
||||||
docker-compose "$@"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
docker container rm -f argus-agent-e2e argus-master-agent-e2e argus-bind-agent-e2e >/dev/null 2>&1 || true
|
|
||||||
|
|
||||||
docker network rm tests_default >/dev/null 2>&1 || true
|
|
||||||
|
|
||||||
pushd "$TEST_ROOT" >/dev/null
|
|
||||||
compose down --remove-orphans || true
|
|
||||||
BIND_IMAGE_TAG="$BIND_IMAGE_TAG_VALUE" compose up -d
|
|
||||||
popd >/dev/null
|
|
||||||
|
|
||||||
echo "[INFO] Master+Agent stack started"
|
|
@ -1,65 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
TMP_ROOT="$TEST_ROOT/tmp"
|
|
||||||
API_BASE="http://localhost:32300/api/v1/master"
|
|
||||||
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
|
|
||||||
NODE_FILE="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/node.json"
|
|
||||||
|
|
||||||
mkdir -p "$TMP_ROOT"
|
|
||||||
|
|
||||||
node_id=""
|
|
||||||
for _ in {1..30}; do
|
|
||||||
sleep 2
|
|
||||||
response=$(curl -sS "$API_BASE/nodes" || true)
|
|
||||||
if [[ -z "$response" ]]; then
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
list_file="$TMP_ROOT/nodes_list.json"
|
|
||||||
echo "$response" > "$list_file"
|
|
||||||
node_id=$(python3 - "$list_file" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
nodes = json.load(handle)
|
|
||||||
print(nodes[0]["id"] if nodes else "")
|
|
||||||
PY
|
|
||||||
)
|
|
||||||
if [[ -n "$node_id" ]]; then
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ -z "$node_id" ]]; then
|
|
||||||
echo "[ERROR] Agent did not register within timeout" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "$node_id" > "$TMP_ROOT/node_id"
|
|
||||||
|
|
||||||
if [[ ! -f "$NODE_FILE" ]]; then
|
|
||||||
echo "[ERROR] node.json not created at $NODE_FILE" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
python3 - "$NODE_FILE" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
assert "id" in node and node["id"], "node.json missing id"
|
|
||||||
PY
|
|
||||||
|
|
||||||
detail_file="$TMP_ROOT/initial_detail.json"
|
|
||||||
curl -sS "$API_BASE/nodes/$node_id" -o "$detail_file"
|
|
||||||
python3 - "$detail_file" "$TMP_ROOT/initial_ip" <<'PY'
|
|
||||||
import json, sys, pathlib
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
ip = node["meta_data"].get("ip")
|
|
||||||
if not ip:
|
|
||||||
raise SystemExit("meta_data.ip missing")
|
|
||||||
pathlib.Path(sys.argv[2]).write_text(ip)
|
|
||||||
PY
|
|
||||||
|
|
||||||
echo "[INFO] Agent registered with node id $node_id"
|
|
@ -1,22 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
HEALTH_DIR="$TEST_ROOT/private/argus/agent/dev-e2euser-e2einst-pod-0/health"
|
|
||||||
|
|
||||||
cat > "$HEALTH_DIR/log-fluentbit.json" <<JSON
|
|
||||||
{
|
|
||||||
"status": "healthy",
|
|
||||||
"timestamp": "2023-10-05T12:05:00Z"
|
|
||||||
}
|
|
||||||
JSON
|
|
||||||
|
|
||||||
cat > "$HEALTH_DIR/metric-node-exporter.json" <<JSON
|
|
||||||
{
|
|
||||||
"status": "healthy",
|
|
||||||
"timestamp": "2023-10-05T12:05:00Z"
|
|
||||||
}
|
|
||||||
JSON
|
|
||||||
|
|
||||||
echo "[INFO] Health files written"
|
|
@ -1,53 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
TMP_ROOT="$TEST_ROOT/tmp"
|
|
||||||
API_BASE="http://localhost:32300/api/v1/master"
|
|
||||||
NODE_ID="$(cat "$TMP_ROOT/node_id")"
|
|
||||||
NODES_JSON="$TEST_ROOT/private/argus/metric/prometheus/nodes.json"
|
|
||||||
|
|
||||||
success=false
|
|
||||||
detail_file="$TMP_ROOT/agent_status_detail.json"
|
|
||||||
for _ in {1..20}; do
|
|
||||||
sleep 2
|
|
||||||
if ! curl -sS "$API_BASE/nodes/$NODE_ID" -o "$detail_file"; then
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
if python3 - "$detail_file" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
if node["status"] != "online":
|
|
||||||
raise SystemExit(1)
|
|
||||||
health = node.get("health", {})
|
|
||||||
if "log-fluentbit" not in health or "metric-node-exporter" not in health:
|
|
||||||
raise SystemExit(1)
|
|
||||||
PY
|
|
||||||
then
|
|
||||||
success=true
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ "$success" != true ]]; then
|
|
||||||
echo "[ERROR] Node did not report health data in time" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -f "$NODES_JSON" ]]; then
|
|
||||||
echo "[ERROR] nodes.json missing at $NODES_JSON" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
python3 - "$NODES_JSON" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
nodes = json.load(handle)
|
|
||||||
assert len(nodes) == 1, nodes
|
|
||||||
entry = nodes[0]
|
|
||||||
assert entry["node_id"], entry
|
|
||||||
PY
|
|
||||||
|
|
||||||
echo "[INFO] Master reflects agent health and nodes.json entries"
|
|
@ -1,143 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
TMP_ROOT="$TEST_ROOT/tmp"
|
|
||||||
API_BASE="http://localhost:32300/api/v1/master"
|
|
||||||
NODE_ID="$(cat "$TMP_ROOT/node_id")"
|
|
||||||
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
|
|
||||||
NETWORK_NAME="tests_default"
|
|
||||||
NEW_AGENT_IP="172.28.0.200"
|
|
||||||
ENTRYPOINT_SCRIPT="$SCRIPT_DIR/agent_entrypoint.sh"
|
|
||||||
ENV_FILE="$TEST_ROOT/.env"
|
|
||||||
|
|
||||||
# 中文提示:重启场景也需要同样的入口脚本,确保 DNS 注册逻辑一致
|
|
||||||
if [[ ! -f "$ENTRYPOINT_SCRIPT" ]]; then
|
|
||||||
echo "[ERROR] agent entrypoint script missing at $ENTRYPOINT_SCRIPT" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -f "$TMP_ROOT/agent_binary_path" ]]; then
|
|
||||||
echo "[ERROR] Agent binary path missing; rerun bootstrap" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
AGENT_BINARY="$(cat "$TMP_ROOT/agent_binary_path")"
|
|
||||||
if [[ ! -x "$AGENT_BINARY" ]]; then
|
|
||||||
echo "[ERROR] Agent binary not executable: $AGENT_BINARY" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -f "$ENV_FILE" ]]; then
|
|
||||||
set -a
|
|
||||||
# shellcheck disable=SC1090
|
|
||||||
source "$ENV_FILE"
|
|
||||||
set +a
|
|
||||||
else
|
|
||||||
REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
|
|
||||||
# shellcheck disable=SC1090
|
|
||||||
source "$REPO_ROOT/scripts/common/build_user.sh"
|
|
||||||
load_build_user
|
|
||||||
fi
|
|
||||||
|
|
||||||
AGENT_UID="${ARGUS_BUILD_UID:-2133}"
|
|
||||||
AGENT_GID="${ARGUS_BUILD_GID:-2015}"
|
|
||||||
|
|
||||||
compose() {
|
|
||||||
if docker compose version >/dev/null 2>&1; then
|
|
||||||
docker compose "$@"
|
|
||||||
else
|
|
||||||
docker-compose "$@"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
before_file="$TMP_ROOT/before_restart.json"
|
|
||||||
curl -sS "$API_BASE/nodes/$NODE_ID" -o "$before_file"
|
|
||||||
prev_last_updated=$(python3 - "$before_file" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
print(node.get("last_updated", ""))
|
|
||||||
PY
|
|
||||||
)
|
|
||||||
prev_ip=$(python3 - "$before_file" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
print(node["meta_data"].get("ip", ""))
|
|
||||||
PY
|
|
||||||
)
|
|
||||||
initial_ip=$(cat "$TMP_ROOT/initial_ip")
|
|
||||||
if [[ "$prev_ip" != "$initial_ip" ]]; then
|
|
||||||
echo "[ERROR] Expected initial IP $initial_ip, got $prev_ip" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
pushd "$TEST_ROOT" >/dev/null
|
|
||||||
compose rm -sf agent
|
|
||||||
popd >/dev/null
|
|
||||||
|
|
||||||
docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true
|
|
||||||
|
|
||||||
AGENT_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME"
|
|
||||||
HEALTH_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/health"
|
|
||||||
|
|
||||||
# 先以 sleep 方式启动容器,确保我们掌握注册时的网络状态
|
|
||||||
if ! docker run -d \
|
|
||||||
--name argus-agent-e2e \
|
|
||||||
--hostname "$AGENT_HOSTNAME" \
|
|
||||||
--network "$NETWORK_NAME" \
|
|
||||||
--ip "$NEW_AGENT_IP" \
|
|
||||||
-v "$AGENT_DIR:/private/argus/agent/$AGENT_HOSTNAME" \
|
|
||||||
-v "$HEALTH_DIR:/private/argus/agent/$AGENT_HOSTNAME/health" \
|
|
||||||
-v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \
|
|
||||||
-v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
|
|
||||||
-v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \
|
|
||||||
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
|
|
||||||
-e REPORT_INTERVAL_SECONDS=2 \
|
|
||||||
-e ARGUS_BUILD_UID="$AGENT_UID" \
|
|
||||||
-e ARGUS_BUILD_GID="$AGENT_GID" \
|
|
||||||
--entrypoint /usr/local/bin/agent-entrypoint.sh \
|
|
||||||
ubuntu:22.04 >/dev/null; then
|
|
||||||
echo "[ERROR] Failed to start agent container with custom IP" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
success=false
|
|
||||||
detail_file="$TMP_ROOT/post_restart.json"
|
|
||||||
for _ in {1..20}; do
|
|
||||||
sleep 3
|
|
||||||
if ! curl -sS "$API_BASE/nodes/$NODE_ID" -o "$detail_file"; then
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
if python3 - "$detail_file" "$prev_last_updated" "$NODE_ID" "$prev_ip" "$NEW_AGENT_IP" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
prev_last_updated = sys.argv[2]
|
|
||||||
expected_id = sys.argv[3]
|
|
||||||
old_ip = sys.argv[4]
|
|
||||||
expected_ip = sys.argv[5]
|
|
||||||
last_updated = node.get("last_updated")
|
|
||||||
current_ip = node["meta_data"].get("ip")
|
|
||||||
assert node["id"] == expected_id
|
|
||||||
if current_ip != expected_ip:
|
|
||||||
raise SystemExit(1)
|
|
||||||
if current_ip == old_ip:
|
|
||||||
raise SystemExit(1)
|
|
||||||
if not last_updated or last_updated == prev_last_updated:
|
|
||||||
raise SystemExit(1)
|
|
||||||
PY
|
|
||||||
then
|
|
||||||
success=true
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ "$success" != true ]]; then
|
|
||||||
echo "[ERROR] Agent did not report expected new IP $NEW_AGENT_IP after restart" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[INFO] Agent restart produced successful re-registration with IP change"
|
|
@ -1,36 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
ENV_FILE="$TEST_ROOT/.env"
|
|
||||||
|
|
||||||
compose() {
|
|
||||||
if docker compose version >/dev/null 2>&1; then
|
|
||||||
docker compose "$@"
|
|
||||||
else
|
|
||||||
docker-compose "$@"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true
|
|
||||||
|
|
||||||
pushd "$TEST_ROOT" >/dev/null
|
|
||||||
compose down --remove-orphans
|
|
||||||
popd >/dev/null
|
|
||||||
|
|
||||||
if [[ -d "$TEST_ROOT/private" ]]; then
|
|
||||||
docker run --rm \
|
|
||||||
-v "$TEST_ROOT/private:/target" \
|
|
||||||
ubuntu:24.04 \
|
|
||||||
chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true
|
|
||||||
rm -rf "$TEST_ROOT/private"
|
|
||||||
fi
|
|
||||||
|
|
||||||
rm -rf "$TEST_ROOT/tmp"
|
|
||||||
|
|
||||||
if [[ -f "$ENV_FILE" ]]; then
|
|
||||||
rm -f "$ENV_FILE"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[INFO] Agent E2E environment cleaned up"
|
|
@ -1,26 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
VERIFY_SCRIPT="$(cd "$TEST_ROOT/.." && pwd)/scripts/agent_deployment_verify.sh"
|
|
||||||
|
|
||||||
if ! docker ps --format '{{.Names}}' | grep -q '^argus-agent-e2e$'; then
|
|
||||||
echo "[WARN] agent container not running; skip verification"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if docker exec -i argus-agent-e2e bash -lc 'command -v curl >/dev/null && command -v jq >/dev/null'; then
|
|
||||||
echo "[INFO] curl/jq already installed in agent container"
|
|
||||||
else
|
|
||||||
echo "[INFO] Installing curl/jq in agent container"
|
|
||||||
docker exec -i argus-agent-e2e bash -lc 'apt-get update >/dev/null 2>&1 && apt-get install -y curl jq >/dev/null 2>&1' || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
if docker exec -i argus-agent-e2e bash -lc 'command -v /usr/local/bin/agent_deployment_verify.sh >/dev/null'; then
|
|
||||||
docker exec -i argus-agent-e2e /usr/local/bin/agent_deployment_verify.sh
|
|
||||||
elif [[ -x "$VERIFY_SCRIPT" ]]; then
|
|
||||||
docker exec -i argus-agent-e2e "$VERIFY_SCRIPT"
|
|
||||||
else
|
|
||||||
echo "[WARN] agent_deployment_verify.sh not found"
|
|
||||||
fi
|
|
@ -1,79 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
LOG_PREFIX="[AGENT-ENTRYPOINT]"
|
|
||||||
DNS_SCRIPT="/private/argus/etc/update-dns.sh"
|
|
||||||
DNS_CONF="/private/argus/etc/dns.conf"
|
|
||||||
TARGET_DOMAIN="master.argus.com"
|
|
||||||
AGENT_UID="${ARGUS_BUILD_UID:-2133}"
|
|
||||||
AGENT_GID="${ARGUS_BUILD_GID:-2015}"
|
|
||||||
AGENT_HOSTNAME="${HOSTNAME:-unknown}"
|
|
||||||
AGENT_DATA_DIR="/private/argus/agent/${AGENT_HOSTNAME}"
|
|
||||||
AGENT_HEALTH_DIR="${AGENT_DATA_DIR}/health"
|
|
||||||
RUNTIME_GROUP="argusagent"
|
|
||||||
RUNTIME_USER="argusagent"
|
|
||||||
|
|
||||||
log() {
|
|
||||||
echo "${LOG_PREFIX} $*"
|
|
||||||
}
|
|
||||||
|
|
||||||
mkdir -p "$AGENT_DATA_DIR" "$AGENT_HEALTH_DIR"
|
|
||||||
chown -R "$AGENT_UID:$AGENT_GID" "$AGENT_DATA_DIR" "$AGENT_HEALTH_DIR" 2>/dev/null || true
|
|
||||||
chown -R "$AGENT_UID:$AGENT_GID" "/private/argus/etc" 2>/dev/null || true
|
|
||||||
|
|
||||||
if ! getent group "$AGENT_GID" >/dev/null 2>&1; then
|
|
||||||
groupadd -g "$AGENT_GID" "$RUNTIME_GROUP"
|
|
||||||
else
|
|
||||||
RUNTIME_GROUP="$(getent group "$AGENT_GID" | cut -d: -f1)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! getent passwd "$AGENT_UID" >/dev/null 2>&1; then
|
|
||||||
useradd -u "$AGENT_UID" -g "$AGENT_GID" -M -s /bin/bash "$RUNTIME_USER"
|
|
||||||
else
|
|
||||||
RUNTIME_USER="$(getent passwd "$AGENT_UID" | cut -d: -f1)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
log "运行用户: $RUNTIME_USER ($AGENT_UID:$AGENT_GID)"
|
|
||||||
|
|
||||||
# 中文提示:等待 bind 下发的 update-dns.sh 脚本
|
|
||||||
for _ in {1..30}; do
|
|
||||||
if [[ -x "$DNS_SCRIPT" ]]; then
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
log "等待 update-dns.sh 准备就绪..."
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ -x "$DNS_SCRIPT" ]]; then
|
|
||||||
log "执行 update-dns.sh 更新容器 DNS"
|
|
||||||
while true; do
|
|
||||||
if "$DNS_SCRIPT"; then
|
|
||||||
log "update-dns.sh 执行成功"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
log "update-dns.sh 执行失败,3 秒后重试"
|
|
||||||
sleep 3
|
|
||||||
done
|
|
||||||
else
|
|
||||||
log "未获取到 update-dns.sh,使用镜像默认 DNS"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 中文提示:记录当前 dns.conf 内容,便于排查
|
|
||||||
if [[ -f "$DNS_CONF" ]]; then
|
|
||||||
log "dns.conf 内容: $(tr '\n' ' ' < "$DNS_CONF")"
|
|
||||||
else
|
|
||||||
log "dns.conf 暂未生成"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 中文提示:尝试解析 master 域名,失败不阻塞但会打日志
|
|
||||||
for _ in {1..30}; do
|
|
||||||
if getent hosts "$TARGET_DOMAIN" >/dev/null 2>&1; then
|
|
||||||
MASTER_IP=$(getent hosts "$TARGET_DOMAIN" | awk '{print $1}' | head -n 1)
|
|
||||||
log "master.argus.com 解析成功: $MASTER_IP"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
|
|
||||||
log "启动 argus-agent"
|
|
||||||
exec su -s /bin/bash -c /usr/local/bin/argus-agent "$RUNTIME_USER"
|
|
@ -1,19 +0,0 @@
|
|||||||
global:
|
|
||||||
resolve_timeout: 5m
|
|
||||||
|
|
||||||
route:
|
|
||||||
group_by: ['alertname', 'instance'] # 分组:相同 alertname + instance 的告警合并
|
|
||||||
group_wait: 30s # 第一个告警后,等 30s 看是否有同组告警一起发
|
|
||||||
group_interval: 5m # 同组告警变化后,至少 5 分钟再发一次
|
|
||||||
repeat_interval: 3h # 相同告警,3 小时重复提醒一次
|
|
||||||
receiver: 'null'
|
|
||||||
|
|
||||||
receivers:
|
|
||||||
- name: 'null'
|
|
||||||
|
|
||||||
inhibit_rules:
|
|
||||||
- source_match:
|
|
||||||
severity: 'critical' # critical 告警存在时
|
|
||||||
target_match:
|
|
||||||
severity: 'warning' # 抑制相同 instance 的 warning 告警
|
|
||||||
equal: ['instance']
|
|
@ -1 +0,0 @@
|
|||||||
172.18.0.2
|
|
@ -1,37 +0,0 @@
|
|||||||
version: '3.8'
|
|
||||||
services:
|
|
||||||
alertmanager:
|
|
||||||
build:
|
|
||||||
context: ../../../
|
|
||||||
dockerfile: src/alert/alertmanager/build/Dockerfile
|
|
||||||
args:
|
|
||||||
ARGUS_UID: ${ARGUS_UID:-2133}
|
|
||||||
ARGUS_GID: ${ARGUS_GID:-2015}
|
|
||||||
USE_INTRANET: ${USE_INTRANET:-false}
|
|
||||||
image: argus-alertmanager:latest
|
|
||||||
container_name: argus-alertmanager
|
|
||||||
environment:
|
|
||||||
- ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
|
|
||||||
- ARGUS_UID=${ARGUS_UID:-2133}
|
|
||||||
- ARGUS_GID=${ARGUS_GID:-2015}
|
|
||||||
ports:
|
|
||||||
- "${ARGUS_PORT:-9093}:9093"
|
|
||||||
volumes:
|
|
||||||
- ${DATA_ROOT:-./data}/alertmanager:/private/argus/alert/alertmanager
|
|
||||||
- ${DATA_ROOT:-./data}/etc:/private/argus/etc
|
|
||||||
networks:
|
|
||||||
- argus-network
|
|
||||||
restart: unless-stopped
|
|
||||||
logging:
|
|
||||||
driver: "json-file"
|
|
||||||
options:
|
|
||||||
max-size: "10m"
|
|
||||||
max-file: "3"
|
|
||||||
networks:
|
|
||||||
argus-network:
|
|
||||||
driver: bridge
|
|
||||||
name: argus-network
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
alertmanager_data:
|
|
||||||
driver: local
|
|
@ -1,19 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)"
|
|
||||||
project_root="$(cd "$root/../../.." && pwd)"
|
|
||||||
|
|
||||||
source "$project_root/scripts/common/build_user.sh"
|
|
||||||
load_build_user
|
|
||||||
|
|
||||||
# 创建新的private目录结构 (基于argus目录结构)
|
|
||||||
echo "[INFO] Creating private directory structure for supervisor-based containers..."
|
|
||||||
mkdir -p "$root/private/argus/alert/alertmanager"
|
|
||||||
mkdir -p "$root/private/argus/etc/"
|
|
||||||
|
|
||||||
# 设置数据目录权限
|
|
||||||
echo "[INFO] Setting permissions for data directories..."
|
|
||||||
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/alert/alertmanager" 2>/dev/null || true
|
|
||||||
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/etc" 2>/dev/null || true
|
|
||||||
|
|
||||||
echo "[INFO] Supervisor-based containers will manage their own scripts and configurations"
|
|
@ -1,10 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
cd "$(dirname "$0")/.."
|
|
||||||
compose_cmd="docker compose"
|
|
||||||
if ! $compose_cmd version >/dev/null 2>&1; then
|
|
||||||
if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else
|
|
||||||
echo "需要 Docker Compose,请安装后重试" >&2; exit 1; fi
|
|
||||||
fi
|
|
||||||
$compose_cmd -p alert-mvp up -d --remove-orphans
|
|
||||||
echo "[OK] 服务已启动:Alertmanager http://localhost:9093"
|
|
@ -1,106 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# ==========================================================
|
|
||||||
# Alertmanager 测试脚本
|
|
||||||
# ==========================================================
|
|
||||||
|
|
||||||
ALERTMANAGER_URL="http://localhost:9093"
|
|
||||||
TEST_ALERT_NAME_CRITICAL="NodeDown"
|
|
||||||
TEST_ALERT_NAME_WARNING="HighCPU"
|
|
||||||
TMP_LOG="/tmp/test-alertmanager.log"
|
|
||||||
|
|
||||||
# 等待参数
|
|
||||||
am_wait_attempts=30
|
|
||||||
am_wait_interval=2
|
|
||||||
|
|
||||||
GREEN="\033[1;32m"
|
|
||||||
RED="\033[1;31m"
|
|
||||||
YELLOW="\033[1;33m"
|
|
||||||
RESET="\033[0m"
|
|
||||||
|
|
||||||
# ==========================================================
|
|
||||||
# 函数定义
|
|
||||||
# ==========================================================
|
|
||||||
|
|
||||||
wait_for_alertmanager() {
|
|
||||||
local attempt=1
|
|
||||||
echo "[INFO] 等待 Alertmanager 启动中..."
|
|
||||||
while (( attempt <= am_wait_attempts )); do
|
|
||||||
if curl -fsS "${ALERTMANAGER_URL}/api/v2/status" >/dev/null 2>&1; then
|
|
||||||
echo -e "${GREEN}[OK] Alertmanager 已就绪 (attempt=${attempt}/${am_wait_attempts})${RESET}"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
echo "[..] Alertmanager 尚未就绪 (${attempt}/${am_wait_attempts})"
|
|
||||||
sleep "${am_wait_interval}"
|
|
||||||
(( attempt++ ))
|
|
||||||
done
|
|
||||||
echo -e "${RED}[ERROR] Alertmanager 在 ${am_wait_attempts} 次尝试后仍未就绪${RESET}"
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
log_step() {
|
|
||||||
echo -e "${YELLOW}==== $1 ====${RESET}"
|
|
||||||
}
|
|
||||||
|
|
||||||
# ==========================================================
|
|
||||||
# 主流程
|
|
||||||
# ==========================================================
|
|
||||||
|
|
||||||
log_step "测试 Alertmanager 开始"
|
|
||||||
echo "[INFO] Alertmanager 地址: $ALERTMANAGER_URL"
|
|
||||||
|
|
||||||
# Step 1: 等待 Alertmanager 启动
|
|
||||||
wait_for_alertmanager
|
|
||||||
|
|
||||||
# Step 2: 触发一个critical测试告警
|
|
||||||
echo "[INFO] 发送critical测试告警..."
|
|
||||||
curl -fsS -X POST "${ALERTMANAGER_URL}/api/v2/alerts" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '[
|
|
||||||
{
|
|
||||||
"labels": {
|
|
||||||
"alertname": "'"${TEST_ALERT_NAME_CRITICAL}"'",
|
|
||||||
"instance": "node-1",
|
|
||||||
"severity": "critical"
|
|
||||||
},
|
|
||||||
"annotations": {
|
|
||||||
"summary": "节点 node-1 宕机"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]' \
|
|
||||||
-o "$TMP_LOG"
|
|
||||||
|
|
||||||
if [ $? -eq 0 ]; then
|
|
||||||
echo -e "${GREEN}[OK] 已成功发送critical测试告警${RESET}"
|
|
||||||
else
|
|
||||||
echo -e "${RED}[ERROR] critical告警发送失败!${RESET}"
|
|
||||||
cat "$TMP_LOG"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Step 3: 触发一个warning测试告警
|
|
||||||
echo "[INFO] 发送warning测试告警..."
|
|
||||||
curl -fsS -X POST "${ALERTMANAGER_URL}/api/v2/alerts" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '[
|
|
||||||
{
|
|
||||||
"labels": {
|
|
||||||
"alertname": "'"${TEST_ALERT_NAME_WARNING}"'",
|
|
||||||
"instance": "node-1",
|
|
||||||
"severity": "warning"
|
|
||||||
},
|
|
||||||
"annotations": {
|
|
||||||
"summary": "节点 node-1 CPU 使用率过高"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]' \
|
|
||||||
-o "$TMP_LOG"
|
|
||||||
|
|
||||||
if [ $? -eq 0 ]; then
|
|
||||||
echo -e "${GREEN}[OK] 已成功发送warning测试告警${RESET}"
|
|
||||||
else
|
|
||||||
echo -e "${RED}[ERROR] warning告警发送失败!${RESET}"
|
|
||||||
cat "$TMP_LOG"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
@ -1,71 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# ==========================================================
|
|
||||||
# Alertmanager 测试脚本(含启动等待)
|
|
||||||
# ==========================================================
|
|
||||||
|
|
||||||
ALERTMANAGER_URL="http://localhost:9093"
|
|
||||||
TEST_ALERT_NAME_CRITICAL="NodeDown"
|
|
||||||
TEST_ALERT_NAME_WARNING="HighCPU"
|
|
||||||
TMP_LOG="/tmp/test-alertmanager.log"
|
|
||||||
|
|
||||||
# 等待参数
|
|
||||||
am_wait_attempts=30
|
|
||||||
am_wait_interval=2
|
|
||||||
|
|
||||||
GREEN="\033[1;32m"
|
|
||||||
RED="\033[1;31m"
|
|
||||||
YELLOW="\033[1;33m"
|
|
||||||
RESET="\033[0m"
|
|
||||||
|
|
||||||
# ==========================================================
|
|
||||||
# 函数定义
|
|
||||||
# ==========================================================
|
|
||||||
|
|
||||||
wait_for_alertmanager() {
|
|
||||||
local attempt=1
|
|
||||||
echo "[INFO] 等待 Alertmanager 启动中..."
|
|
||||||
while (( attempt <= am_wait_attempts )); do
|
|
||||||
if curl -fsS "${ALERTMANAGER_URL}/api/v2/status" >/dev/null 2>&1; then
|
|
||||||
echo -e "${GREEN}[OK] Alertmanager 已就绪 (attempt=${attempt}/${am_wait_attempts})${RESET}"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
echo "[..] Alertmanager 尚未就绪 (${attempt}/${am_wait_attempts})"
|
|
||||||
sleep "${am_wait_interval}"
|
|
||||||
(( attempt++ ))
|
|
||||||
done
|
|
||||||
echo -e "${RED}[ERROR] Alertmanager 在 ${am_wait_attempts} 次尝试后仍未就绪${RESET}"
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
log_step() {
|
|
||||||
echo -e "${YELLOW}==== $1 ====${RESET}"
|
|
||||||
}
|
|
||||||
|
|
||||||
# ==========================================================
|
|
||||||
# 主流程
|
|
||||||
# ==========================================================
|
|
||||||
|
|
||||||
log_step "查询 Alertmanager 当前告警列表开始"
|
|
||||||
echo "[INFO] Alertmanager 地址: $ALERTMANAGER_URL"
|
|
||||||
|
|
||||||
# Step 1: 等待 Alertmanager 启动
|
|
||||||
wait_for_alertmanager
|
|
||||||
|
|
||||||
# Step 2: 查询当前告警列表
|
|
||||||
echo "[INFO] 查询当前告警..."
|
|
||||||
sleep 1
|
|
||||||
curl -fsS "${ALERTMANAGER_URL}/api/v2/alerts" | jq '.' || {
|
|
||||||
echo -e "${RED}[WARN] 无法解析返回 JSON,请检查 jq 是否安装${RESET}"
|
|
||||||
curl -s "${ALERTMANAGER_URL}/api/v2/alerts"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Step 3: 检查告警是否包含 NodeDown
|
|
||||||
if curl -fsS "${ALERTMANAGER_URL}/api/v2/alerts" | grep -q "${TEST_ALERT_NAME_CRITICAL}"; then
|
|
||||||
echo -e "${GREEN}✅ 测试通过:Alertmanager 已成功接收告警 ${TEST_ALERT_NAME_CRITICAL}${RESET}"
|
|
||||||
else
|
|
||||||
echo -e "${RED}❌ 测试失败:未检测到告警 ${TEST_ALERT_NAME_CRITICAL}${RESET}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
log_step "测试结束"
|
|
@ -1,21 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
cd "$(dirname "$0")/.."
|
|
||||||
compose_cmd="docker compose"
|
|
||||||
if ! $compose_cmd version >/dev/null 2>&1; then
|
|
||||||
if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else
|
|
||||||
echo "需要 Docker Compose,请安装后重试" >&2; exit 1; fi
|
|
||||||
fi
|
|
||||||
$compose_cmd -p alert-mvp down
|
|
||||||
echo "[OK] 已停止所有容器"
|
|
||||||
|
|
||||||
# 清理private目录内容
|
|
||||||
echo "[INFO] 清理private目录内容..."
|
|
||||||
cd "$(dirname "$0")/.."
|
|
||||||
if [ -d "private" ]; then
|
|
||||||
# 删除private目录及其所有内容
|
|
||||||
rm -rf private
|
|
||||||
echo "[OK] 已清理private目录"
|
|
||||||
else
|
|
||||||
echo "[INFO] private目录不存在,无需清理"
|
|
||||||
fi
|
|
@ -1,105 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
echo "======================================="
|
|
||||||
echo "ARGUS Alert System End-to-End Test"
|
|
||||||
echo "======================================="
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# 记录测试开始时间
|
|
||||||
test_start_time=$(date +%s)
|
|
||||||
|
|
||||||
# 函数:等待服务就绪
|
|
||||||
wait_for_services() {
|
|
||||||
echo "[INFO] Waiting for all services to be ready..."
|
|
||||||
local max_attempts=${SERVICE_WAIT_ATTEMPTS:-120}
|
|
||||||
local attempt=1
|
|
||||||
|
|
||||||
while [ $attempt -le $max_attempts ]; do
|
|
||||||
if curl -fs http://localhost:9093/api/v2/status >/dev/null 2>&1; then
|
|
||||||
echo "[OK] All services are ready!"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
echo " Waiting for services... ($attempt/$max_attempts)"
|
|
||||||
sleep 5
|
|
||||||
((attempt++))
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "[ERROR] Services not ready after $max_attempts attempts"
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
# 函数:显示测试步骤
|
|
||||||
show_step() {
|
|
||||||
echo ""
|
|
||||||
echo "🔄 Step $1: $2"
|
|
||||||
echo "----------------------------------------"
|
|
||||||
}
|
|
||||||
|
|
||||||
# 函数:验证步骤结果
|
|
||||||
verify_step() {
|
|
||||||
if [ $? -eq 0 ]; then
|
|
||||||
echo "✅ $1 - SUCCESS"
|
|
||||||
else
|
|
||||||
echo "❌ $1 - FAILED"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# 开始端到端测试
|
|
||||||
show_step "1" "Bootstrap - Initialize environment"
|
|
||||||
./scripts/01_bootstrap.sh
|
|
||||||
verify_step "Bootstrap"
|
|
||||||
|
|
||||||
show_step "2" "Startup - Start all services"
|
|
||||||
./scripts/02_up.sh
|
|
||||||
verify_step "Service startup"
|
|
||||||
|
|
||||||
# 等待服务完全就绪
|
|
||||||
wait_for_services || exit 1
|
|
||||||
|
|
||||||
# 发送告警数据
|
|
||||||
show_step "3" "Add alerts - Send test alerts to Alertmanager"
|
|
||||||
./scripts/03_alertmanager_add_alert.sh
|
|
||||||
verify_step "Send test alerts"
|
|
||||||
|
|
||||||
# 查询告警数据
|
|
||||||
show_step "4" "Verify data - Query Alertmanager"
|
|
||||||
./scripts/04_query_alerts.sh
|
|
||||||
verify_step "Data verification"
|
|
||||||
|
|
||||||
|
|
||||||
# 检查服务健康状态
|
|
||||||
show_step "Health" "Check service health"
|
|
||||||
echo "[INFO] Checking service health..."
|
|
||||||
|
|
||||||
# 检查 Alertmanager 状态
|
|
||||||
if curl -fs "http://localhost:9093/api/v2/status" >/dev/null 2>&1; then
|
|
||||||
am_status="available"
|
|
||||||
echo "✅ Alertmanager status: $am_status"
|
|
||||||
else
|
|
||||||
am_status="unavailable"
|
|
||||||
echo "⚠️ Alertmanager status: $am_status"
|
|
||||||
fi
|
|
||||||
verify_step "Service health check"
|
|
||||||
|
|
||||||
# 清理环境
|
|
||||||
show_step "5" "Cleanup - Stop all services"
|
|
||||||
./scripts/05_down.sh
|
|
||||||
verify_step "Service cleanup"
|
|
||||||
|
|
||||||
# 计算总测试时间
|
|
||||||
test_end_time=$(date +%s)
|
|
||||||
total_time=$((test_end_time - test_start_time))
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "======================================="
|
|
||||||
echo "🎉 END-TO-END TEST COMPLETED SUCCESSFULLY!"
|
|
||||||
echo "======================================="
|
|
||||||
echo "📊 Test Summary:"
|
|
||||||
echo " • Total time: ${total_time}s"
|
|
||||||
echo " • Alertmanager status: $am_status"
|
|
||||||
echo " • All services started and stopped successfully"
|
|
||||||
echo ""
|
|
||||||
echo "✅ The ARGUS Alert system is working correctly!"
|
|
||||||
echo ""
|
|
@ -6,11 +6,6 @@ ENV TZ=Asia/Shanghai
|
|||||||
|
|
||||||
# 设置构建参数
|
# 设置构建参数
|
||||||
ARG USE_INTRANET=false
|
ARG USE_INTRANET=false
|
||||||
ARG ARGUS_BUILD_UID=2133
|
|
||||||
ARG ARGUS_BUILD_GID=2015
|
|
||||||
|
|
||||||
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
|
|
||||||
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
|
|
||||||
|
|
||||||
# 配置内网 apt 源 (如果指定了内网选项)
|
# 配置内网 apt 源 (如果指定了内网选项)
|
||||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||||
@ -34,24 +29,6 @@ RUN apt-get update && \
|
|||||||
&& apt-get clean \
|
&& apt-get clean \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# 调整 bind 用户与用户组 ID 以匹配宿主机配置
|
|
||||||
RUN set -eux; \
|
|
||||||
current_gid="$(getent group bind | awk -F: '{print $3}')"; \
|
|
||||||
if [ -z "$current_gid" ]; then \
|
|
||||||
groupadd -g "${ARGUS_BUILD_GID}" bind; \
|
|
||||||
elif [ "$current_gid" != "${ARGUS_BUILD_GID}" ]; then \
|
|
||||||
groupmod -g "${ARGUS_BUILD_GID}" bind; \
|
|
||||||
fi; \
|
|
||||||
if id bind >/dev/null 2>&1; then \
|
|
||||||
current_uid="$(id -u bind)"; \
|
|
||||||
if [ "$current_uid" != "${ARGUS_BUILD_UID}" ]; then \
|
|
||||||
usermod -u "${ARGUS_BUILD_UID}" bind; \
|
|
||||||
fi; \
|
|
||||||
else \
|
|
||||||
useradd -m -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" bind; \
|
|
||||||
fi; \
|
|
||||||
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /var/cache/bind /var/lib/bind
|
|
||||||
|
|
||||||
# 配置部署时使用的apt源
|
# 配置部署时使用的apt源
|
||||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||||
echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
|
echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
|
||||||
|
@ -9,9 +9,6 @@ SLEEP_SECONDS=10
|
|||||||
RELOAD_SCRIPT="/usr/local/bin/reload-bind9.sh" # 这里放你已有脚本的路径
|
RELOAD_SCRIPT="/usr/local/bin/reload-bind9.sh" # 这里放你已有脚本的路径
|
||||||
|
|
||||||
mkdir -p "$(dirname "$LOCKFILE")" "$BACKUP_DIR"
|
mkdir -p "$(dirname "$LOCKFILE")" "$BACKUP_DIR"
|
||||||
BACKUP_UID="${ARGUS_BUILD_UID:-2133}"
|
|
||||||
BACKUP_GID="${ARGUS_BUILD_GID:-2015}"
|
|
||||||
chown -R "$BACKUP_UID:$BACKUP_GID" "$BACKUP_DIR" 2>/dev/null || true
|
|
||||||
|
|
||||||
is_ipv4() {
|
is_ipv4() {
|
||||||
local ip="$1"
|
local ip="$1"
|
||||||
@ -36,7 +33,6 @@ upsert_record() {
|
|||||||
local changed=0
|
local changed=0
|
||||||
|
|
||||||
cp -a "$ZONE_DB" "$BACKUP_DIR/db.argus.com.$ts.bak"
|
cp -a "$ZONE_DB" "$BACKUP_DIR/db.argus.com.$ts.bak"
|
||||||
chown "$BACKUP_UID:$BACKUP_GID" "$BACKUP_DIR/db.argus.com.$ts.bak" 2>/dev/null || true
|
|
||||||
|
|
||||||
local cur_ip
|
local cur_ip
|
||||||
cur_ip="$(get_current_ip "$name" || true)"
|
cur_ip="$(get_current_ip "$name" || true)"
|
||||||
@ -65,10 +61,7 @@ upsert_record() {
|
|||||||
echo "[SKIP] ${name} unchanged (${new_ip})"
|
echo "[SKIP] ${name} unchanged (${new_ip})"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $changed -eq 1 ]]; then
|
return $changed
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
return 1
|
|
||||||
}
|
}
|
||||||
|
|
||||||
while true; do
|
while true; do
|
||||||
@ -77,7 +70,7 @@ while true; do
|
|||||||
shopt -s nullglob
|
shopt -s nullglob
|
||||||
NEED_RELOAD=0
|
NEED_RELOAD=0
|
||||||
|
|
||||||
for f in "$WATCH_DIR"/*.argus.com; do
|
for f in "$WATCH_DIR"/*.argus.com; do
|
||||||
base="$(basename "$f")"
|
base="$(basename "$f")"
|
||||||
name="${base%.argus.com}"
|
name="${base%.argus.com}"
|
||||||
ip="$(grep -Eo '([0-9]{1,3}\.){3}[0-9]{1,3}' "$f" | tail -n1 || true)"
|
ip="$(grep -Eo '([0-9]{1,3}\.){3}[0-9]{1,3}' "$f" | tail -n1 || true)"
|
||||||
@ -104,3 +97,4 @@ while true; do
|
|||||||
|
|
||||||
sleep "$SLEEP_SECONDS"
|
sleep "$SLEEP_SECONDS"
|
||||||
done
|
done
|
||||||
|
|
||||||
|
@ -6,8 +6,6 @@ chmod 777 /private 2>/dev/null || true
|
|||||||
# Create persistent directories for BIND9 configs and DNS sync
|
# Create persistent directories for BIND9 configs and DNS sync
|
||||||
mkdir -p /private/argus/bind
|
mkdir -p /private/argus/bind
|
||||||
mkdir -p /private/argus/etc
|
mkdir -p /private/argus/etc
|
||||||
chown bind:bind /private/argus 2>/dev/null || true
|
|
||||||
chown -R bind:bind /private/argus/bind /private/argus/etc
|
|
||||||
|
|
||||||
# Copy configuration files to persistent storage if they don't exist
|
# Copy configuration files to persistent storage if they don't exist
|
||||||
if [ ! -f /private/argus/bind/named.conf.local ]; then
|
if [ ! -f /private/argus/bind/named.conf.local ]; then
|
||||||
|
@ -3,8 +3,8 @@ services:
|
|||||||
image: argus-bind9:latest
|
image: argus-bind9:latest
|
||||||
container_name: argus-bind9-test
|
container_name: argus-bind9-test
|
||||||
ports:
|
ports:
|
||||||
- "${HOST_DNS_PORT:-1053}:53/tcp"
|
- "53:53/tcp"
|
||||||
- "${HOST_DNS_PORT:-1053}:53/udp"
|
- "53:53/udp"
|
||||||
volumes:
|
volumes:
|
||||||
- ./private:/private
|
- ./private:/private
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
@ -7,9 +7,6 @@
|
|||||||
set -e
|
set -e
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
|
|
||||||
|
|
||||||
export HOST_DNS_PORT
|
|
||||||
|
|
||||||
echo "=========================================="
|
echo "=========================================="
|
||||||
echo "BIND9 DNS Server End-to-End Test Suite"
|
echo "BIND9 DNS Server End-to-End Test Suite"
|
||||||
|
@ -7,17 +7,13 @@ set -e
|
|||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
TEST_DIR="$(dirname "$SCRIPT_DIR")"
|
TEST_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
|
|
||||||
|
|
||||||
export HOST_DNS_PORT
|
|
||||||
|
|
||||||
cd "$TEST_DIR"
|
cd "$TEST_DIR"
|
||||||
|
|
||||||
echo "Starting BIND9 test container..."
|
echo "Starting BIND9 test container..."
|
||||||
|
|
||||||
# Ensure private directory exists with proper permissions
|
# Ensure private directory exists with proper permissions
|
||||||
mkdir -p private/argus/bind
|
mkdir -p private
|
||||||
mkdir -p private/argus/etc
|
|
||||||
chmod 777 private
|
chmod 777 private
|
||||||
|
|
||||||
# Start the container
|
# Start the container
|
||||||
@ -39,4 +35,4 @@ fi
|
|||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "BIND9 test environment is ready!"
|
echo "BIND9 test environment is ready!"
|
||||||
echo "DNS server listening on localhost:${HOST_DNS_PORT}"
|
echo "DNS server listening on localhost:53"
|
@ -5,10 +5,7 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
|
|
||||||
|
|
||||||
echo "Testing DNS resolution with dig..."
|
echo "Testing DNS resolution with dig..."
|
||||||
echo "Using DNS server localhost:${HOST_DNS_PORT}"
|
|
||||||
|
|
||||||
# Function to test DNS query
|
# Function to test DNS query
|
||||||
test_dns_query() {
|
test_dns_query() {
|
||||||
@ -22,7 +19,7 @@ test_dns_query() {
|
|||||||
echo "Expected IP: $expected_ip"
|
echo "Expected IP: $expected_ip"
|
||||||
|
|
||||||
# Perform dig query
|
# Perform dig query
|
||||||
result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
|
result=$(dig @localhost $hostname.argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
|
||||||
|
|
||||||
if [ "$result" = "QUERY_FAILED" ]; then
|
if [ "$result" = "QUERY_FAILED" ]; then
|
||||||
echo "✗ DNS query failed"
|
echo "✗ DNS query failed"
|
||||||
|
@ -6,13 +6,10 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
TEST_DIR="$(dirname "$SCRIPT_DIR")"
|
TEST_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
|
||||||
echo "=== DNS Auto-Sync Functionality Test ==="
|
echo "=== DNS Auto-Sync Functionality Test ==="
|
||||||
echo "Using DNS server localhost:${HOST_DNS_PORT}"
|
|
||||||
|
|
||||||
# Check if container is running
|
# Check if container is running
|
||||||
if ! docker compose ps | grep -q "Up"; then
|
if ! docker compose ps | grep -q "Up"; then
|
||||||
@ -39,7 +36,7 @@ test_dns_query() {
|
|||||||
# Wait a moment for DNS cache
|
# Wait a moment for DNS cache
|
||||||
sleep 2
|
sleep 2
|
||||||
|
|
||||||
result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
|
result=$(dig @localhost $hostname.argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
|
||||||
|
|
||||||
if [ "$result" = "$expected_ip" ]; then
|
if [ "$result" = "$expected_ip" ]; then
|
||||||
echo "✓ $result"
|
echo "✓ $result"
|
||||||
@ -93,7 +90,7 @@ echo ""
|
|||||||
echo "Step 2: Testing initial DNS configuration..."
|
echo "Step 2: Testing initial DNS configuration..."
|
||||||
|
|
||||||
# Get current IP for web.argus.com (may have been changed by previous tests)
|
# Get current IP for web.argus.com (may have been changed by previous tests)
|
||||||
current_web_ip=$(dig @localhost -p "$HOST_DNS_PORT" web.argus.com A +short 2>/dev/null || echo "UNKNOWN")
|
current_web_ip=$(dig @localhost web.argus.com A +short 2>/dev/null || echo "UNKNOWN")
|
||||||
echo "Current web.argus.com IP: $current_web_ip"
|
echo "Current web.argus.com IP: $current_web_ip"
|
||||||
|
|
||||||
# Test that DNS is working (regardless of specific IP)
|
# Test that DNS is working (regardless of specific IP)
|
||||||
@ -188,7 +185,7 @@ docker compose exec bind9 bash -c 'echo "this is not an IP address" > /private/a
|
|||||||
wait_for_sync
|
wait_for_sync
|
||||||
|
|
||||||
# Verify invalid record was not added (should fail to resolve)
|
# Verify invalid record was not added (should fail to resolve)
|
||||||
result=$(dig @localhost -p "$HOST_DNS_PORT" invalid.argus.com A +short 2>/dev/null || echo "NO_RESULT")
|
result=$(dig @localhost invalid.argus.com A +short 2>/dev/null || echo "NO_RESULT")
|
||||||
if [ "$result" = "NO_RESULT" ] || [ -z "$result" ]; then
|
if [ "$result" = "NO_RESULT" ] || [ -z "$result" ]; then
|
||||||
echo "✓ Invalid IP correctly ignored"
|
echo "✓ Invalid IP correctly ignored"
|
||||||
else
|
else
|
||||||
|
@ -5,13 +5,10 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
TEST_DIR="$(dirname "$SCRIPT_DIR")"
|
TEST_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
|
||||||
echo "=== DNS Configuration Reload Test ==="
|
echo "=== DNS Configuration Reload Test ==="
|
||||||
echo "Using DNS server localhost:${HOST_DNS_PORT}"
|
|
||||||
|
|
||||||
# Check if container is running
|
# Check if container is running
|
||||||
if ! docker compose ps | grep -q "Up"; then
|
if ! docker compose ps | grep -q "Up"; then
|
||||||
@ -35,7 +32,7 @@ test_dns_query() {
|
|||||||
echo "Testing: $description"
|
echo "Testing: $description"
|
||||||
echo "Query: $hostname.argus.com -> Expected: $expected_ip"
|
echo "Query: $hostname.argus.com -> Expected: $expected_ip"
|
||||||
|
|
||||||
result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
|
result=$(dig @localhost $hostname.argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
|
||||||
|
|
||||||
if [ "$result" = "$expected_ip" ]; then
|
if [ "$result" = "$expected_ip" ]; then
|
||||||
echo "✓ $result"
|
echo "✓ $result"
|
||||||
|
@ -5,13 +5,10 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
TEST_DIR="$(dirname "$SCRIPT_DIR")"
|
TEST_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
|
||||||
echo "=== Configuration Persistence Test ==="
|
echo "=== Configuration Persistence Test ==="
|
||||||
echo "Using DNS server localhost:${HOST_DNS_PORT}"
|
|
||||||
|
|
||||||
# Check if dig is available
|
# Check if dig is available
|
||||||
if ! command -v dig &> /dev/null; then
|
if ! command -v dig &> /dev/null; then
|
||||||
@ -28,7 +25,7 @@ test_dns_query() {
|
|||||||
echo "Testing: $description"
|
echo "Testing: $description"
|
||||||
echo "Query: $hostname.argus.com -> Expected: $expected_ip"
|
echo "Query: $hostname.argus.com -> Expected: $expected_ip"
|
||||||
|
|
||||||
result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
|
result=$(dig @localhost $hostname.argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
|
||||||
|
|
||||||
if [ "$result" = "$expected_ip" ]; then
|
if [ "$result" = "$expected_ip" ]; then
|
||||||
echo "✓ $result"
|
echo "✓ $result"
|
||||||
|
@ -7,9 +7,6 @@ set -e
|
|||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
TEST_DIR="$(dirname "$SCRIPT_DIR")"
|
TEST_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
|
|
||||||
|
|
||||||
export HOST_DNS_PORT
|
|
||||||
|
|
||||||
# Parse command line arguments
|
# Parse command line arguments
|
||||||
FULL_CLEANUP=true
|
FULL_CLEANUP=true
|
||||||
|
@ -3,29 +3,10 @@ FROM docker.elastic.co/elasticsearch/elasticsearch:8.13.4
|
|||||||
# 切换到 root 用户进行系统级安装
|
# 切换到 root 用户进行系统级安装
|
||||||
USER root
|
USER root
|
||||||
|
|
||||||
ARG ARGUS_BUILD_UID=2133
|
# 修改elasticsearch用户的UID和GID
|
||||||
ARG ARGUS_BUILD_GID=2015
|
RUN usermod -u 2133 elasticsearch && \
|
||||||
|
groupmod -g 2015 elasticsearch && \
|
||||||
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
|
chown -R elasticsearch:elasticsearch /usr/share/elasticsearch
|
||||||
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
|
|
||||||
|
|
||||||
# 调整 elasticsearch 用户与用户组 ID 以匹配宿主机配置
|
|
||||||
RUN set -eux; \
|
|
||||||
current_gid="$(getent group elasticsearch | awk -F: '{print $3}')"; \
|
|
||||||
if [ -z "$current_gid" ]; then \
|
|
||||||
groupadd -g "${ARGUS_BUILD_GID}" elasticsearch; \
|
|
||||||
elif [ "$current_gid" != "${ARGUS_BUILD_GID}" ]; then \
|
|
||||||
groupmod -g "${ARGUS_BUILD_GID}" elasticsearch; \
|
|
||||||
fi; \
|
|
||||||
if id elasticsearch >/dev/null 2>&1; then \
|
|
||||||
current_uid="$(id -u elasticsearch)"; \
|
|
||||||
if [ "$current_uid" != "${ARGUS_BUILD_UID}" ]; then \
|
|
||||||
usermod -u "${ARGUS_BUILD_UID}" elasticsearch; \
|
|
||||||
fi; \
|
|
||||||
else \
|
|
||||||
useradd -m -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" elasticsearch; \
|
|
||||||
fi; \
|
|
||||||
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/elasticsearch
|
|
||||||
|
|
||||||
# 设置构建参数
|
# 设置构建参数
|
||||||
ARG USE_INTRANET=false
|
ARG USE_INTRANET=false
|
||||||
|
@ -25,5 +25,3 @@
|
|||||||
Regex ^(?<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(?<level>\w+)\s+(?<message>.*)$
|
Regex ^(?<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(?<level>\w+)\s+(?<message>.*)$
|
||||||
Time_Key timestamp
|
Time_Key timestamp
|
||||||
Time_Format %Y-%m-%d %H:%M:%S
|
Time_Format %Y-%m-%d %H:%M:%S
|
||||||
Time_Offset +0800
|
|
||||||
Time_Keep On
|
|
||||||
|
@ -3,29 +3,10 @@ FROM docker.elastic.co/kibana/kibana:8.13.4
|
|||||||
# 切换到 root 用户进行系统级安装
|
# 切换到 root 用户进行系统级安装
|
||||||
USER root
|
USER root
|
||||||
|
|
||||||
ARG ARGUS_BUILD_UID=2133
|
# 修改kibana用户的UID和GID
|
||||||
ARG ARGUS_BUILD_GID=2015
|
RUN usermod -u 2133 kibana && \
|
||||||
|
groupmod -g 2015 kibana && \
|
||||||
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
|
chown -R kibana:kibana /usr/share/kibana
|
||||||
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
|
|
||||||
|
|
||||||
# 调整 kibana 用户与用户组 ID 以匹配宿主机配置
|
|
||||||
RUN set -eux; \
|
|
||||||
current_gid="$(getent group kibana | awk -F: '{print $3}')"; \
|
|
||||||
if [ -z "$current_gid" ]; then \
|
|
||||||
groupadd -g "${ARGUS_BUILD_GID}" kibana; \
|
|
||||||
elif [ "$current_gid" != "${ARGUS_BUILD_GID}" ]; then \
|
|
||||||
groupmod -g "${ARGUS_BUILD_GID}" kibana; \
|
|
||||||
fi; \
|
|
||||||
if id kibana >/dev/null 2>&1; then \
|
|
||||||
current_uid="$(id -u kibana)"; \
|
|
||||||
if [ "$current_uid" != "${ARGUS_BUILD_UID}" ]; then \
|
|
||||||
usermod -u "${ARGUS_BUILD_UID}" kibana; \
|
|
||||||
fi; \
|
|
||||||
else \
|
|
||||||
useradd -m -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" kibana; \
|
|
||||||
fi; \
|
|
||||||
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/kibana
|
|
||||||
|
|
||||||
# 设置构建参数
|
# 设置构建参数
|
||||||
ARG USE_INTRANET=false
|
ARG USE_INTRANET=false
|
||||||
|
@ -17,7 +17,6 @@ services:
|
|||||||
interval: 10s
|
interval: 10s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 30
|
retries: 30
|
||||||
restart: always
|
|
||||||
|
|
||||||
kibana:
|
kibana:
|
||||||
build:
|
build:
|
||||||
@ -74,11 +73,13 @@ services:
|
|||||||
interval: 15s
|
interval: 15s
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
retries: 30
|
retries: 30
|
||||||
restart: always
|
|
||||||
|
|
||||||
bind9:
|
bind9:
|
||||||
image: argus-bind9:latest
|
image: argus-bind9:latest
|
||||||
|
ports:
|
||||||
|
- "53:53/tcp"
|
||||||
|
- "53:53/udp"
|
||||||
volumes:
|
volumes:
|
||||||
- ./private/argus:/private/argus/
|
- ./private/argus:/private/argus/
|
||||||
restart: always
|
restart: unless-stopped
|
||||||
|
|
||||||
|
@ -1,10 +1,6 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)"
|
root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)"
|
||||||
project_root="$(cd "$root/../../.." && pwd)"
|
|
||||||
|
|
||||||
source "$project_root/scripts/common/build_user.sh"
|
|
||||||
load_build_user
|
|
||||||
|
|
||||||
# 创建新的private目录结构 (基于argus目录结构)
|
# 创建新的private目录结构 (基于argus目录结构)
|
||||||
echo "[INFO] Creating private directory structure for supervisor-based containers..."
|
echo "[INFO] Creating private directory structure for supervisor-based containers..."
|
||||||
@ -15,9 +11,9 @@ mkdir -p "$root/private/argus/etc/"
|
|||||||
|
|
||||||
# 设置数据目录权限(ES 和 Kibana 容器都使用 UID 1000)
|
# 设置数据目录权限(ES 和 Kibana 容器都使用 UID 1000)
|
||||||
echo "[INFO] Setting permissions for data directories..."
|
echo "[INFO] Setting permissions for data directories..."
|
||||||
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/log/elasticsearch" 2>/dev/null || true
|
sudo chown -R 2133:2015 "$root/private/argus/log/elasticsearch" 2>/dev/null || true
|
||||||
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/log/kibana" 2>/dev/null || true
|
sudo chown -R 2133:2015 "$root/private/argus/log/kibana" 2>/dev/null || true
|
||||||
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/etc" 2>/dev/null || true
|
sudo chown -R 2133:2015 "$root/private/argus/etc" 2>/dev/null || true
|
||||||
|
|
||||||
echo "[INFO] Supervisor-based containers will manage their own scripts and configurations"
|
echo "[INFO] Supervisor-based containers will manage their own scripts and configurations"
|
||||||
|
|
||||||
|
@ -4,22 +4,8 @@ set -euo pipefail
|
|||||||
# 获取fluent-bit-host01容器名称
|
# 获取fluent-bit-host01容器名称
|
||||||
container_name="logging-mvp-fluent-bit-host01-1"
|
container_name="logging-mvp-fluent-bit-host01-1"
|
||||||
|
|
||||||
wait_for_container() {
|
# 检查容器是否存在并运行
|
||||||
local name="$1"
|
if ! docker ps | grep -q "$container_name"; then
|
||||||
local attempts=30
|
|
||||||
local delay=5
|
|
||||||
local i
|
|
||||||
for ((i = 1; i <= attempts; i++)); do
|
|
||||||
if docker ps --format '{{.Names}}' | grep -Fx "$name" >/dev/null; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
echo "[INFO] 等待容器 $name 启动中... ($i/$attempts)"
|
|
||||||
sleep "$delay"
|
|
||||||
done
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
if ! wait_for_container "$container_name"; then
|
|
||||||
echo "[ERROR] Fluent Bit容器 $container_name 未运行"
|
echo "[ERROR] Fluent Bit容器 $container_name 未运行"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
@ -4,22 +4,8 @@ set -euo pipefail
|
|||||||
# 获取fluent-bit-host02容器名称
|
# 获取fluent-bit-host02容器名称
|
||||||
container_name="logging-mvp-fluent-bit-host02-1"
|
container_name="logging-mvp-fluent-bit-host02-1"
|
||||||
|
|
||||||
wait_for_container() {
|
# 检查容器是否存在并运行
|
||||||
local name="$1"
|
if ! docker ps | grep -q "$container_name"; then
|
||||||
local attempts=30
|
|
||||||
local delay=5
|
|
||||||
local i
|
|
||||||
for ((i = 1; i <= attempts; i++)); do
|
|
||||||
if docker ps --format '{{.Names}}' | grep -Fx "$name" >/dev/null; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
echo "[INFO] 等待容器 $name 启动中... ($i/$attempts)"
|
|
||||||
sleep "$delay"
|
|
||||||
done
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
if ! wait_for_container "$container_name"; then
|
|
||||||
echo "[ERROR] Fluent Bit容器 $container_name 未运行"
|
echo "[ERROR] Fluent Bit容器 $container_name 未运行"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
@ -1,42 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
# ES endpoint and wait strategy
|
|
||||||
ES="${ES:-http://localhost:9200}"
|
ES="${ES:-http://localhost:9200}"
|
||||||
es_wait_attempts="${ES_WAIT_ATTEMPTS:-60}" # total attempts to wait for ES
|
|
||||||
es_wait_interval="${ES_WAIT_INTERVAL:-2}" # seconds between attempts
|
|
||||||
|
|
||||||
echo "[i] 查询 ES 端点:$ES"
|
echo "[i] 查询 ES 端点:$ES"
|
||||||
|
|
||||||
wait_for_es() {
|
|
||||||
local attempt=1
|
|
||||||
while (( attempt <= es_wait_attempts )); do
|
|
||||||
# 等待集群达到至少 yellow 状态;请求失败则重试
|
|
||||||
if curl -fsS "$ES/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then
|
|
||||||
echo "[ok] Elasticsearch 已就绪 (attempt=${attempt}/${es_wait_attempts})"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
echo "[..] 等待 Elasticsearch 可用中 (${attempt}/${es_wait_attempts})"
|
|
||||||
sleep "${es_wait_interval}"
|
|
||||||
(( attempt++ ))
|
|
||||||
done
|
|
||||||
echo "[err] Elasticsearch 在 ${es_wait_attempts} 次尝试后仍不可用"
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
safe_count() {
|
|
||||||
# 对缺失索引返回 0,避免 404 触发失败
|
|
||||||
local pattern="$1"
|
|
||||||
local json
|
|
||||||
json=$(curl -fsS "$ES/${pattern}/_count?ignore_unavailable=true&allow_no_indices=true" 2>/dev/null || echo '{}')
|
|
||||||
echo "$json" | sed -E 's/.*"count":([0-9]+).*/\1/' | awk 'NF{print $0;exit} END{if(NR==0)print 0}'
|
|
||||||
}
|
|
||||||
|
|
||||||
wait_for_es
|
|
||||||
|
|
||||||
# 列出相关索引(可能为空,允许)
|
|
||||||
curl -fsS "$ES/_cat/indices?v" | egrep 'train-|infer-|logstash' || true
|
curl -fsS "$ES/_cat/indices?v" | egrep 'train-|infer-|logstash' || true
|
||||||
|
printf "train-* 计数:"; curl -fsS "$ES/train-*/_count" | sed -E 's/.*"count":([0-9]+).*/\1/'; echo
|
||||||
# 打印计数,缺失索引按 0 处理
|
printf "infer-* 计数:"; curl -fsS "$ES/infer-*/_count" | sed -E 's/.*"count":([0-9]+).*/\1/'; echo
|
||||||
printf "train-* 计数:"; safe_count "train-*"; echo
|
|
||||||
printf "infer-* 计数:"; safe_count "infer-*"; echo
|
|
||||||
|
@ -19,7 +19,7 @@ get_log_count() {
|
|||||||
# 函数:等待服务就绪
|
# 函数:等待服务就绪
|
||||||
wait_for_services() {
|
wait_for_services() {
|
||||||
echo "[INFO] Waiting for all services to be ready..."
|
echo "[INFO] Waiting for all services to be ready..."
|
||||||
local max_attempts=${SERVICE_WAIT_ATTEMPTS:-120}
|
local max_attempts=60
|
||||||
local attempt=1
|
local attempt=1
|
||||||
|
|
||||||
while [ $attempt -le $max_attempts ]; do
|
while [ $attempt -le $max_attempts ]; do
|
||||||
|
@ -1,81 +0,0 @@
|
|||||||
FROM python:3.11-slim
|
|
||||||
|
|
||||||
SHELL ["/bin/bash", "-c"]
|
|
||||||
|
|
||||||
ARG PIP_INDEX_URL=
|
|
||||||
ARG USE_OFFLINE=0
|
|
||||||
ARG USE_INTRANET=false
|
|
||||||
ARG ARGUS_BUILD_UID=2133
|
|
||||||
ARG ARGUS_BUILD_GID=2015
|
|
||||||
|
|
||||||
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
|
|
||||||
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
|
|
||||||
|
|
||||||
ENV PIP_NO_CACHE_DIR=1 \
|
|
||||||
PYTHONUNBUFFERED=1 \
|
|
||||||
PYTHONPATH=/app
|
|
||||||
|
|
||||||
USER root
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY ./src/master/requirements.txt ./requirements.txt
|
|
||||||
COPY ./src/master/offline_wheels/ /opt/offline_wheels/
|
|
||||||
|
|
||||||
RUN set -euxo pipefail \
|
|
||||||
&& if [[ "$USE_OFFLINE" == "1" ]]; then \
|
|
||||||
python -m pip install --no-index --find-links /opt/offline_wheels pip && \
|
|
||||||
python -m pip install --no-index --find-links /opt/offline_wheels -r requirements.txt; \
|
|
||||||
else \
|
|
||||||
python -m pip install --upgrade pip && \
|
|
||||||
if [[ -n "$PIP_INDEX_URL" ]]; then \
|
|
||||||
PIP_INDEX_URL="$PIP_INDEX_URL" python -m pip install -r requirements.txt; \
|
|
||||||
else \
|
|
||||||
python -m pip install -r requirements.txt; \
|
|
||||||
fi; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 配置内网 apt 源并安装常用工具
|
|
||||||
RUN if [[ "$USE_INTRANET" == "true" ]]; then \
|
|
||||||
echo "Configuring intranet apt sources" && \
|
|
||||||
if [[ -f /etc/apt/sources.list ]]; then cp /etc/apt/sources.list /etc/apt/sources.list.bak; fi && \
|
|
||||||
mkdir -p /etc/apt && \
|
|
||||||
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
|
|
||||||
rm -rf /etc/apt/sources.list.d && \
|
|
||||||
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
|
|
||||||
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
|
|
||||||
fi && \
|
|
||||||
apt-get update && \
|
|
||||||
apt-get install -y supervisor net-tools inetutils-ping && \
|
|
||||||
apt-get clean && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# 运行期切换到运行所需的 apt 源
|
|
||||||
RUN if [[ "$USE_INTRANET" == "true" ]]; then \
|
|
||||||
echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
RUN mkdir -p /var/log/supervisor
|
|
||||||
|
|
||||||
RUN set -eux; \
|
|
||||||
if getent group argus >/dev/null; then \
|
|
||||||
groupmod -g "${ARGUS_BUILD_GID}" argus; \
|
|
||||||
else \
|
|
||||||
groupadd -g "${ARGUS_BUILD_GID}" argus; \
|
|
||||||
fi; \
|
|
||||||
if id argus >/dev/null 2>&1; then \
|
|
||||||
usermod -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" argus; \
|
|
||||||
else \
|
|
||||||
useradd -m -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" -s /bin/bash argus; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
COPY ./src/master/build/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
|
||||||
COPY ./src/master/build/start-master.sh /usr/local/bin/start-master.sh
|
|
||||||
COPY ./src/master/build/dns-monitor.sh /usr/local/bin/dns-monitor.sh
|
|
||||||
RUN chmod +x /usr/local/bin/start-master.sh /usr/local/bin/dns-monitor.sh
|
|
||||||
|
|
||||||
COPY ./src/master/app ./app
|
|
||||||
|
|
||||||
EXPOSE 3000
|
|
||||||
|
|
||||||
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
|
|
@ -1,186 +0,0 @@
|
|||||||
# Argus Master 模块
|
|
||||||
|
|
||||||
Argus Master 是基于 Flask + SQLite 的节点管理服务,负责:
|
|
||||||
|
|
||||||
- 接收 agent 的注册与重注册请求,分配/校验节点 ID。
|
|
||||||
- 存储节点元数据、配置、健康状态,并根据上报时间计算在线状态。
|
|
||||||
- 输出仅包含在线节点的 `nodes.json`,供其他模块(如 metric)消费。
|
|
||||||
- 提供查询、配置更新、统计等 REST API。
|
|
||||||
|
|
||||||
## 构建与运行
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd src/master
|
|
||||||
./scripts/build_images.sh # 生成 argus-master:latest 镜像
|
|
||||||
```
|
|
||||||
|
|
||||||
如需离线构建,先在有网环境运行准备脚本:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd src/master
|
|
||||||
./scripts/prepare_offline_wheels.sh --pip-version 25.2 # 可选 --clean
|
|
||||||
```
|
|
||||||
|
|
||||||
脚本会把 `requirements.txt` 及 pip 指定版本全部下载到 `offline_wheels/`。随后将源码目录(含该子目录)与基础镜像一并拷贝到内网,执行:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd src/master
|
|
||||||
./scripts/build_images.sh --offline --tag argus-master:latest
|
|
||||||
```
|
|
||||||
|
|
||||||
若内网缺少 `python:3.11-slim`,请提前在外网 `docker save` 后通过离线介质 `docker load`。
|
|
||||||
|
|
||||||
本仓库提供的端到端测试会使用 `src/master/tests/docker-compose.yml` 启动示例环境:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd src/master/tests
|
|
||||||
./scripts/01_up_master.sh # 构建镜像并启动容器,监听 http://localhost:31300
|
|
||||||
```
|
|
||||||
|
|
||||||
服务日志与数据默认写入 `tests/private/argus/master/`(或自定义的挂载目录)。
|
|
||||||
|
|
||||||
## 运行时环境变量
|
|
||||||
|
|
||||||
| 变量 | 默认值 | 说明 |
|
|
||||||
| --- | --- | --- |
|
|
||||||
| `DB_PATH` | `/private/argus/master/db.sqlite3` | SQLite 数据库存放路径。目录会在启动时自动创建。 |
|
|
||||||
| `METRIC_NODES_JSON_PATH` | `/private/argus/metric/prometheus/nodes.json` | `nodes.json` 输出位置,仅包含在线节点。采用原子写入避免部分文件。 |
|
|
||||||
| `OFFLINE_THRESHOLD_SECONDS` | `180` | 若距离最近一次上报时间超过该值,调度器会将节点标记为 `offline`。 |
|
|
||||||
| `ONLINE_THRESHOLD_SECONDS` | `120` | 若最新上报时间距当前不超过该值,则标记为 `online`。范围处于两个阈值之间时保持原状态。 |
|
|
||||||
| `SCHEDULER_INTERVAL_SECONDS` | `30` | 调度器检查节点状态与刷新 `nodes.json` 的周期。 |
|
|
||||||
| `NODE_ID_PREFIX` | `A` | 新节点 ID 的前缀,实际 ID 形如 `A1`、`A2`。 |
|
|
||||||
| `AUTH_MODE` | `disabled` | 预留的认证开关,当前固定为禁用。 |
|
|
||||||
|
|
||||||
## 进程与监控
|
|
||||||
|
|
||||||
镜像内通过 `supervisord` 管理进程:
|
|
||||||
|
|
||||||
- `master`:执行 `/usr/local/bin/start-master.sh`,默认以 4 个 Gunicorn worker 监听 `0.0.0.0:3000`;可通过环境变量 `GUNICORN_WORKERS`、`GUNICORN_BIND`、`GUNICORN_EXTRA_ARGS` 调整。
|
|
||||||
- `dns-monitor`:轮询 `/private/argus/etc/dns.conf`,若发现变更则调用 `/private/argus/etc/update-dns.sh`,日志输出在 `/var/log/supervisor/dns-monitor.log`。
|
|
||||||
|
|
||||||
镜像构建阶段会安装 `supervisor`/`net-tools`/`inetutils-ping`/`vim` 等基础工具,并在运行前把 apt 源切换到内网镜像,方便容器内进一步运维。
|
|
||||||
|
|
||||||
## 域名注册与 DNS 联动
|
|
||||||
|
|
||||||
- Master 容器启动时会主动执行 `/private/argus/etc/update-dns.sh`(若存在),把自身 `/etc/resolv.conf` 指向 bind 服务提供的 DNS;随后解析 `eth0` 的 IPv4 地址并写入 `/private/argus/etc/master.argus.com`。该文件会被 bind 模块的 `argus_dns_sync.sh` 监控,用于生成 `master.argus.com` → 当前容器 IP 的 A 记录。
|
|
||||||
- 测试与生产都需要将 bind 下发的 `update-dns.sh`、`dns.conf` 等文件挂载到 `/private/argus/etc/`。在 E2E 场景中,`tests/private/argus/etc` 会由脚本自动准备。
|
|
||||||
- 其他模块(如 agent)在启动脚本中只需执行同一份 `update-dns.sh`,即可使用域名访问 master;若域名注册异常,agent 将无法成功上报,可据此快速定位问题。
|
|
||||||
|
|
||||||
## REST API 详解
|
|
||||||
|
|
||||||
基础路径:`/api/v1/master`,全部返回 JSON。
|
|
||||||
|
|
||||||
### 1. `GET /nodes`
|
|
||||||
- **用途**:获取所有节点的简要信息。
|
|
||||||
- **响应示例**:
|
|
||||||
```json
|
|
||||||
[
|
|
||||||
{"id": "A1", "name": "dev-user-inst-pod-0", "status": "online", "type": "agent", "version": "1.1.0"}
|
|
||||||
]
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. `GET /nodes/{id}`
|
|
||||||
- **用途**:获取节点详情(包含配置、健康、持久化时间戳等)。
|
|
||||||
- **错误**:`404` 表示节点不存在。
|
|
||||||
|
|
||||||
### 3. `POST /nodes`
|
|
||||||
- **用途**:注册或重注册节点。
|
|
||||||
- **请求体**:
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"id": "A1", // 可选,重注册时携带
|
|
||||||
"name": "dev-user-inst-pod-0",
|
|
||||||
"type": "agent",
|
|
||||||
"version": "1.1.0",
|
|
||||||
"meta_data": {
|
|
||||||
"hostname": "dev-user-inst-pod-0",
|
|
||||||
"ip": "10.0.0.10",
|
|
||||||
"env": "dev",
|
|
||||||
"user": "testuser",
|
|
||||||
"instance": "testinst",
|
|
||||||
"cpu_number": 4,
|
|
||||||
"memory_in_bytes": 2147483648,
|
|
||||||
"gpu_number": 0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
- **成功返回**:
|
|
||||||
- 新节点:`201 Created`,返回完整节点对象。
|
|
||||||
- 重注册:`200 OK`,返回更新后的节点对象。
|
|
||||||
- **错误情况**:
|
|
||||||
- `404 Not Found`:携带的 ID 在 Master 中不存在。
|
|
||||||
- `500 Internal Server Error`:携带的 ID 与已有名称不匹配。
|
|
||||||
- `400 Bad Request`:请求体缺字段或类型不正确。
|
|
||||||
|
|
||||||
### 4. `PUT /nodes/{id}/status`
|
|
||||||
- **用途**:Agent 上报状态。Master 记录 `last_report`(服务器时间)与 `agent_last_report`(上报内时间),并更新 `health` 字段。
|
|
||||||
- **请求体示例**:
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"timestamp": "2025-09-24T03:24:59Z",
|
|
||||||
"health": {
|
|
||||||
"log-fluentbit": {"status": "healthy"},
|
|
||||||
"metric-node-exporter": {"status": "healthy"}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
- **响应**:`200 OK`,返回最新节点对象。`404` 表示节点不存在。
|
|
||||||
|
|
||||||
### 5. `PUT /nodes/{id}/config`
|
|
||||||
- **用途**:局部更新节点配置与标签。
|
|
||||||
- **请求体示例**:
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"config": {"log_level": "debug"},
|
|
||||||
"label": ["gpu", "exp001"]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
- **说明**:字段可任选其一;未提供的配置保持原值。更新标签会触发 `nodes.json` 重新生成。
|
|
||||||
- **错误**:`404` 表示节点不存在;`400` 表示请求体不合法。
|
|
||||||
|
|
||||||
### 6. `GET /nodes/statistics`
|
|
||||||
- **用途**:统计节点总数及按状态分布。
|
|
||||||
- **响应示例**:
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"total": 2,
|
|
||||||
"status_statistics": [
|
|
||||||
{"status": "online", "count": 1},
|
|
||||||
{"status": "offline", "count": 1}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### 7. 健康探针
|
|
||||||
- `GET /healthz`:进程存活检查。
|
|
||||||
- `GET /readyz`:数据库可用性检查(会尝试访问 `DB_PATH`)。
|
|
||||||
|
|
||||||
|
|
||||||
如需验证离线镜像,可使用自动化脚本:
|
|
||||||
```bash
|
|
||||||
cd src/master/tests
|
|
||||||
./scripts/00_e2e_test_offline.sh # 构建离线镜像并执行完整 E2E
|
|
||||||
```
|
|
||||||
|
|
||||||
## 端到端测试场景
|
|
||||||
|
|
||||||
执行 `src/master/tests/scripts/00_e2e_test.sh` 会串联以下用例(脚本 01–10):
|
|
||||||
|
|
||||||
1. **01_up_master**:构建镜像、启动容器、初始化目录与卷。
|
|
||||||
2. **02_verify_ready_and_nodes_json**:轮询 `/readyz`,校验初始 `nodes.json` 为 `[]`。
|
|
||||||
3. **03_register_via_curl**:模拟 agent 注册,保存返回的节点 ID,并确认节点出现在列表接口中。
|
|
||||||
4. **04_reregister_and_error_cases**:覆盖重注册成功、携带未知 ID 的 `404`、ID/名称不匹配触发 `500` 等场景。
|
|
||||||
5. **05_status_report_via_curl**:上报健康信息并验证状态自动从 `initialized`→`online`→`offline`→`online` 的转换。
|
|
||||||
6. **06_config_update_and_nodes_json**:更新配置/标签,检查 `nodes.json` 中的标签同步,并确保离线节点不会出现在文件里。
|
|
||||||
7. **07_stats_single_node**:等待节点掉线,验证统计接口与 `nodes.json` 为空列表。
|
|
||||||
8. **08_multi_node_stats**:注册第二节点,使一在线一离线,校验统计聚合和 `nodes.json` 仅包含在线节点。
|
|
||||||
9. **09_restart_persistence**:重启 master 容器,确认节点数据、统计结果与 `nodes.json` 在持久化目录中保持不变。
|
|
||||||
10. **10_down**:停止并清理容器、网络与临时目录。
|
|
||||||
|
|
||||||
## 相关持久化文件
|
|
||||||
|
|
||||||
- SQLite:默认位于 `DB_PATH`,包含 `nodes` 与 `kv` 两张表。
|
|
||||||
- `nodes.json`:由调度器周期生成,仅保留状态为 `online` 的节点信息。
|
|
||||||
- 测试用例中的 `tests/private/`、`tests/tmp/` 会随脚本自动清理,避免污染后续运行。
|
|
||||||
|
|
||||||
如需在生产环境运行,可将镜像推送到私有仓库,或参考测试 Compose 配置自行部署;只需确保上述环境变量在容器内正确设置即可。
|
|
@ -1,41 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import atexit
|
|
||||||
import logging
|
|
||||||
|
|
||||||
from flask import Flask
|
|
||||||
|
|
||||||
from .config import AppConfig, load_config
|
|
||||||
from .routes import register_routes
|
|
||||||
from .scheduler import StatusScheduler
|
|
||||||
from .storage import Storage
|
|
||||||
|
|
||||||
|
|
||||||
def create_app(config: AppConfig | None = None) -> Flask:
|
|
||||||
app_config = config or load_config()
|
|
||||||
storage = Storage(app_config.db_path, app_config.node_id_prefix)
|
|
||||||
scheduler = StatusScheduler(storage, app_config)
|
|
||||||
|
|
||||||
app = Flask(__name__)
|
|
||||||
app.config["APP_CONFIG"] = app_config
|
|
||||||
app.config["STORAGE"] = storage
|
|
||||||
app.config["SCHEDULER"] = scheduler
|
|
||||||
|
|
||||||
register_routes(app, storage, scheduler, app_config)
|
|
||||||
|
|
||||||
scheduler.start()
|
|
||||||
|
|
||||||
def _cleanup() -> None:
|
|
||||||
logging.getLogger("argus.master").info("Shutting down master app")
|
|
||||||
try:
|
|
||||||
scheduler.stop()
|
|
||||||
except Exception: # pragma: no cover - defensive
|
|
||||||
logging.getLogger("argus.master").exception("Failed to stop scheduler")
|
|
||||||
try:
|
|
||||||
storage.close()
|
|
||||||
except Exception: # pragma: no cover - defensive
|
|
||||||
logging.getLogger("argus.master").exception("Failed to close storage")
|
|
||||||
|
|
||||||
atexit.register(_cleanup)
|
|
||||||
|
|
||||||
return app
|
|
@ -1,40 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import os
|
|
||||||
from dataclasses import dataclass
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class AppConfig:
|
|
||||||
db_path: str
|
|
||||||
metric_nodes_json_path: str
|
|
||||||
offline_threshold_seconds: int
|
|
||||||
online_threshold_seconds: int
|
|
||||||
scheduler_interval_seconds: int
|
|
||||||
node_id_prefix: str
|
|
||||||
auth_mode: str
|
|
||||||
|
|
||||||
|
|
||||||
def _get_int_env(name: str, default: int) -> int:
|
|
||||||
raw = os.environ.get(name)
|
|
||||||
if raw is None or raw.strip() == "":
|
|
||||||
return default
|
|
||||||
try:
|
|
||||||
return int(raw)
|
|
||||||
except ValueError as exc:
|
|
||||||
raise ValueError(f"Environment variable {name} must be an integer, got {raw!r}") from exc
|
|
||||||
|
|
||||||
|
|
||||||
def load_config() -> AppConfig:
|
|
||||||
"""读取环境变量生成配置对象,方便统一管理运行参数。"""
|
|
||||||
return AppConfig(
|
|
||||||
db_path=os.environ.get("DB_PATH", "/private/argus/master/db.sqlite3"),
|
|
||||||
metric_nodes_json_path=os.environ.get(
|
|
||||||
"METRIC_NODES_JSON_PATH", "/private/argus/metric/prometheus/nodes.json"
|
|
||||||
),
|
|
||||||
offline_threshold_seconds=_get_int_env("OFFLINE_THRESHOLD_SECONDS", 180),
|
|
||||||
online_threshold_seconds=_get_int_env("ONLINE_THRESHOLD_SECONDS", 120),
|
|
||||||
scheduler_interval_seconds=_get_int_env("SCHEDULER_INTERVAL_SECONDS", 30),
|
|
||||||
node_id_prefix=os.environ.get("NODE_ID_PREFIX", "A"),
|
|
||||||
auth_mode=os.environ.get("AUTH_MODE", "disabled"),
|
|
||||||
)
|
|
@ -1,171 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from typing import Any, Dict, Iterable, Mapping
|
|
||||||
|
|
||||||
from .util import parse_iso
|
|
||||||
|
|
||||||
|
|
||||||
class ValidationError(Exception):
|
|
||||||
"""Raised when user payload fails validation."""
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Node:
|
|
||||||
id: str
|
|
||||||
name: str
|
|
||||||
type: str
|
|
||||||
version: str | None
|
|
||||||
status: str
|
|
||||||
config: Dict[str, Any]
|
|
||||||
labels: Iterable[str]
|
|
||||||
meta_data: Dict[str, Any]
|
|
||||||
health: Dict[str, Any]
|
|
||||||
register_time: str | None
|
|
||||||
last_report: str | None
|
|
||||||
agent_last_report: str | None
|
|
||||||
last_updated: str | None
|
|
||||||
|
|
||||||
|
|
||||||
def serialize_node_row(row: Mapping[str, Any]) -> Dict[str, Any]:
|
|
||||||
def _json_or_default(value: str | None, default: Any) -> Any:
|
|
||||||
if value is None or value == "":
|
|
||||||
return default
|
|
||||||
try:
|
|
||||||
return json.loads(value)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
return default
|
|
||||||
|
|
||||||
config = _json_or_default(row["config_json"], {})
|
|
||||||
labels = _json_or_default(row["labels_json"], [])
|
|
||||||
meta = _json_or_default(row["meta_json"], {})
|
|
||||||
health = _json_or_default(row["health_json"], {})
|
|
||||||
return {
|
|
||||||
"id": row["id"],
|
|
||||||
"name": row["name"],
|
|
||||||
"type": row["type"],
|
|
||||||
"version": row["version"],
|
|
||||||
"status": row["status"],
|
|
||||||
"config": config if isinstance(config, dict) else {},
|
|
||||||
"label": list(labels) if isinstance(labels, list) else [],
|
|
||||||
"meta_data": meta if isinstance(meta, dict) else {},
|
|
||||||
"health": health if isinstance(health, dict) else {},
|
|
||||||
"register_time": row["register_time"],
|
|
||||||
"last_report": row["last_report"],
|
|
||||||
"agent_last_report": row["agent_last_report"],
|
|
||||||
"last_updated": row["last_updated"],
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def serialize_node_summary(row: Mapping[str, Any]) -> Dict[str, Any]:
|
|
||||||
return {
|
|
||||||
"id": row["id"],
|
|
||||||
"name": row["name"],
|
|
||||||
"status": row["status"],
|
|
||||||
"type": row["type"],
|
|
||||||
"version": row["version"],
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def validate_registration_payload(payload: Mapping[str, Any]) -> Dict[str, Any]:
|
|
||||||
if not isinstance(payload, Mapping):
|
|
||||||
raise ValidationError("Request body must be a JSON object")
|
|
||||||
|
|
||||||
name = payload.get("name")
|
|
||||||
if not isinstance(name, str) or not name.strip():
|
|
||||||
raise ValidationError("Field 'name' is required and must be a non-empty string")
|
|
||||||
|
|
||||||
node_type = payload.get("type", "agent")
|
|
||||||
if not isinstance(node_type, str) or not node_type:
|
|
||||||
raise ValidationError("Field 'type' must be a string")
|
|
||||||
|
|
||||||
version = payload.get("version")
|
|
||||||
if version is not None and not isinstance(version, str):
|
|
||||||
raise ValidationError("Field 'version' must be a string if provided")
|
|
||||||
|
|
||||||
meta = payload.get("meta_data")
|
|
||||||
if not isinstance(meta, Mapping):
|
|
||||||
raise ValidationError("Field 'meta_data' must be an object")
|
|
||||||
|
|
||||||
required_meta = ["hostname", "ip", "env", "user", "instance", "cpu_number", "memory_in_bytes", "gpu_number"]
|
|
||||||
for key in required_meta:
|
|
||||||
if key not in meta:
|
|
||||||
raise ValidationError(f"meta_data.{key} is required")
|
|
||||||
|
|
||||||
cpu_number = meta["cpu_number"]
|
|
||||||
memory_in_bytes = meta["memory_in_bytes"]
|
|
||||||
gpu_number = meta["gpu_number"]
|
|
||||||
if not isinstance(cpu_number, int) or cpu_number < 0:
|
|
||||||
raise ValidationError("meta_data.cpu_number must be a non-negative integer")
|
|
||||||
if not isinstance(memory_in_bytes, int) or memory_in_bytes < 0:
|
|
||||||
raise ValidationError("meta_data.memory_in_bytes must be a non-negative integer")
|
|
||||||
if not isinstance(gpu_number, int) or gpu_number < 0:
|
|
||||||
raise ValidationError("meta_data.gpu_number must be a non-negative integer")
|
|
||||||
|
|
||||||
node_id = payload.get("id")
|
|
||||||
if node_id is not None and (not isinstance(node_id, str) or not node_id.strip()):
|
|
||||||
raise ValidationError("Field 'id' must be a non-empty string when provided")
|
|
||||||
|
|
||||||
return {
|
|
||||||
"id": node_id,
|
|
||||||
"name": name,
|
|
||||||
"type": node_type,
|
|
||||||
"version": version,
|
|
||||||
"meta_data": dict(meta),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def validate_status_payload(payload: Mapping[str, Any]) -> Dict[str, Any]:
|
|
||||||
if not isinstance(payload, Mapping):
|
|
||||||
raise ValidationError("Request body must be a JSON object")
|
|
||||||
|
|
||||||
timestamp = payload.get("timestamp")
|
|
||||||
if not isinstance(timestamp, str) or not timestamp:
|
|
||||||
raise ValidationError("Field 'timestamp' is required and must be a string")
|
|
||||||
|
|
||||||
parsed = parse_iso(timestamp)
|
|
||||||
if parsed is None:
|
|
||||||
raise ValidationError("Field 'timestamp' must be an ISO8601 datetime string")
|
|
||||||
|
|
||||||
health = payload.get("health", {})
|
|
||||||
if not isinstance(health, Mapping):
|
|
||||||
raise ValidationError("Field 'health' must be an object if provided")
|
|
||||||
|
|
||||||
sanitized_health: Dict[str, Any] = {}
|
|
||||||
for key, value in health.items():
|
|
||||||
if not isinstance(key, str):
|
|
||||||
raise ValidationError("Keys in 'health' must be strings")
|
|
||||||
if not isinstance(value, (Mapping, list, str, int, float, bool)) and value is not None:
|
|
||||||
raise ValidationError("Values in 'health' must be JSON-compatible")
|
|
||||||
sanitized_health[key] = value
|
|
||||||
|
|
||||||
return {
|
|
||||||
"timestamp": timestamp,
|
|
||||||
"parsed_timestamp": parsed,
|
|
||||||
"health": sanitized_health,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def validate_config_payload(payload: Mapping[str, Any]) -> Dict[str, Any]:
|
|
||||||
if not isinstance(payload, Mapping):
|
|
||||||
raise ValidationError("Request body must be a JSON object")
|
|
||||||
|
|
||||||
result: Dict[str, Any] = {}
|
|
||||||
if "config" in payload:
|
|
||||||
config = payload["config"]
|
|
||||||
if not isinstance(config, Mapping):
|
|
||||||
raise ValidationError("Field 'config' must be an object")
|
|
||||||
result["config"] = dict(config)
|
|
||||||
|
|
||||||
if "label" in payload:
|
|
||||||
labels = payload["label"]
|
|
||||||
if not isinstance(labels, list) or not all(isinstance(item, str) for item in labels):
|
|
||||||
raise ValidationError("Field 'label' must be an array of strings")
|
|
||||||
result["label"] = list(labels)
|
|
||||||
|
|
||||||
if not result:
|
|
||||||
raise ValidationError("At least one of 'config' or 'label' must be provided")
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
@ -1,155 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from http import HTTPStatus
|
|
||||||
from typing import Any, Mapping
|
|
||||||
|
|
||||||
from flask import Blueprint, jsonify, request
|
|
||||||
|
|
||||||
from .models import (
|
|
||||||
ValidationError,
|
|
||||||
validate_config_payload,
|
|
||||||
validate_registration_payload,
|
|
||||||
validate_status_payload,
|
|
||||||
)
|
|
||||||
from .scheduler import StatusScheduler
|
|
||||||
from .storage import Storage
|
|
||||||
from .util import to_iso, utcnow
|
|
||||||
|
|
||||||
|
|
||||||
def create_nodes_blueprint(storage: Storage, scheduler: StatusScheduler) -> Blueprint:
|
|
||||||
bp = Blueprint("nodes", __name__)
|
|
||||||
logger = logging.getLogger("argus.master.api")
|
|
||||||
|
|
||||||
def _json_error(message: str, status: HTTPStatus, code: str) -> Any:
|
|
||||||
response = jsonify({"error": message, "code": code})
|
|
||||||
response.status_code = status
|
|
||||||
return response
|
|
||||||
|
|
||||||
@bp.errorhandler(ValidationError)
|
|
||||||
def _handle_validation_error(err: ValidationError):
|
|
||||||
return _json_error(str(err), HTTPStatus.BAD_REQUEST, "invalid_request")
|
|
||||||
|
|
||||||
@bp.get("/nodes")
|
|
||||||
def list_nodes():
|
|
||||||
nodes = storage.list_nodes()
|
|
||||||
return jsonify(nodes)
|
|
||||||
|
|
||||||
@bp.get("/nodes/<node_id>")
|
|
||||||
def get_node(node_id: str):
|
|
||||||
node = storage.get_node(node_id)
|
|
||||||
if node is None:
|
|
||||||
return _json_error("Node not found", HTTPStatus.NOT_FOUND, "not_found")
|
|
||||||
return jsonify(node)
|
|
||||||
|
|
||||||
@bp.post("/nodes")
|
|
||||||
def register_node():
|
|
||||||
payload = _get_json()
|
|
||||||
data = validate_registration_payload(payload)
|
|
||||||
now = utcnow()
|
|
||||||
now_iso = to_iso(now)
|
|
||||||
node_id = data["id"]
|
|
||||||
name = data["name"]
|
|
||||||
node_type = data["type"]
|
|
||||||
version = data["version"]
|
|
||||||
meta = data["meta_data"]
|
|
||||||
|
|
||||||
if node_id:
|
|
||||||
# 携带 id 说明是重注册,需要校验名称一致性
|
|
||||||
existing_row = storage.get_node_raw(node_id)
|
|
||||||
if existing_row is None:
|
|
||||||
return _json_error("Node not found", HTTPStatus.NOT_FOUND, "not_found")
|
|
||||||
if existing_row["name"] != name:
|
|
||||||
return _json_error(
|
|
||||||
"Node id and name mismatch during re-registration",
|
|
||||||
HTTPStatus.INTERNAL_SERVER_ERROR,
|
|
||||||
"id_name_mismatch",
|
|
||||||
)
|
|
||||||
updated = storage.update_node_meta(
|
|
||||||
node_id,
|
|
||||||
node_type=node_type,
|
|
||||||
version=version,
|
|
||||||
meta_data=meta,
|
|
||||||
last_updated_iso=now_iso,
|
|
||||||
)
|
|
||||||
scheduler.trigger_nodes_json_refresh()
|
|
||||||
return jsonify(updated), HTTPStatus.OK
|
|
||||||
|
|
||||||
# No id provided → search by name
|
|
||||||
existing_by_name = storage.get_node_by_name(name)
|
|
||||||
if existing_by_name:
|
|
||||||
# 同名节点已存在,视为无 id 重注册
|
|
||||||
updated = storage.update_node_meta(
|
|
||||||
existing_by_name["id"],
|
|
||||||
node_type=node_type,
|
|
||||||
version=version,
|
|
||||||
meta_data=meta,
|
|
||||||
last_updated_iso=now_iso,
|
|
||||||
)
|
|
||||||
scheduler.trigger_nodes_json_refresh()
|
|
||||||
return jsonify(updated), HTTPStatus.OK
|
|
||||||
|
|
||||||
new_id = storage.allocate_node_id()
|
|
||||||
created = storage.create_node(
|
|
||||||
new_id,
|
|
||||||
name,
|
|
||||||
node_type,
|
|
||||||
version,
|
|
||||||
meta,
|
|
||||||
status="initialized",
|
|
||||||
register_time_iso=now_iso,
|
|
||||||
last_updated_iso=now_iso,
|
|
||||||
)
|
|
||||||
scheduler.trigger_nodes_json_refresh()
|
|
||||||
return jsonify(created), HTTPStatus.CREATED
|
|
||||||
|
|
||||||
@bp.put("/nodes/<node_id>/config")
|
|
||||||
def update_node_config(node_id: str):
|
|
||||||
payload = _get_json()
|
|
||||||
updates = validate_config_payload(payload)
|
|
||||||
try:
|
|
||||||
updated = storage.update_config_and_labels(
|
|
||||||
node_id,
|
|
||||||
config=updates.get("config"),
|
|
||||||
labels=updates.get("label"),
|
|
||||||
)
|
|
||||||
except KeyError:
|
|
||||||
return _json_error("Node not found", HTTPStatus.NOT_FOUND, "not_found")
|
|
||||||
|
|
||||||
if "label" in updates:
|
|
||||||
scheduler.trigger_nodes_json_refresh()
|
|
||||||
return jsonify(updated)
|
|
||||||
|
|
||||||
@bp.get("/nodes/statistics")
|
|
||||||
def node_statistics():
|
|
||||||
stats = storage.get_statistics()
|
|
||||||
return jsonify(stats)
|
|
||||||
|
|
||||||
@bp.put("/nodes/<node_id>/status")
|
|
||||||
def update_status(node_id: str):
|
|
||||||
payload = _get_json()
|
|
||||||
data = validate_status_payload(payload)
|
|
||||||
try:
|
|
||||||
# master 负责写入 last_report,状态由调度器计算
|
|
||||||
updated = storage.update_last_report(
|
|
||||||
node_id,
|
|
||||||
server_timestamp_iso=to_iso(utcnow()),
|
|
||||||
agent_timestamp_iso=data["timestamp"],
|
|
||||||
health=data["health"],
|
|
||||||
)
|
|
||||||
except KeyError:
|
|
||||||
return _json_error("Node not found", HTTPStatus.NOT_FOUND, "not_found")
|
|
||||||
|
|
||||||
scheduler.trigger_nodes_json_refresh()
|
|
||||||
return jsonify(updated)
|
|
||||||
|
|
||||||
return bp
|
|
||||||
|
|
||||||
|
|
||||||
def _get_json() -> Mapping[str, Any]:
|
|
||||||
data = request.get_json(silent=True)
|
|
||||||
if data is None:
|
|
||||||
raise ValidationError("Request body must be valid JSON")
|
|
||||||
if not isinstance(data, Mapping):
|
|
||||||
raise ValidationError("Request body must be a JSON object")
|
|
||||||
return data
|
|
@ -1,24 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from flask import Flask, jsonify
|
|
||||||
|
|
||||||
from .config import AppConfig
|
|
||||||
from .nodes_api import create_nodes_blueprint
|
|
||||||
from .scheduler import StatusScheduler
|
|
||||||
from .storage import Storage
|
|
||||||
|
|
||||||
|
|
||||||
def register_routes(app: Flask, storage: Storage, scheduler: StatusScheduler, config: AppConfig) -> None:
|
|
||||||
app.register_blueprint(create_nodes_blueprint(storage, scheduler), url_prefix="/api/v1/master")
|
|
||||||
|
|
||||||
@app.get("/healthz")
|
|
||||||
def healthz():
|
|
||||||
return jsonify({"status": "ok"})
|
|
||||||
|
|
||||||
@app.get("/readyz")
|
|
||||||
def readyz():
|
|
||||||
try:
|
|
||||||
storage.list_nodes() # simple readiness probe
|
|
||||||
except Exception as exc: # pragma: no cover - defensive
|
|
||||||
return jsonify({"status": "error", "error": str(exc)}), 500
|
|
||||||
return jsonify({"status": "ok"})
|
|
@ -1,90 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import threading
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from .config import AppConfig
|
|
||||||
from .storage import Storage
|
|
||||||
from .util import atomic_write_json, parse_iso, to_iso, utcnow
|
|
||||||
|
|
||||||
|
|
||||||
class StatusScheduler:
|
|
||||||
def __init__(self, storage: Storage, config: AppConfig, logger: Optional[logging.Logger] = None) -> None:
|
|
||||||
self._storage = storage
|
|
||||||
self._config = config
|
|
||||||
self._logger = logger or logging.getLogger("argus.master.scheduler")
|
|
||||||
self._stop_event = threading.Event()
|
|
||||||
self._thread = threading.Thread(target=self._run, name="status-scheduler", daemon=True)
|
|
||||||
self._nodes_json_lock = threading.Lock()
|
|
||||||
self._pending_nodes_json = threading.Event()
|
|
||||||
|
|
||||||
def start(self) -> None:
|
|
||||||
"""启动后台线程,定期刷新节点状态与 nodes.json。"""
|
|
||||||
if not self._thread.is_alive():
|
|
||||||
self._logger.info("Starting scheduler thread")
|
|
||||||
self._thread.start()
|
|
||||||
|
|
||||||
def stop(self) -> None:
|
|
||||||
self._stop_event.set()
|
|
||||||
self._pending_nodes_json.set()
|
|
||||||
self._thread.join(timeout=5)
|
|
||||||
|
|
||||||
def trigger_nodes_json_refresh(self) -> None:
|
|
||||||
self._pending_nodes_json.set()
|
|
||||||
|
|
||||||
def generate_nodes_json(self) -> None:
|
|
||||||
with self._nodes_json_lock:
|
|
||||||
online_nodes = self._storage.get_online_nodes()
|
|
||||||
atomic_write_json(self._config.metric_nodes_json_path, online_nodes)
|
|
||||||
self._logger.info("nodes.json updated", extra={"count": len(online_nodes)})
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# internal loop
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _run(self) -> None:
|
|
||||||
# 确保启动时 nodes.json 会立即生成
|
|
||||||
self._pending_nodes_json.set()
|
|
||||||
while not self._stop_event.is_set():
|
|
||||||
changed = self._reconcile_statuses()
|
|
||||||
if changed or self._pending_nodes_json.is_set():
|
|
||||||
try:
|
|
||||||
self.generate_nodes_json()
|
|
||||||
finally:
|
|
||||||
self._pending_nodes_json.clear()
|
|
||||||
self._stop_event.wait(self._config.scheduler_interval_seconds)
|
|
||||||
|
|
||||||
def _reconcile_statuses(self) -> bool:
|
|
||||||
"""根据 last_report 与当前时间对比,决定是否切换状态。"""
|
|
||||||
any_status_changed = False
|
|
||||||
now = utcnow()
|
|
||||||
rows = self._storage.fetch_nodes_for_scheduler()
|
|
||||||
for row in rows:
|
|
||||||
node_id = row["id"]
|
|
||||||
last_report_iso = row["last_report"]
|
|
||||||
current_status = row["status"]
|
|
||||||
last_report_dt = parse_iso(last_report_iso)
|
|
||||||
if last_report_dt is None:
|
|
||||||
# No report yet; treat as initialized until report arrives
|
|
||||||
continue
|
|
||||||
delta_seconds = (now - last_report_dt).total_seconds()
|
|
||||||
new_status = current_status
|
|
||||||
if delta_seconds > self._config.offline_threshold_seconds:
|
|
||||||
new_status = "offline"
|
|
||||||
elif delta_seconds <= self._config.online_threshold_seconds:
|
|
||||||
new_status = "online"
|
|
||||||
# Between thresholds: keep current status (sticky)
|
|
||||||
if new_status != current_status:
|
|
||||||
any_status_changed = True
|
|
||||||
self._logger.info(
|
|
||||||
"Updating node status",
|
|
||||||
extra={
|
|
||||||
"node_id": node_id,
|
|
||||||
"previous": current_status,
|
|
||||||
"new": new_status,
|
|
||||||
"delta_seconds": delta_seconds,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
self._storage.update_status(node_id, new_status, last_updated_iso=to_iso(now))
|
|
||||||
return any_status_changed
|
|
@ -1,332 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
import sqlite3
|
|
||||||
import threading
|
|
||||||
from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple
|
|
||||||
|
|
||||||
from .models import serialize_node_row, serialize_node_summary
|
|
||||||
from .util import ensure_parent, to_iso, utcnow
|
|
||||||
|
|
||||||
|
|
||||||
class Storage:
|
|
||||||
def __init__(self, db_path: str, node_id_prefix: str) -> None:
|
|
||||||
self._db_path = db_path
|
|
||||||
self._node_id_prefix = node_id_prefix
|
|
||||||
ensure_parent(db_path)
|
|
||||||
self._lock = threading.Lock()
|
|
||||||
self._conn = sqlite3.connect(db_path, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
|
|
||||||
self._conn.row_factory = sqlite3.Row
|
|
||||||
with self._lock:
|
|
||||||
self._conn.execute("PRAGMA foreign_keys = ON;")
|
|
||||||
self._ensure_schema()
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# schema & helpers
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _ensure_schema(self) -> None:
|
|
||||||
"""初始化表结构,确保服务启动时数据库结构就绪。"""
|
|
||||||
with self._lock:
|
|
||||||
self._conn.executescript(
|
|
||||||
"""
|
|
||||||
CREATE TABLE IF NOT EXISTS nodes (
|
|
||||||
id TEXT PRIMARY KEY,
|
|
||||||
name TEXT NOT NULL UNIQUE,
|
|
||||||
type TEXT NOT NULL,
|
|
||||||
version TEXT,
|
|
||||||
status TEXT NOT NULL,
|
|
||||||
config_json TEXT,
|
|
||||||
labels_json TEXT,
|
|
||||||
meta_json TEXT,
|
|
||||||
health_json TEXT,
|
|
||||||
register_time TEXT,
|
|
||||||
last_report TEXT,
|
|
||||||
agent_last_report TEXT,
|
|
||||||
last_updated TEXT
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS kv (
|
|
||||||
key TEXT PRIMARY KEY,
|
|
||||||
value TEXT NOT NULL
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_nodes_status ON nodes(status);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name);
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
|
|
||||||
def close(self) -> None:
|
|
||||||
with self._lock:
|
|
||||||
self._conn.close()
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Node ID allocation
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def allocate_node_id(self) -> str:
|
|
||||||
"""在 kv 表里维护自增序列,为新节点生成形如 A1 的 ID。"""
|
|
||||||
with self._lock:
|
|
||||||
cur = self._conn.execute("SELECT value FROM kv WHERE key = ?", ("node_id_seq",))
|
|
||||||
row = cur.fetchone()
|
|
||||||
if row is None:
|
|
||||||
next_id = 1
|
|
||||||
self._conn.execute("INSERT INTO kv(key, value) VALUES(?, ?)", ("node_id_seq", str(next_id)))
|
|
||||||
else:
|
|
||||||
next_id = int(row["value"]) + 1
|
|
||||||
self._conn.execute("UPDATE kv SET value = ? WHERE key = ?", (str(next_id), "node_id_seq"))
|
|
||||||
self._conn.commit()
|
|
||||||
return f"{self._node_id_prefix}{next_id}"
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Query helpers
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def list_nodes(self) -> List[Dict[str, Any]]:
|
|
||||||
with self._lock:
|
|
||||||
cur = self._conn.execute(
|
|
||||||
"SELECT id, name, status, type, version FROM nodes ORDER BY id ASC"
|
|
||||||
)
|
|
||||||
rows = cur.fetchall()
|
|
||||||
return [serialize_node_summary(row) for row in rows]
|
|
||||||
|
|
||||||
def get_node(self, node_id: str) -> Optional[Dict[str, Any]]:
|
|
||||||
with self._lock:
|
|
||||||
cur = self._conn.execute("SELECT * FROM nodes WHERE id = ?", (node_id,))
|
|
||||||
row = cur.fetchone()
|
|
||||||
if row is None:
|
|
||||||
return None
|
|
||||||
return serialize_node_row(row)
|
|
||||||
|
|
||||||
def get_node_raw(self, node_id: str) -> Optional[sqlite3.Row]:
|
|
||||||
with self._lock:
|
|
||||||
cur = self._conn.execute("SELECT * FROM nodes WHERE id = ?", (node_id,))
|
|
||||||
row = cur.fetchone()
|
|
||||||
return row
|
|
||||||
|
|
||||||
def get_node_by_name(self, name: str) -> Optional[Dict[str, Any]]:
|
|
||||||
with self._lock:
|
|
||||||
cur = self._conn.execute("SELECT * FROM nodes WHERE name = ?", (name,))
|
|
||||||
row = cur.fetchone()
|
|
||||||
if row is None:
|
|
||||||
return None
|
|
||||||
return serialize_node_row(row)
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Mutation helpers
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def create_node(
|
|
||||||
self,
|
|
||||||
node_id: str,
|
|
||||||
name: str,
|
|
||||||
node_type: str,
|
|
||||||
version: str | None,
|
|
||||||
meta_data: Mapping[str, Any],
|
|
||||||
status: str,
|
|
||||||
register_time_iso: str,
|
|
||||||
last_updated_iso: str,
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""插入节点初始记录,默认 config/label/health 为空。"""
|
|
||||||
now_iso = last_updated_iso
|
|
||||||
with self._lock:
|
|
||||||
self._conn.execute(
|
|
||||||
"""
|
|
||||||
INSERT INTO nodes (
|
|
||||||
id, name, type, version, status, config_json, labels_json, meta_json,
|
|
||||||
health_json, register_time, last_report, agent_last_report, last_updated
|
|
||||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
||||||
""",
|
|
||||||
(
|
|
||||||
node_id,
|
|
||||||
name,
|
|
||||||
node_type,
|
|
||||||
version,
|
|
||||||
status,
|
|
||||||
json.dumps({}),
|
|
||||||
json.dumps([]),
|
|
||||||
json.dumps(dict(meta_data)),
|
|
||||||
json.dumps({}),
|
|
||||||
register_time_iso,
|
|
||||||
None,
|
|
||||||
None,
|
|
||||||
now_iso,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
|
|
||||||
created = self.get_node(node_id)
|
|
||||||
if created is None:
|
|
||||||
raise RuntimeError("Failed to read back created node")
|
|
||||||
return created
|
|
||||||
|
|
||||||
def update_node_meta(
|
|
||||||
self,
|
|
||||||
node_id: str,
|
|
||||||
*,
|
|
||||||
name: Optional[str] = None,
|
|
||||||
node_type: Optional[str] = None,
|
|
||||||
version: Optional[str | None] = None,
|
|
||||||
meta_data: Optional[Mapping[str, Any]] = None,
|
|
||||||
last_updated_iso: Optional[str] = None,
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""重注册时更新节点静态信息,缺省字段保持不变。"""
|
|
||||||
updates: List[str] = []
|
|
||||||
params: List[Any] = []
|
|
||||||
if name is not None:
|
|
||||||
updates.append("name = ?")
|
|
||||||
params.append(name)
|
|
||||||
if node_type is not None:
|
|
||||||
updates.append("type = ?")
|
|
||||||
params.append(node_type)
|
|
||||||
if version is not None:
|
|
||||||
updates.append("version = ?")
|
|
||||||
params.append(version)
|
|
||||||
if meta_data is not None:
|
|
||||||
updates.append("meta_json = ?")
|
|
||||||
params.append(json.dumps(dict(meta_data)))
|
|
||||||
if last_updated_iso is not None:
|
|
||||||
updates.append("last_updated = ?")
|
|
||||||
params.append(last_updated_iso)
|
|
||||||
|
|
||||||
if not updates:
|
|
||||||
result = self.get_node(node_id)
|
|
||||||
if result is None:
|
|
||||||
raise KeyError(node_id)
|
|
||||||
return result
|
|
||||||
|
|
||||||
params.append(node_id)
|
|
||||||
with self._lock:
|
|
||||||
self._conn.execute(
|
|
||||||
f"UPDATE nodes SET {', '.join(updates)} WHERE id = ?",
|
|
||||||
tuple(params),
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
updated = self.get_node(node_id)
|
|
||||||
if updated is None:
|
|
||||||
raise KeyError(node_id)
|
|
||||||
return updated
|
|
||||||
|
|
||||||
def update_config_and_labels(
|
|
||||||
self, node_id: str, *, config: Optional[Mapping[str, Any]] = None, labels: Optional[Iterable[str]] = None
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""部分更新 config/label,并刷新 last_updated 时间戳。"""
|
|
||||||
updates: List[str] = []
|
|
||||||
params: List[Any] = []
|
|
||||||
if config is not None:
|
|
||||||
updates.append("config_json = ?")
|
|
||||||
params.append(json.dumps(dict(config)))
|
|
||||||
if labels is not None:
|
|
||||||
updates.append("labels_json = ?")
|
|
||||||
params.append(json.dumps(list(labels)))
|
|
||||||
updates.append("last_updated = ?")
|
|
||||||
params.append(to_iso(utcnow()))
|
|
||||||
params.append(node_id)
|
|
||||||
with self._lock:
|
|
||||||
self._conn.execute(
|
|
||||||
f"UPDATE nodes SET {', '.join(updates)} WHERE id = ?",
|
|
||||||
tuple(params),
|
|
||||||
)
|
|
||||||
if self._conn.total_changes == 0:
|
|
||||||
self._conn.rollback()
|
|
||||||
raise KeyError(node_id)
|
|
||||||
self._conn.commit()
|
|
||||||
updated = self.get_node(node_id)
|
|
||||||
if updated is None:
|
|
||||||
raise KeyError(node_id)
|
|
||||||
return updated
|
|
||||||
|
|
||||||
def update_last_report(
|
|
||||||
self,
|
|
||||||
node_id: str,
|
|
||||||
*,
|
|
||||||
server_timestamp_iso: str,
|
|
||||||
agent_timestamp_iso: str,
|
|
||||||
health: Mapping[str, Any],
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""记录最新上报时间和健康信息,用于后续状态计算。"""
|
|
||||||
with self._lock:
|
|
||||||
self._conn.execute(
|
|
||||||
"""
|
|
||||||
UPDATE nodes
|
|
||||||
SET last_report = ?,
|
|
||||||
agent_last_report = ?,
|
|
||||||
health_json = ?,
|
|
||||||
last_updated = ?
|
|
||||||
WHERE id = ?
|
|
||||||
""",
|
|
||||||
(
|
|
||||||
server_timestamp_iso,
|
|
||||||
agent_timestamp_iso,
|
|
||||||
json.dumps(health),
|
|
||||||
server_timestamp_iso,
|
|
||||||
node_id,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
if self._conn.total_changes == 0:
|
|
||||||
self._conn.rollback()
|
|
||||||
raise KeyError(node_id)
|
|
||||||
self._conn.commit()
|
|
||||||
updated = self.get_node(node_id)
|
|
||||||
if updated is None:
|
|
||||||
raise KeyError(node_id)
|
|
||||||
return updated
|
|
||||||
|
|
||||||
def update_status(self, node_id: str, status: str, *, last_updated_iso: str) -> None:
|
|
||||||
with self._lock:
|
|
||||||
self._conn.execute(
|
|
||||||
"UPDATE nodes SET status = ?, last_updated = ? WHERE id = ?",
|
|
||||||
(status, last_updated_iso, node_id),
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Reporting helpers
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def get_statistics(self) -> Dict[str, Any]:
|
|
||||||
"""统计节点总数及按状态聚合的数量。"""
|
|
||||||
with self._lock:
|
|
||||||
cur = self._conn.execute("SELECT COUNT(*) AS total FROM nodes")
|
|
||||||
total_row = cur.fetchone()
|
|
||||||
cur = self._conn.execute("SELECT status, COUNT(*) AS count FROM nodes GROUP BY status")
|
|
||||||
status_rows = cur.fetchall()
|
|
||||||
return {
|
|
||||||
"total": total_row["total"] if total_row else 0,
|
|
||||||
"status_statistics": [
|
|
||||||
{"status": row["status"], "count": row["count"]}
|
|
||||||
for row in status_rows
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
def fetch_nodes_for_scheduler(self) -> List[sqlite3.Row]:
|
|
||||||
with self._lock:
|
|
||||||
cur = self._conn.execute(
|
|
||||||
"SELECT id, last_report, status FROM nodes"
|
|
||||||
)
|
|
||||||
return cur.fetchall()
|
|
||||||
|
|
||||||
def get_online_nodes(self) -> List[Dict[str, Any]]:
|
|
||||||
"""返回在线节点列表,用于生成 nodes.json。"""
|
|
||||||
with self._lock:
|
|
||||||
cur = self._conn.execute(
|
|
||||||
"SELECT id, meta_json, labels_json, name FROM nodes WHERE status = ? ORDER BY id ASC",
|
|
||||||
("online",),
|
|
||||||
)
|
|
||||||
rows = cur.fetchall()
|
|
||||||
|
|
||||||
result: List[Dict[str, Any]] = []
|
|
||||||
for row in rows:
|
|
||||||
meta = json.loads(row["meta_json"]) if row["meta_json"] else {}
|
|
||||||
labels = json.loads(row["labels_json"]) if row["labels_json"] else []
|
|
||||||
result.append(
|
|
||||||
{
|
|
||||||
"node_id": row["id"],
|
|
||||||
"user_id": meta.get("user"),
|
|
||||||
"ip": meta.get("ip"),
|
|
||||||
"hostname": meta.get("hostname", row["name"]),
|
|
||||||
"labels": labels if isinstance(labels, list) else [],
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return result
|
|
@ -1,51 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import tempfile
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Iterable
|
|
||||||
|
|
||||||
|
|
||||||
ISO_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
|
|
||||||
|
|
||||||
|
|
||||||
def utcnow() -> datetime:
|
|
||||||
"""获取当前 UTC 时间,统一时间基准。"""
|
|
||||||
return datetime.now(timezone.utc)
|
|
||||||
|
|
||||||
|
|
||||||
def to_iso(dt: datetime | None) -> str | None:
|
|
||||||
if dt is None:
|
|
||||||
return None
|
|
||||||
return dt.astimezone(timezone.utc).replace(microsecond=0).strftime(ISO_FORMAT)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_iso(value: str | None) -> datetime | None:
|
|
||||||
if not value:
|
|
||||||
return None
|
|
||||||
try:
|
|
||||||
if value.endswith("Z"):
|
|
||||||
return datetime.strptime(value, ISO_FORMAT).replace(tzinfo=timezone.utc)
|
|
||||||
# Fallback for ISO strings with offset
|
|
||||||
return datetime.fromisoformat(value).astimezone(timezone.utc)
|
|
||||||
except ValueError:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_parent(path: str) -> None:
|
|
||||||
"""确保目标文件所在目录存在。"""
|
|
||||||
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
|
|
||||||
def atomic_write_json(path: str, data: Iterable[Any] | Any) -> None:
|
|
||||||
"""原子化写 JSON,避免被其它进程读到半成品。"""
|
|
||||||
ensure_parent(path)
|
|
||||||
directory = Path(path).parent
|
|
||||||
with tempfile.NamedTemporaryFile("w", dir=directory, delete=False) as tmp:
|
|
||||||
json.dump(data, tmp, separators=(",", ":"))
|
|
||||||
tmp.flush()
|
|
||||||
os.fsync(tmp.fileno())
|
|
||||||
temp_path = tmp.name
|
|
||||||
os.replace(temp_path, path)
|
|
@ -1 +0,0 @@
|
|||||||
../../bind/build/dns-monitor.sh
|
|
@ -1,59 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# 中文提示:确保共享目录与 DNS 相关脚本存在
|
|
||||||
DNS_DIR="/private/argus/etc"
|
|
||||||
DNS_SCRIPT="${DNS_DIR}/update-dns.sh"
|
|
||||||
MASTER_DOMAIN_FILE="${DNS_DIR}/master.argus.com"
|
|
||||||
RUNTIME_USER="${ARGUS_RUNTIME_USER:-argus}"
|
|
||||||
RUNTIME_UID="${ARGUS_BUILD_UID:-2133}"
|
|
||||||
RUNTIME_GID="${ARGUS_BUILD_GID:-2015}"
|
|
||||||
MASTER_DATA_DIR="/private/argus/master"
|
|
||||||
METRIC_DIR="/private/argus/metric/prometheus"
|
|
||||||
|
|
||||||
mkdir -p "$DNS_DIR"
|
|
||||||
chown -R "$RUNTIME_UID:$RUNTIME_GID" "$DNS_DIR" 2>/dev/null || true
|
|
||||||
mkdir -p "$MASTER_DATA_DIR"
|
|
||||||
mkdir -p "$METRIC_DIR"
|
|
||||||
chown -R "$RUNTIME_UID:$RUNTIME_GID" "$MASTER_DATA_DIR" "$METRIC_DIR" 2>/dev/null || true
|
|
||||||
|
|
||||||
if [[ -x "$DNS_SCRIPT" ]]; then
|
|
||||||
echo "[INFO] Running update-dns.sh before master starts"
|
|
||||||
# 中文提示:若脚本存在则执行,保证容器使用 bind 作为 DNS
|
|
||||||
"$DNS_SCRIPT" || echo "[WARN] update-dns.sh execution failed"
|
|
||||||
else
|
|
||||||
echo "[WARN] DNS update script not found or not executable: $DNS_SCRIPT"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 中文提示:记录 master 当前 IP,供 bind 服务同步
|
|
||||||
MASTER_IP=$(ifconfig | grep -A 1 eth0 | grep inet | awk '{print $2}' || true)
|
|
||||||
if [[ -n "${MASTER_IP}" ]]; then
|
|
||||||
echo "current IP: ${MASTER_IP}"
|
|
||||||
echo "${MASTER_IP}" > "$MASTER_DOMAIN_FILE"
|
|
||||||
chown "$RUNTIME_UID:$RUNTIME_GID" "$MASTER_DOMAIN_FILE" 2>/dev/null || true
|
|
||||||
else
|
|
||||||
echo "[WARN] Failed to detect master IP via ifconfig"
|
|
||||||
fi
|
|
||||||
|
|
||||||
WORKERS=${GUNICORN_WORKERS:-4}
|
|
||||||
BIND_ADDR=${GUNICORN_BIND:-0.0.0.0:3000}
|
|
||||||
EXTRA_OPTS=${GUNICORN_EXTRA_ARGS:-}
|
|
||||||
|
|
||||||
if [[ -n "$EXTRA_OPTS" ]]; then
|
|
||||||
read -r -a EXTRA_ARRAY <<< "$EXTRA_OPTS"
|
|
||||||
else
|
|
||||||
EXTRA_ARRAY=()
|
|
||||||
fi
|
|
||||||
|
|
||||||
command=(gunicorn --bind "$BIND_ADDR" --workers "$WORKERS")
|
|
||||||
if [[ ${#EXTRA_ARRAY[@]} -gt 0 ]]; then
|
|
||||||
command+=("${EXTRA_ARRAY[@]}")
|
|
||||||
fi
|
|
||||||
command+=("app:create_app()")
|
|
||||||
|
|
||||||
if command -v runuser >/dev/null 2>&1; then
|
|
||||||
exec runuser -u "$RUNTIME_USER" -- "${command[@]}"
|
|
||||||
else
|
|
||||||
printf -v _cmd_str '%q ' "${command[@]}"
|
|
||||||
exec su -s /bin/bash -m "$RUNTIME_USER" -c "exec ${_cmd_str}"
|
|
||||||
fi
|
|
@ -1,39 +0,0 @@
|
|||||||
[supervisord]
|
|
||||||
nodaemon=true
|
|
||||||
logfile=/var/log/supervisor/supervisord.log
|
|
||||||
pidfile=/var/run/supervisord.pid
|
|
||||||
user=root
|
|
||||||
|
|
||||||
[program:master]
|
|
||||||
command=/usr/local/bin/start-master.sh
|
|
||||||
user=root
|
|
||||||
stdout_logfile=/var/log/supervisor/master.log
|
|
||||||
stderr_logfile=/var/log/supervisor/master_error.log
|
|
||||||
autostart=true
|
|
||||||
autorestart=true
|
|
||||||
startsecs=5
|
|
||||||
stopwaitsecs=30
|
|
||||||
killasgroup=true
|
|
||||||
stopasgroup=true
|
|
||||||
|
|
||||||
[program:dns-monitor]
|
|
||||||
command=/usr/local/bin/dns-monitor.sh
|
|
||||||
user=root
|
|
||||||
stdout_logfile=/var/log/supervisor/dns-monitor.log
|
|
||||||
stderr_logfile=/var/log/supervisor/dns-monitor_error.log
|
|
||||||
autostart=true
|
|
||||||
autorestart=true
|
|
||||||
startsecs=5
|
|
||||||
stopwaitsecs=10
|
|
||||||
killasgroup=true
|
|
||||||
stopasgroup=true
|
|
||||||
|
|
||||||
[unix_http_server]
|
|
||||||
file=/var/run/supervisor.sock
|
|
||||||
chmod=0700
|
|
||||||
|
|
||||||
[supervisorctl]
|
|
||||||
serverurl=unix:///var/run/supervisor.sock
|
|
||||||
|
|
||||||
[rpcinterface:supervisor]
|
|
||||||
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
|
|
Binary file not shown.
@ -1,2 +0,0 @@
|
|||||||
Flask==2.3.3
|
|
||||||
gunicorn==21.2.0
|
|
@ -1,76 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
cat >&2 <<'USAGE'
|
|
||||||
Usage: $0 [--intranet] [--offline] [--tag <image_tag>]
|
|
||||||
|
|
||||||
Options:
|
|
||||||
--intranet 使用指定的 PyPI 镜像源(默认清华镜像)。
|
|
||||||
--offline 完全离线构建,依赖 offline_wheels/ 目录中的离线依赖包。
|
|
||||||
--tag <image_tag> 自定义镜像标签,默认 argus-master:latest。
|
|
||||||
USAGE
|
|
||||||
}
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
|
|
||||||
MODULE_ROOT="$PROJECT_ROOT/src/master"
|
|
||||||
IMAGE_TAG="${IMAGE_TAG:-argus-master:latest}"
|
|
||||||
DOCKERFILE="src/master/Dockerfile"
|
|
||||||
BUILD_ARGS=()
|
|
||||||
OFFLINE_MODE=0
|
|
||||||
|
|
||||||
source "$PROJECT_ROOT/scripts/common/build_user.sh"
|
|
||||||
load_build_user
|
|
||||||
BUILD_ARGS+=("--build-arg" "ARGUS_BUILD_UID=${ARGUS_BUILD_UID}" "--build-arg" "ARGUS_BUILD_GID=${ARGUS_BUILD_GID}")
|
|
||||||
|
|
||||||
cd "$PROJECT_ROOT"
|
|
||||||
|
|
||||||
while [[ "$#" -gt 0 ]]; do
|
|
||||||
case "$1" in
|
|
||||||
--intranet)
|
|
||||||
INTRANET_INDEX="${INTRANET_INDEX:-https://pypi.tuna.tsinghua.edu.cn/simple}"
|
|
||||||
BUILD_ARGS+=("--build-arg" "PIP_INDEX_URL=${INTRANET_INDEX}")
|
|
||||||
BUILD_ARGS+=("--build-arg" "USE_INTRANET=true")
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
--offline)
|
|
||||||
OFFLINE_MODE=1
|
|
||||||
BUILD_ARGS+=("--build-arg" "USE_OFFLINE=1")
|
|
||||||
BUILD_ARGS+=("--build-arg" "USE_INTRANET=true")
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
--tag)
|
|
||||||
[[ $# -ge 2 ]] || { usage; exit 1; }
|
|
||||||
IMAGE_TAG="$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
-h|--help)
|
|
||||||
usage
|
|
||||||
exit 0
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "Unknown option: $1" >&2
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ "$OFFLINE_MODE" -eq 1 ]]; then
|
|
||||||
WHEELS_DIR="$MODULE_ROOT/offline_wheels"
|
|
||||||
if [[ ! -d "$WHEELS_DIR" ]]; then
|
|
||||||
echo "[ERROR] offline_wheels 目录不存在: $WHEELS_DIR" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
if ! find "$WHEELS_DIR" -maxdepth 1 -type f -name '*.whl' -print -quit >/dev/null; then
|
|
||||||
echo "[ERROR] offline_wheels 目录为空,请先在有网环境执行 scripts/prepare_offline_wheels.sh" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
echo "[INFO] Building image $IMAGE_TAG"
|
|
||||||
docker build -f "$DOCKERFILE" "${BUILD_ARGS[@]}" -t "$IMAGE_TAG" "$PROJECT_ROOT"
|
|
||||||
echo "[OK] Image $IMAGE_TAG built"
|
|
@ -1,39 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
echo "Usage: $0 [--file <tar_path>]" >&2
|
|
||||||
}
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
DEFAULT_INPUT="$PROJECT_ROOT/images/argus-master-dev.tar"
|
|
||||||
IMAGE_TAR="$DEFAULT_INPUT"
|
|
||||||
|
|
||||||
while [[ "$#" -gt 0 ]]; do
|
|
||||||
case "$1" in
|
|
||||||
--file)
|
|
||||||
[[ $# -ge 2 ]] || { usage; exit 1; }
|
|
||||||
IMAGE_TAR="$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
-h|--help)
|
|
||||||
usage
|
|
||||||
exit 0
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "Unknown option: $1" >&2
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ ! -f "$IMAGE_TAR" ]]; then
|
|
||||||
echo "[ERROR] Image tarball not found: $IMAGE_TAR" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[INFO] Loading image from $IMAGE_TAR"
|
|
||||||
docker image load -i "$IMAGE_TAR"
|
|
||||||
echo "[OK] Image loaded"
|
|
@ -1,97 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
cat >&2 <<'USAGE'
|
|
||||||
Usage: $0 [--pip-version <version>] [--clean] [--local]
|
|
||||||
|
|
||||||
Options:
|
|
||||||
--pip-version <version> 额外下载指定版本的 pip wheel(例如 25.2)。
|
|
||||||
--clean 清理 offline_wheels/*.whl 后重新下载。
|
|
||||||
--local 使用本地 python 执行下载(默认通过 docker python:3.11-slim)。
|
|
||||||
USAGE
|
|
||||||
}
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
REQUIREMENTS_FILE="$PROJECT_ROOT/requirements.txt"
|
|
||||||
WHEEL_DIR="$PROJECT_ROOT/offline_wheels"
|
|
||||||
PIP_VERSION=""
|
|
||||||
CLEAN=0
|
|
||||||
USE_LOCAL=0
|
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
case "$1" in
|
|
||||||
--pip-version)
|
|
||||||
[[ $# -ge 2 ]] || { usage; exit 1; }
|
|
||||||
PIP_VERSION="$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--clean)
|
|
||||||
CLEAN=1
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
--local)
|
|
||||||
USE_LOCAL=1
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
-h|--help)
|
|
||||||
usage
|
|
||||||
exit 0
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "Unknown option: $1" >&2
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ ! -f "$REQUIREMENTS_FILE" ]]; then
|
|
||||||
echo "[ERROR] requirements.txt not found at $REQUIREMENTS_FILE" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
mkdir -p "$WHEEL_DIR"
|
|
||||||
|
|
||||||
if [[ "$CLEAN" -eq 1 ]]; then
|
|
||||||
echo "[INFO] Cleaning existing wheels in $WHEEL_DIR"
|
|
||||||
find "$WHEEL_DIR" -maxdepth 1 -type f -name '*.whl' -delete
|
|
||||||
fi
|
|
||||||
|
|
||||||
run_with_python() {
|
|
||||||
local cmd=("python" "-m" "pip" "$@")
|
|
||||||
eval "${cmd[@]}"
|
|
||||||
}
|
|
||||||
|
|
||||||
if [[ "$USE_LOCAL" -eq 1 ]]; then
|
|
||||||
PYTHON_BIN=${PYTHON_BIN:-python3}
|
|
||||||
if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then
|
|
||||||
echo "[ERROR] $PYTHON_BIN not found" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "[INFO] Using local python ($PYTHON_BIN) to download wheels"
|
|
||||||
"$PYTHON_BIN" -m pip download -r "$REQUIREMENTS_FILE" -d "$WHEEL_DIR"
|
|
||||||
if [[ -n "$PIP_VERSION" ]]; then
|
|
||||||
"$PYTHON_BIN" -m pip download "pip==${PIP_VERSION}" -d "$WHEEL_DIR"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
if ! command -v docker >/dev/null 2>&1; then
|
|
||||||
echo "[ERROR] docker not found; rerun with --local or安装 docker" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "[INFO] Using docker image python:3.11-slim 下载 wheel"
|
|
||||||
docker run --rm \
|
|
||||||
-v "$WHEEL_DIR":/wheels \
|
|
||||||
-v "$REQUIREMENTS_FILE":/tmp/requirements.txt \
|
|
||||||
python:3.11-slim \
|
|
||||||
bash -c "set -euo pipefail && python -m pip install --upgrade pip && python -m pip download -r /tmp/requirements.txt -d /wheels"
|
|
||||||
if [[ -n "$PIP_VERSION" ]]; then
|
|
||||||
docker run --rm \
|
|
||||||
-v "$WHEEL_DIR":/wheels \
|
|
||||||
python:3.11-slim \
|
|
||||||
bash -c "set -euo pipefail && python -m pip download pip==${PIP_VERSION} -d /wheels"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[INFO] Offline wheels prepared at $WHEEL_DIR"
|
|
@ -1,41 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
echo "Usage: $0 [--tag <image_tag>] [--output <tar_path>]" >&2
|
|
||||||
}
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
DEFAULT_OUTPUT="$PROJECT_ROOT/images/argus-master-dev.tar"
|
|
||||||
IMAGE_TAG="${IMAGE_TAG:-argus-master:latest}"
|
|
||||||
OUTPUT_PATH="$DEFAULT_OUTPUT"
|
|
||||||
|
|
||||||
while [[ "$#" -gt 0 ]]; do
|
|
||||||
case "$1" in
|
|
||||||
--tag)
|
|
||||||
[[ $# -ge 2 ]] || { usage; exit 1; }
|
|
||||||
IMAGE_TAG="$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--output)
|
|
||||||
[[ $# -ge 2 ]] || { usage; exit 1; }
|
|
||||||
OUTPUT_PATH="$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
-h|--help)
|
|
||||||
usage
|
|
||||||
exit 0
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "Unknown option: $1" >&2
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
mkdir -p "$(dirname "$OUTPUT_PATH")"
|
|
||||||
echo "[INFO] Saving image $IMAGE_TAG to $OUTPUT_PATH"
|
|
||||||
docker image save "$IMAGE_TAG" -o "$OUTPUT_PATH"
|
|
||||||
echo "[OK] Image saved"
|
|
2
src/master/tests/.gitignore
vendored
2
src/master/tests/.gitignore
vendored
@ -1,2 +0,0 @@
|
|||||||
private/
|
|
||||||
tmp/
|
|
@ -1,19 +0,0 @@
|
|||||||
services:
|
|
||||||
master:
|
|
||||||
image: ${MASTER_IMAGE_TAG:-argus-master:latest}
|
|
||||||
container_name: argus-master-e2e
|
|
||||||
environment:
|
|
||||||
- OFFLINE_THRESHOLD_SECONDS=6
|
|
||||||
- ONLINE_THRESHOLD_SECONDS=2
|
|
||||||
- SCHEDULER_INTERVAL_SECONDS=1
|
|
||||||
ports:
|
|
||||||
- "31300:3000"
|
|
||||||
volumes:
|
|
||||||
- ./private/argus/master:/private/argus/master
|
|
||||||
- ./private/argus/metric/prometheus:/private/argus/metric/prometheus
|
|
||||||
- ./private/argus/etc:/private/argus/etc
|
|
||||||
restart: unless-stopped
|
|
||||||
|
|
||||||
networks:
|
|
||||||
default:
|
|
||||||
driver: bridge
|
|
@ -1,25 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
SCRIPTS=(
|
|
||||||
"01_up_master.sh"
|
|
||||||
"02_verify_ready_and_nodes_json.sh"
|
|
||||||
"03_register_via_curl.sh"
|
|
||||||
"04_reregister_and_error_cases.sh"
|
|
||||||
"05_status_report_via_curl.sh"
|
|
||||||
"06_config_update_and_nodes_json.sh"
|
|
||||||
"07_stats_single_node.sh"
|
|
||||||
"08_multi_node_stats.sh"
|
|
||||||
"09_restart_persistence.sh"
|
|
||||||
"10_down.sh"
|
|
||||||
)
|
|
||||||
|
|
||||||
for script in "${SCRIPTS[@]}"; do
|
|
||||||
echo "[TEST] Running $script"
|
|
||||||
MASTER_IMAGE_TAG="${MASTER_IMAGE_TAG:-argus-master:latest}" "$SCRIPT_DIR/$script"
|
|
||||||
echo "[TEST] $script completed"
|
|
||||||
echo
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "[TEST] Master module E2E tests completed"
|
|
@ -1,16 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
MODULE_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
MASTER_ROOT="$(cd "$MODULE_ROOT/.." && pwd)"
|
|
||||||
|
|
||||||
# 准备离线依赖并构建镜像
|
|
||||||
pushd "$MASTER_ROOT" >/dev/null
|
|
||||||
./scripts/prepare_offline_wheels.sh --clean --pip-version 25.2
|
|
||||||
./scripts/build_images.sh --offline --tag argus-master:offline
|
|
||||||
popd >/dev/null
|
|
||||||
|
|
||||||
# 使用离线镜像执行既有端到端用例
|
|
||||||
MASTER_IMAGE_TAG="argus-master:offline" ./scripts/00_e2e_test.sh
|
|
||||||
|
|
@ -1,50 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
MODULE_ROOT="$(cd "$TEST_ROOT/.." && pwd)"
|
|
||||||
PRIVATE_ROOT="$TEST_ROOT/private"
|
|
||||||
TMP_ROOT="$TEST_ROOT/tmp"
|
|
||||||
DNS_ROOT="$PRIVATE_ROOT/argus/etc"
|
|
||||||
BIND_UPDATE_SCRIPT_SRC="$(cd "$MODULE_ROOT/../bind" && pwd)/build/update-dns.sh"
|
|
||||||
BIND_UPDATE_SCRIPT_DEST="$DNS_ROOT/update-dns.sh"
|
|
||||||
|
|
||||||
mkdir -p "$PRIVATE_ROOT/argus/master"
|
|
||||||
mkdir -p "$PRIVATE_ROOT/argus/metric/prometheus"
|
|
||||||
mkdir -p "$TMP_ROOT"
|
|
||||||
mkdir -p "$DNS_ROOT"
|
|
||||||
|
|
||||||
# 确保上一次运行留下的容器/数据被清理
|
|
||||||
compose() {
|
|
||||||
if docker compose version >/dev/null 2>&1; then
|
|
||||||
docker compose "$@"
|
|
||||||
else
|
|
||||||
docker-compose "$@"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
pushd "$TEST_ROOT" >/dev/null
|
|
||||||
compose down --remove-orphans || true
|
|
||||||
popd >/dev/null
|
|
||||||
|
|
||||||
rm -rf "$TMP_ROOT" "$PRIVATE_ROOT"
|
|
||||||
mkdir -p "$PRIVATE_ROOT/argus/master"
|
|
||||||
mkdir -p "$PRIVATE_ROOT/argus/metric/prometheus"
|
|
||||||
mkdir -p "$TMP_ROOT"
|
|
||||||
mkdir -p "$DNS_ROOT"
|
|
||||||
|
|
||||||
# 中文提示:将 bind 模块自带的 update-dns.sh 下发到共享目录,模拟实际环境
|
|
||||||
if [[ -f "$BIND_UPDATE_SCRIPT_SRC" ]]; then
|
|
||||||
cp "$BIND_UPDATE_SCRIPT_SRC" "$BIND_UPDATE_SCRIPT_DEST"
|
|
||||||
chmod +x "$BIND_UPDATE_SCRIPT_DEST"
|
|
||||||
else
|
|
||||||
echo "[WARN] bind update script missing at $BIND_UPDATE_SCRIPT_SRC"
|
|
||||||
fi
|
|
||||||
|
|
||||||
pushd "$TEST_ROOT" >/dev/null
|
|
||||||
compose down --remove-orphans || true
|
|
||||||
MASTER_IMAGE_TAG="${MASTER_IMAGE_TAG:-argus-master:latest}" compose up -d
|
|
||||||
popd >/dev/null
|
|
||||||
|
|
||||||
echo "[INFO] Master container is up on http://localhost:31300"
|
|
@ -1,60 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
PRIVATE_ROOT="$TEST_ROOT/private"
|
|
||||||
API_BASE="http://localhost:31300"
|
|
||||||
NODES_JSON_PATH="$PRIVATE_ROOT/argus/metric/prometheus/nodes.json"
|
|
||||||
MASTER_DOMAIN_FILE="$PRIVATE_ROOT/argus/etc/master.argus.com"
|
|
||||||
|
|
||||||
# 等待 readyz 返回 200,确保数据库初始化完成
|
|
||||||
for _ in {1..30}; do
|
|
||||||
status=$(curl -s -o /dev/null -w '%{http_code}' "$API_BASE/readyz" || true)
|
|
||||||
if [[ "$status" == "200" ]]; then
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ "${status:-}" != "200" ]]; then
|
|
||||||
echo "[ERROR] /readyz 未在预期时间内返回 200,实际=$status" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[INFO] /readyz 已通过,就绪检查成功"
|
|
||||||
|
|
||||||
# scheduler 启动时会产生空的 nodes.json,这里等待文件出现并校验内容
|
|
||||||
for _ in {1..30}; do
|
|
||||||
if [[ -f "$NODES_JSON_PATH" ]]; then
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ ! -f "$NODES_JSON_PATH" ]]; then
|
|
||||||
echo "[ERROR] 未在预期时间内生成 $NODES_JSON_PATH" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! python3 - "$NODES_JSON_PATH" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
data = json.load(handle)
|
|
||||||
if data != []:
|
|
||||||
raise SystemExit(f"nodes.json initial content should be [], got {data}")
|
|
||||||
PY
|
|
||||||
then
|
|
||||||
echo "[ERROR] nodes.json 初始内容不是空数组" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[INFO] nodes.json 初始状态校验通过"
|
|
||||||
|
|
||||||
# 中文提示:输出 master 写入的域名文件,失败不影响测试
|
|
||||||
if [[ -f "$MASTER_DOMAIN_FILE" ]]; then
|
|
||||||
MASTER_IP=$(<"$MASTER_DOMAIN_FILE")
|
|
||||||
echo "[INFO] master.argus.com 记录: $MASTER_IP"
|
|
||||||
else
|
|
||||||
echo "[WARN] 未找到 master.argus.com 记录文件,目录=$MASTER_DOMAIN_FILE"
|
|
||||||
fi
|
|
@ -1,68 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
TMP_ROOT="$TEST_ROOT/tmp"
|
|
||||||
API_BASE="http://localhost:31300/api/v1/master"
|
|
||||||
|
|
||||||
mkdir -p "$TMP_ROOT"
|
|
||||||
|
|
||||||
for _ in {1..30}; do
|
|
||||||
if curl -sf "$API_BASE/healthz" >/dev/null; then
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
|
|
||||||
payload=$(cat <<'JSON'
|
|
||||||
{
|
|
||||||
"name": "dev-testuser-testinst-pod-0",
|
|
||||||
"type": "agent",
|
|
||||||
"meta_data": {
|
|
||||||
"hostname": "dev-testuser-testinst-pod-0",
|
|
||||||
"ip": "10.0.0.10",
|
|
||||||
"env": "dev",
|
|
||||||
"user": "testuser",
|
|
||||||
"instance": "testinst",
|
|
||||||
"cpu_number": 4,
|
|
||||||
"memory_in_bytes": 2147483648,
|
|
||||||
"gpu_number": 0
|
|
||||||
},
|
|
||||||
"version": "1.1.0"
|
|
||||||
}
|
|
||||||
JSON
|
|
||||||
)
|
|
||||||
|
|
||||||
body_file="$TMP_ROOT/register_body.json"
|
|
||||||
status=$(curl -sS -o "$body_file" -w '%{http_code}' -H 'Content-Type: application/json' -X POST "$API_BASE/nodes" -d "$payload")
|
|
||||||
body="$(cat "$body_file")"
|
|
||||||
|
|
||||||
if [[ "$status" != "201" ]]; then
|
|
||||||
echo "[ERROR] Unexpected status code: $status" >&2
|
|
||||||
echo "$body" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
node_id=$(python3 - "$body_file" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
body = json.load(handle)
|
|
||||||
print(body["id"])
|
|
||||||
PY
|
|
||||||
)
|
|
||||||
|
|
||||||
echo "$body" > "$TMP_ROOT/last_response.json"
|
|
||||||
echo "$node_id" > "$TMP_ROOT/node_id"
|
|
||||||
|
|
||||||
list_file="$TMP_ROOT/nodes_list.json"
|
|
||||||
curl -sS "$API_BASE/nodes" -o "$list_file"
|
|
||||||
python3 - "$list_file" "$node_id" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
data = json.load(handle)
|
|
||||||
node_id = sys.argv[2]
|
|
||||||
assert any(item.get("id") == node_id for item in data), "node not in list"
|
|
||||||
PY
|
|
||||||
|
|
||||||
echo "[INFO] Registered node with id $node_id"
|
|
@ -1,116 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
TMP_ROOT="$TEST_ROOT/tmp"
|
|
||||||
API_BASE="http://localhost:31300/api/v1/master"
|
|
||||||
NODE_ID="$(cat "$TMP_ROOT/node_id")"
|
|
||||||
|
|
||||||
# 使用相同 ID 重注册,同时修改部分 meta/version 字段
|
|
||||||
payload=$(cat <<JSON
|
|
||||||
{
|
|
||||||
"id": "$NODE_ID",
|
|
||||||
"name": "dev-testuser-testinst-pod-0",
|
|
||||||
"type": "agent",
|
|
||||||
"meta_data": {
|
|
||||||
"hostname": "dev-testuser-testinst-pod-0",
|
|
||||||
"ip": "10.0.0.11",
|
|
||||||
"env": "dev",
|
|
||||||
"user": "testuser",
|
|
||||||
"instance": "testinst",
|
|
||||||
"cpu_number": 8,
|
|
||||||
"memory_in_bytes": 2147483648,
|
|
||||||
"gpu_number": 0
|
|
||||||
},
|
|
||||||
"version": "1.2.0"
|
|
||||||
}
|
|
||||||
JSON
|
|
||||||
)
|
|
||||||
|
|
||||||
status=$(curl -sS -o "$TMP_ROOT/reregister_response.json" -w '%{http_code}' -H 'Content-Type: application/json' -X POST "$API_BASE/nodes" -d "$payload")
|
|
||||||
if [[ "$status" != "200" ]]; then
|
|
||||||
echo "[ERROR] 重注册返回非 200: $status" >&2
|
|
||||||
cat "$TMP_ROOT/reregister_response.json" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
python3 - "$TMP_ROOT/reregister_response.json" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
assert node["meta_data"]["ip"] == "10.0.0.11", node["meta_data"]
|
|
||||||
assert node["meta_data"]["cpu_number"] == 8, node["meta_data"]
|
|
||||||
assert node["version"] == "1.2.0", node
|
|
||||||
PY
|
|
||||||
|
|
||||||
echo "[INFO] 重注册成功,元数据已更新"
|
|
||||||
|
|
||||||
# 未知 ID => 404
|
|
||||||
unknown_payload=$(cat <<'JSON'
|
|
||||||
{
|
|
||||||
"id": "A999",
|
|
||||||
"name": "dev-testuser-testinst-pod-0",
|
|
||||||
"type": "agent",
|
|
||||||
"meta_data": {
|
|
||||||
"hostname": "dev-testuser-testinst-pod-0",
|
|
||||||
"ip": "10.0.0.12",
|
|
||||||
"env": "dev",
|
|
||||||
"user": "testuser",
|
|
||||||
"instance": "testinst",
|
|
||||||
"cpu_number": 4,
|
|
||||||
"memory_in_bytes": 2147483648,
|
|
||||||
"gpu_number": 0
|
|
||||||
},
|
|
||||||
"version": "1.2.0"
|
|
||||||
}
|
|
||||||
JSON
|
|
||||||
)
|
|
||||||
|
|
||||||
status=$(curl -sS -o "$TMP_ROOT/unknown_id_response.json" -w '%{http_code}' -H 'Content-Type: application/json' -X POST "$API_BASE/nodes" -d "$unknown_payload")
|
|
||||||
if [[ "$status" != "404" ]]; then
|
|
||||||
echo "[ERROR] 未知 ID 应返回 404,实际=$status" >&2
|
|
||||||
cat "$TMP_ROOT/unknown_id_response.json" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[INFO] 未知 ID 返回 404 验证通过"
|
|
||||||
|
|
||||||
# id 与 name 不匹配 => 500,节点保持原名
|
|
||||||
mismatch_payload=$(cat <<JSON
|
|
||||||
{
|
|
||||||
"id": "$NODE_ID",
|
|
||||||
"name": "dev-testuser-testinst-pod-0-mismatch",
|
|
||||||
"type": "agent",
|
|
||||||
"meta_data": {
|
|
||||||
"hostname": "dev-testuser-testinst-pod-0-mismatch",
|
|
||||||
"ip": "10.0.0.13",
|
|
||||||
"env": "dev",
|
|
||||||
"user": "testuser",
|
|
||||||
"instance": "testinst",
|
|
||||||
"cpu_number": 4,
|
|
||||||
"memory_in_bytes": 2147483648,
|
|
||||||
"gpu_number": 0
|
|
||||||
},
|
|
||||||
"version": "1.2.0"
|
|
||||||
}
|
|
||||||
JSON
|
|
||||||
)
|
|
||||||
|
|
||||||
status=$(curl -sS -o "$TMP_ROOT/mismatch_response.json" -w '%{http_code}' -H 'Content-Type: application/json' -X POST "$API_BASE/nodes" -d "$mismatch_payload")
|
|
||||||
if [[ "$status" != "500" ]]; then
|
|
||||||
echo "[ERROR] 名称不匹配应返回 500,实际=$status" >&2
|
|
||||||
cat "$TMP_ROOT/mismatch_response.json" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 验证名称仍保持正确
|
|
||||||
curl -sS "$API_BASE/nodes/$NODE_ID" -o "$TMP_ROOT/post_mismatch_detail.json"
|
|
||||||
python3 - "$TMP_ROOT/post_mismatch_detail.json" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
assert node["name"] == "dev-testuser-testinst-pod-0", node["name"]
|
|
||||||
PY
|
|
||||||
|
|
||||||
echo "[INFO] 名称不匹配返回 500,且原始节点未被篡改"
|
|
@ -1,98 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
TMP_ROOT="$TEST_ROOT/tmp"
|
|
||||||
API_BASE="http://localhost:31300/api/v1/master"
|
|
||||||
|
|
||||||
node_id="$(cat "$TMP_ROOT/node_id")"
|
|
||||||
|
|
||||||
payload=$(python3 - <<'PY'
|
|
||||||
import json
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
body = {
|
|
||||||
"timestamp": datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z"),
|
|
||||||
"health": {
|
|
||||||
"log-fluentbit": {"status": "healthy", "timestamp": "2023-10-05T12:05:00Z"},
|
|
||||||
"metric-node-exporter": {"status": "healthy", "timestamp": "2023-10-05T12:05:00Z"}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
print(json.dumps(body))
|
|
||||||
PY
|
|
||||||
)
|
|
||||||
|
|
||||||
response=$(curl -sS -w '\n%{http_code}' -H 'Content-Type: application/json' -X PUT "$API_BASE/nodes/$node_id/status" -d "$payload")
|
|
||||||
body="$(echo "$response" | head -n -1)"
|
|
||||||
status="$(echo "$response" | tail -n1)"
|
|
||||||
|
|
||||||
if [[ "$status" != "200" ]]; then
|
|
||||||
echo "[ERROR] Status update failed with code $status" >&2
|
|
||||||
echo "$body" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "$body" > "$TMP_ROOT/last_response.json"
|
|
||||||
|
|
||||||
sleep 3
|
|
||||||
|
|
||||||
detail_file="$TMP_ROOT/status_detail.json"
|
|
||||||
curl -sS "$API_BASE/nodes/$node_id" -o "$detail_file"
|
|
||||||
python3 - "$detail_file" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
assert node["status"] == "online", f"Expected online, got {node['status']}"
|
|
||||||
assert "log-fluentbit" in node["health"], node["health"].keys()
|
|
||||||
PY
|
|
||||||
|
|
||||||
echo "[INFO] Status report successful and node is online"
|
|
||||||
|
|
||||||
# 等待超过 offline 阈值,验证会自动转为 offline
|
|
||||||
sleep 7
|
|
||||||
|
|
||||||
offline_detail_file="$TMP_ROOT/status_offline.json"
|
|
||||||
curl -sS "$API_BASE/nodes/$node_id" -o "$offline_detail_file"
|
|
||||||
python3 - "$offline_detail_file" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
assert node["status"] == "offline", f"Expected offline, got {node['status']}"
|
|
||||||
PY
|
|
||||||
|
|
||||||
echo "[INFO] Node transitioned to offline as expected"
|
|
||||||
|
|
||||||
# 再次上报健康,触发状态回到 online
|
|
||||||
payload=$(python3 - <<'PY'
|
|
||||||
import json
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
body = {
|
|
||||||
"timestamp": datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z"),
|
|
||||||
"health": {
|
|
||||||
"log-fluentbit": {"status": "healthy", "timestamp": "2023-10-05T12:05:00Z"},
|
|
||||||
"metric-node-exporter": {"status": "healthy", "timestamp": "2023-10-05T12:05:00Z"}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
print(json.dumps(body))
|
|
||||||
PY
|
|
||||||
)
|
|
||||||
|
|
||||||
curl -sS -o "$TMP_ROOT/second_status_response.json" -w '%{http_code}' -H 'Content-Type: application/json' -X PUT "$API_BASE/nodes/$node_id/status" -d "$payload" > "$TMP_ROOT/second_status_code"
|
|
||||||
if [[ $(cat "$TMP_ROOT/second_status_code") != "200" ]]; then
|
|
||||||
echo "[ERROR] Second status update failed" >&2
|
|
||||||
cat "$TMP_ROOT/second_status_response.json" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
sleep 3
|
|
||||||
|
|
||||||
final_detail_file="$TMP_ROOT/status_back_online.json"
|
|
||||||
curl -sS "$API_BASE/nodes/$node_id" -o "$final_detail_file"
|
|
||||||
python3 - "$final_detail_file" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
assert node["status"] == "online", f"Expected online after second report, got {node['status']}"
|
|
||||||
PY
|
|
||||||
|
|
||||||
echo "[INFO] Node transitioned back to online after new status report"
|
|
@ -1,56 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
TMP_ROOT="$TEST_ROOT/tmp"
|
|
||||||
PRIVATE_ROOT="$TEST_ROOT/private"
|
|
||||||
API_BASE="http://localhost:31300/api/v1/master"
|
|
||||||
NODE_ID="$(cat "$TMP_ROOT/node_id")"
|
|
||||||
|
|
||||||
payload='{"config":{"log_level":"debug"},"label":["gpu","exp001"]}'
|
|
||||||
|
|
||||||
response=$(curl -sS -w '\n%{http_code}' -H 'Content-Type: application/json' -X PUT "$API_BASE/nodes/$NODE_ID/config" -d "$payload")
|
|
||||||
body="$(echo "$response" | head -n -1)"
|
|
||||||
status="$(echo "$response" | tail -n1)"
|
|
||||||
|
|
||||||
if [[ "$status" != "200" ]]; then
|
|
||||||
echo "[ERROR] Config update failed: $status" >&2
|
|
||||||
echo "$body" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
sleep 2
|
|
||||||
|
|
||||||
nodes_json_path="$PRIVATE_ROOT/argus/metric/prometheus/nodes.json"
|
|
||||||
if [[ ! -f "$nodes_json_path" ]]; then
|
|
||||||
echo "[ERROR] nodes.json not generated at $nodes_json_path" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 确保节点处于 online 状态,避免因等待导致 nodes.json 为空
|
|
||||||
curl -sS "$API_BASE/nodes/$NODE_ID" -o "$TMP_ROOT/config_detail.json"
|
|
||||||
if ! python3 - "$TMP_ROOT/config_detail.json" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
if node["status"] != "online":
|
|
||||||
raise SystemExit(1)
|
|
||||||
PY
|
|
||||||
then
|
|
||||||
payload='{"timestamp":"2025-09-24T00:00:00Z","health":{"log-fluentbit":{"status":"healthy"}}}'
|
|
||||||
curl -sS -o "$TMP_ROOT/config_second_report.json" -w '%{http_code}' -H 'Content-Type: application/json' -X PUT "$API_BASE/nodes/$NODE_ID/status" -d "$payload" > "$TMP_ROOT/config_second_code"
|
|
||||||
sleep 2
|
|
||||||
fi
|
|
||||||
|
|
||||||
python3 - "$nodes_json_path" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
from pathlib import Path
|
|
||||||
path = Path(sys.argv[1])
|
|
||||||
content = json.loads(path.read_text())
|
|
||||||
assert isinstance(content, list) and len(content) == 1
|
|
||||||
entry = content[0]
|
|
||||||
assert entry["labels"] == ["gpu", "exp001"], entry
|
|
||||||
PY
|
|
||||||
|
|
||||||
echo "[INFO] Config updated and nodes.json verified"
|
|
@ -1,41 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
PRIVATE_ROOT="$TEST_ROOT/private"
|
|
||||||
TMP_ROOT="$TEST_ROOT/tmp"
|
|
||||||
API_BASE="http://localhost:31300/api/v1/master"
|
|
||||||
NODE_ID="$(cat "$TMP_ROOT/node_id")"
|
|
||||||
|
|
||||||
sleep 7
|
|
||||||
|
|
||||||
detail_file="$TMP_ROOT/offline_detail.json"
|
|
||||||
curl -sS "$API_BASE/nodes/$NODE_ID" -o "$detail_file"
|
|
||||||
python3 - "$detail_file" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
assert node["status"] == "offline", f"Expected offline, got {node['status']}"
|
|
||||||
PY
|
|
||||||
|
|
||||||
stats_file="$TMP_ROOT/stats.json"
|
|
||||||
curl -sS "$API_BASE/nodes/statistics" -o "$stats_file"
|
|
||||||
python3 - "$stats_file" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
stats = json.load(handle)
|
|
||||||
assert stats["total"] == 1
|
|
||||||
found = {item["status"]: item["count"] for item in stats["status_statistics"]}
|
|
||||||
assert found.get("offline") == 1
|
|
||||||
PY
|
|
||||||
|
|
||||||
nodes_json_path="$PRIVATE_ROOT/argus/metric/prometheus/nodes.json"
|
|
||||||
python3 - "$nodes_json_path" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
content = json.load(handle)
|
|
||||||
assert content == [], content
|
|
||||||
PY
|
|
||||||
|
|
||||||
echo "[INFO] Offline transition and statistics validated"
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user