Compare commits
26 Commits
main
...
dev_1.0.0_
| Author | SHA1 | Date | |
|---|---|---|---|
| 9b9fade833 | |||
| 824eddde67 | |||
| d036da2d5e | |||
| 8e01264e3f | |||
| 26c39604d5 | |||
| bd082866d8 | |||
| cc0f9e5fed | |||
| 23f0f4fca4 | |||
| 2c799f2c1e | |||
| 1d4208ed3c | |||
| a1cdd05950 | |||
| 0b1ccbd87f | |||
| 1d9a8ec695 | |||
|
|
b0d451cbe7 | ||
|
|
835e81282f | ||
|
|
c4582c99bc | ||
|
|
299765ed40 | ||
|
|
a8bbf2d6e9 | ||
| d1b89c0cf6 | |||
| 1a768bc837 | |||
| 31ccb0b1b8 | |||
| 8fbe107ac9 | |||
| c098f1d3ce | |||
| 1e5e91b193 | |||
| 8a38d3d0b2 | |||
| 26e1c964ed |
1
.gitattributes
vendored
Normal file
1
.gitattributes
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
src/metric/client-plugins/all-in-one-full/plugins/*/bin/* filter=lfs diff=lfs merge=lfs -text
|
||||||
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
.idea/
|
||||||
@ -5,3 +5,10 @@
|
|||||||
项目文档:【腾讯文档】GPU集群运维系统
|
项目文档:【腾讯文档】GPU集群运维系统
|
||||||
https://docs.qq.com/doc/DQUxDdmhIZ1dpeERk
|
https://docs.qq.com/doc/DQUxDdmhIZ1dpeERk
|
||||||
|
|
||||||
|
## 构建账号配置
|
||||||
|
|
||||||
|
镜像构建和运行账号的 UID/GID 可通过 `configs/build_user.conf` 配置,详细说明见 `doc/build-user-config.md`。
|
||||||
|
|
||||||
|
## 本地端口占用提示
|
||||||
|
|
||||||
|
如需运行 BIND 模块端到端测试且宿主机 53 端口已占用,可通过环境变量 `HOST_DNS_PORT`(默认 1053)指定对外映射端口,例如 `HOST_DNS_PORT=12053 ./scripts/00_e2e_test.sh`。
|
||||||
|
|||||||
409
build/build_images.sh
Executable file
409
build/build_images.sh
Executable file
@ -0,0 +1,409 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
show_help() {
|
||||||
|
cat <<'EOF'
|
||||||
|
ARGUS Unified Build System - Image Build Tool
|
||||||
|
|
||||||
|
Usage: $0 [OPTIONS]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--intranet Use intranet mirror for log/bind builds
|
||||||
|
--master-offline Build master offline image (requires src/master/offline_wheels.tar.gz)
|
||||||
|
--metric Build metric module images (ftp, prometheus, grafana, test nodes)
|
||||||
|
--no-cache Build all images without using Docker layer cache
|
||||||
|
--only LIST Comma-separated targets to build: core,master,metric,web,alert,sys,all
|
||||||
|
-h, --help Show this help message
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
$0 # Build with default sources
|
||||||
|
$0 --intranet # Build with intranet mirror
|
||||||
|
$0 --master-offline # Additionally build argus-master:offline
|
||||||
|
$0 --metric # Additionally build metric module images
|
||||||
|
$0 --intranet --master-offline --metric
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
use_intranet=false
|
||||||
|
build_core=true
|
||||||
|
build_master=true
|
||||||
|
build_master_offline=false
|
||||||
|
build_metric=true
|
||||||
|
build_web=true
|
||||||
|
build_alert=true
|
||||||
|
build_sys=true
|
||||||
|
no_cache=false
|
||||||
|
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case $1 in
|
||||||
|
--intranet)
|
||||||
|
use_intranet=true
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--master)
|
||||||
|
build_master=true
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--master-offline)
|
||||||
|
build_master=true
|
||||||
|
build_master_offline=true
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--metric)
|
||||||
|
build_metric=true
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--no-cache)
|
||||||
|
no_cache=true
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--only)
|
||||||
|
if [[ -z ${2:-} ]]; then
|
||||||
|
echo "--only requires a target list" >&2; exit 1
|
||||||
|
fi
|
||||||
|
sel="$2"; shift 2
|
||||||
|
# reset all, then enable selected
|
||||||
|
build_core=false; build_master=false; build_metric=false; build_web=false; build_alert=false; build_sys=false
|
||||||
|
IFS=',' read -ra parts <<< "$sel"
|
||||||
|
for p in "${parts[@]}"; do
|
||||||
|
case "$p" in
|
||||||
|
core) build_core=true ;;
|
||||||
|
master) build_master=true ;;
|
||||||
|
metric) build_metric=true ;;
|
||||||
|
web) build_web=true ;;
|
||||||
|
alert) build_alert=true ;;
|
||||||
|
sys) build_sys=true ;;
|
||||||
|
all) build_core=true; build_master=true; build_metric=true; build_web=true; build_alert=true; build_sys=true ;;
|
||||||
|
*) echo "Unknown --only target: $p" >&2; exit 1 ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
;;
|
||||||
|
-h|--help)
|
||||||
|
show_help
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Unknown option: $1" >&2
|
||||||
|
show_help
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||||
|
. "$root/scripts/common/build_user.sh"
|
||||||
|
|
||||||
|
declare -a build_args=()
|
||||||
|
|
||||||
|
if [[ "$use_intranet" == true ]]; then
|
||||||
|
build_args+=("--build-arg" "USE_INTRANET=true")
|
||||||
|
fi
|
||||||
|
|
||||||
|
cd "$root"
|
||||||
|
|
||||||
|
load_build_user
|
||||||
|
build_args+=("--build-arg" "ARGUS_BUILD_UID=${ARGUS_BUILD_UID}" "--build-arg" "ARGUS_BUILD_GID=${ARGUS_BUILD_GID}")
|
||||||
|
|
||||||
|
if [[ "$no_cache" == true ]]; then
|
||||||
|
build_args+=("--no-cache")
|
||||||
|
fi
|
||||||
|
|
||||||
|
master_root="$root/src/master"
|
||||||
|
master_offline_tar="$master_root/offline_wheels.tar.gz"
|
||||||
|
master_offline_dir="$master_root/offline_wheels"
|
||||||
|
|
||||||
|
if [[ "$build_master_offline" == true ]]; then
|
||||||
|
if [[ ! -f "$master_offline_tar" ]]; then
|
||||||
|
echo "❌ offline wheels tar not found: $master_offline_tar" >&2
|
||||||
|
echo " 请提前准备好 offline_wheels.tar.gz 后再执行 --master-offline" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "📦 Preparing offline wheels for master (extracting $master_offline_tar)"
|
||||||
|
rm -rf "$master_offline_dir"
|
||||||
|
mkdir -p "$master_offline_dir"
|
||||||
|
tar -xzf "$master_offline_tar" -C "$master_root"
|
||||||
|
has_wheel=$(find "$master_offline_dir" -maxdepth 1 -type f -name '*.whl' -print -quit)
|
||||||
|
if [[ -z "$has_wheel" ]]; then
|
||||||
|
echo "❌ offline_wheels extraction failed或无 wheel: $master_offline_dir" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "======================================="
|
||||||
|
echo "ARGUS Unified Build System"
|
||||||
|
echo "======================================="
|
||||||
|
|
||||||
|
if [[ "$use_intranet" == true ]]; then
|
||||||
|
echo "🌐 Mode: Intranet (Using internal mirror: 10.68.64.1)"
|
||||||
|
else
|
||||||
|
echo "🌐 Mode: Public (Using default package sources)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "👤 Build user UID:GID -> ${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}"
|
||||||
|
|
||||||
|
echo "📁 Build context: $root"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
build_image() {
|
||||||
|
local image_name=$1
|
||||||
|
local dockerfile_path=$2
|
||||||
|
local tag=$3
|
||||||
|
local context="."
|
||||||
|
shift 3
|
||||||
|
|
||||||
|
if [[ $# -gt 0 ]]; then
|
||||||
|
context=$1
|
||||||
|
shift
|
||||||
|
fi
|
||||||
|
|
||||||
|
local extra_args=("$@")
|
||||||
|
|
||||||
|
echo "🔄 Building $image_name image..."
|
||||||
|
echo " Dockerfile: $dockerfile_path"
|
||||||
|
echo " Tag: $tag"
|
||||||
|
echo " Context: $context"
|
||||||
|
|
||||||
|
if docker build "${build_args[@]}" "${extra_args[@]}" -f "$dockerfile_path" -t "$tag" "$context"; then
|
||||||
|
echo "✅ $image_name image built successfully"
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
echo "❌ Failed to build $image_name image"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
pull_base_image() {
|
||||||
|
local image_ref=$1
|
||||||
|
local attempts=${2:-3}
|
||||||
|
local delay=${3:-5}
|
||||||
|
|
||||||
|
# If the image already exists locally, skip pulling.
|
||||||
|
if docker image inspect "$image_ref" >/dev/null 2>&1; then
|
||||||
|
echo " Local image present; skip pull: $image_ref"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
for ((i=1; i<=attempts; i++)); do
|
||||||
|
echo " Pulling base image ($i/$attempts): $image_ref"
|
||||||
|
if docker pull "$image_ref" >/dev/null; then
|
||||||
|
echo " Base image ready: $image_ref"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
echo " Pull failed: $image_ref"
|
||||||
|
if (( i < attempts )); then
|
||||||
|
echo " Retrying in ${delay}s..."
|
||||||
|
sleep "$delay"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "❌ Unable to pull base image after ${attempts} attempts: $image_ref"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
images_built=()
|
||||||
|
build_failed=false
|
||||||
|
|
||||||
|
if [[ "$build_core" == true ]]; then
|
||||||
|
if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then
|
||||||
|
images_built+=("argus-elasticsearch:latest")
|
||||||
|
else
|
||||||
|
build_failed=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:latest"; then
|
||||||
|
images_built+=("argus-kibana:latest")
|
||||||
|
else
|
||||||
|
build_failed=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:latest"; then
|
||||||
|
images_built+=("argus-bind9:latest")
|
||||||
|
else
|
||||||
|
build_failed=true
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
if [[ "$build_master" == true ]]; then
|
||||||
|
echo ""
|
||||||
|
echo "🔄 Building Master image..."
|
||||||
|
pushd "$master_root" >/dev/null
|
||||||
|
master_args=("--tag" "argus-master:latest")
|
||||||
|
if [[ "$use_intranet" == true ]]; then
|
||||||
|
master_args+=("--intranet")
|
||||||
|
fi
|
||||||
|
if [[ "$build_master_offline" == true ]]; then
|
||||||
|
master_args+=("--offline")
|
||||||
|
fi
|
||||||
|
if [[ "$no_cache" == true ]]; then
|
||||||
|
master_args+=("--no-cache")
|
||||||
|
fi
|
||||||
|
if ./scripts/build_images.sh "${master_args[@]}"; then
|
||||||
|
if [[ "$build_master_offline" == true ]]; then
|
||||||
|
images_built+=("argus-master:offline")
|
||||||
|
else
|
||||||
|
images_built+=("argus-master:latest")
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
build_failed=true
|
||||||
|
fi
|
||||||
|
popd >/dev/null
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$build_metric" == true ]]; then
|
||||||
|
echo ""
|
||||||
|
echo "Building Metric module images..."
|
||||||
|
|
||||||
|
metric_base_images=(
|
||||||
|
"ubuntu:22.04"
|
||||||
|
"ubuntu/prometheus:3-24.04_stable"
|
||||||
|
"grafana/grafana:11.1.0"
|
||||||
|
)
|
||||||
|
|
||||||
|
for base_image in "${metric_base_images[@]}"; do
|
||||||
|
if ! pull_base_image "$base_image"; then
|
||||||
|
build_failed=true
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
metric_builds=(
|
||||||
|
"Metric FTP|src/metric/ftp/build/Dockerfile|argus-metric-ftp:latest|src/metric/ftp/build"
|
||||||
|
"Metric Prometheus|src/metric/prometheus/build/Dockerfile|argus-metric-prometheus:latest|src/metric/prometheus/build"
|
||||||
|
"Metric Grafana|src/metric/grafana/build/Dockerfile|argus-metric-grafana:latest|src/metric/grafana/build"
|
||||||
|
)
|
||||||
|
|
||||||
|
for build_spec in "${metric_builds[@]}"; do
|
||||||
|
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
|
||||||
|
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
|
||||||
|
images_built+=("$image_tag")
|
||||||
|
else
|
||||||
|
build_failed=true
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# =======================================
|
||||||
|
# Sys (system tests) node images
|
||||||
|
# =======================================
|
||||||
|
|
||||||
|
if [[ "$build_sys" == true ]]; then
|
||||||
|
echo ""
|
||||||
|
echo "Building Sys node images..."
|
||||||
|
|
||||||
|
sys_base_images=(
|
||||||
|
"ubuntu:22.04"
|
||||||
|
"nvidia/cuda:12.2.2-runtime-ubuntu22.04"
|
||||||
|
)
|
||||||
|
|
||||||
|
for base_image in "${sys_base_images[@]}"; do
|
||||||
|
if ! pull_base_image "$base_image"; then
|
||||||
|
build_failed=true
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
sys_builds=(
|
||||||
|
"Sys Node|src/sys/build/node/Dockerfile|argus-sys-node:latest|."
|
||||||
|
"Sys Metric Test Node|src/sys/build/test-node/Dockerfile|argus-sys-metric-test-node:latest|."
|
||||||
|
"Sys Metric Test GPU Node|src/sys/build/test-gpu-node/Dockerfile|argus-sys-metric-test-gpu-node:latest|."
|
||||||
|
)
|
||||||
|
|
||||||
|
for build_spec in "${sys_builds[@]}"; do
|
||||||
|
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
|
||||||
|
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
|
||||||
|
images_built+=("$image_tag")
|
||||||
|
else
|
||||||
|
build_failed=true
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# =======================================
|
||||||
|
# Web & Alert module images
|
||||||
|
# =======================================
|
||||||
|
|
||||||
|
if [[ "$build_web" == true || "$build_alert" == true ]]; then
|
||||||
|
echo ""
|
||||||
|
echo "Building Web and Alert module images..."
|
||||||
|
|
||||||
|
# Pre-pull commonly used base images for stability
|
||||||
|
web_alert_base_images=(
|
||||||
|
"node:20"
|
||||||
|
"ubuntu:24.04"
|
||||||
|
)
|
||||||
|
|
||||||
|
for base_image in "${web_alert_base_images[@]}"; do
|
||||||
|
if ! pull_base_image "$base_image"; then
|
||||||
|
build_failed=true
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ "$build_web" == true ]]; then
|
||||||
|
web_builds=(
|
||||||
|
"Web Frontend|src/web/build_tools/frontend/Dockerfile|argus-web-frontend:latest|."
|
||||||
|
"Web Proxy|src/web/build_tools/proxy/Dockerfile|argus-web-proxy:latest|."
|
||||||
|
)
|
||||||
|
for build_spec in "${web_builds[@]}"; do
|
||||||
|
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
|
||||||
|
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
|
||||||
|
images_built+=("$image_tag")
|
||||||
|
else
|
||||||
|
build_failed=true
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$build_alert" == true ]]; then
|
||||||
|
alert_builds=(
|
||||||
|
"Alertmanager|src/alert/alertmanager/build/Dockerfile|argus-alertmanager:latest|."
|
||||||
|
)
|
||||||
|
for build_spec in "${alert_builds[@]}"; do
|
||||||
|
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
|
||||||
|
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
|
||||||
|
images_built+=("$image_tag")
|
||||||
|
else
|
||||||
|
build_failed=true
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "======================================="
|
||||||
|
echo "📦 Build Summary"
|
||||||
|
echo "======================================="
|
||||||
|
|
||||||
|
if [[ ${#images_built[@]} -gt 0 ]]; then
|
||||||
|
echo "✅ Successfully built images:"
|
||||||
|
for image in "${images_built[@]}"; do
|
||||||
|
echo " • $image"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$build_failed" == true ]]; then
|
||||||
|
echo ""
|
||||||
|
echo "❌ Some images failed to build. Please check the errors above."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$use_intranet" == true ]]; then
|
||||||
|
echo ""
|
||||||
|
echo "🌐 Built with intranet mirror configuration"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$build_master_offline" == true ]]; then
|
||||||
|
echo ""
|
||||||
|
echo "🧳 Master offline wheels 已解压到 $master_offline_dir"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
echo "🚀 Next steps:"
|
||||||
|
echo " ./build/save_images.sh --compress # 导出镜像"
|
||||||
|
echo " cd src/master/tests && MASTER_IMAGE_TAG=argus-master:offline ./scripts/00_e2e_test.sh"
|
||||||
|
echo ""
|
||||||
229
build/save_images.sh
Executable file
229
build/save_images.sh
Executable file
@ -0,0 +1,229 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# 帮助信息
|
||||||
|
show_help() {
|
||||||
|
cat << EOF
|
||||||
|
ARGUS Unified Build System - Image Export Tool
|
||||||
|
|
||||||
|
Usage: $0 [OPTIONS]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--compress Compress exported images with gzip
|
||||||
|
-h, --help Show this help message
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
$0 # Export all images without compression
|
||||||
|
$0 --compress # Export all images with gzip compression
|
||||||
|
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
# 解析命令行参数
|
||||||
|
use_compression=false
|
||||||
|
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case $1 in
|
||||||
|
--compress)
|
||||||
|
use_compression=true
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
-h|--help)
|
||||||
|
show_help
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Unknown option: $1"
|
||||||
|
show_help
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# 获取项目根目录
|
||||||
|
root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||||
|
cd "$root"
|
||||||
|
|
||||||
|
# 创建镜像输出目录
|
||||||
|
images_dir="$root/images"
|
||||||
|
mkdir -p "$images_dir"
|
||||||
|
|
||||||
|
echo "======================================="
|
||||||
|
echo "ARGUS Unified Build System - Image Export"
|
||||||
|
echo "======================================="
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
if [[ "$use_compression" == true ]]; then
|
||||||
|
echo "🗜️ Mode: With gzip compression"
|
||||||
|
else
|
||||||
|
echo "📦 Mode: No compression"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "📁 Output directory: $images_dir"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 定义镜像列表
|
||||||
|
declare -A images=(
|
||||||
|
["argus-elasticsearch:latest"]="argus-elasticsearch-latest.tar"
|
||||||
|
["argus-kibana:latest"]="argus-kibana-latest.tar"
|
||||||
|
["argus-bind9:latest"]="argus-bind9-latest.tar"
|
||||||
|
["argus-master:offline"]="argus-master-offline.tar"
|
||||||
|
["argus-metric-ftp:latest"]="argus-metric-ftp-latest.tar"
|
||||||
|
["argus-metric-prometheus:latest"]="argus-metric-prometheus-latest.tar"
|
||||||
|
["argus-metric-grafana:latest"]="argus-metric-grafana-latest.tar"
|
||||||
|
["argus-web-frontend:latest"]="argus-web-frontend-latest.tar"
|
||||||
|
["argus-web-proxy:latest"]="argus-web-proxy-latest.tar"
|
||||||
|
["argus-alertmanager:latest"]="argus-alertmanager-latest.tar"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 函数:检查镜像是否存在
|
||||||
|
check_image() {
|
||||||
|
local image_name="$1"
|
||||||
|
if docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^$image_name$"; then
|
||||||
|
echo "✅ Image found: $image_name"
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
echo "❌ Image not found: $image_name"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# 函数:显示镜像信息
|
||||||
|
show_image_info() {
|
||||||
|
local image_name="$1"
|
||||||
|
echo "📋 Image info for $image_name:"
|
||||||
|
docker images "$image_name" --format " Size: {{.Size}}, Created: {{.CreatedSince}}, ID: {{.ID}}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# 函数:保存镜像
|
||||||
|
save_image() {
|
||||||
|
local image_name="$1"
|
||||||
|
local output_file="$2"
|
||||||
|
local output_path="$images_dir/$output_file"
|
||||||
|
|
||||||
|
echo "🔄 Saving $image_name to $output_file..."
|
||||||
|
|
||||||
|
# 删除旧的镜像文件(如果存在)
|
||||||
|
if [[ -f "$output_path" ]]; then
|
||||||
|
echo " Removing existing file: $output_file"
|
||||||
|
rm "$output_path"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$use_compression" == true && -f "$output_path.gz" ]]; then
|
||||||
|
echo " Removing existing compressed file: $output_file.gz"
|
||||||
|
rm "$output_path.gz"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 保存镜像
|
||||||
|
docker save "$image_name" -o "$output_path"
|
||||||
|
|
||||||
|
if [[ "$use_compression" == true ]]; then
|
||||||
|
echo " Compressing with gzip..."
|
||||||
|
gzip "$output_path"
|
||||||
|
output_path="$output_path.gz"
|
||||||
|
output_file="$output_file.gz"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 检查文件大小
|
||||||
|
local file_size=$(du -h "$output_path" | cut -f1)
|
||||||
|
echo "✅ Saved successfully: $output_file ($file_size)"
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "🔍 Checking for ARGUS images..."
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 检查所有镜像
|
||||||
|
available_images=()
|
||||||
|
missing_images=()
|
||||||
|
|
||||||
|
for image_name in "${!images[@]}"; do
|
||||||
|
if check_image "$image_name"; then
|
||||||
|
show_image_info "$image_name"
|
||||||
|
available_images+=("$image_name")
|
||||||
|
else
|
||||||
|
missing_images+=("$image_name")
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
|
||||||
|
# 如果没有镜像存在,提示构建
|
||||||
|
if [[ ${#available_images[@]} -eq 0 ]]; then
|
||||||
|
echo "❌ No ARGUS images found to export."
|
||||||
|
echo ""
|
||||||
|
echo "🔧 Please build the images first with:"
|
||||||
|
echo " ./build/build_images.sh"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 显示缺失的镜像
|
||||||
|
if [[ ${#missing_images[@]} -gt 0 ]]; then
|
||||||
|
echo "⚠️ Missing images (will be skipped):"
|
||||||
|
for image_name in "${missing_images[@]}"; do
|
||||||
|
echo " • $image_name"
|
||||||
|
done
|
||||||
|
echo ""
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "💾 Starting image export process..."
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 保存所有可用的镜像
|
||||||
|
exported_files=()
|
||||||
|
for image_name in "${available_images[@]}"; do
|
||||||
|
output_file="${images[$image_name]}"
|
||||||
|
save_image "$image_name" "$output_file"
|
||||||
|
|
||||||
|
if [[ "$use_compression" == true ]]; then
|
||||||
|
exported_files+=("$output_file.gz")
|
||||||
|
else
|
||||||
|
exported_files+=("$output_file")
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "======================================="
|
||||||
|
echo "📦 Export Summary"
|
||||||
|
echo "======================================="
|
||||||
|
|
||||||
|
# 显示导出的文件
|
||||||
|
echo "📁 Exported files in $images_dir:"
|
||||||
|
total_size=0
|
||||||
|
for file in "${exported_files[@]}"; do
|
||||||
|
full_path="$images_dir/$file"
|
||||||
|
if [[ -f "$full_path" ]]; then
|
||||||
|
size=$(du -h "$full_path" | cut -f1)
|
||||||
|
size_bytes=$(du -b "$full_path" | cut -f1)
|
||||||
|
total_size=$((total_size + size_bytes))
|
||||||
|
echo " ✅ $file ($size)"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# 显示总大小
|
||||||
|
if [[ $total_size -gt 0 ]]; then
|
||||||
|
total_size_human=$(numfmt --to=iec --suffix=B $total_size)
|
||||||
|
echo ""
|
||||||
|
echo "📊 Total size: $total_size_human"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "🚀 Usage instructions:"
|
||||||
|
echo " To load these images on another system:"
|
||||||
|
|
||||||
|
if [[ "$use_compression" == true ]]; then
|
||||||
|
for file in "${exported_files[@]}"; do
|
||||||
|
if [[ -f "$images_dir/$file" ]]; then
|
||||||
|
base_name="${file%.gz}"
|
||||||
|
echo " gunzip $file && docker load -i $base_name"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
else
|
||||||
|
for file in "${exported_files[@]}"; do
|
||||||
|
if [[ -f "$images_dir/$file" ]]; then
|
||||||
|
echo " docker load -i $file"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "✅ Image export completed successfully!"
|
||||||
|
echo ""
|
||||||
2
configs/.gitignore
vendored
Normal file
2
configs/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
# Local overrides for build user/group settings
|
||||||
|
build_user.local.conf
|
||||||
6
configs/build_user.conf
Normal file
6
configs/build_user.conf
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
# Default build-time UID/GID for Argus images
|
||||||
|
# Override by creating configs/build_user.local.conf with the same format.
|
||||||
|
# Syntax: KEY=VALUE, supports UID/GID only. Whitespace and lines starting with # are ignored.
|
||||||
|
|
||||||
|
UID=2133
|
||||||
|
GID=2015
|
||||||
38
doc/build-user-config.md
Normal file
38
doc/build-user-config.md
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
# Argus 镜像构建 UID/GID 配置说明
|
||||||
|
|
||||||
|
通过统一配置文件可以为 Kibana、Elasticsearch、Bind、Master 等容器指定运行账号,解决跨机器部署时 UID/GID 不一致导致的权限问题。
|
||||||
|
|
||||||
|
## 配置入口
|
||||||
|
|
||||||
|
- 默认配置存放在 `configs/build_user.conf`,内容示例:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
UID=2133
|
||||||
|
GID=2015
|
||||||
|
```
|
||||||
|
|
||||||
|
- 如果需要本地覆盖,可在 `configs/` 下新建 `build_user.local.conf`,字段与默认文件一致。该文件已列入 `.gitignore`,不会被意外提交。
|
||||||
|
- 亦可在执行脚本前通过环境变量 `ARGUS_BUILD_UID` / `ARGUS_BUILD_GID` 强制指定值,优先级最高。
|
||||||
|
|
||||||
|
## 作用范围
|
||||||
|
|
||||||
|
- `build/build_images.sh` 在构建 log/bind/master 镜像时读取配置,并传递 `--build-arg ARGUS_BUILD_UID/GID`;控制台会输出当前使用的 UID/GID。
|
||||||
|
- `src/master/scripts/build_images.sh` 同步使用配置,确保单独构建 master 镜像时行为一致。
|
||||||
|
- 各镜像 Dockerfile 会根据传入的 UID/GID 调整容器内账号(如 `elasticsearch`、`kibana`、`bind`、`argus`),并以环境变量形式暴露运行时可见值。
|
||||||
|
- Master 启动脚本会在执行 DNS 逻辑后,降权到配置的账号运行 `gunicorn`,确保写入 `/private/argus/**` 的文件具备正确属主。
|
||||||
|
- Log 模块测试脚本 `01_bootstrap.sh` 会根据配置修正挂载目录属主,方便端到端测试在任意用户下运行。
|
||||||
|
|
||||||
|
## 使用建议
|
||||||
|
|
||||||
|
1. 初次克隆仓库后无需修改,默认 UID/GID 保持向后兼容。
|
||||||
|
2. 如果在目标环境中使用新的账号(例如 `uid=4001,gid=4001`):
|
||||||
|
- 编辑 `configs/build_user.local.conf` 填入新值;
|
||||||
|
- 使用新账号登录,并确保其加入宿主机的 `docker` 组;
|
||||||
|
- 重新执行 `build/build_images.sh` 或相关模块的构建脚本。
|
||||||
|
3. 切换配置后建议重新运行目标模块的端到端脚本(如 `src/log/tests/scripts/01_bootstrap.sh`、`src/master/tests/scripts/00_e2e_test.sh`、`src/agent/tests/scripts/00_e2e_test.sh`),验证 `/private/argus` 下文件属主是否为期望账号。
|
||||||
|
|
||||||
|
## 故障排查
|
||||||
|
|
||||||
|
- **镜像构建报错 `groupmod: GID already in use`**:说明所选 GID 已存在于基础镜像,建议换用未占用的值,或在自定义基础镜像中先移除冲突。
|
||||||
|
- **容器内运行时报写权限不足**:检查宿主机挂载目录是否已经由目标 UID/GID 创建;必要时重新执行模块的 `01_bootstrap.sh` 之类的准备脚本。
|
||||||
|
- **仍看到旧 UID/GID**:确认脚本执行时未继承旧缓存,可运行 `ARGUS_BUILD_UID=... ARGUS_BUILD_GID=... ./build/build_images.sh` 强制覆盖。
|
||||||
115
scripts/common/build_user.sh
Normal file
115
scripts/common/build_user.sh
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Shared helper to load Argus build user/group configuration.
|
||||||
|
# Usage:
|
||||||
|
# source "${PROJECT_ROOT}/scripts/common/build_user.sh"
|
||||||
|
# load_build_user
|
||||||
|
# echo "$ARGUS_BUILD_UID:$ARGUS_BUILD_GID"
|
||||||
|
|
||||||
|
ARGUS_BUILD_UID_DEFAULT=2133
|
||||||
|
ARGUS_BUILD_GID_DEFAULT=2015
|
||||||
|
|
||||||
|
shopt -s extglob
|
||||||
|
|
||||||
|
_ARGUS_BUILD_USER_LOADED="${_ARGUS_BUILD_USER_LOADED:-0}"
|
||||||
|
|
||||||
|
_argus_build_user_script_dir() {
|
||||||
|
local dir
|
||||||
|
dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
echo "$dir"
|
||||||
|
}
|
||||||
|
|
||||||
|
argus_project_root() {
|
||||||
|
local script_dir
|
||||||
|
script_dir="$(_argus_build_user_script_dir)"
|
||||||
|
(cd "$script_dir/../.." >/dev/null && pwd)
|
||||||
|
}
|
||||||
|
|
||||||
|
_argus_trim() {
|
||||||
|
local value="$1"
|
||||||
|
value="${value##+([[:space:]])}"
|
||||||
|
value="${value%%+([[:space:]])}"
|
||||||
|
printf '%s' "$value"
|
||||||
|
}
|
||||||
|
|
||||||
|
_argus_is_number() {
|
||||||
|
[[ "$1" =~ ^[0-9]+$ ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
load_build_user() {
|
||||||
|
if [[ "$_ARGUS_BUILD_USER_LOADED" == "1" ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
local project_root config_files config uid gid
|
||||||
|
project_root="$(argus_project_root)"
|
||||||
|
config_files=(
|
||||||
|
"$project_root/configs/build_user.local.conf"
|
||||||
|
"$project_root/configs/build_user.conf"
|
||||||
|
)
|
||||||
|
|
||||||
|
uid="$ARGUS_BUILD_UID_DEFAULT"
|
||||||
|
gid="$ARGUS_BUILD_GID_DEFAULT"
|
||||||
|
|
||||||
|
for config in "${config_files[@]}"; do
|
||||||
|
if [[ -f "$config" ]]; then
|
||||||
|
while IFS= read -r raw_line || [[ -n "$raw_line" ]]; do
|
||||||
|
local line key value
|
||||||
|
line="${raw_line%%#*}"
|
||||||
|
line="$(_argus_trim "${line}")"
|
||||||
|
[[ -z "$line" ]] && continue
|
||||||
|
if [[ "$line" != *=* ]]; then
|
||||||
|
echo "[ARGUS build_user] Ignoring malformed line in $config: $raw_line" >&2
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
key="${line%%=*}"
|
||||||
|
value="${line#*=}"
|
||||||
|
key="$(_argus_trim "$key")"
|
||||||
|
value="$(_argus_trim "$value")"
|
||||||
|
case "$key" in
|
||||||
|
UID)
|
||||||
|
uid="$value"
|
||||||
|
;;
|
||||||
|
GID)
|
||||||
|
gid="$value"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "[ARGUS build_user] Unknown key '$key' in $config" >&2
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done < "$config"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ -n "${ARGUS_BUILD_UID:-}" ]]; then
|
||||||
|
uid="$ARGUS_BUILD_UID"
|
||||||
|
fi
|
||||||
|
if [[ -n "${ARGUS_BUILD_GID:-}" ]]; then
|
||||||
|
gid="$ARGUS_BUILD_GID"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! _argus_is_number "$uid"; then
|
||||||
|
echo "[ARGUS build_user] Invalid UID '$uid'" >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
if ! _argus_is_number "$gid"; then
|
||||||
|
echo "[ARGUS build_user] Invalid GID '$gid'" >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
export ARGUS_BUILD_UID="$uid"
|
||||||
|
export ARGUS_BUILD_GID="$gid"
|
||||||
|
_ARGUS_BUILD_USER_LOADED=1
|
||||||
|
}
|
||||||
|
|
||||||
|
argus_build_user_args() {
|
||||||
|
load_build_user
|
||||||
|
printf '%s' "--build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} --build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID}"
|
||||||
|
}
|
||||||
|
|
||||||
|
print_build_user() {
|
||||||
|
load_build_user
|
||||||
|
echo "ARGUS build user: UID=${ARGUS_BUILD_UID} GID=${ARGUS_BUILD_GID}"
|
||||||
|
}
|
||||||
2
src/.gitignore
vendored
Normal file
2
src/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
|
||||||
|
__pycache__/
|
||||||
5
src/agent/.gitignore
vendored
Normal file
5
src/agent/.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
build/
|
||||||
|
*.egg-info/
|
||||||
|
__pycache__/
|
||||||
|
|
||||||
|
.env
|
||||||
78
src/agent/README.md
Normal file
78
src/agent/README.md
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
# Argus Agent 模块
|
||||||
|
|
||||||
|
Argus Agent 是一个轻量级 Python 进程,负责向 Argus Master 注册节点、汇报健康数据,并维护本地持久化信息。模块现以 PyInstaller 打包为独立可执行文件,便于在普通容器或虚机中直接运行。
|
||||||
|
|
||||||
|
## 构建可执行文件
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd src/agent
|
||||||
|
./scripts/build_binary.sh # 生成 dist/argus-agent
|
||||||
|
```
|
||||||
|
|
||||||
|
脚本默认会在 Docker 容器 (`python:3.11-slim-bullseye`) 内执行 PyInstaller,确保产物运行时兼容 glibc 2.31+(覆盖 2.35 环境)。构建流程注意事项:
|
||||||
|
|
||||||
|
- 每次构建前会清理 `build/`、`dist/` 并在容器内重新创建虚拟环境。
|
||||||
|
- 需要使用内网 Python 镜像时,可通过 `PIP_INDEX_URL`、`PIP_EXTRA_INDEX_URL`、`PIP_TRUSTED_HOST` 等环境变量传入,脚本会自动透传给容器。
|
||||||
|
- 如果宿主机无法运行 Docker,可设置 `AGENT_BUILD_USE_DOCKER=0` 回退到本地构建;此时代码必须在 glibc ≤ 2.35 的机器上执行。
|
||||||
|
|
||||||
|
构建结束后脚本会在 `build/compat_check/` 下解包关键动态库并输出最高 `GLIBC_x.y` 版本,便于快速核对兼容性。如果结果中缺少 `libssl.so.3` / `libcrypto.so.3`,表示系统会在目标宿主机上使用本地 OpenSSL 库,无需额外处理。
|
||||||
|
|
||||||
|
例如:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
strings build/compat_check/libpython*.so.1.0 | grep -Eo 'GLIBC_[0-9]+\.[0-9]+' | sort -Vu | tail -n1
|
||||||
|
```
|
||||||
|
|
||||||
|
如遇构建失败,常见原因是 Docker 不可用(请改用 `AGENT_BUILD_USE_DOCKER=0`)或无法访问 Python 包镜像(先设置上述镜像环境变量后重试)。
|
||||||
|
|
||||||
|
## 运行时配置
|
||||||
|
|
||||||
|
Agent 不再依赖配置文件;所有参数均由环境变量与主机名推导:
|
||||||
|
|
||||||
|
| 变量 | 必填 | 默认值 | 说明 |
|
||||||
|
| --- | --- | --- | --- |
|
||||||
|
| `MASTER_ENDPOINT` | 是 | N/A | Master 基础地址,可写 `http://host:3000` 或 `host:3000`(自动补全 `http://`)。 |
|
||||||
|
| `REPORT_INTERVAL_SECONDS` | 否 | `60` | 状态上报间隔(秒)。必须为正整数。 |
|
||||||
|
| `AGENT_HOSTNAME` | 否 | `$(hostname)` | 覆盖容器内主机名,便于测试或特殊命名需求。 |
|
||||||
|
| `AGENT_ENV` | 否 | 来源于主机名 | 运行环境标识(如 `dev`、`prod`)。与 `AGENT_USER`、`AGENT_INSTANCE` 必须同时设置。 |
|
||||||
|
| `AGENT_USER` | 否 | 来源于主机名 | 归属用户或团队标识。与 `AGENT_ENV`、`AGENT_INSTANCE` 必须同时设置。 |
|
||||||
|
| `AGENT_INSTANCE` | 否 | 来源于主机名 | 实例编号或别名。与 `AGENT_ENV`、`AGENT_USER` 必须同时设置。 |
|
||||||
|
|
||||||
|
主机名与元数据的解析优先级:
|
||||||
|
|
||||||
|
1. 若设置 `AGENT_ENV` / `AGENT_USER` / `AGENT_INSTANCE` 且全部存在,则直接使用这些值。
|
||||||
|
2. 否则检查历史 `node.json`(注册成功后由 Master 返回的信息),若包含 `env` / `user` / `instance` 则沿用。
|
||||||
|
3. 若以上均不可用,则按历史约定从主机名解析 `env-user-instance` 前缀。
|
||||||
|
4. 如果仍无法得到完整结果,Agent 启动会失败并提示需要提供上述环境变量。
|
||||||
|
|
||||||
|
> 提示:在首次部署时需确保环境变量或主机名能够提供完整信息。完成注册后,Agent 会把 Master 返回的元数据写入 `node.json`,后续重启无需再次提供环境变量就能保持一致性。
|
||||||
|
|
||||||
|
派生路径:
|
||||||
|
|
||||||
|
- 节点信息:`/private/argus/agent/<hostname>/node.json`
|
||||||
|
- 子模块健康目录:`/private/argus/agent/<hostname>/health/`
|
||||||
|
|
||||||
|
健康目录中的文件需遵循 `<模块前缀>-*.json` 命名(例如 `log-fluentbit.json`、`metric-node-exporter.json`),文件内容会原样并入上报的 `health` 字段。
|
||||||
|
|
||||||
|
## 日志与持久化
|
||||||
|
|
||||||
|
- Agent 会在成功注册、状态上报、异常重试等关键节点输出结构化日志,便于聚合分析。
|
||||||
|
- `node.json` 保存 Master 返回的最新节点对象,用于重启后继续使用既有节点 ID。
|
||||||
|
|
||||||
|
## 端到端测试
|
||||||
|
|
||||||
|
仓库内提供 Docker Compose 测试栈(master + ubuntu 容器):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd src/agent/tests
|
||||||
|
./scripts/00_e2e_test.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
测试脚本会:
|
||||||
|
|
||||||
|
1. 构建 master 镜像与 agent 可执行文件。
|
||||||
|
2. 以 `ubuntu:24.04` 启动 agent 容器,并通过环境变量注入 `MASTER_ENDPOINT`、`REPORT_INTERVAL_SECONDS`。
|
||||||
|
3. 验证注册、健康上报、nodes.json 生成、统计接口,以及“容器重启 + IP 变化”重注册流程。
|
||||||
|
4. 清理 `tests/private/` 与临时容器网络。
|
||||||
|
|
||||||
|
如需在真实环境部署,只需将 `dist/argus-agent` 连同健康目录挂载到目标主机,并按上表设置环境变量即可。
|
||||||
0
src/agent/app/__init__.py
Normal file
0
src/agent/app/__init__.py
Normal file
60
src/agent/app/client.py
Normal file
60
src/agent/app/client.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from .log import get_logger
|
||||||
|
|
||||||
|
LOGGER = get_logger("argus.agent.client")
|
||||||
|
|
||||||
|
|
||||||
|
class MasterAPIError(Exception):
|
||||||
|
def __init__(self, message: str, status_code: int, payload: Optional[Dict[str, Any]] = None) -> None:
|
||||||
|
super().__init__(message)
|
||||||
|
self.status_code = status_code
|
||||||
|
self.payload = payload or {}
|
||||||
|
|
||||||
|
|
||||||
|
class AgentClient:
|
||||||
|
def __init__(self, base_url: str, *, timeout: int = 10) -> None:
|
||||||
|
self._base_url = base_url.rstrip("/")
|
||||||
|
self._timeout = timeout
|
||||||
|
self._session = requests.Session()
|
||||||
|
|
||||||
|
def register_node(self, body: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""调用 master 注册接口,返回节点对象。"""
|
||||||
|
url = f"{self._base_url}/api/v1/master/nodes"
|
||||||
|
response = self._session.post(url, json=body, timeout=self._timeout)
|
||||||
|
return self._parse_response(response, "Failed to register node")
|
||||||
|
|
||||||
|
def update_status(self, node_id: str, body: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""上报健康信息,由 master 更新 last_report。"""
|
||||||
|
url = f"{self._base_url}/api/v1/master/nodes/{node_id}/status"
|
||||||
|
response = self._session.put(url, json=body, timeout=self._timeout)
|
||||||
|
return self._parse_response(response, "Failed to update node status")
|
||||||
|
|
||||||
|
def _parse_response(self, response: requests.Response, error_prefix: str) -> Dict[str, Any]:
|
||||||
|
content_type = response.headers.get("Content-Type", "")
|
||||||
|
payload: Dict[str, Any] | None = None
|
||||||
|
if "application/json" in content_type:
|
||||||
|
try:
|
||||||
|
payload = response.json()
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
LOGGER.warning("Response contained invalid JSON", extra={"status": response.status_code})
|
||||||
|
|
||||||
|
if response.status_code >= 400:
|
||||||
|
message = payload.get("error") if isinstance(payload, dict) else response.text
|
||||||
|
raise MasterAPIError(
|
||||||
|
f"{error_prefix}: {message}",
|
||||||
|
status_code=response.status_code,
|
||||||
|
payload=payload if isinstance(payload, dict) else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
if payload is None:
|
||||||
|
try:
|
||||||
|
payload = response.json()
|
||||||
|
except json.JSONDecodeError as exc:
|
||||||
|
raise MasterAPIError("Master returned non-JSON payload", response.status_code) from exc
|
||||||
|
return payload
|
||||||
110
src/agent/app/collector.py
Normal file
110
src/agent/app/collector.py
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import socket
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
from .config import AgentConfig
|
||||||
|
from .log import get_logger
|
||||||
|
|
||||||
|
LOGGER = get_logger("argus.agent.collector")
|
||||||
|
|
||||||
|
_HOSTNAME_PATTERN = re.compile(r"^([^-]+)-([^-]+)-([^-]+)-.*$")
|
||||||
|
|
||||||
|
|
||||||
|
def collect_metadata(config: AgentConfig) -> Dict[str, Any]:
|
||||||
|
"""汇总节点注册需要的静态信息。"""
|
||||||
|
hostname = config.hostname
|
||||||
|
meta = {
|
||||||
|
"hostname": hostname,
|
||||||
|
"ip": _detect_ip_address(),
|
||||||
|
"env": config.environment,
|
||||||
|
"user": config.user,
|
||||||
|
"instance": config.instance,
|
||||||
|
"cpu_number": _detect_cpu_count(),
|
||||||
|
"memory_in_bytes": _detect_memory_bytes(),
|
||||||
|
"gpu_number": _detect_gpu_count(),
|
||||||
|
}
|
||||||
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_hostname(hostname: str) -> tuple[str, str, str]:
|
||||||
|
"""按照约定的 env-user-instance 前缀拆解主机名。"""
|
||||||
|
match = _HOSTNAME_PATTERN.match(hostname)
|
||||||
|
if not match:
|
||||||
|
LOGGER.warning("Hostname does not match expected pattern", extra={"hostname": hostname})
|
||||||
|
return "", "", ""
|
||||||
|
return match.group(1), match.group(2), match.group(3)
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_cpu_count() -> int:
|
||||||
|
count = os.cpu_count()
|
||||||
|
return count if count is not None else 0
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_memory_bytes() -> int:
|
||||||
|
"""优先读取 cgroup 限额,失败时退回 /proc/meminfo。"""
|
||||||
|
cgroup_path = Path("/sys/fs/cgroup/memory.max")
|
||||||
|
try:
|
||||||
|
raw = cgroup_path.read_text(encoding="utf-8").strip()
|
||||||
|
if raw and raw != "max":
|
||||||
|
return int(raw)
|
||||||
|
except FileNotFoundError:
|
||||||
|
LOGGER.debug("cgroup memory.max not found, falling back to /proc/meminfo")
|
||||||
|
except ValueError:
|
||||||
|
LOGGER.warning("Failed to parse memory.max, falling back", extra={"value": raw})
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open("/proc/meminfo", "r", encoding="utf-8") as handle:
|
||||||
|
for line in handle:
|
||||||
|
if line.startswith("MemTotal:"):
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 2:
|
||||||
|
return int(parts[1]) * 1024
|
||||||
|
except FileNotFoundError:
|
||||||
|
LOGGER.error("/proc/meminfo not found; defaulting memory to 0")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_gpu_count() -> int:
|
||||||
|
"""采集 GPU 数量,如无法探测则默认为 0。"""
|
||||||
|
try:
|
||||||
|
proc = subprocess.run(
|
||||||
|
["nvidia-smi", "-L"],
|
||||||
|
check=False,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True,
|
||||||
|
timeout=5,
|
||||||
|
)
|
||||||
|
except FileNotFoundError:
|
||||||
|
LOGGER.debug("nvidia-smi not available; assuming 0 GPUs")
|
||||||
|
return 0
|
||||||
|
except subprocess.SubprocessError as exc:
|
||||||
|
LOGGER.warning("nvidia-smi invocation failed", extra={"error": str(exc)})
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if proc.returncode != 0:
|
||||||
|
LOGGER.debug("nvidia-smi returned non-zero", extra={"stderr": proc.stderr.strip()})
|
||||||
|
return 0
|
||||||
|
|
||||||
|
count = sum(1 for line in proc.stdout.splitlines() if line.strip())
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_ip_address() -> str:
|
||||||
|
"""尝试通过 UDP socket 获得容器出口 IP,失败则回退解析主机名。"""
|
||||||
|
try:
|
||||||
|
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock:
|
||||||
|
sock.connect(("8.8.8.8", 80))
|
||||||
|
return sock.getsockname()[0]
|
||||||
|
except OSError:
|
||||||
|
LOGGER.debug("UDP socket trick failed; falling back to hostname lookup")
|
||||||
|
try:
|
||||||
|
return socket.gethostbyname(socket.gethostname())
|
||||||
|
except OSError:
|
||||||
|
LOGGER.warning("Unable to resolve hostname to IP; defaulting to 127.0.0.1")
|
||||||
|
return "127.0.0.1"
|
||||||
141
src/agent/app/config.py
Normal file
141
src/agent/app/config.py
Normal file
@ -0,0 +1,141 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import socket
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Final
|
||||||
|
|
||||||
|
from .state import load_node_state
|
||||||
|
from .version import VERSION
|
||||||
|
from .log import get_logger
|
||||||
|
|
||||||
|
DEFAULT_REPORT_INTERVAL_SECONDS: Final[int] = 60
|
||||||
|
|
||||||
|
LOGGER = get_logger("argus.agent.config")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class AgentConfig:
|
||||||
|
hostname: str
|
||||||
|
environment: str
|
||||||
|
user: str
|
||||||
|
instance: str
|
||||||
|
node_file: str
|
||||||
|
version: str
|
||||||
|
master_endpoint: str
|
||||||
|
report_interval_seconds: int
|
||||||
|
health_dir: str
|
||||||
|
request_timeout_seconds: int = 10
|
||||||
|
|
||||||
|
|
||||||
|
def _normalise_master_endpoint(value: str) -> str:
|
||||||
|
value = value.strip()
|
||||||
|
if not value:
|
||||||
|
raise ValueError("MASTER_ENDPOINT environment variable is required")
|
||||||
|
if not value.startswith("http://") and not value.startswith("https://"):
|
||||||
|
value = f"http://{value}"
|
||||||
|
return value.rstrip("/")
|
||||||
|
|
||||||
|
|
||||||
|
def _read_report_interval(raw_value: str | None) -> int:
|
||||||
|
if raw_value is None or raw_value.strip() == "":
|
||||||
|
return DEFAULT_REPORT_INTERVAL_SECONDS
|
||||||
|
try:
|
||||||
|
interval = int(raw_value)
|
||||||
|
except ValueError as exc:
|
||||||
|
raise ValueError("REPORT_INTERVAL_SECONDS must be an integer") from exc
|
||||||
|
if interval <= 0:
|
||||||
|
raise ValueError("REPORT_INTERVAL_SECONDS must be positive")
|
||||||
|
return interval
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_hostname() -> str:
|
||||||
|
return os.environ.get("AGENT_HOSTNAME") or socket.gethostname()
|
||||||
|
|
||||||
|
|
||||||
|
def _load_metadata_from_state(node_file: str) -> tuple[str, str, str] | None:
|
||||||
|
state = load_node_state(node_file)
|
||||||
|
if not state:
|
||||||
|
return None
|
||||||
|
|
||||||
|
meta = state.get("meta_data") or {}
|
||||||
|
env = meta.get("env") or state.get("env")
|
||||||
|
user = meta.get("user") or state.get("user")
|
||||||
|
instance = meta.get("instance") or state.get("instance")
|
||||||
|
|
||||||
|
if env and user and instance:
|
||||||
|
LOGGER.debug("Metadata resolved from node state", extra={"node_file": node_file})
|
||||||
|
return env, user, instance
|
||||||
|
|
||||||
|
LOGGER.warning(
|
||||||
|
"node.json missing metadata fields; ignoring",
|
||||||
|
extra={"node_file": node_file, "meta_data": meta},
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_metadata_fields(hostname: str, node_file: str) -> tuple[str, str, str]:
|
||||||
|
env = os.environ.get("AGENT_ENV")
|
||||||
|
user = os.environ.get("AGENT_USER")
|
||||||
|
instance = os.environ.get("AGENT_INSTANCE")
|
||||||
|
|
||||||
|
if env and user and instance:
|
||||||
|
return env, user, instance
|
||||||
|
|
||||||
|
if any([env, user, instance]):
|
||||||
|
LOGGER.warning(
|
||||||
|
"Incomplete metadata environment variables; falling back to persisted metadata",
|
||||||
|
extra={
|
||||||
|
"has_env": bool(env),
|
||||||
|
"has_user": bool(user),
|
||||||
|
"has_instance": bool(instance),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
state_metadata = _load_metadata_from_state(node_file)
|
||||||
|
if state_metadata is not None:
|
||||||
|
return state_metadata
|
||||||
|
|
||||||
|
from .collector import _parse_hostname # Local import to avoid circular dependency
|
||||||
|
|
||||||
|
env, user, instance = _parse_hostname(hostname)
|
||||||
|
|
||||||
|
if not all([env, user, instance]):
|
||||||
|
raise ValueError(
|
||||||
|
"Failed to determine metadata fields; set AGENT_ENV/USER/INSTANCE or use supported hostname pattern"
|
||||||
|
)
|
||||||
|
|
||||||
|
return env, user, instance
|
||||||
|
|
||||||
|
|
||||||
|
def load_config() -> AgentConfig:
|
||||||
|
"""从环境变量推导配置,移除了外部配置文件依赖。"""
|
||||||
|
|
||||||
|
hostname = _resolve_hostname()
|
||||||
|
node_file = f"/private/argus/agent/{hostname}/node.json"
|
||||||
|
environment, user, instance = _resolve_metadata_fields(hostname, node_file)
|
||||||
|
|
||||||
|
health_dir = f"/private/argus/agent/{hostname}/health/"
|
||||||
|
|
||||||
|
master_endpoint_env = os.environ.get("MASTER_ENDPOINT")
|
||||||
|
if master_endpoint_env is None:
|
||||||
|
raise ValueError("MASTER_ENDPOINT environment variable is not set")
|
||||||
|
master_endpoint = _normalise_master_endpoint(master_endpoint_env)
|
||||||
|
|
||||||
|
report_interval_seconds = _read_report_interval(os.environ.get("REPORT_INTERVAL_SECONDS"))
|
||||||
|
|
||||||
|
Path(node_file).parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
Path(health_dir).mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
return AgentConfig(
|
||||||
|
hostname=hostname,
|
||||||
|
environment=environment,
|
||||||
|
user=user,
|
||||||
|
instance=instance,
|
||||||
|
node_file=node_file,
|
||||||
|
version=VERSION,
|
||||||
|
master_endpoint=master_endpoint,
|
||||||
|
report_interval_seconds=report_interval_seconds,
|
||||||
|
health_dir=health_dir,
|
||||||
|
)
|
||||||
32
src/agent/app/health_reader.py
Normal file
32
src/agent/app/health_reader.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
from .log import get_logger
|
||||||
|
|
||||||
|
LOGGER = get_logger("argus.agent.health")
|
||||||
|
|
||||||
|
|
||||||
|
def read_health_directory(path: str) -> Dict[str, Any]:
|
||||||
|
"""读取目录中所有 <prefix>-*.json 文件并返回 JSON 映射。"""
|
||||||
|
result: Dict[str, Any] = {}
|
||||||
|
directory = Path(path)
|
||||||
|
if not directory.exists():
|
||||||
|
LOGGER.debug("Health directory does not exist", extra={"path": str(directory)})
|
||||||
|
return result
|
||||||
|
|
||||||
|
for health_file in sorted(directory.glob("*.json")):
|
||||||
|
if "-" not in health_file.stem:
|
||||||
|
LOGGER.debug("Skipping non-prefixed health file", extra={"file": health_file.name})
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
with health_file.open("r", encoding="utf-8") as handle:
|
||||||
|
content = json.load(handle)
|
||||||
|
result[health_file.stem] = content
|
||||||
|
except json.JSONDecodeError as exc:
|
||||||
|
LOGGER.warning("Failed to parse health file", extra={"file": health_file.name, "error": str(exc)})
|
||||||
|
except OSError as exc:
|
||||||
|
LOGGER.warning("Failed to read health file", extra={"file": health_file.name, "error": str(exc)})
|
||||||
|
return result
|
||||||
18
src/agent/app/log.py
Normal file
18
src/agent/app/log.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
_LOG_FORMAT = "%(asctime)s %(levelname)s %(name)s - %(message)s"
|
||||||
|
|
||||||
|
|
||||||
|
def setup_logging() -> None:
|
||||||
|
level_name = os.environ.get("AGENT_LOG_LEVEL", "INFO").upper()
|
||||||
|
level = getattr(logging, level_name, logging.INFO)
|
||||||
|
logging.basicConfig(level=level, format=_LOG_FORMAT)
|
||||||
|
|
||||||
|
|
||||||
|
def get_logger(name: str) -> logging.Logger:
|
||||||
|
setup_logging()
|
||||||
|
return logging.getLogger(name)
|
||||||
163
src/agent/app/main.py
Normal file
163
src/agent/app/main.py
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import signal
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from .client import AgentClient, MasterAPIError
|
||||||
|
from .collector import collect_metadata
|
||||||
|
from .config import AgentConfig, load_config
|
||||||
|
from .health_reader import read_health_directory
|
||||||
|
from .log import get_logger, setup_logging
|
||||||
|
from .state import clear_node_state, load_node_state, save_node_state
|
||||||
|
|
||||||
|
LOGGER = get_logger("argus.agent")
|
||||||
|
|
||||||
|
|
||||||
|
def _current_timestamp() -> str:
|
||||||
|
return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
||||||
|
|
||||||
|
|
||||||
|
class StopSignal:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._stop = False
|
||||||
|
|
||||||
|
def set(self, *_args) -> None: # type: ignore[override]
|
||||||
|
self._stop = True
|
||||||
|
|
||||||
|
def is_set(self) -> bool:
|
||||||
|
return self._stop
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv: Optional[list[str]] = None) -> int: # noqa: ARG001 - 保留签名以兼容入口调用
|
||||||
|
setup_logging()
|
||||||
|
|
||||||
|
stop_signal = StopSignal()
|
||||||
|
signal.signal(signal.SIGTERM, stop_signal.set)
|
||||||
|
signal.signal(signal.SIGINT, stop_signal.set)
|
||||||
|
|
||||||
|
try:
|
||||||
|
config = load_config()
|
||||||
|
except Exception as exc:
|
||||||
|
LOGGER.error("Failed to load configuration", extra={"error": str(exc)})
|
||||||
|
return 1
|
||||||
|
|
||||||
|
LOGGER.info(
|
||||||
|
"Agent starting",
|
||||||
|
extra={
|
||||||
|
"hostname": config.hostname,
|
||||||
|
"master_endpoint": config.master_endpoint,
|
||||||
|
"node_file": config.node_file,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
client = AgentClient(config.master_endpoint, timeout=config.request_timeout_seconds)
|
||||||
|
|
||||||
|
node_state = load_node_state(config.node_file) or {}
|
||||||
|
node_id = node_state.get("id")
|
||||||
|
|
||||||
|
# 与 master 建立注册关系(支持重注册),失败则重试
|
||||||
|
register_response = _register_with_retry(client, config, node_id, stop_signal)
|
||||||
|
if register_response is None:
|
||||||
|
LOGGER.info("Registration aborted due to shutdown signal")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
node_id = register_response.get("id")
|
||||||
|
if not node_id:
|
||||||
|
LOGGER.error("Master did not return node id; aborting")
|
||||||
|
return 1
|
||||||
|
save_node_state(config.node_file, register_response)
|
||||||
|
|
||||||
|
LOGGER.info("Entering status report loop", extra={"node_id": node_id})
|
||||||
|
_status_loop(client, config, node_id, stop_signal)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def _register_with_retry(
|
||||||
|
client: AgentClient,
|
||||||
|
config: AgentConfig,
|
||||||
|
node_id: Optional[str],
|
||||||
|
stop_signal: StopSignal,
|
||||||
|
):
|
||||||
|
backoff = 5
|
||||||
|
while not stop_signal.is_set():
|
||||||
|
payload = {
|
||||||
|
"name": config.hostname,
|
||||||
|
"type": "agent",
|
||||||
|
"meta_data": collect_metadata(config),
|
||||||
|
"version": config.version,
|
||||||
|
}
|
||||||
|
if node_id:
|
||||||
|
payload["id"] = node_id
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = client.register_node(payload)
|
||||||
|
LOGGER.info("Registration successful", extra={"node_id": response.get("id")})
|
||||||
|
save_node_state(config.node_file, response)
|
||||||
|
return response
|
||||||
|
except MasterAPIError as exc:
|
||||||
|
if exc.status_code == 404 and node_id:
|
||||||
|
LOGGER.warning(
|
||||||
|
"Master does not recognise node id; clearing local node state",
|
||||||
|
extra={"node_id": node_id},
|
||||||
|
)
|
||||||
|
clear_node_state(config.node_file)
|
||||||
|
node_id = None
|
||||||
|
elif exc.status_code == 500 and node_id:
|
||||||
|
# id 与 name 不匹配通常意味着配置异常,记录但继续重试
|
||||||
|
LOGGER.error(
|
||||||
|
"Master rejected node due to id/name mismatch; will retry",
|
||||||
|
extra={"node_id": node_id},
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
LOGGER.error("Registration failed", extra={"status_code": exc.status_code, "error": str(exc)})
|
||||||
|
time.sleep(min(backoff, 60))
|
||||||
|
backoff = min(backoff * 2, 60)
|
||||||
|
except Exception as exc: # pragma: no cover - defensive
|
||||||
|
LOGGER.exception("Unexpected error during registration", extra={"error": str(exc)})
|
||||||
|
time.sleep(min(backoff, 60))
|
||||||
|
backoff = min(backoff * 2, 60)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _status_loop(
|
||||||
|
client: AgentClient,
|
||||||
|
config: AgentConfig,
|
||||||
|
node_id: str,
|
||||||
|
stop_signal: StopSignal,
|
||||||
|
) -> None:
|
||||||
|
interval = config.report_interval_seconds
|
||||||
|
while not stop_signal.is_set():
|
||||||
|
timestamp = _current_timestamp()
|
||||||
|
health_payload = read_health_directory(config.health_dir)
|
||||||
|
body = {
|
||||||
|
"timestamp": timestamp,
|
||||||
|
"health": health_payload,
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
response = client.update_status(node_id, body)
|
||||||
|
LOGGER.info(
|
||||||
|
"Status report succeeded",
|
||||||
|
extra={"node_id": node_id, "health_keys": list(health_payload.keys())},
|
||||||
|
)
|
||||||
|
save_node_state(config.node_file, response)
|
||||||
|
except MasterAPIError as exc:
|
||||||
|
# 保持循环继续执行,等待下一次重试
|
||||||
|
LOGGER.error(
|
||||||
|
"Failed to report status",
|
||||||
|
extra={"status_code": exc.status_code, "error": str(exc)},
|
||||||
|
)
|
||||||
|
except Exception as exc: # pragma: no cover - defensive
|
||||||
|
LOGGER.exception("Unexpected error during status report", extra={"error": str(exc)})
|
||||||
|
|
||||||
|
for _ in range(interval):
|
||||||
|
if stop_signal.is_set():
|
||||||
|
break
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
LOGGER.info("Stop signal received; exiting status loop")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
44
src/agent/app/state.py
Normal file
44
src/agent/app/state.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
from .log import get_logger
|
||||||
|
|
||||||
|
LOGGER = get_logger("argus.agent.state")
|
||||||
|
|
||||||
|
|
||||||
|
def load_node_state(path: str) -> Optional[Dict[str, Any]]:
|
||||||
|
"""读取本地 node.json,容器重启后沿用之前的 ID。"""
|
||||||
|
try:
|
||||||
|
with open(path, "r", encoding="utf-8") as handle:
|
||||||
|
return json.load(handle)
|
||||||
|
except FileNotFoundError:
|
||||||
|
return None
|
||||||
|
except json.JSONDecodeError as exc:
|
||||||
|
LOGGER.warning("node.json is invalid JSON; ignoring", extra={"error": str(exc)})
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def save_node_state(path: str, data: Dict[str, Any]) -> None:
|
||||||
|
"""原子化写入 node.json,避免并发读取坏数据。"""
|
||||||
|
directory = Path(path).parent
|
||||||
|
directory.mkdir(parents=True, exist_ok=True)
|
||||||
|
with tempfile.NamedTemporaryFile("w", dir=directory, delete=False, encoding="utf-8") as tmp:
|
||||||
|
json.dump(data, tmp, separators=(",", ":"))
|
||||||
|
tmp.flush()
|
||||||
|
os.fsync(tmp.fileno())
|
||||||
|
temp_path = tmp.name
|
||||||
|
os.replace(temp_path, path)
|
||||||
|
|
||||||
|
|
||||||
|
def clear_node_state(path: str) -> None:
|
||||||
|
try:
|
||||||
|
os.remove(path)
|
||||||
|
except FileNotFoundError:
|
||||||
|
return
|
||||||
|
except OSError as exc:
|
||||||
|
LOGGER.warning("Failed to remove node state file", extra={"error": str(exc), "path": path})
|
||||||
69
src/agent/app/version.py
Normal file
69
src/agent/app/version.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import importlib.metadata
|
||||||
|
|
||||||
|
try:
|
||||||
|
import tomllib
|
||||||
|
except ModuleNotFoundError: # pragma: no cover
|
||||||
|
import tomli as tomllib # type: ignore[no-redef]
|
||||||
|
|
||||||
|
|
||||||
|
def _candidate_paths() -> list[Path]:
|
||||||
|
paths = []
|
||||||
|
bundle_dir: Optional[str] = getattr(sys, "_MEIPASS", None)
|
||||||
|
if bundle_dir:
|
||||||
|
paths.append(Path(bundle_dir) / "pyproject.toml")
|
||||||
|
paths.append(Path(__file__).resolve().parent.parent / "pyproject.toml")
|
||||||
|
paths.append(Path(__file__).resolve().parent / "pyproject.toml")
|
||||||
|
paths.append(Path.cwd() / "pyproject.toml")
|
||||||
|
return paths
|
||||||
|
|
||||||
|
|
||||||
|
def _read_from_pyproject() -> Optional[str]:
|
||||||
|
for path in _candidate_paths():
|
||||||
|
if not path.exists():
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
with path.open("rb") as handle:
|
||||||
|
data = tomllib.load(handle)
|
||||||
|
except (OSError, tomllib.TOMLDecodeError):
|
||||||
|
continue
|
||||||
|
project = data.get("project")
|
||||||
|
if isinstance(project, dict):
|
||||||
|
version = project.get("version")
|
||||||
|
if isinstance(version, str):
|
||||||
|
return version
|
||||||
|
tool = data.get("tool")
|
||||||
|
if isinstance(tool, dict):
|
||||||
|
argus_cfg = tool.get("argus")
|
||||||
|
if isinstance(argus_cfg, dict):
|
||||||
|
version = argus_cfg.get("version")
|
||||||
|
if isinstance(version, str):
|
||||||
|
return version
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_version() -> str:
|
||||||
|
try:
|
||||||
|
return importlib.metadata.version("argus-agent")
|
||||||
|
except importlib.metadata.PackageNotFoundError:
|
||||||
|
pass
|
||||||
|
override = os.environ.get("AGENT_VERSION_OVERRIDE")
|
||||||
|
if override:
|
||||||
|
return override
|
||||||
|
fallback = _read_from_pyproject()
|
||||||
|
if fallback:
|
||||||
|
return fallback
|
||||||
|
return "0.0.0"
|
||||||
|
|
||||||
|
|
||||||
|
VERSION: str = _detect_version()
|
||||||
|
|
||||||
|
|
||||||
|
def get_version() -> str:
|
||||||
|
return VERSION
|
||||||
BIN
src/agent/dist/argus-agent
vendored
Executable file
BIN
src/agent/dist/argus-agent
vendored
Executable file
Binary file not shown.
10
src/agent/entry.py
Normal file
10
src/agent/entry.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from app.main import main as agent_main
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(agent_main())
|
||||||
19
src/agent/pyproject.toml
Normal file
19
src/agent/pyproject.toml
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
[project]
|
||||||
|
name = "argus-agent"
|
||||||
|
version = "1.1.0"
|
||||||
|
description = "Argus agent binary"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.11"
|
||||||
|
dependencies = [
|
||||||
|
"requests==2.31.0"
|
||||||
|
]
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=69", "wheel"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[tool.argus]
|
||||||
|
entry = "app.main:main"
|
||||||
|
|
||||||
|
[tool.setuptools]
|
||||||
|
packages = ["app"]
|
||||||
690
src/agent/scripts/agent_deployment_verify.sh
Executable file
690
src/agent/scripts/agent_deployment_verify.sh
Executable file
@ -0,0 +1,690 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
LOG_PREFIX="[AGENT-VERIFY]"
|
||||||
|
MASTER_ENDPOINT_DEFAULT=""
|
||||||
|
AGENT_DATA_ROOT_DEFAULT="/private/argus/agent"
|
||||||
|
AGENT_ETC_ROOT_DEFAULT="/private/argus/etc"
|
||||||
|
REPORT_INTERVAL_DEFAULT="2"
|
||||||
|
|
||||||
|
ALLOW_CONFIG_TOUCH="false"
|
||||||
|
KEEP_TEST_HEALTH="false"
|
||||||
|
|
||||||
|
log_info() {
|
||||||
|
echo "${LOG_PREFIX} INFO $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_warn() {
|
||||||
|
echo "${LOG_PREFIX} WARN $*" >&2
|
||||||
|
}
|
||||||
|
|
||||||
|
log_error() {
|
||||||
|
echo "${LOG_PREFIX} ERROR $*" >&2
|
||||||
|
}
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<'USAGE'
|
||||||
|
Usage: agent_deployment_verify.sh [options]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--allow-config-touch Enable optional config PUT dry-run check.
|
||||||
|
--keep-test-health Keep the temporary verify health file after checks.
|
||||||
|
-h, --help Show this help message.
|
||||||
|
|
||||||
|
Environment variables:
|
||||||
|
MASTER_ENDPOINT (required) Master API base endpoint, e.g. http://master:3000
|
||||||
|
AGENT_DATA_ROOT (default: /private/argus/agent)
|
||||||
|
AGENT_ETC_ROOT (default: /private/argus/etc)
|
||||||
|
VERIFY_HOSTNAME (default: output of hostname)
|
||||||
|
REPORT_INTERVAL_SECONDS (default: 2) Agent report interval in seconds
|
||||||
|
USAGE
|
||||||
|
}
|
||||||
|
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--allow-config-touch)
|
||||||
|
ALLOW_CONFIG_TOUCH="true"
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--keep-test-health)
|
||||||
|
KEEP_TEST_HEALTH="true"
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
-h|--help)
|
||||||
|
usage
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
log_error "Unknown option: $1"
|
||||||
|
usage >&2
|
||||||
|
exit 2
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
MASTER_ENDPOINT="${MASTER_ENDPOINT:-$MASTER_ENDPOINT_DEFAULT}"
|
||||||
|
AGENT_DATA_ROOT="${AGENT_DATA_ROOT:-$AGENT_DATA_ROOT_DEFAULT}"
|
||||||
|
AGENT_ETC_ROOT="${AGENT_ETC_ROOT:-$AGENT_ETC_ROOT_DEFAULT}"
|
||||||
|
VERIFY_HOSTNAME="${VERIFY_HOSTNAME:-$(hostname)}"
|
||||||
|
REPORT_INTERVAL_SECONDS="${REPORT_INTERVAL_SECONDS:-$REPORT_INTERVAL_DEFAULT}"
|
||||||
|
|
||||||
|
if [[ -z "$MASTER_ENDPOINT" ]]; then
|
||||||
|
log_error "MASTER_ENDPOINT is required"
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! [[ "$REPORT_INTERVAL_SECONDS" =~ ^[0-9]+$ ]] || [[ "$REPORT_INTERVAL_SECONDS" -le 0 ]]; then
|
||||||
|
log_warn "Invalid REPORT_INTERVAL_SECONDS='$REPORT_INTERVAL_SECONDS', fallback to $REPORT_INTERVAL_DEFAULT"
|
||||||
|
REPORT_INTERVAL_SECONDS="$REPORT_INTERVAL_DEFAULT"
|
||||||
|
fi
|
||||||
|
|
||||||
|
normalize_endpoint() {
|
||||||
|
local endpoint="$1"
|
||||||
|
if [[ "$endpoint" != http://* && "$endpoint" != https://* ]]; then
|
||||||
|
endpoint="http://$endpoint"
|
||||||
|
fi
|
||||||
|
endpoint="${endpoint%/}"
|
||||||
|
echo "$endpoint"
|
||||||
|
}
|
||||||
|
|
||||||
|
MASTER_BASE="$(normalize_endpoint "$MASTER_ENDPOINT")"
|
||||||
|
|
||||||
|
NODE_DIR="$AGENT_DATA_ROOT/$VERIFY_HOSTNAME"
|
||||||
|
NODE_JSON="$NODE_DIR/node.json"
|
||||||
|
HEALTH_DIR="$NODE_DIR/health"
|
||||||
|
DNS_CONF="$AGENT_ETC_ROOT/dns.conf"
|
||||||
|
UPDATE_SCRIPT="$AGENT_ETC_ROOT/update-dns.sh"
|
||||||
|
|
||||||
|
declare -a RESULTS_PASS=()
|
||||||
|
declare -a RESULTS_WARN=()
|
||||||
|
declare -a RESULTS_FAIL=()
|
||||||
|
|
||||||
|
add_result() {
|
||||||
|
local level="$1" message="$2"
|
||||||
|
case "$level" in
|
||||||
|
PASS)
|
||||||
|
RESULTS_PASS+=("$message")
|
||||||
|
log_info "$message"
|
||||||
|
;;
|
||||||
|
WARN)
|
||||||
|
RESULTS_WARN+=("$message")
|
||||||
|
log_warn "$message"
|
||||||
|
;;
|
||||||
|
FAIL)
|
||||||
|
RESULTS_FAIL+=("$message")
|
||||||
|
log_error "$message"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
|
||||||
|
HAS_JQ="0"
|
||||||
|
if command -v jq >/dev/null 2>&1; then
|
||||||
|
HAS_JQ="1"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! command -v curl >/dev/null 2>&1; then
|
||||||
|
log_error "curl command not found; please install curl (e.g. apt-get install -y curl)"
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$HAS_JQ" == "0" ]] && ! command -v python3 >/dev/null 2>&1; then
|
||||||
|
log_error "Neither jq nor python3 is available for JSON processing"
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
CURL_OPTS=(--fail --show-error --silent --max-time 10)
|
||||||
|
|
||||||
|
curl_json() {
|
||||||
|
local url="$1"
|
||||||
|
if ! curl "${CURL_OPTS[@]}" "$url"; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
json_query() {
|
||||||
|
local json="$1" jq_expr="$2" py_expr="$3"
|
||||||
|
if [[ "$HAS_JQ" == "1" ]]; then
|
||||||
|
if ! output=$(printf '%s' "$json" | jq -e -r "$jq_expr" 2>/dev/null); then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
printf '%s' "$output"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
python3 - "$py_expr" <<'PY'
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
expr = sys.argv[1]
|
||||||
|
try:
|
||||||
|
data = json.load(sys.stdin)
|
||||||
|
value = eval(expr, {}, {"data": data})
|
||||||
|
except Exception:
|
||||||
|
sys.exit(1)
|
||||||
|
if value is None:
|
||||||
|
sys.exit(1)
|
||||||
|
if isinstance(value, (dict, list)):
|
||||||
|
print(json.dumps(value))
|
||||||
|
else:
|
||||||
|
print(value)
|
||||||
|
PY
|
||||||
|
}
|
||||||
|
|
||||||
|
json_length() {
|
||||||
|
local json="$1" jq_expr="$2" py_expr="$3"
|
||||||
|
if [[ "$HAS_JQ" == "1" ]]; then
|
||||||
|
if ! output=$(printf '%s' "$json" | jq -e "$jq_expr" 2>/dev/null); then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
printf '%s' "$output"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
python3 - "$py_expr" <<'PY'
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
expr = sys.argv[1]
|
||||||
|
try:
|
||||||
|
data = json.load(sys.stdin)
|
||||||
|
value = eval(expr, {}, {"data": data})
|
||||||
|
except Exception:
|
||||||
|
sys.exit(1)
|
||||||
|
try:
|
||||||
|
print(len(value))
|
||||||
|
except Exception:
|
||||||
|
sys.exit(1)
|
||||||
|
PY
|
||||||
|
}
|
||||||
|
|
||||||
|
json_has_key() {
|
||||||
|
local json="$1" jq_expr="$2" py_expr="$3"
|
||||||
|
if [[ "$HAS_JQ" == "1" ]]; then
|
||||||
|
if printf '%s' "$json" | jq -e "$jq_expr" >/dev/null 2>&1; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
python3 - "$py_expr" <<'PY'
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
expr = sys.argv[1]
|
||||||
|
try:
|
||||||
|
data = json.load(sys.stdin)
|
||||||
|
value = eval(expr, {}, {"data": data})
|
||||||
|
except Exception:
|
||||||
|
sys.exit(1)
|
||||||
|
if value:
|
||||||
|
sys.exit(0)
|
||||||
|
sys.exit(1)
|
||||||
|
PY
|
||||||
|
}
|
||||||
|
|
||||||
|
iso_to_epoch() {
|
||||||
|
local value="$1"
|
||||||
|
if command -v date >/dev/null 2>&1; then
|
||||||
|
date -d "$value" +%s 2>/dev/null && return 0
|
||||||
|
fi
|
||||||
|
if command -v python3 >/dev/null 2>&1; then
|
||||||
|
python3 - "$value" <<'PY'
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
value = sys.argv[1]
|
||||||
|
if value is None or value == "":
|
||||||
|
sys.exit(1)
|
||||||
|
if value.endswith('Z'):
|
||||||
|
value = value[:-1] + '+00:00'
|
||||||
|
try:
|
||||||
|
dt = datetime.fromisoformat(value)
|
||||||
|
except ValueError:
|
||||||
|
sys.exit(1)
|
||||||
|
print(int(dt.timestamp()))
|
||||||
|
PY
|
||||||
|
return $?
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
validate_json_file() {
|
||||||
|
local path="$1"
|
||||||
|
if [[ "$HAS_JQ" == "1" ]]; then
|
||||||
|
jq empty "$path" >/dev/null 2>&1 && return 0
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
if command -v python3 >/dev/null 2>&1; then
|
||||||
|
python3 - "$path" <<'PY'
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
path = sys.argv[1]
|
||||||
|
with open(path, 'r', encoding='utf-8') as handle:
|
||||||
|
json.load(handle)
|
||||||
|
PY
|
||||||
|
return $?
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_directory() {
|
||||||
|
local dir="$1"
|
||||||
|
if [[ ! -d "$dir" ]]; then
|
||||||
|
log_warn "Creating missing directory $dir"
|
||||||
|
mkdir -p "$dir"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_HEALTH_FILE=""
|
||||||
|
TEST_HEALTH_BACKUP=""
|
||||||
|
TEST_HEALTH_EXISTED="false"
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
if [[ -n "$TEST_HEALTH_FILE" ]]; then
|
||||||
|
if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then
|
||||||
|
printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE"
|
||||||
|
elif [[ "$KEEP_TEST_HEALTH" == "true" ]]; then
|
||||||
|
:
|
||||||
|
else
|
||||||
|
rm -f "$TEST_HEALTH_FILE"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
log_info "Starting agent deployment verification for hostname '$VERIFY_HOSTNAME'"
|
||||||
|
|
||||||
|
# 4.2 Master health checks
|
||||||
|
health_resp=""
|
||||||
|
if ! health_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/healthz" 2>/tmp/agent_verify_healthz.err); then
|
||||||
|
error_detail=$(cat /tmp/agent_verify_healthz.err || true)
|
||||||
|
add_result FAIL "GET /healthz failed: $error_detail"
|
||||||
|
else
|
||||||
|
http_meta=$(tail -n1 <<<"$health_resp")
|
||||||
|
payload=$(head -n -1 <<<"$health_resp" || true)
|
||||||
|
status_code=${http_meta%% *}
|
||||||
|
elapsed=${http_meta##* }
|
||||||
|
add_result PASS "GET /healthz status=$status_code elapsed=${elapsed}s payload=$payload"
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_healthz.err
|
||||||
|
|
||||||
|
if ! readyz_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/readyz" 2>/tmp/agent_verify_readyz.err); then
|
||||||
|
error_detail=$(cat /tmp/agent_verify_readyz.err || true)
|
||||||
|
add_result FAIL "GET /readyz failed: $error_detail"
|
||||||
|
readyz_payload=""
|
||||||
|
else
|
||||||
|
readyz_meta=$(tail -n1 <<<"$readyz_resp")
|
||||||
|
readyz_payload=$(head -n -1 <<<"$readyz_resp" || true)
|
||||||
|
readyz_status=${readyz_meta%% *}
|
||||||
|
readyz_elapsed=${readyz_meta##* }
|
||||||
|
add_result PASS "GET /readyz status=$readyz_status elapsed=${readyz_elapsed}s"
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_readyz.err
|
||||||
|
|
||||||
|
# 4.3 Nodes list and detail
|
||||||
|
if ! nodes_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes" 2>/tmp/agent_verify_nodes.err); then
|
||||||
|
error_detail=$(cat /tmp/agent_verify_nodes.err || true)
|
||||||
|
add_result FAIL "GET /api/v1/master/nodes failed: $error_detail"
|
||||||
|
nodes_json=""
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_nodes.err
|
||||||
|
|
||||||
|
NODE_ENTRY=""
|
||||||
|
NODE_ID=""
|
||||||
|
NODE_IP=""
|
||||||
|
if [[ -n "$nodes_json" ]]; then
|
||||||
|
if [[ "$HAS_JQ" == "1" ]]; then
|
||||||
|
NODE_ENTRY=$(printf '%s' "$nodes_json" | jq -e --arg name "$VERIFY_HOSTNAME" '.[] | select(.name == $name)') || NODE_ENTRY=""
|
||||||
|
else
|
||||||
|
NODE_ENTRY=$(python3 - "$VERIFY_HOSTNAME" <<'PY'
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
hostname = sys.argv[1]
|
||||||
|
nodes = json.load(sys.stdin)
|
||||||
|
for node in nodes:
|
||||||
|
if node.get("name") == hostname:
|
||||||
|
import json as _json
|
||||||
|
print(_json.dumps(node))
|
||||||
|
sys.exit(0)
|
||||||
|
sys.exit(1)
|
||||||
|
PY
|
||||||
|
) || NODE_ENTRY=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z "$NODE_ENTRY" ]]; then
|
||||||
|
add_result FAIL "Current node '$VERIFY_HOSTNAME' not found in master nodes list"
|
||||||
|
else
|
||||||
|
if NODE_ID=$(json_query "$NODE_ENTRY" '.id' 'data["id"]'); then
|
||||||
|
add_result PASS "Discovered node id '$NODE_ID' for hostname '$VERIFY_HOSTNAME'"
|
||||||
|
else
|
||||||
|
add_result FAIL "Failed to extract node id from master response"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "$NODE_ENTRY" ]] && NODE_DETAIL=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_node_detail.err); then
|
||||||
|
NODE_DETAIL_JSON="$NODE_DETAIL"
|
||||||
|
add_result PASS "Fetched node detail for $NODE_ID"
|
||||||
|
if NODE_IP=$(json_query "$NODE_DETAIL_JSON" '.meta_data.ip // .meta_data.host_ip // empty' 'data.get("meta_data", {}).get("ip") or data.get("meta_data", {}).get("host_ip") or ""'); then
|
||||||
|
if [[ -n "$NODE_IP" ]]; then
|
||||||
|
add_result PASS "Registered node IP=$NODE_IP"
|
||||||
|
else
|
||||||
|
add_result INFO "Node detail does not expose IP fields"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
error_detail=$(cat /tmp/agent_verify_node_detail.err 2>/dev/null || true)
|
||||||
|
add_result FAIL "Failed to fetch node detail for $NODE_ID: $error_detail"
|
||||||
|
NODE_DETAIL_JSON=""
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_node_detail.err
|
||||||
|
|
||||||
|
if stats_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes/statistics" 2>/tmp/agent_verify_stats.err); then
|
||||||
|
if total_nodes=$(json_query "$stats_json" '.total // .total_nodes' 'data.get("total") or data.get("total_nodes")'); then
|
||||||
|
if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$total_nodes" -ge 1 ]]; then
|
||||||
|
add_result PASS "Statistics total=$total_nodes"
|
||||||
|
else
|
||||||
|
add_result WARN "Statistics total field not numeric: $total_nodes"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result WARN "Unable to read total field from statistics"
|
||||||
|
fi
|
||||||
|
|
||||||
|
active_nodes=""
|
||||||
|
if [[ "$HAS_JQ" == "1" ]]; then
|
||||||
|
active_nodes=$(printf '%s' "$stats_json" | jq -e 'if .status_statistics then (.status_statistics[] | select(.status == "online") | .count) else empty end' 2>/dev/null | head -n1 || true)
|
||||||
|
elif command -v python3 >/dev/null 2>&1; then
|
||||||
|
active_nodes=$(printf '%s' "$stats_json" | python3 -c 'import json,sys; data=json.load(sys.stdin); print(next((row.get("count") for row in data.get("status_statistics", []) if row.get("status")=="online"), ""))' 2>/dev/null)
|
||||||
|
fi
|
||||||
|
if [[ -n "$active_nodes" ]]; then
|
||||||
|
add_result PASS "Online nodes reported by master: $active_nodes"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$HAS_JQ" == "1" ]]; then
|
||||||
|
node_count=$(printf '%s' "$nodes_json" | jq 'length')
|
||||||
|
else
|
||||||
|
node_count=$(json_length "$nodes_json" 'length' 'len(data)')
|
||||||
|
fi
|
||||||
|
if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$node_count" =~ ^[0-9]+$ ]] && [[ "$total_nodes" -lt "$node_count" ]]; then
|
||||||
|
add_result WARN "Statistics total=$total_nodes less than nodes list count=$node_count"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
error_detail=$(cat /tmp/agent_verify_stats.err 2>/dev/null || true)
|
||||||
|
add_result FAIL "Failed to fetch node statistics: $error_detail"
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_stats.err
|
||||||
|
else
|
||||||
|
NODE_DETAIL_JSON=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 4.4 Agent persistence checks
|
||||||
|
if [[ -f "$NODE_JSON" ]]; then
|
||||||
|
node_file_content="$(cat "$NODE_JSON")"
|
||||||
|
if node_id_local=$(json_query "$node_file_content" '.id' 'data["id"]'); then
|
||||||
|
if [[ "$NODE_ID" != "" && "$node_id_local" == "$NODE_ID" ]]; then
|
||||||
|
add_result PASS "node.json id matches master ($NODE_ID)"
|
||||||
|
else
|
||||||
|
add_result FAIL "node.json id '$node_id_local' differs from master id '$NODE_ID'"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result FAIL "Unable to extract id from node.json"
|
||||||
|
fi
|
||||||
|
if node_name_local=$(json_query "$node_file_content" '.name' 'data["name"]'); then
|
||||||
|
if [[ "$node_name_local" == "$VERIFY_HOSTNAME" ]]; then
|
||||||
|
add_result PASS "node.json name matches $VERIFY_HOSTNAME"
|
||||||
|
else
|
||||||
|
add_result FAIL "node.json name '$node_name_local' differs from hostname '$VERIFY_HOSTNAME'"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result FAIL "Unable to extract name from node.json"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if register_time=$(json_query "$node_file_content" '.register_time' 'data.get("register_time")'); then
|
||||||
|
if iso_to_epoch "$register_time" >/dev/null 2>&1; then
|
||||||
|
add_result PASS "node.json register_time valid ISO timestamp"
|
||||||
|
else
|
||||||
|
add_result WARN "node.json register_time invalid: $register_time"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result WARN "node.json missing register_time"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then
|
||||||
|
if iso_to_epoch "$last_updated" >/dev/null 2>&1; then
|
||||||
|
add_result PASS "node.json last_updated valid ISO timestamp"
|
||||||
|
else
|
||||||
|
add_result WARN "node.json last_updated invalid: $last_updated"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result WARN "node.json missing last_updated"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result FAIL "node.json not found at $NODE_JSON"
|
||||||
|
node_file_content=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
ensure_directory "$HEALTH_DIR"
|
||||||
|
|
||||||
|
if [[ -d "$HEALTH_DIR" ]]; then
|
||||||
|
shopt -s nullglob
|
||||||
|
health_files=("$HEALTH_DIR"/*.json)
|
||||||
|
shopt -u nullglob
|
||||||
|
if [[ ${#health_files[@]} -eq 0 ]]; then
|
||||||
|
add_result WARN "Health directory $HEALTH_DIR is empty"
|
||||||
|
else
|
||||||
|
for hf in "${health_files[@]}"; do
|
||||||
|
base=$(basename "$hf")
|
||||||
|
if [[ "$base" != *-* ]]; then
|
||||||
|
add_result WARN "Health file $base does not follow <module>-*.json"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if ! validate_json_file "$hf" >/dev/null 2>&1; then
|
||||||
|
add_result WARN "Health file $base is not valid JSON"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result WARN "Health directory $HEALTH_DIR missing"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if getent hosts master.argus.com >/dev/null 2>&1; then
|
||||||
|
resolved_ips=$(getent hosts master.argus.com | awk '{print $1}' | xargs)
|
||||||
|
add_result PASS "master.argus.com resolves to $resolved_ips"
|
||||||
|
else
|
||||||
|
add_result FAIL "Failed to resolve master.argus.com"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 4.5 Master-Node status consistency
|
||||||
|
sleep_interval=$((REPORT_INTERVAL_SECONDS + 2))
|
||||||
|
|
||||||
|
if [[ -n "$NODE_DETAIL_JSON" ]]; then
|
||||||
|
detail_pre="$NODE_DETAIL_JSON"
|
||||||
|
else
|
||||||
|
detail_pre=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z "$detail_pre" && -n "$NODE_ID" ]]; then
|
||||||
|
if detail_pre=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_pre.err); then
|
||||||
|
add_result PASS "Fetched node detail pre-check"
|
||||||
|
else
|
||||||
|
error_detail=$(cat /tmp/agent_verify_detail_pre.err 2>/dev/null || true)
|
||||||
|
add_result FAIL "Unable to fetch node detail for status check: $error_detail"
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_detail_pre.err
|
||||||
|
fi
|
||||||
|
|
||||||
|
server_ts_pre=""
|
||||||
|
agent_ts_pre=""
|
||||||
|
server_ts_post=""
|
||||||
|
agent_ts_post=""
|
||||||
|
|
||||||
|
if [[ -n "$detail_pre" ]]; then
|
||||||
|
server_ts_pre=$(json_query "$detail_pre" '.last_report' 'data.get("last_report")' || echo "")
|
||||||
|
agent_ts_pre=$(json_query "$detail_pre" '.agent_last_report' 'data.get("agent_last_report")' || echo "")
|
||||||
|
log_info "Captured initial last_report timestamps server='$server_ts_pre' agent='$agent_ts_pre'"
|
||||||
|
|
||||||
|
sleep "$sleep_interval"
|
||||||
|
|
||||||
|
if detail_post=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_post.err); then
|
||||||
|
server_ts_post=$(json_query "$detail_post" '.last_report' 'data.get("last_report")' || echo "")
|
||||||
|
agent_ts_post=$(json_query "$detail_post" '.agent_last_report' 'data.get("agent_last_report")' || echo "")
|
||||||
|
if [[ "$server_ts_post" != "$server_ts_pre" ]]; then
|
||||||
|
add_result PASS "last_report.server_timestamp advanced (pre=$server_ts_pre post=$server_ts_post)"
|
||||||
|
else
|
||||||
|
add_result FAIL "last_report.server_timestamp did not change after ${sleep_interval}s"
|
||||||
|
fi
|
||||||
|
if [[ "$agent_ts_post" != "$agent_ts_pre" ]]; then
|
||||||
|
add_result PASS "last_report.agent_timestamp advanced"
|
||||||
|
else
|
||||||
|
add_result FAIL "last_report.agent_timestamp did not change"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "$node_file_content" ]]; then
|
||||||
|
if node_last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then
|
||||||
|
if epoch_post=$(iso_to_epoch "$server_ts_post" 2>/dev/null); then
|
||||||
|
if node_epoch=$(iso_to_epoch "$node_last_updated" 2>/dev/null); then
|
||||||
|
diff=$((epoch_post - node_epoch))
|
||||||
|
[[ $diff -lt 0 ]] && diff=$((-diff))
|
||||||
|
tolerance=$((REPORT_INTERVAL_SECONDS * 2))
|
||||||
|
if [[ $diff -le $tolerance ]]; then
|
||||||
|
add_result PASS "last_report.server_timestamp and node.json last_updated within tolerance ($diff s)"
|
||||||
|
else
|
||||||
|
add_result WARN "Timestamp gap between master ($server_ts_post) and node.json ($node_last_updated) is ${diff}s"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
NODE_DETAIL_JSON="$detail_post"
|
||||||
|
else
|
||||||
|
error_detail=$(cat /tmp/agent_verify_detail_post.err 2>/dev/null || true)
|
||||||
|
add_result FAIL "Failed to fetch node detail post-check: $error_detail"
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_detail_post.err
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 4.6 Health simulation
|
||||||
|
TEST_HEALTH_FILE="$HEALTH_DIR/verify-master.json"
|
||||||
|
ensure_directory "$HEALTH_DIR"
|
||||||
|
|
||||||
|
if [[ -f "$TEST_HEALTH_FILE" ]]; then
|
||||||
|
TEST_HEALTH_EXISTED="true"
|
||||||
|
TEST_HEALTH_BACKUP="$(cat "$TEST_HEALTH_FILE")"
|
||||||
|
else
|
||||||
|
TEST_HEALTH_EXISTED="false"
|
||||||
|
fi
|
||||||
|
|
||||||
|
create_health_file() {
|
||||||
|
local message="$1"
|
||||||
|
cat > "$TEST_HEALTH_FILE" <<HEALTHJSON
|
||||||
|
{"status":"ok","message":"$message"}
|
||||||
|
HEALTHJSON
|
||||||
|
}
|
||||||
|
|
||||||
|
validate_health_in_master() {
|
||||||
|
local expected_message="$1"
|
||||||
|
local detail_json="$2"
|
||||||
|
local message
|
||||||
|
if message=$(json_query "$detail_json" '.health["verify-master"].message' 'data.get("health", {}).get("verify-master", {}).get("message")'); then
|
||||||
|
if [[ "$message" == "$expected_message" ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
remove_health_from_master() {
|
||||||
|
local detail_json="$1"
|
||||||
|
if json_has_key "$detail_json" '(.health | has("verify-master"))' '"verify-master" in data.get("health", {})'; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
health_message_one="verify $(date +%s)"
|
||||||
|
create_health_file "$health_message_one"
|
||||||
|
add_result PASS "Created test health file $TEST_HEALTH_FILE"
|
||||||
|
|
||||||
|
sleep "$sleep_interval"
|
||||||
|
if detail_health_one=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health1.err); then
|
||||||
|
if validate_health_in_master "$health_message_one" "$detail_health_one"; then
|
||||||
|
add_result PASS "Master reflects verify-master health message"
|
||||||
|
else
|
||||||
|
add_result FAIL "Master health payload does not match test message"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
error_detail=$(cat /tmp/agent_verify_health1.err 2>/dev/null || true)
|
||||||
|
add_result FAIL "Failed to fetch node detail during health validation: $error_detail"
|
||||||
|
detail_health_one=""
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_health1.err
|
||||||
|
|
||||||
|
health_message_two="verify $(date +%s)-update"
|
||||||
|
create_health_file "$health_message_two"
|
||||||
|
sleep "$sleep_interval"
|
||||||
|
if detail_health_two=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health2.err); then
|
||||||
|
if validate_health_in_master "$health_message_two" "$detail_health_two"; then
|
||||||
|
add_result PASS "Master health updated to new message"
|
||||||
|
else
|
||||||
|
add_result FAIL "Master health message did not update"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
error_detail=$(cat /tmp/agent_verify_health2.err 2>/dev/null || true)
|
||||||
|
add_result FAIL "Failed to fetch node detail after health update: $error_detail"
|
||||||
|
detail_health_two=""
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_health2.err
|
||||||
|
|
||||||
|
rm -f "$TEST_HEALTH_FILE"
|
||||||
|
sleep "$sleep_interval"
|
||||||
|
if detail_health_three=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health3.err); then
|
||||||
|
if remove_health_from_master "$detail_health_three"; then
|
||||||
|
add_result PASS "Master health no longer lists verify-master after removal"
|
||||||
|
else
|
||||||
|
add_result FAIL "Master health still contains verify-master after file deletion"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
error_detail=$(cat /tmp/agent_verify_health3.err 2>/dev/null || true)
|
||||||
|
add_result FAIL "Failed to fetch node detail after health removal: $error_detail"
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_health3.err
|
||||||
|
|
||||||
|
if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then
|
||||||
|
printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Optional config touch
|
||||||
|
if [[ "$ALLOW_CONFIG_TOUCH" == "true" ]]; then
|
||||||
|
if [[ -n "$NODE_ID" ]]; then
|
||||||
|
payload='{"label": {"verify": "true"}}'
|
||||||
|
if curl "${CURL_OPTS[@]}" -X PUT -H 'Content-Type: application/json' -d "$payload" "$MASTER_BASE/api/v1/master/nodes/$NODE_ID/config" >/tmp/agent_verify_config.log 2>&1; then
|
||||||
|
add_result PASS "Config PUT dry-run succeeded"
|
||||||
|
else
|
||||||
|
add_result WARN "Config PUT dry-run failed: $(cat /tmp/agent_verify_config.log)"
|
||||||
|
fi
|
||||||
|
rm -f /tmp/agent_verify_config.log
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
add_result WARN "Config PUT dry-run skipped (enable with --allow-config-touch)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Result summary
|
||||||
|
echo
|
||||||
|
echo "==== Verification Summary ===="
|
||||||
|
for entry in "${RESULTS_PASS[@]}"; do
|
||||||
|
printf 'PASS: %s\n' "$entry"
|
||||||
|
done
|
||||||
|
for entry in "${RESULTS_WARN[@]}"; do
|
||||||
|
printf 'WARN: %s\n' "$entry"
|
||||||
|
done
|
||||||
|
for entry in "${RESULTS_FAIL[@]}"; do
|
||||||
|
printf 'FAIL: %s\n' "$entry"
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ ${#RESULTS_FAIL[@]} -gt 0 ]]; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
||||||
276
src/agent/scripts/build_binary.sh
Executable file
276
src/agent/scripts/build_binary.sh
Executable file
@ -0,0 +1,276 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
MODULE_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
BUILD_ROOT="$MODULE_ROOT/build"
|
||||||
|
DIST_DIR="$MODULE_ROOT/dist"
|
||||||
|
PYINSTALLER_BUILD="$BUILD_ROOT/pyinstaller"
|
||||||
|
PYINSTALLER_SPEC="$PYINSTALLER_BUILD/spec"
|
||||||
|
PYINSTALLER_WORK="$PYINSTALLER_BUILD/work"
|
||||||
|
VENV_DIR="$BUILD_ROOT/venv"
|
||||||
|
|
||||||
|
AGENT_BUILD_IMAGE="${AGENT_BUILD_IMAGE:-python:3.11-slim-bullseye}"
|
||||||
|
AGENT_BUILD_USE_DOCKER="${AGENT_BUILD_USE_DOCKER:-1}"
|
||||||
|
# 默认在容器内忽略代理以避免公司内网代理在 Docker 网络不可达导致 pip 失败(可用 0 关闭)
|
||||||
|
AGENT_BUILD_IGNORE_PROXY="${AGENT_BUILD_IGNORE_PROXY:-1}"
|
||||||
|
USED_DOCKER=0
|
||||||
|
|
||||||
|
run_host_build() {
|
||||||
|
echo "[INFO] Using host Python environment for build" >&2
|
||||||
|
rm -rf "$BUILD_ROOT" "$DIST_DIR"
|
||||||
|
mkdir -p "$PYINSTALLER_BUILD" "$DIST_DIR"
|
||||||
|
python3 -m venv --copies "$VENV_DIR"
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$VENV_DIR/bin/activate"
|
||||||
|
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install .
|
||||||
|
pip install "pyinstaller==6.6.0"
|
||||||
|
|
||||||
|
pyinstaller \
|
||||||
|
--clean \
|
||||||
|
--onefile \
|
||||||
|
--name argus-agent \
|
||||||
|
--distpath "$DIST_DIR" \
|
||||||
|
--workpath "$PYINSTALLER_WORK" \
|
||||||
|
--specpath "$PYINSTALLER_SPEC" \
|
||||||
|
--add-data "$MODULE_ROOT/pyproject.toml:." \
|
||||||
|
"$MODULE_ROOT/entry.py"
|
||||||
|
|
||||||
|
chmod +x "$DIST_DIR/argus-agent"
|
||||||
|
deactivate
|
||||||
|
}
|
||||||
|
|
||||||
|
run_docker_build() {
|
||||||
|
if ! command -v docker >/dev/null 2>&1; then
|
||||||
|
echo "[ERROR] docker 命令不存在,无法在容器内构建。请安装 Docker 或设置 AGENT_BUILD_USE_DOCKER=0" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
USED_DOCKER=1
|
||||||
|
echo "[INFO] Building agent binary inside $AGENT_BUILD_IMAGE" >&2
|
||||||
|
|
||||||
|
local host_uid host_gid
|
||||||
|
host_uid="$(id -u)"
|
||||||
|
host_gid="$(id -g)"
|
||||||
|
docker_env=("--rm" "-v" "$MODULE_ROOT:/workspace" "-w" "/workspace" "--env" "TARGET_UID=${host_uid}" "--env" "TARGET_GID=${host_gid}")
|
||||||
|
|
||||||
|
pass_env_if_set() {
|
||||||
|
local var="$1"
|
||||||
|
local value="${!var:-}"
|
||||||
|
if [[ -n "$value" ]]; then
|
||||||
|
docker_env+=("--env" "$var=$value")
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
pass_env_if_set PIP_INDEX_URL
|
||||||
|
pass_env_if_set PIP_EXTRA_INDEX_URL
|
||||||
|
pass_env_if_set PIP_TRUSTED_HOST
|
||||||
|
pass_env_if_set HTTP_PROXY
|
||||||
|
pass_env_if_set HTTPS_PROXY
|
||||||
|
pass_env_if_set NO_PROXY
|
||||||
|
pass_env_if_set http_proxy
|
||||||
|
pass_env_if_set https_proxy
|
||||||
|
pass_env_if_set no_proxy
|
||||||
|
pass_env_if_set AGENT_BUILD_IGNORE_PROXY
|
||||||
|
|
||||||
|
build_script=$(cat <<'INNER'
|
||||||
|
set -euo pipefail
|
||||||
|
cd /workspace
|
||||||
|
apt-get update >/dev/null
|
||||||
|
apt-get install -y --no-install-recommends binutils >/dev/null
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
rm -rf build dist
|
||||||
|
mkdir -p build/pyinstaller dist
|
||||||
|
python3 -m venv --copies build/venv
|
||||||
|
source build/venv/bin/activate
|
||||||
|
# 若指定忽略代理,则清空常见代理与 pip 镜像环境变量,避免容器内代理不可达
|
||||||
|
if [ "${AGENT_BUILD_IGNORE_PROXY:-1}" = "1" ]; then
|
||||||
|
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY PIP_INDEX_URL PIP_EXTRA_INDEX_URL PIP_TRUSTED_HOST
|
||||||
|
fi
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install .
|
||||||
|
pip install pyinstaller==6.6.0
|
||||||
|
pyinstaller \
|
||||||
|
--clean \
|
||||||
|
--onefile \
|
||||||
|
--name argus-agent \
|
||||||
|
--distpath dist \
|
||||||
|
--workpath build/pyinstaller/work \
|
||||||
|
--specpath build/pyinstaller/spec \
|
||||||
|
--add-data /workspace/pyproject.toml:. \
|
||||||
|
entry.py
|
||||||
|
chmod +x dist/argus-agent
|
||||||
|
|
||||||
|
TARGET_UID="${TARGET_UID:-0}"
|
||||||
|
TARGET_GID="${TARGET_GID:-0}"
|
||||||
|
chown -R "$TARGET_UID:$TARGET_GID" dist build 2>/dev/null || true
|
||||||
|
|
||||||
|
python3 - <<'PY'
|
||||||
|
from pathlib import Path
|
||||||
|
from PyInstaller.archive.readers import CArchiveReader
|
||||||
|
import sys
|
||||||
|
|
||||||
|
archive = Path('dist/argus-agent')
|
||||||
|
out_dir = Path('build/compat_check')
|
||||||
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
major, minor = sys.version_info[:2]
|
||||||
|
libpython = f'libpython{major}.{minor}.so.1.0'
|
||||||
|
expected_libs = [
|
||||||
|
libpython,
|
||||||
|
'libssl.so.3',
|
||||||
|
'libcrypto.so.3',
|
||||||
|
]
|
||||||
|
reader = CArchiveReader(str(archive))
|
||||||
|
extracted = []
|
||||||
|
missing = []
|
||||||
|
for name in expected_libs:
|
||||||
|
try:
|
||||||
|
data = reader.extract(name)
|
||||||
|
except KeyError:
|
||||||
|
missing.append(name)
|
||||||
|
continue
|
||||||
|
(out_dir / name).write_bytes(data)
|
||||||
|
extracted.append(name)
|
||||||
|
(out_dir / 'manifest').write_text('\n'.join(extracted))
|
||||||
|
if extracted:
|
||||||
|
print('[INFO] Extracted libraries: ' + ', '.join(extracted))
|
||||||
|
if missing:
|
||||||
|
print('[WARN] Missing expected libraries in bundle: ' + ', '.join(missing))
|
||||||
|
PY
|
||||||
|
|
||||||
|
compat_check() {
|
||||||
|
local lib_path="$1"
|
||||||
|
if [[ ! -f "$lib_path" ]]; then
|
||||||
|
echo "[WARN] Missing $lib_path for GLIBC check"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
local max_glibc
|
||||||
|
max_glibc=$(strings -a "$lib_path" | grep -Eo 'GLIBC_[0-9]+\.[0-9]+' | sort -Vu | tail -n 1 || true)
|
||||||
|
if [[ -n "$max_glibc" ]]; then
|
||||||
|
echo "[INFO] $lib_path references up to $max_glibc"
|
||||||
|
else
|
||||||
|
echo "[INFO] $lib_path does not expose GLIBC version strings"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
compat_libs=()
|
||||||
|
if [[ -f build/compat_check/manifest ]]; then
|
||||||
|
mapfile -t compat_libs < build/compat_check/manifest
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ${#compat_libs[@]} -eq 0 ]]; then
|
||||||
|
echo "[WARN] No libraries captured for GLIBC inspection"
|
||||||
|
else
|
||||||
|
for lib in "${compat_libs[@]}"; do
|
||||||
|
compat_check "build/compat_check/$lib"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
deactivate
|
||||||
|
INNER
|
||||||
|
)
|
||||||
|
|
||||||
|
if ! docker run "${docker_env[@]}" "$AGENT_BUILD_IMAGE" bash -lc "$build_script"; then
|
||||||
|
echo "[ERROR] Docker 构建失败,请检查 Docker 权限或设置 AGENT_BUILD_USE_DOCKER=0 在兼容主机上构建" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
if [[ "$AGENT_BUILD_USE_DOCKER" == "1" ]]; then
|
||||||
|
run_docker_build
|
||||||
|
else
|
||||||
|
run_host_build
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f "$DIST_DIR/argus-agent" ]]; then
|
||||||
|
echo "[ERROR] Agent binary was not produced" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$USED_DOCKER" != "1" ]]; then
|
||||||
|
if [[ ! -x "$VENV_DIR/bin/python" ]]; then
|
||||||
|
echo "[WARN] PyInstaller virtualenv missing at $VENV_DIR; skipping compatibility check" >&2
|
||||||
|
else
|
||||||
|
COMPAT_DIR="$BUILD_ROOT/compat_check"
|
||||||
|
rm -rf "$COMPAT_DIR"
|
||||||
|
mkdir -p "$COMPAT_DIR"
|
||||||
|
|
||||||
|
EXTRACT_SCRIPT=$(cat <<'PY'
|
||||||
|
from pathlib import Path
|
||||||
|
from PyInstaller.archive.readers import CArchiveReader
|
||||||
|
import sys
|
||||||
|
|
||||||
|
archive = Path('dist/argus-agent')
|
||||||
|
out_dir = Path('build/compat_check')
|
||||||
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
major, minor = sys.version_info[:2]
|
||||||
|
libpython = f'libpython{major}.{minor}.so.1.0'
|
||||||
|
expected_libs = [
|
||||||
|
libpython,
|
||||||
|
'libssl.so.3',
|
||||||
|
'libcrypto.so.3',
|
||||||
|
]
|
||||||
|
reader = CArchiveReader(str(archive))
|
||||||
|
extracted = []
|
||||||
|
missing = []
|
||||||
|
for name in expected_libs:
|
||||||
|
try:
|
||||||
|
data = reader.extract(name)
|
||||||
|
except KeyError:
|
||||||
|
missing.append(name)
|
||||||
|
continue
|
||||||
|
(out_dir / name).write_bytes(data)
|
||||||
|
extracted.append(name)
|
||||||
|
(out_dir / 'manifest').write_text('\n'.join(extracted))
|
||||||
|
if extracted:
|
||||||
|
print('[INFO] Extracted libraries: ' + ', '.join(extracted))
|
||||||
|
if missing:
|
||||||
|
print('[WARN] Missing expected libraries in bundle: ' + ', '.join(missing))
|
||||||
|
PY
|
||||||
|
)
|
||||||
|
|
||||||
|
"$VENV_DIR/bin/python" - <<PY
|
||||||
|
$EXTRACT_SCRIPT
|
||||||
|
PY
|
||||||
|
|
||||||
|
compat_libs=()
|
||||||
|
if [[ -f "$COMPAT_DIR/manifest" ]]; then
|
||||||
|
mapfile -t compat_libs < "$COMPAT_DIR/manifest"
|
||||||
|
fi
|
||||||
|
|
||||||
|
check_glibc_version() {
|
||||||
|
local lib_path="$1"
|
||||||
|
if [[ ! -f "$lib_path" ]]; then
|
||||||
|
echo "[WARN] Skipping GLIBC check; file not found: $lib_path" >&2
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
if command -v strings >/dev/null 2>&1; then
|
||||||
|
local max_glibc
|
||||||
|
max_glibc=$(strings -a "$lib_path" | grep -Eo 'GLIBC_[0-9]+\.[0-9]+' | sort -Vu | tail -n 1 || true)
|
||||||
|
if [[ -n "$max_glibc" ]]; then
|
||||||
|
echo "[INFO] $lib_path references up to $max_glibc"
|
||||||
|
else
|
||||||
|
echo "[INFO] $lib_path does not expose GLIBC version strings"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "[WARN] strings command unavailable; cannot inspect $lib_path" >&2
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
if [[ ${#compat_libs[@]} -eq 0 ]]; then
|
||||||
|
echo "[WARN] No libraries captured for GLIBC inspection" >&2
|
||||||
|
else
|
||||||
|
for lib in "${compat_libs[@]}"; do
|
||||||
|
check_glibc_version "$COMPAT_DIR/$lib"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "[INFO] Compatibility check executed inside container"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[INFO] Agent binary generated at $DIST_DIR/argus-agent"
|
||||||
2
src/agent/tests/.gitignore
vendored
Normal file
2
src/agent/tests/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
private/
|
||||||
|
tmp/
|
||||||
0
src/agent/tests/__init__.py
Normal file
0
src/agent/tests/__init__.py
Normal file
99
src/agent/tests/docker-compose.yml
Normal file
99
src/agent/tests/docker-compose.yml
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
services:
|
||||||
|
bind:
|
||||||
|
image: ${BIND_IMAGE_TAG:-argus-bind9:latest}
|
||||||
|
container_name: argus-bind-agent-e2e
|
||||||
|
volumes:
|
||||||
|
- ./private:/private
|
||||||
|
networks:
|
||||||
|
default:
|
||||||
|
ipv4_address: 172.28.0.2
|
||||||
|
environment:
|
||||||
|
- "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
|
||||||
|
- "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
|
||||||
|
restart: always
|
||||||
|
|
||||||
|
master:
|
||||||
|
image: argus-master:latest
|
||||||
|
container_name: argus-master-agent-e2e
|
||||||
|
depends_on:
|
||||||
|
- bind
|
||||||
|
environment:
|
||||||
|
- OFFLINE_THRESHOLD_SECONDS=6
|
||||||
|
- ONLINE_THRESHOLD_SECONDS=2
|
||||||
|
- SCHEDULER_INTERVAL_SECONDS=1
|
||||||
|
- "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
|
||||||
|
- "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
|
||||||
|
ports:
|
||||||
|
- "32300:3000"
|
||||||
|
volumes:
|
||||||
|
- ./private/argus/master:/private/argus/master
|
||||||
|
- ./private/argus/metric/prometheus:/private/argus/metric/prometheus
|
||||||
|
- ./private/argus/etc:/private/argus/etc
|
||||||
|
networks:
|
||||||
|
default:
|
||||||
|
ipv4_address: 172.28.0.10
|
||||||
|
restart: always
|
||||||
|
|
||||||
|
agent:
|
||||||
|
image: ubuntu:22.04
|
||||||
|
container_name: argus-agent-e2e
|
||||||
|
hostname: dev-e2euser-e2einst-pod-0
|
||||||
|
depends_on:
|
||||||
|
- master
|
||||||
|
- bind
|
||||||
|
environment:
|
||||||
|
- MASTER_ENDPOINT=http://master.argus.com:3000
|
||||||
|
- REPORT_INTERVAL_SECONDS=2
|
||||||
|
- "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
|
||||||
|
- "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
|
||||||
|
volumes:
|
||||||
|
- ./private/argus/agent/dev-e2euser-e2einst-pod-0:/private/argus/agent/dev-e2euser-e2einst-pod-0
|
||||||
|
- ./private/argus/agent/dev-e2euser-e2einst-pod-0/health:/private/argus/agent/dev-e2euser-e2einst-pod-0/health
|
||||||
|
- ./private/argus/etc:/private/argus/etc
|
||||||
|
- ../dist/argus-agent:/usr/local/bin/argus-agent:ro
|
||||||
|
- ./scripts/agent_entrypoint.sh:/usr/local/bin/agent-entrypoint.sh:ro
|
||||||
|
- ../scripts/agent_deployment_verify.sh:/usr/local/bin/agent_deployment_verify.sh:ro
|
||||||
|
entrypoint:
|
||||||
|
- /usr/local/bin/agent-entrypoint.sh
|
||||||
|
networks:
|
||||||
|
default:
|
||||||
|
ipv4_address: 172.28.0.20
|
||||||
|
restart: always
|
||||||
|
|
||||||
|
agent_env:
|
||||||
|
image: ubuntu:22.04
|
||||||
|
container_name: argus-agent-env-e2e
|
||||||
|
hostname: host_abc
|
||||||
|
depends_on:
|
||||||
|
- master
|
||||||
|
- bind
|
||||||
|
environment:
|
||||||
|
- MASTER_ENDPOINT=http://master.argus.com:3000
|
||||||
|
- REPORT_INTERVAL_SECONDS=2
|
||||||
|
- AGENT_ENV=prod
|
||||||
|
- AGENT_USER=ml
|
||||||
|
- AGENT_INSTANCE=node-3
|
||||||
|
- AGENT_HOSTNAME=host_abc
|
||||||
|
- "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
|
||||||
|
- "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
|
||||||
|
volumes:
|
||||||
|
- ./private/argus/agent/host_abc:/private/argus/agent/host_abc
|
||||||
|
- ./private/argus/agent/host_abc/health:/private/argus/agent/host_abc/health
|
||||||
|
- ./private/argus/etc:/private/argus/etc
|
||||||
|
- ../dist/argus-agent:/usr/local/bin/argus-agent:ro
|
||||||
|
- ./scripts/agent_entrypoint.sh:/usr/local/bin/agent-entrypoint.sh:ro
|
||||||
|
- ../scripts/agent_deployment_verify.sh:/usr/local/bin/agent_deployment_verify.sh:ro
|
||||||
|
entrypoint:
|
||||||
|
- /usr/local/bin/agent-entrypoint.sh
|
||||||
|
networks:
|
||||||
|
default:
|
||||||
|
ipv4_address: 172.28.0.21
|
||||||
|
restart: always
|
||||||
|
|
||||||
|
networks:
|
||||||
|
default:
|
||||||
|
driver: bridge
|
||||||
|
ipam:
|
||||||
|
driver: default
|
||||||
|
config:
|
||||||
|
- subnet: 172.28.0.0/16
|
||||||
23
src/agent/tests/scripts/00_e2e_test.sh
Executable file
23
src/agent/tests/scripts/00_e2e_test.sh
Executable file
@ -0,0 +1,23 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
SCRIPTS=(
|
||||||
|
"01_bootstrap.sh"
|
||||||
|
"02_up.sh"
|
||||||
|
"03_wait_and_assert_registration.sh"
|
||||||
|
"04_write_health_files.sh"
|
||||||
|
"05_verify_agent.sh"
|
||||||
|
"06_assert_status_on_master.sh"
|
||||||
|
"07_restart_agent_and_reregister.sh"
|
||||||
|
"08_down.sh"
|
||||||
|
)
|
||||||
|
|
||||||
|
for script in "${SCRIPTS[@]}"; do
|
||||||
|
echo "[TEST] Running $script"
|
||||||
|
"$SCRIPT_DIR/$script"
|
||||||
|
echo "[TEST] $script completed"
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "[TEST] Agent module E2E tests completed"
|
||||||
63
src/agent/tests/scripts/01_bootstrap.sh
Executable file
63
src/agent/tests/scripts/01_bootstrap.sh
Executable file
@ -0,0 +1,63 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
AGENT_ROOT="$(cd "$TEST_ROOT/.." && pwd)"
|
||||||
|
MASTER_ROOT="$(cd "$AGENT_ROOT/../master" && pwd)"
|
||||||
|
REPO_ROOT="$(cd "$AGENT_ROOT/../.." && pwd)"
|
||||||
|
PRIVATE_ROOT="$TEST_ROOT/private"
|
||||||
|
TMP_ROOT="$TEST_ROOT/tmp"
|
||||||
|
|
||||||
|
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
|
||||||
|
AGENT_CONFIG_DIR="$PRIVATE_ROOT/argus/agent/$AGENT_HOSTNAME"
|
||||||
|
AGENT_HEALTH_DIR="$PRIVATE_ROOT/argus/agent/$AGENT_HOSTNAME/health"
|
||||||
|
MASTER_PRIVATE_DIR="$PRIVATE_ROOT/argus/master"
|
||||||
|
METRIC_PRIVATE_DIR="$PRIVATE_ROOT/argus/metric/prometheus"
|
||||||
|
DNS_DIR="$PRIVATE_ROOT/argus/etc"
|
||||||
|
BIND_IMAGE_TAG="${BIND_IMAGE_TAG:-argus-bind9:latest}"
|
||||||
|
BIND_ROOT="$(cd "$MASTER_ROOT/../bind" && pwd)"
|
||||||
|
|
||||||
|
ensure_image() {
|
||||||
|
local image="$1"
|
||||||
|
if ! docker image inspect "$image" >/dev/null 2>&1; then
|
||||||
|
echo "[ERROR] Docker image '$image' 未找到,请先运行统一构建脚本 (例如 ./build/build_images.sh) 生成所需镜像" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
mkdir -p "$AGENT_CONFIG_DIR"
|
||||||
|
mkdir -p "$AGENT_HEALTH_DIR"
|
||||||
|
mkdir -p "$MASTER_PRIVATE_DIR"
|
||||||
|
mkdir -p "$METRIC_PRIVATE_DIR"
|
||||||
|
mkdir -p "$TMP_ROOT"
|
||||||
|
mkdir -p "$DNS_DIR"
|
||||||
|
|
||||||
|
touch "$AGENT_HEALTH_DIR/.keep"
|
||||||
|
|
||||||
|
# 中文提示:准备 bind 模块提供的 update-dns.sh,模拟生产下发
|
||||||
|
if [[ -f "$BIND_ROOT/build/update-dns.sh" ]]; then
|
||||||
|
cp "$BIND_ROOT/build/update-dns.sh" "$DNS_DIR/update-dns.sh"
|
||||||
|
chmod +x "$DNS_DIR/update-dns.sh"
|
||||||
|
else
|
||||||
|
echo "[WARN] bind update script missing at $BIND_ROOT/build/update-dns.sh"
|
||||||
|
fi
|
||||||
|
|
||||||
|
ensure_image "argus-master:latest"
|
||||||
|
ensure_image "$BIND_IMAGE_TAG"
|
||||||
|
|
||||||
|
AGENT_BINARY="$AGENT_ROOT/dist/argus-agent"
|
||||||
|
|
||||||
|
pushd "$AGENT_ROOT" >/dev/null
|
||||||
|
./scripts/build_binary.sh
|
||||||
|
popd >/dev/null
|
||||||
|
|
||||||
|
if [[ ! -x "$AGENT_BINARY" ]]; then
|
||||||
|
echo "[ERROR] Agent binary not found at $AGENT_BINARY" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "$AGENT_BINARY" > "$TMP_ROOT/agent_binary_path"
|
||||||
|
echo "$BIND_IMAGE_TAG" > "$TMP_ROOT/bind_image_tag"
|
||||||
|
|
||||||
|
echo "[INFO] Agent E2E bootstrap complete"
|
||||||
53
src/agent/tests/scripts/02_up.sh
Executable file
53
src/agent/tests/scripts/02_up.sh
Executable file
@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
|
||||||
|
|
||||||
|
TMP_ROOT="$TEST_ROOT/tmp"
|
||||||
|
ENV_FILE="$TEST_ROOT/.env"
|
||||||
|
|
||||||
|
source "$REPO_ROOT/scripts/common/build_user.sh"
|
||||||
|
load_build_user
|
||||||
|
export ARGUS_BUILD_UID ARGUS_BUILD_GID
|
||||||
|
|
||||||
|
cat > "$ENV_FILE" <<EOF
|
||||||
|
ARGUS_BUILD_UID=$ARGUS_BUILD_UID
|
||||||
|
ARGUS_BUILD_GID=$ARGUS_BUILD_GID
|
||||||
|
EOF
|
||||||
|
|
||||||
|
if [[ ! -f "$TMP_ROOT/agent_binary_path" ]]; then
|
||||||
|
echo "[ERROR] Agent binary path missing; run 01_bootstrap.sh first" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
AGENT_BINARY="$(cat "$TMP_ROOT/agent_binary_path")"
|
||||||
|
if [[ ! -x "$AGENT_BINARY" ]]; then
|
||||||
|
echo "[ERROR] Agent binary not executable: $AGENT_BINARY" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
BIND_IMAGE_TAG_VALUE="argus-bind9:latest"
|
||||||
|
if [[ -f "$TMP_ROOT/bind_image_tag" ]]; then
|
||||||
|
BIND_IMAGE_TAG_VALUE="$(cat "$TMP_ROOT/bind_image_tag")"
|
||||||
|
fi
|
||||||
|
|
||||||
|
compose() {
|
||||||
|
if docker compose version >/dev/null 2>&1; then
|
||||||
|
docker compose "$@"
|
||||||
|
else
|
||||||
|
docker-compose "$@"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
docker container rm -f argus-agent-e2e argus-agent-env-e2e argus-master-agent-e2e argus-bind-agent-e2e >/dev/null 2>&1 || true
|
||||||
|
|
||||||
|
docker network rm tests_default >/dev/null 2>&1 || true
|
||||||
|
|
||||||
|
pushd "$TEST_ROOT" >/dev/null
|
||||||
|
compose down --remove-orphans || true
|
||||||
|
BIND_IMAGE_TAG="$BIND_IMAGE_TAG_VALUE" compose up -d
|
||||||
|
popd >/dev/null
|
||||||
|
|
||||||
|
echo "[INFO] Master+Agent stack started"
|
||||||
106
src/agent/tests/scripts/03_wait_and_assert_registration.sh
Executable file
106
src/agent/tests/scripts/03_wait_and_assert_registration.sh
Executable file
@ -0,0 +1,106 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
TMP_ROOT="$TEST_ROOT/tmp"
|
||||||
|
API_BASE="http://localhost:32300/api/v1/master"
|
||||||
|
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
|
||||||
|
ENV_AGENT_HOSTNAME="host_abc"
|
||||||
|
NODE_FILE="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/node.json"
|
||||||
|
ENV_NODE_FILE="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME/node.json"
|
||||||
|
|
||||||
|
mkdir -p "$TMP_ROOT"
|
||||||
|
|
||||||
|
primary_node_id=""
|
||||||
|
env_node_id=""
|
||||||
|
for _ in {1..30}; do
|
||||||
|
sleep 2
|
||||||
|
response=$(curl -sS "$API_BASE/nodes" || true)
|
||||||
|
if [[ -z "$response" ]]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
list_file="$TMP_ROOT/nodes_list.json"
|
||||||
|
echo "$response" > "$list_file"
|
||||||
|
readarray -t node_ids < <(python3 - "$list_file" "$AGENT_HOSTNAME" "$ENV_AGENT_HOSTNAME" <<'PY'
|
||||||
|
import json, sys
|
||||||
|
|
||||||
|
with open(sys.argv[1]) as handle:
|
||||||
|
nodes = json.load(handle)
|
||||||
|
|
||||||
|
target_primary = sys.argv[2]
|
||||||
|
target_env = sys.argv[3]
|
||||||
|
|
||||||
|
primary_id = ""
|
||||||
|
env_id = ""
|
||||||
|
|
||||||
|
for node in nodes:
|
||||||
|
if node.get("name") == target_primary:
|
||||||
|
primary_id = node.get("id", "")
|
||||||
|
if node.get("name") == target_env:
|
||||||
|
env_id = node.get("id", "")
|
||||||
|
|
||||||
|
print(primary_id)
|
||||||
|
print(env_id)
|
||||||
|
PY
|
||||||
|
)
|
||||||
|
|
||||||
|
primary_node_id="${node_ids[0]}"
|
||||||
|
env_node_id="${node_ids[1]}"
|
||||||
|
|
||||||
|
if [[ -n "$primary_node_id" && -n "$env_node_id" ]]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ -z "$primary_node_id" ]]; then
|
||||||
|
echo "[ERROR] Primary agent did not register within timeout" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z "$env_node_id" ]]; then
|
||||||
|
echo "[ERROR] Env-variable agent did not register within timeout" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "$primary_node_id" > "$TMP_ROOT/node_id"
|
||||||
|
echo "$env_node_id" > "$TMP_ROOT/node_id_host_abc"
|
||||||
|
|
||||||
|
if [[ ! -f "$NODE_FILE" ]]; then
|
||||||
|
echo "[ERROR] node.json not created at $NODE_FILE" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
python3 - "$NODE_FILE" <<'PY'
|
||||||
|
import json, sys
|
||||||
|
with open(sys.argv[1]) as handle:
|
||||||
|
node = json.load(handle)
|
||||||
|
assert "id" in node and node["id"], "node.json missing id"
|
||||||
|
PY
|
||||||
|
|
||||||
|
if [[ ! -f "$ENV_NODE_FILE" ]]; then
|
||||||
|
echo "[ERROR] node.json not created at $ENV_NODE_FILE" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
python3 - "$ENV_NODE_FILE" <<'PY'
|
||||||
|
import json, sys
|
||||||
|
with open(sys.argv[1]) as handle:
|
||||||
|
node = json.load(handle)
|
||||||
|
assert "id" in node and node["id"], "env agent node.json missing id"
|
||||||
|
PY
|
||||||
|
|
||||||
|
detail_file="$TMP_ROOT/initial_detail.json"
|
||||||
|
curl -sS "$API_BASE/nodes/$primary_node_id" -o "$detail_file"
|
||||||
|
python3 - "$detail_file" "$TMP_ROOT/initial_ip" <<'PY'
|
||||||
|
import json, sys, pathlib
|
||||||
|
with open(sys.argv[1]) as handle:
|
||||||
|
node = json.load(handle)
|
||||||
|
ip = node["meta_data"].get("ip")
|
||||||
|
if not ip:
|
||||||
|
raise SystemExit("meta_data.ip missing")
|
||||||
|
pathlib.Path(sys.argv[2]).write_text(ip)
|
||||||
|
PY
|
||||||
|
|
||||||
|
echo "[INFO] Agent registered with node id $primary_node_id"
|
||||||
|
echo "[INFO] Env-variable agent registered with node id $env_node_id"
|
||||||
22
src/agent/tests/scripts/04_write_health_files.sh
Executable file
22
src/agent/tests/scripts/04_write_health_files.sh
Executable file
@ -0,0 +1,22 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
HEALTH_DIR="$TEST_ROOT/private/argus/agent/dev-e2euser-e2einst-pod-0/health"
|
||||||
|
|
||||||
|
cat > "$HEALTH_DIR/log-fluentbit.json" <<JSON
|
||||||
|
{
|
||||||
|
"status": "healthy",
|
||||||
|
"timestamp": "2023-10-05T12:05:00Z"
|
||||||
|
}
|
||||||
|
JSON
|
||||||
|
|
||||||
|
cat > "$HEALTH_DIR/metric-node-exporter.json" <<JSON
|
||||||
|
{
|
||||||
|
"status": "healthy",
|
||||||
|
"timestamp": "2023-10-05T12:05:00Z"
|
||||||
|
}
|
||||||
|
JSON
|
||||||
|
|
||||||
|
echo "[INFO] Health files written"
|
||||||
60
src/agent/tests/scripts/05_verify_agent.sh
Executable file
60
src/agent/tests/scripts/05_verify_agent.sh
Executable file
@ -0,0 +1,60 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
REPO_ROOT="$(cd "$TEST_ROOT/.." && pwd)"
|
||||||
|
VERIFY_SCRIPT="$REPO_ROOT/scripts/agent_deployment_verify.sh"
|
||||||
|
ENV_NODE_ID_FILE="$TEST_ROOT/tmp/node_id_host_abc"
|
||||||
|
PRIMARY_CONTAINER="argus-agent-e2e"
|
||||||
|
ENV_CONTAINER="argus-agent-env-e2e"
|
||||||
|
PRIMARY_HOSTNAME="dev-e2euser-e2einst-pod-0"
|
||||||
|
ENV_HOSTNAME="host_abc"
|
||||||
|
|
||||||
|
if ! docker ps --format '{{.Names}}' | grep -q "^${PRIMARY_CONTAINER}$"; then
|
||||||
|
echo "[WARN] agent container not running; skip verification"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if docker exec -i "$PRIMARY_CONTAINER" bash -lc 'command -v curl >/dev/null && command -v jq >/dev/null'; then
|
||||||
|
echo "[INFO] curl/jq already installed in agent container"
|
||||||
|
else
|
||||||
|
echo "[INFO] Installing curl/jq in agent container"
|
||||||
|
docker exec -i "$PRIMARY_CONTAINER" bash -lc 'apt-get update >/dev/null 2>&1 && apt-get install -y curl jq >/dev/null 2>&1' || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f "$VERIFY_SCRIPT" ]]; then
|
||||||
|
echo "[ERROR] Verification script missing at $VERIFY_SCRIPT" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
run_verifier() {
|
||||||
|
local container="$1" hostname="$2"
|
||||||
|
|
||||||
|
if ! docker ps --format '{{.Names}}' | grep -q "^${container}$"; then
|
||||||
|
echo "[WARN] container $container not running; skip"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! docker exec -i "$container" bash -lc 'command -v /usr/local/bin/agent_deployment_verify.sh >/dev/null'; then
|
||||||
|
echo "[ERROR] /usr/local/bin/agent_deployment_verify.sh missing in $container" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[INFO] Running verification for $hostname in $container"
|
||||||
|
docker exec -i "$container" env VERIFY_HOSTNAME="$hostname" /usr/local/bin/agent_deployment_verify.sh
|
||||||
|
}
|
||||||
|
|
||||||
|
run_verifier "$PRIMARY_CONTAINER" "$PRIMARY_HOSTNAME"
|
||||||
|
|
||||||
|
if docker ps --format '{{.Names}}' | grep -q "^${ENV_CONTAINER}$"; then
|
||||||
|
if docker exec -i "$ENV_CONTAINER" bash -lc 'command -v curl >/dev/null && command -v jq >/dev/null'; then
|
||||||
|
echo "[INFO] curl/jq already installed in env agent container"
|
||||||
|
else
|
||||||
|
echo "[INFO] Installing curl/jq in env agent container"
|
||||||
|
docker exec -i "$ENV_CONTAINER" bash -lc 'apt-get update >/dev/null 2>&1 && apt-get install -y curl jq >/dev/null 2>&1' || true
|
||||||
|
fi
|
||||||
|
run_verifier "$ENV_CONTAINER" "$ENV_HOSTNAME"
|
||||||
|
else
|
||||||
|
echo "[WARN] env-driven agent container not running; skip secondary verification"
|
||||||
|
fi
|
||||||
78
src/agent/tests/scripts/06_assert_status_on_master.sh
Executable file
78
src/agent/tests/scripts/06_assert_status_on_master.sh
Executable file
@ -0,0 +1,78 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
TMP_ROOT="$TEST_ROOT/tmp"
|
||||||
|
API_BASE="http://localhost:32300/api/v1/master"
|
||||||
|
NODE_ID="$(cat "$TMP_ROOT/node_id")"
|
||||||
|
ENV_NODE_ID="$(cat "$TMP_ROOT/node_id_host_abc")"
|
||||||
|
ENV_HOSTNAME="host_abc"
|
||||||
|
NODES_JSON="$TEST_ROOT/private/argus/metric/prometheus/nodes.json"
|
||||||
|
|
||||||
|
success=false
|
||||||
|
detail_file="$TMP_ROOT/agent_status_detail.json"
|
||||||
|
for _ in {1..20}; do
|
||||||
|
sleep 2
|
||||||
|
if ! curl -sS "$API_BASE/nodes/$NODE_ID" -o "$detail_file"; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if python3 - "$detail_file" <<'PY'
|
||||||
|
import json, sys
|
||||||
|
with open(sys.argv[1]) as handle:
|
||||||
|
node = json.load(handle)
|
||||||
|
if node["status"] != "online":
|
||||||
|
raise SystemExit(1)
|
||||||
|
health = node.get("health", {})
|
||||||
|
if "log-fluentbit" not in health or "metric-node-exporter" not in health:
|
||||||
|
raise SystemExit(1)
|
||||||
|
PY
|
||||||
|
then
|
||||||
|
success=true
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ "$success" != true ]]; then
|
||||||
|
echo "[ERROR] Node did not report health data in time" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f "$NODES_JSON" ]]; then
|
||||||
|
echo "[ERROR] nodes.json missing at $NODES_JSON" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
python3 - "$NODES_JSON" "$NODE_ID" "$ENV_NODE_ID" <<'PY'
|
||||||
|
import json, sys
|
||||||
|
with open(sys.argv[1]) as handle:
|
||||||
|
nodes = json.load(handle)
|
||||||
|
|
||||||
|
expected_primary = sys.argv[2]
|
||||||
|
expected_env = sys.argv[3]
|
||||||
|
|
||||||
|
ids = {entry.get("node_id") for entry in nodes}
|
||||||
|
assert expected_primary in ids, nodes
|
||||||
|
assert expected_env in ids, nodes
|
||||||
|
assert len(nodes) >= 2, nodes
|
||||||
|
PY
|
||||||
|
|
||||||
|
echo "[INFO] Master reflects agent health and nodes.json entries"
|
||||||
|
|
||||||
|
env_detail_file="$TMP_ROOT/env_agent_detail.json"
|
||||||
|
curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_detail_file"
|
||||||
|
python3 - "$env_detail_file" "$ENV_HOSTNAME" <<'PY'
|
||||||
|
import json, sys
|
||||||
|
with open(sys.argv[1]) as handle:
|
||||||
|
node = json.load(handle)
|
||||||
|
|
||||||
|
expected_name = sys.argv[2]
|
||||||
|
|
||||||
|
assert node.get("name") == expected_name, node
|
||||||
|
meta = node.get("meta_data", {})
|
||||||
|
assert meta.get("env") == "prod", meta
|
||||||
|
assert meta.get("user") == "ml", meta
|
||||||
|
assert meta.get("instance") == "node-3", meta
|
||||||
|
PY
|
||||||
|
|
||||||
|
echo "[INFO] Env-variable agent reports expected metadata"
|
||||||
254
src/agent/tests/scripts/07_restart_agent_and_reregister.sh
Executable file
254
src/agent/tests/scripts/07_restart_agent_and_reregister.sh
Executable file
@ -0,0 +1,254 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
TMP_ROOT="$TEST_ROOT/tmp"
|
||||||
|
API_BASE="http://localhost:32300/api/v1/master"
|
||||||
|
NODE_ID="$(cat "$TMP_ROOT/node_id")"
|
||||||
|
ENV_NODE_ID_FILE="$TMP_ROOT/node_id_host_abc"
|
||||||
|
if [[ ! -f "$ENV_NODE_ID_FILE" ]]; then
|
||||||
|
echo "[ERROR] Env agent node id file missing at $ENV_NODE_ID_FILE" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ENV_NODE_ID="$(cat "$ENV_NODE_ID_FILE")"
|
||||||
|
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
|
||||||
|
ENV_AGENT_HOSTNAME="host_abc"
|
||||||
|
NETWORK_NAME="tests_default"
|
||||||
|
NEW_AGENT_IP="172.28.0.200"
|
||||||
|
NEW_ENV_AGENT_IP="172.28.0.210"
|
||||||
|
ENTRYPOINT_SCRIPT="$SCRIPT_DIR/agent_entrypoint.sh"
|
||||||
|
VERIFY_SCRIPT="$TEST_ROOT/../scripts/agent_deployment_verify.sh"
|
||||||
|
ENV_FILE="$TEST_ROOT/.env"
|
||||||
|
|
||||||
|
# 中文提示:重启场景也需要同样的入口脚本,确保 DNS 注册逻辑一致
|
||||||
|
if [[ ! -f "$ENTRYPOINT_SCRIPT" ]]; then
|
||||||
|
echo "[ERROR] agent entrypoint script missing at $ENTRYPOINT_SCRIPT" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f "$VERIFY_SCRIPT" ]]; then
|
||||||
|
echo "[ERROR] agent verification script missing at $VERIFY_SCRIPT" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f "$TMP_ROOT/agent_binary_path" ]]; then
|
||||||
|
echo "[ERROR] Agent binary path missing; rerun bootstrap" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
AGENT_BINARY="$(cat "$TMP_ROOT/agent_binary_path")"
|
||||||
|
if [[ ! -x "$AGENT_BINARY" ]]; then
|
||||||
|
echo "[ERROR] Agent binary not executable: $AGENT_BINARY" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -f "$ENV_FILE" ]]; then
|
||||||
|
set -a
|
||||||
|
# shellcheck disable=SC1090
|
||||||
|
source "$ENV_FILE"
|
||||||
|
set +a
|
||||||
|
else
|
||||||
|
REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
|
||||||
|
# shellcheck disable=SC1090
|
||||||
|
source "$REPO_ROOT/scripts/common/build_user.sh"
|
||||||
|
load_build_user
|
||||||
|
fi
|
||||||
|
|
||||||
|
AGENT_UID="${ARGUS_BUILD_UID:-2133}"
|
||||||
|
AGENT_GID="${ARGUS_BUILD_GID:-2015}"
|
||||||
|
|
||||||
|
compose() {
|
||||||
|
if docker compose version >/dev/null 2>&1; then
|
||||||
|
docker compose "$@"
|
||||||
|
else
|
||||||
|
docker-compose "$@"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
before_file="$TMP_ROOT/before_restart.json"
|
||||||
|
curl -sS "$API_BASE/nodes/$NODE_ID" -o "$before_file"
|
||||||
|
prev_last_updated=$(python3 - "$before_file" <<'PY'
|
||||||
|
import json, sys
|
||||||
|
with open(sys.argv[1]) as handle:
|
||||||
|
node = json.load(handle)
|
||||||
|
print(node.get("last_updated", ""))
|
||||||
|
PY
|
||||||
|
)
|
||||||
|
prev_ip=$(python3 - "$before_file" <<'PY'
|
||||||
|
import json, sys
|
||||||
|
with open(sys.argv[1]) as handle:
|
||||||
|
node = json.load(handle)
|
||||||
|
print(node["meta_data"].get("ip", ""))
|
||||||
|
PY
|
||||||
|
)
|
||||||
|
initial_ip=$(cat "$TMP_ROOT/initial_ip")
|
||||||
|
if [[ "$prev_ip" != "$initial_ip" ]]; then
|
||||||
|
echo "[ERROR] Expected initial IP $initial_ip, got $prev_ip" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
env_before_file="$TMP_ROOT/env_before_restart.json"
|
||||||
|
curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_before_file"
|
||||||
|
env_prev_last_updated=$(python3 - "$env_before_file" <<'PY'
|
||||||
|
import json, sys
|
||||||
|
with open(sys.argv[1]) as handle:
|
||||||
|
node = json.load(handle)
|
||||||
|
print(node.get("last_updated", ""))
|
||||||
|
PY
|
||||||
|
)
|
||||||
|
env_prev_ip=$(python3 - "$env_before_file" <<'PY'
|
||||||
|
import json, sys
|
||||||
|
with open(sys.argv[1]) as handle:
|
||||||
|
node = json.load(handle)
|
||||||
|
print(node["meta_data"].get("ip", ""))
|
||||||
|
PY
|
||||||
|
)
|
||||||
|
|
||||||
|
pushd "$TEST_ROOT" >/dev/null
|
||||||
|
compose rm -sf agent
|
||||||
|
compose rm -sf agent_env
|
||||||
|
popd >/dev/null
|
||||||
|
|
||||||
|
docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true
|
||||||
|
docker container rm -f argus-agent-env-e2e >/dev/null 2>&1 || true
|
||||||
|
|
||||||
|
AGENT_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME"
|
||||||
|
HEALTH_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/health"
|
||||||
|
|
||||||
|
ENV_AGENT_DIR="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME"
|
||||||
|
ENV_HEALTH_DIR="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME/health"
|
||||||
|
|
||||||
|
# 先以 sleep 方式启动容器,确保我们掌握注册时的网络状态
|
||||||
|
if ! docker run -d \
|
||||||
|
--name argus-agent-e2e \
|
||||||
|
--hostname "$AGENT_HOSTNAME" \
|
||||||
|
--network "$NETWORK_NAME" \
|
||||||
|
--ip "$NEW_AGENT_IP" \
|
||||||
|
-v "$AGENT_DIR:/private/argus/agent/$AGENT_HOSTNAME" \
|
||||||
|
-v "$HEALTH_DIR:/private/argus/agent/$AGENT_HOSTNAME/health" \
|
||||||
|
-v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \
|
||||||
|
-v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
|
||||||
|
-v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \
|
||||||
|
-v "$VERIFY_SCRIPT:/usr/local/bin/agent_deployment_verify.sh:ro" \
|
||||||
|
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
|
||||||
|
-e REPORT_INTERVAL_SECONDS=2 \
|
||||||
|
-e ARGUS_BUILD_UID="$AGENT_UID" \
|
||||||
|
-e ARGUS_BUILD_GID="$AGENT_GID" \
|
||||||
|
--entrypoint /usr/local/bin/agent-entrypoint.sh \
|
||||||
|
ubuntu:22.04 >/dev/null; then
|
||||||
|
echo "[ERROR] Failed to start agent container with custom IP" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
success=false
|
||||||
|
detail_file="$TMP_ROOT/post_restart.json"
|
||||||
|
for _ in {1..20}; do
|
||||||
|
sleep 3
|
||||||
|
if ! curl -sS "$API_BASE/nodes/$NODE_ID" -o "$detail_file"; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if python3 - "$detail_file" "$prev_last_updated" "$NODE_ID" "$prev_ip" "$NEW_AGENT_IP" <<'PY'
|
||||||
|
import json, sys
|
||||||
|
with open(sys.argv[1]) as handle:
|
||||||
|
node = json.load(handle)
|
||||||
|
prev_last_updated = sys.argv[2]
|
||||||
|
expected_id = sys.argv[3]
|
||||||
|
old_ip = sys.argv[4]
|
||||||
|
expected_ip = sys.argv[5]
|
||||||
|
last_updated = node.get("last_updated")
|
||||||
|
current_ip = node["meta_data"].get("ip")
|
||||||
|
assert node["id"] == expected_id
|
||||||
|
if current_ip != expected_ip:
|
||||||
|
raise SystemExit(1)
|
||||||
|
if current_ip == old_ip:
|
||||||
|
raise SystemExit(1)
|
||||||
|
if not last_updated or last_updated == prev_last_updated:
|
||||||
|
raise SystemExit(1)
|
||||||
|
PY
|
||||||
|
then
|
||||||
|
success=true
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ "$success" != true ]]; then
|
||||||
|
echo "[ERROR] Agent did not report expected new IP $NEW_AGENT_IP after restart" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[INFO] Agent restart produced successful re-registration with IP change"
|
||||||
|
|
||||||
|
# ---- Restart env-driven agent without metadata environment variables ----
|
||||||
|
|
||||||
|
if [[ ! -d "$ENV_AGENT_DIR" ]]; then
|
||||||
|
echo "[ERROR] Env agent data dir missing at $ENV_AGENT_DIR" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -d "$ENV_HEALTH_DIR" ]]; then
|
||||||
|
mkdir -p "$ENV_HEALTH_DIR"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! docker run -d \
|
||||||
|
--name argus-agent-env-e2e \
|
||||||
|
--hostname "$ENV_AGENT_HOSTNAME" \
|
||||||
|
--network "$NETWORK_NAME" \
|
||||||
|
--ip "$NEW_ENV_AGENT_IP" \
|
||||||
|
-v "$ENV_AGENT_DIR:/private/argus/agent/$ENV_AGENT_HOSTNAME" \
|
||||||
|
-v "$ENV_HEALTH_DIR:/private/argus/agent/$ENV_AGENT_HOSTNAME/health" \
|
||||||
|
-v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \
|
||||||
|
-v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
|
||||||
|
-v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \
|
||||||
|
-v "$VERIFY_SCRIPT:/usr/local/bin/agent_deployment_verify.sh:ro" \
|
||||||
|
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
|
||||||
|
-e REPORT_INTERVAL_SECONDS=2 \
|
||||||
|
-e ARGUS_BUILD_UID="$AGENT_UID" \
|
||||||
|
-e ARGUS_BUILD_GID="$AGENT_GID" \
|
||||||
|
--entrypoint /usr/local/bin/agent-entrypoint.sh \
|
||||||
|
ubuntu:22.04 >/dev/null; then
|
||||||
|
echo "[ERROR] Failed to start env-driven agent container without metadata env" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
env_success=false
|
||||||
|
env_detail_file="$TMP_ROOT/env_post_restart.json"
|
||||||
|
for _ in {1..20}; do
|
||||||
|
sleep 3
|
||||||
|
if ! curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_detail_file"; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if python3 - "$env_detail_file" "$env_prev_last_updated" "$ENV_NODE_ID" "$env_prev_ip" "$NEW_ENV_AGENT_IP" <<'PY'
|
||||||
|
import json, sys
|
||||||
|
with open(sys.argv[1]) as handle:
|
||||||
|
node = json.load(handle)
|
||||||
|
prev_last_updated = sys.argv[2]
|
||||||
|
expected_id = sys.argv[3]
|
||||||
|
old_ip = sys.argv[4]
|
||||||
|
expected_ip = sys.argv[5]
|
||||||
|
last_updated = node.get("last_updated")
|
||||||
|
current_ip = node["meta_data"].get("ip")
|
||||||
|
meta = node.get("meta_data", {})
|
||||||
|
assert node["id"] == expected_id
|
||||||
|
if current_ip != expected_ip:
|
||||||
|
raise SystemExit(1)
|
||||||
|
if current_ip == old_ip:
|
||||||
|
raise SystemExit(1)
|
||||||
|
if not last_updated or last_updated == prev_last_updated:
|
||||||
|
raise SystemExit(1)
|
||||||
|
if meta.get("env") != "prod" or meta.get("user") != "ml" or meta.get("instance") != "node-3":
|
||||||
|
raise SystemExit(1)
|
||||||
|
PY
|
||||||
|
then
|
||||||
|
env_success=true
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ "$env_success" != true ]]; then
|
||||||
|
echo "[ERROR] Env-driven agent did not reuse persisted metadata after restart" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[INFO] Env-driven agent restart succeeded with persisted metadata"
|
||||||
36
src/agent/tests/scripts/08_down.sh
Executable file
36
src/agent/tests/scripts/08_down.sh
Executable file
@ -0,0 +1,36 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
ENV_FILE="$TEST_ROOT/.env"
|
||||||
|
|
||||||
|
compose() {
|
||||||
|
if docker compose version >/dev/null 2>&1; then
|
||||||
|
docker compose "$@"
|
||||||
|
else
|
||||||
|
docker-compose "$@"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
docker container rm -f argus-agent-e2e argus-agent-env-e2e >/dev/null 2>&1 || true
|
||||||
|
|
||||||
|
pushd "$TEST_ROOT" >/dev/null
|
||||||
|
compose down --remove-orphans
|
||||||
|
popd >/dev/null
|
||||||
|
|
||||||
|
if [[ -d "$TEST_ROOT/private" ]]; then
|
||||||
|
docker run --rm \
|
||||||
|
-v "$TEST_ROOT/private:/target" \
|
||||||
|
ubuntu:24.04 \
|
||||||
|
chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true
|
||||||
|
rm -rf "$TEST_ROOT/private"
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm -rf "$TEST_ROOT/tmp"
|
||||||
|
|
||||||
|
if [[ -f "$ENV_FILE" ]]; then
|
||||||
|
rm -f "$ENV_FILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[INFO] Agent E2E environment cleaned up"
|
||||||
79
src/agent/tests/scripts/agent_entrypoint.sh
Executable file
79
src/agent/tests/scripts/agent_entrypoint.sh
Executable file
@ -0,0 +1,79 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
LOG_PREFIX="[AGENT-ENTRYPOINT]"
|
||||||
|
DNS_SCRIPT="/private/argus/etc/update-dns.sh"
|
||||||
|
DNS_CONF="/private/argus/etc/dns.conf"
|
||||||
|
TARGET_DOMAIN="master.argus.com"
|
||||||
|
AGENT_UID="${ARGUS_BUILD_UID:-2133}"
|
||||||
|
AGENT_GID="${ARGUS_BUILD_GID:-2015}"
|
||||||
|
AGENT_HOSTNAME="${HOSTNAME:-unknown}"
|
||||||
|
AGENT_DATA_DIR="/private/argus/agent/${AGENT_HOSTNAME}"
|
||||||
|
AGENT_HEALTH_DIR="${AGENT_DATA_DIR}/health"
|
||||||
|
RUNTIME_GROUP="argusagent"
|
||||||
|
RUNTIME_USER="argusagent"
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo "${LOG_PREFIX} $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
mkdir -p "$AGENT_DATA_DIR" "$AGENT_HEALTH_DIR"
|
||||||
|
chown -R "$AGENT_UID:$AGENT_GID" "$AGENT_DATA_DIR" "$AGENT_HEALTH_DIR" 2>/dev/null || true
|
||||||
|
chown -R "$AGENT_UID:$AGENT_GID" "/private/argus/etc" 2>/dev/null || true
|
||||||
|
|
||||||
|
if ! getent group "$AGENT_GID" >/dev/null 2>&1; then
|
||||||
|
groupadd -g "$AGENT_GID" "$RUNTIME_GROUP"
|
||||||
|
else
|
||||||
|
RUNTIME_GROUP="$(getent group "$AGENT_GID" | cut -d: -f1)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! getent passwd "$AGENT_UID" >/dev/null 2>&1; then
|
||||||
|
useradd -u "$AGENT_UID" -g "$AGENT_GID" -M -s /bin/bash "$RUNTIME_USER"
|
||||||
|
else
|
||||||
|
RUNTIME_USER="$(getent passwd "$AGENT_UID" | cut -d: -f1)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "运行用户: $RUNTIME_USER ($AGENT_UID:$AGENT_GID)"
|
||||||
|
|
||||||
|
# 中文提示:等待 bind 下发的 update-dns.sh 脚本
|
||||||
|
for _ in {1..30}; do
|
||||||
|
if [[ -x "$DNS_SCRIPT" ]]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
log "等待 update-dns.sh 准备就绪..."
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ -x "$DNS_SCRIPT" ]]; then
|
||||||
|
log "执行 update-dns.sh 更新容器 DNS"
|
||||||
|
while true; do
|
||||||
|
if "$DNS_SCRIPT"; then
|
||||||
|
log "update-dns.sh 执行成功"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
log "update-dns.sh 执行失败,3 秒后重试"
|
||||||
|
sleep 3
|
||||||
|
done
|
||||||
|
else
|
||||||
|
log "未获取到 update-dns.sh,使用镜像默认 DNS"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 中文提示:记录当前 dns.conf 内容,便于排查
|
||||||
|
if [[ -f "$DNS_CONF" ]]; then
|
||||||
|
log "dns.conf 内容: $(tr '\n' ' ' < "$DNS_CONF")"
|
||||||
|
else
|
||||||
|
log "dns.conf 暂未生成"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 中文提示:尝试解析 master 域名,失败不阻塞但会打日志
|
||||||
|
for _ in {1..30}; do
|
||||||
|
if getent hosts "$TARGET_DOMAIN" >/dev/null 2>&1; then
|
||||||
|
MASTER_IP=$(getent hosts "$TARGET_DOMAIN" | awk '{print $1}' | head -n 1)
|
||||||
|
log "master.argus.com 解析成功: $MASTER_IP"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
|
||||||
|
log "启动 argus-agent"
|
||||||
|
exec su -s /bin/bash -c /usr/local/bin/argus-agent "$RUNTIME_USER"
|
||||||
151
src/agent/tests/test_config_metadata.py
Normal file
151
src/agent/tests/test_config_metadata.py
Normal file
@ -0,0 +1,151 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from app.config import AgentConfig, load_config
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def temp_env(**overrides: str | None):
|
||||||
|
originals: dict[str, str | None] = {}
|
||||||
|
try:
|
||||||
|
for key, value in overrides.items():
|
||||||
|
originals[key] = os.environ.get(key)
|
||||||
|
if value is None:
|
||||||
|
os.environ.pop(key, None)
|
||||||
|
else:
|
||||||
|
os.environ[key] = value
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
for key, original in originals.items():
|
||||||
|
if original is None:
|
||||||
|
os.environ.pop(key, None)
|
||||||
|
else:
|
||||||
|
os.environ[key] = original
|
||||||
|
|
||||||
|
|
||||||
|
class LoadConfigMetadataTests(unittest.TestCase):
|
||||||
|
@patch("app.config.Path.mkdir")
|
||||||
|
def test_metadata_from_environment_variables(self, mock_mkdir):
|
||||||
|
with temp_env(
|
||||||
|
MASTER_ENDPOINT="http://master.local",
|
||||||
|
AGENT_HOSTNAME="dev-user-one-pod",
|
||||||
|
AGENT_ENV="prod",
|
||||||
|
AGENT_USER="ops",
|
||||||
|
AGENT_INSTANCE="node-1",
|
||||||
|
):
|
||||||
|
config = load_config()
|
||||||
|
|
||||||
|
self.assertEqual(config.environment, "prod")
|
||||||
|
self.assertEqual(config.user, "ops")
|
||||||
|
self.assertEqual(config.instance, "node-1")
|
||||||
|
mock_mkdir.assert_called()
|
||||||
|
|
||||||
|
@patch("app.config.Path.mkdir")
|
||||||
|
def test_metadata_falls_back_to_hostname(self, mock_mkdir):
|
||||||
|
with temp_env(
|
||||||
|
MASTER_ENDPOINT="http://master.local",
|
||||||
|
AGENT_HOSTNAME="qa-team-abc-pod-2",
|
||||||
|
AGENT_ENV=None,
|
||||||
|
AGENT_USER=None,
|
||||||
|
AGENT_INSTANCE=None,
|
||||||
|
):
|
||||||
|
config = load_config()
|
||||||
|
|
||||||
|
self.assertEqual(config.environment, "qa")
|
||||||
|
self.assertEqual(config.user, "team")
|
||||||
|
self.assertEqual(config.instance, "abc")
|
||||||
|
mock_mkdir.assert_called()
|
||||||
|
|
||||||
|
@patch("app.config._load_metadata_from_state", return_value=("prod", "ops", "node-1"))
|
||||||
|
@patch("app.config.Path.mkdir")
|
||||||
|
def test_metadata_from_node_state(self, mock_mkdir, mock_state):
|
||||||
|
with temp_env(
|
||||||
|
MASTER_ENDPOINT="http://master.local",
|
||||||
|
AGENT_HOSTNAME="host_abc",
|
||||||
|
AGENT_ENV=None,
|
||||||
|
AGENT_USER=None,
|
||||||
|
AGENT_INSTANCE=None,
|
||||||
|
):
|
||||||
|
config = load_config()
|
||||||
|
|
||||||
|
self.assertEqual(config.environment, "prod")
|
||||||
|
self.assertEqual(config.user, "ops")
|
||||||
|
self.assertEqual(config.instance, "node-1")
|
||||||
|
mock_state.assert_called_once()
|
||||||
|
mock_mkdir.assert_called()
|
||||||
|
|
||||||
|
@patch("app.config.Path.mkdir")
|
||||||
|
def test_partial_environment_variables_fallback(self, mock_mkdir):
|
||||||
|
with temp_env(
|
||||||
|
MASTER_ENDPOINT="http://master.local",
|
||||||
|
AGENT_HOSTNAME="stage-ml-001-node",
|
||||||
|
AGENT_ENV="prod",
|
||||||
|
AGENT_USER=None,
|
||||||
|
AGENT_INSTANCE=None,
|
||||||
|
):
|
||||||
|
config = load_config()
|
||||||
|
|
||||||
|
self.assertEqual(config.environment, "stage")
|
||||||
|
self.assertEqual(config.user, "ml")
|
||||||
|
self.assertEqual(config.instance, "001")
|
||||||
|
mock_mkdir.assert_called()
|
||||||
|
|
||||||
|
@patch("app.config.Path.mkdir")
|
||||||
|
def test_invalid_hostname_raises_error(self, mock_mkdir):
|
||||||
|
with temp_env(
|
||||||
|
MASTER_ENDPOINT="http://master.local",
|
||||||
|
AGENT_HOSTNAME="invalidhostname",
|
||||||
|
AGENT_ENV=None,
|
||||||
|
AGENT_USER=None,
|
||||||
|
AGENT_INSTANCE=None,
|
||||||
|
):
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
load_config()
|
||||||
|
|
||||||
|
mock_mkdir.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
class CollectMetadataTests(unittest.TestCase):
|
||||||
|
@patch("app.collector._detect_ip_address", return_value="127.0.0.1")
|
||||||
|
@patch("app.collector._detect_gpu_count", return_value=0)
|
||||||
|
@patch("app.collector._detect_memory_bytes", return_value=1024)
|
||||||
|
@patch("app.collector._detect_cpu_count", return_value=8)
|
||||||
|
def test_collect_metadata_uses_config_fields(
|
||||||
|
self,
|
||||||
|
mock_cpu,
|
||||||
|
mock_memory,
|
||||||
|
mock_gpu,
|
||||||
|
mock_ip,
|
||||||
|
):
|
||||||
|
config = AgentConfig(
|
||||||
|
hostname="dev-user-001-pod",
|
||||||
|
environment="prod",
|
||||||
|
user="ops",
|
||||||
|
instance="node-1",
|
||||||
|
node_file="/tmp/node.json",
|
||||||
|
version="1.0.0",
|
||||||
|
master_endpoint="http://master.local",
|
||||||
|
report_interval_seconds=60,
|
||||||
|
health_dir="/tmp/health",
|
||||||
|
)
|
||||||
|
|
||||||
|
from app.collector import collect_metadata
|
||||||
|
|
||||||
|
metadata = collect_metadata(config)
|
||||||
|
|
||||||
|
self.assertEqual(metadata["env"], "prod")
|
||||||
|
self.assertEqual(metadata["user"], "ops")
|
||||||
|
self.assertEqual(metadata["instance"], "node-1")
|
||||||
|
self.assertEqual(metadata["hostname"], "dev-user-001-pod")
|
||||||
|
self.assertEqual(metadata["ip"], "127.0.0.1")
|
||||||
|
self.assertEqual(metadata["cpu_number"], 8)
|
||||||
|
self.assertEqual(metadata["memory_in_bytes"], 1024)
|
||||||
|
self.assertEqual(metadata["gpu_number"], 0)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
31
src/alert/README.md
Normal file
31
src/alert/README.md
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
# Alertmanager
|
||||||
|
|
||||||
|
## 构建
|
||||||
|
1. 首先设置构建和部署的环境变量, 在项目根目录下执行:
|
||||||
|
```bash
|
||||||
|
cp src/alert/tests/.env.example src/alert/tests/.env
|
||||||
|
```
|
||||||
|
|
||||||
|
然后找到复制出来的.env文件,修改环境变量。
|
||||||
|
|
||||||
|
2. 使用脚本构建,在项目根目录下执行:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bash src/alert/alertmanager/build/build.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
构建成功后,会在项目根目录下生成argus-alertmanager-latest.tar
|
||||||
|
|
||||||
|
## 部署
|
||||||
|
|
||||||
|
提供docker-compose部署。在src/alert/tests目录下
|
||||||
|
```bash
|
||||||
|
docker-compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
## 动态配置
|
||||||
|
配置文件放在`/private/argus/alert/alertmanager/alertmanager.yml`下,修改alertmanager.yml后,调用`http://alertmanager.alert.argus.com:9093/-/reload`接口(POST)可以重新加载配置.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:9093/-/reload
|
||||||
|
```
|
||||||
96
src/alert/alertmanager/build/Dockerfile
Normal file
96
src/alert/alertmanager/build/Dockerfile
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
# 基于 Ubuntu 24.04
|
||||||
|
FROM ubuntu:24.04
|
||||||
|
|
||||||
|
# 切换到 root 用户
|
||||||
|
USER root
|
||||||
|
|
||||||
|
# 安装必要依赖
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y wget supervisor net-tools inetutils-ping vim ca-certificates passwd && \
|
||||||
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# 设置 Alertmanager 版本(与本地离线包保持一致)
|
||||||
|
ARG ALERTMANAGER_VERSION=0.28.1
|
||||||
|
|
||||||
|
# 使用仓库内预置的离线包构建(无需联网)
|
||||||
|
COPY src/alert/alertmanager/build/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz /tmp/
|
||||||
|
RUN tar xvf /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz -C /tmp && \
|
||||||
|
mv /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64 /usr/local/alertmanager && \
|
||||||
|
rm -f /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz
|
||||||
|
|
||||||
|
ENV ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
|
||||||
|
|
||||||
|
ARG ARGUS_BUILD_UID=2133
|
||||||
|
ARG ARGUS_BUILD_GID=2015
|
||||||
|
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID}
|
||||||
|
ENV ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
|
||||||
|
|
||||||
|
RUN mkdir -p /usr/share/alertmanager && \
|
||||||
|
mkdir -p ${ALERTMANAGER_BASE_PATH} && \
|
||||||
|
mkdir -p /private/argus/etc && \
|
||||||
|
rm -rf /alertmanager && \
|
||||||
|
ln -s ${ALERTMANAGER_BASE_PATH} /alertmanager
|
||||||
|
|
||||||
|
# 创建 alertmanager 用户(可自定义 UID/GID)
|
||||||
|
# 创建 alertmanager 用户组
|
||||||
|
RUN set -eux; \
|
||||||
|
# 确保目标 GID 存在;若已被占用,直接使用该 GID(组名不限)\
|
||||||
|
if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \
|
||||||
|
groupadd -g "${ARGUS_BUILD_GID}" alertmanager || true; \
|
||||||
|
fi; \
|
||||||
|
# 确保存在 alertmanager 用户;若 UID 已被占用,跳过并继续使用现有 UID 的用户
|
||||||
|
if ! id alertmanager >/dev/null 2>&1; then \
|
||||||
|
if getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
|
||||||
|
# UID 已占用,则创建同名用户但不指定 UID(避免冲突),仅保证 user 存在
|
||||||
|
useradd -M -s /usr/sbin/nologin -g "${ARGUS_BUILD_GID}" alertmanager || true; \
|
||||||
|
else \
|
||||||
|
useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" alertmanager || true; \
|
||||||
|
fi; \
|
||||||
|
else \
|
||||||
|
usermod -g "${ARGUS_BUILD_GID}" alertmanager || true; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
RUN chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/alertmanager /alertmanager ${ALERTMANAGER_BASE_PATH} /private/argus/etc /usr/local/bin || true
|
||||||
|
|
||||||
|
# 配置内网 apt 源 (如果指定了内网选项)
|
||||||
|
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||||
|
echo "Configuring intranet apt sources..." && \
|
||||||
|
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
|
||||||
|
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
|
||||||
|
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
|
||||||
|
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
# 配置部署时使用的 apt 源
|
||||||
|
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||||
|
echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 创建 supervisor 日志目录
|
||||||
|
RUN mkdir -p /var/log/supervisor
|
||||||
|
|
||||||
|
# 复制 supervisor 配置文件
|
||||||
|
COPY src/alert/alertmanager/build/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
||||||
|
|
||||||
|
# 复制启动脚本
|
||||||
|
COPY src/alert/alertmanager/build/start-am-supervised.sh /usr/local/bin/start-am-supervised.sh
|
||||||
|
RUN chmod +x /usr/local/bin/start-am-supervised.sh
|
||||||
|
|
||||||
|
# 复制 Alertmanager 配置文件
|
||||||
|
COPY src/alert/alertmanager/build/alertmanager.yml /etc/alertmanager/alertmanager.yml
|
||||||
|
RUN chmod +x /etc/alertmanager/alertmanager.yml
|
||||||
|
# COPY src/alert/alertmanager/build/alertmanager.yml ${ALERTMANAGER_BASE_PATH}/alertmanager.yml
|
||||||
|
|
||||||
|
# 复制 DNS 监控脚本
|
||||||
|
COPY src/alert/alertmanager/build/dns-monitor.sh /usr/local/bin/dns-monitor.sh
|
||||||
|
RUN chmod +x /usr/local/bin/dns-monitor.sh
|
||||||
|
|
||||||
|
# 保持 root 用户,由 supervisor 控制 user 切换
|
||||||
|
USER root
|
||||||
|
|
||||||
|
# 暴露端口(Alertmanager 默认端口 9093)
|
||||||
|
EXPOSE 9093
|
||||||
|
|
||||||
|
# 使用 supervisor 作为入口点
|
||||||
|
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
|
||||||
Binary file not shown.
19
src/alert/alertmanager/build/alertmanager.yml
Normal file
19
src/alert/alertmanager/build/alertmanager.yml
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
global:
|
||||||
|
resolve_timeout: 5m
|
||||||
|
|
||||||
|
route:
|
||||||
|
group_by: ['alertname', 'instance'] # 分组:相同 alertname + instance 的告警合并
|
||||||
|
group_wait: 30s # 第一个告警后,等 30s 看是否有同组告警一起发
|
||||||
|
group_interval: 5m # 同组告警变化后,至少 5 分钟再发一次
|
||||||
|
repeat_interval: 3h # 相同告警,3 小时重复提醒一次
|
||||||
|
receiver: 'null'
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
- name: 'null'
|
||||||
|
|
||||||
|
inhibit_rules:
|
||||||
|
- source_match:
|
||||||
|
severity: 'critical' # critical 告警存在时
|
||||||
|
target_match:
|
||||||
|
severity: 'warning' # 抑制相同 instance 的 warning 告警
|
||||||
|
equal: ['instance']
|
||||||
13
src/alert/alertmanager/build/build.sh
Normal file
13
src/alert/alertmanager/build/build.sh
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
docker pull ubuntu:24.04
|
||||||
|
|
||||||
|
source src/alert/tests/.env
|
||||||
|
|
||||||
|
docker build \
|
||||||
|
--build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
|
||||||
|
--build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID} \
|
||||||
|
-f src/alert/alertmanager/build/Dockerfile \
|
||||||
|
-t argus-alertmanager:latest .
|
||||||
|
|
||||||
|
docker save -o argus-alertmanager-latest.tar argus-alertmanager:latest
|
||||||
68
src/alert/alertmanager/build/dns-monitor.sh
Normal file
68
src/alert/alertmanager/build/dns-monitor.sh
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# DNS监控脚本 - 每10秒检查dns.conf是否有变化
|
||||||
|
# 如果有变化则执行update-dns.sh脚本
|
||||||
|
|
||||||
|
DNS_CONF="/private/argus/etc/dns.conf"
|
||||||
|
DNS_BACKUP="/tmp/dns.conf.backup"
|
||||||
|
UPDATE_SCRIPT="/private/argus/etc/update-dns.sh"
|
||||||
|
LOG_FILE="/var/log/supervisor/dns-monitor.log"
|
||||||
|
|
||||||
|
# 确保日志文件存在
|
||||||
|
touch "$LOG_FILE"
|
||||||
|
|
||||||
|
log_message() {
|
||||||
|
echo "$(date '+%Y-%m-%d %H:%M:%S') [DNS-Monitor] $1" >> "$LOG_FILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_message "DNS监控脚本启动"
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
if [ -f "$DNS_CONF" ]; then
|
||||||
|
if [ -f "$DNS_BACKUP" ]; then
|
||||||
|
# 比较文件内容
|
||||||
|
if ! cmp -s "$DNS_CONF" "$DNS_BACKUP"; then
|
||||||
|
log_message "检测到DNS配置变化"
|
||||||
|
|
||||||
|
# 更新备份文件
|
||||||
|
cp "$DNS_CONF" "$DNS_BACKUP"
|
||||||
|
|
||||||
|
# 执行更新脚本
|
||||||
|
if [ -x "$UPDATE_SCRIPT" ]; then
|
||||||
|
log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
|
||||||
|
"$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
log_message "DNS更新脚本执行成功"
|
||||||
|
else
|
||||||
|
log_message "DNS更新脚本执行失败"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
|
||||||
|
# 第一次检测到配置文件,执行更新脚本
|
||||||
|
if [ -x "$UPDATE_SCRIPT" ]; then
|
||||||
|
log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
|
||||||
|
"$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
log_message "DNS更新脚本执行成功"
|
||||||
|
|
||||||
|
# 第一次运行,创建备份并执行更新
|
||||||
|
cp "$DNS_CONF" "$DNS_BACKUP"
|
||||||
|
log_message "创建DNS配置备份文件"
|
||||||
|
|
||||||
|
else
|
||||||
|
log_message "DNS更新脚本执行失败"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log_message "警告: DNS配置文件不存在: $DNS_CONF"
|
||||||
|
fi
|
||||||
|
|
||||||
|
sleep 10
|
||||||
|
done
|
||||||
22
src/alert/alertmanager/build/fetch-dist.sh
Normal file
22
src/alert/alertmanager/build/fetch-dist.sh
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# 下载 Alertmanager 离线安装包到本目录,用于 Docker 构建时 COPY
|
||||||
|
# 用法:
|
||||||
|
# ./fetch-dist.sh [version]
|
||||||
|
# 示例:
|
||||||
|
# ./fetch-dist.sh 0.28.1
|
||||||
|
|
||||||
|
VER="${1:-0.28.1}"
|
||||||
|
OUT="alertmanager-${VER}.linux-amd64.tar.gz"
|
||||||
|
URL="https://github.com/prometheus/alertmanager/releases/download/v${VER}/${OUT}"
|
||||||
|
|
||||||
|
if [[ -f "$OUT" ]]; then
|
||||||
|
echo "[INFO] $OUT already exists, skip download"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[INFO] Downloading $URL"
|
||||||
|
curl -fL --retry 3 --connect-timeout 10 -o "$OUT" "$URL"
|
||||||
|
echo "[OK] Saved to $(pwd)/$OUT"
|
||||||
|
|
||||||
25
src/alert/alertmanager/build/start-am-supervised.sh
Normal file
25
src/alert/alertmanager/build/start-am-supervised.sh
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
echo "[INFO] Starting Alertmanager under supervisor..."
|
||||||
|
|
||||||
|
ALERTMANAGER_BASE_PATH=${ALERTMANAGER_BASE_PATH:-/private/argus/alert/alertmanager}
|
||||||
|
|
||||||
|
echo "[INFO] Alertmanager base path: ${ALERTMANAGER_BASE_PATH}"
|
||||||
|
|
||||||
|
# 使用容器内的 /etc/alertmanager/alertmanager.yml 作为配置文件,避免写入挂载卷导致的权限问题
|
||||||
|
echo "[INFO] Using /etc/alertmanager/alertmanager.yml as configuration"
|
||||||
|
|
||||||
|
|
||||||
|
# 记录容器 IP 地址
|
||||||
|
DOMAIN=alertmanager.alert.argus.com
|
||||||
|
IP=$(ifconfig | grep -A 1 eth0 | grep inet | awk '{print $2}')
|
||||||
|
echo "current IP: ${IP}"
|
||||||
|
echo "${IP}" > /private/argus/etc/${DOMAIN}
|
||||||
|
chmod 755 /private/argus/etc/${DOMAIN}
|
||||||
|
|
||||||
|
|
||||||
|
echo "[INFO] Starting Alertmanager process..."
|
||||||
|
|
||||||
|
# 启动 Alertmanager 主进程
|
||||||
|
exec /usr/local/alertmanager/alertmanager --config.file=/etc/alertmanager/alertmanager.yml --storage.path=/alertmanager --cluster.listen-address=""
|
||||||
39
src/alert/alertmanager/build/supervisord.conf
Normal file
39
src/alert/alertmanager/build/supervisord.conf
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
[supervisord]
|
||||||
|
nodaemon=true
|
||||||
|
logfile=/var/log/supervisor/supervisord.log
|
||||||
|
pidfile=/var/run/supervisord.pid
|
||||||
|
user=root
|
||||||
|
|
||||||
|
[program:alertmanager]
|
||||||
|
command=/usr/local/bin/start-am-supervised.sh
|
||||||
|
user=ubuntu
|
||||||
|
stdout_logfile=/var/log/supervisor/alertmanager.log
|
||||||
|
stderr_logfile=/var/log/supervisor/alertmanager_error.log
|
||||||
|
autorestart=true
|
||||||
|
startretries=3
|
||||||
|
startsecs=10
|
||||||
|
stopwaitsecs=20
|
||||||
|
killasgroup=true
|
||||||
|
stopasgroup=true
|
||||||
|
|
||||||
|
[program:dns-monitor]
|
||||||
|
command=/usr/local/bin/dns-monitor.sh
|
||||||
|
user=root
|
||||||
|
stdout_logfile=/var/log/supervisor/dns-monitor.log
|
||||||
|
stderr_logfile=/var/log/supervisor/dns-monitor_error.log
|
||||||
|
autorestart=true
|
||||||
|
startretries=3
|
||||||
|
startsecs=5
|
||||||
|
stopwaitsecs=10
|
||||||
|
killasgroup=true
|
||||||
|
stopasgroup=true
|
||||||
|
|
||||||
|
[unix_http_server]
|
||||||
|
file=/var/run/supervisor.sock
|
||||||
|
chmod=0700
|
||||||
|
|
||||||
|
[supervisorctl]
|
||||||
|
serverurl=unix:///var/run/supervisor.sock
|
||||||
|
|
||||||
|
[rpcinterface:supervisor]
|
||||||
|
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
|
||||||
60
src/alert/alertmanager/config/rule_files/README.md
Normal file
60
src/alert/alertmanager/config/rule_files/README.md
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
# 告警配置
|
||||||
|
|
||||||
|
> 参考:[自定义Prometheus告警规则](https://yunlzheng.gitbook.io/prometheus-book/parti-prometheus-ji-chu/alert/prometheus-alert-rule)
|
||||||
|
|
||||||
|
在Prometheus中配置告警的有两个步骤:
|
||||||
|
|
||||||
|
1. 写告警规则文件(rules文件)
|
||||||
|
2. 在promethues.yml里加载规则,并配置Alertmanager
|
||||||
|
|
||||||
|
## 1. 编写告警规则文件
|
||||||
|
告警规则如下:
|
||||||
|
```yml
|
||||||
|
groups:
|
||||||
|
- name: example-rules
|
||||||
|
interval: 30s # 每30秒评估一次
|
||||||
|
rules:
|
||||||
|
- alert: InstanceDown
|
||||||
|
expr: up == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "实例 {{ $labels.instance }} 已宕机"
|
||||||
|
description: "{{ $labels.instance }} 在 {{ $labels.job }} 中无响应超过 1 分钟。"
|
||||||
|
|
||||||
|
- alert: HighCpuUsage
|
||||||
|
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "CPU 使用率过高"
|
||||||
|
description: "实例 {{ $labels.instance }} CPU 使用率超过 80% 持续 5 分钟。"
|
||||||
|
```
|
||||||
|
|
||||||
|
其中:
|
||||||
|
|
||||||
|
- `alert`:告警规则的名称。
|
||||||
|
- `expr`:基于PromQL表达式告警触发条件,用于计算是否有时间序列满足该条件。
|
||||||
|
- `for`:评估等待时间,可选参数。用于表示只有当触发条件持续一段时间后才发送告警。在等待期间新产生告警的状态为pending。
|
||||||
|
- `labels`:自定义标签,允许用户指定要附加到告警上的一组附加标签,可以在Alertmanager中做路由和分组。
|
||||||
|
- `annotations`:用于指定一组附加信息,比如用于描述告警详细信息的文字等,annotations的内容在告警产生时会一同作为参数发送到Alertmanager。可以提供告警摘要和详细信息。
|
||||||
|
|
||||||
|
## 2. promothues.yml里引用
|
||||||
|
在prometheus.yml中加上`rule_files`和`alerting`:
|
||||||
|
|
||||||
|
```yml
|
||||||
|
global:
|
||||||
|
[ evaluation_interval: <duration> | default = 1m ]
|
||||||
|
|
||||||
|
rule_files:
|
||||||
|
[ - <filepath_glob> ... ]
|
||||||
|
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- static_configs:
|
||||||
|
- targets:
|
||||||
|
- "alertmanager.alert.argus.com:9093" # Alertmanager 地址
|
||||||
|
|
||||||
|
```
|
||||||
37
src/alert/alertmanager/config/rule_files/example_rules.yml
Normal file
37
src/alert/alertmanager/config/rule_files/example_rules.yml
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
groups:
|
||||||
|
- name: example-rules
|
||||||
|
interval: 30s # 每30秒评估一次
|
||||||
|
rules:
|
||||||
|
- alert: InstanceDown
|
||||||
|
expr: up == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "实例 {{ $labels.instance }} 已宕机"
|
||||||
|
description: "{{ $labels.instance }} 在 {{ $labels.job }} 中无响应超过 1 分钟。"
|
||||||
|
|
||||||
|
- alert: HighCpuUsage
|
||||||
|
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "CPU 使用率过高"
|
||||||
|
description: "实例 {{ $labels.instance }} CPU 使用率超过 80% 持续 5 分钟。"
|
||||||
|
- alert: HighMemoryUsage
|
||||||
|
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 80
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "内存使用率过高"
|
||||||
|
description: "实例 {{ $labels.instance }} 内存使用率超过 80% 持续 5 分钟。"
|
||||||
|
- alert: DiskSpaceLow
|
||||||
|
expr: (node_filesystem_size_bytes{fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{fstype!~"tmpfs|overlay"}) / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"} * 100 > 90
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "磁盘空间不足"
|
||||||
|
description: "实例 {{ $labels.instance }} 磁盘空间不足超过 90% 持续 10 分钟。"
|
||||||
5
src/alert/tests/.env
Normal file
5
src/alert/tests/.env
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
DATA_ROOT=/home/argus/tmp/private/argus
|
||||||
|
ARGUS_BUILD_UID=1048
|
||||||
|
ARGUS_BUILD_GID=1048
|
||||||
|
|
||||||
|
USE_INTRANET=false
|
||||||
5
src/alert/tests/.env.example
Normal file
5
src/alert/tests/.env.example
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
DATA_ROOT=/home/argus/tmp/private/argus
|
||||||
|
ARGUS_UID=1048
|
||||||
|
ARGUS_GID=1048
|
||||||
|
|
||||||
|
USE_INTRANET=false
|
||||||
19
src/alert/tests/data/alertmanager/alertmanager.yml
Normal file
19
src/alert/tests/data/alertmanager/alertmanager.yml
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
global:
|
||||||
|
resolve_timeout: 5m
|
||||||
|
|
||||||
|
route:
|
||||||
|
group_by: ['alertname', 'instance'] # 分组:相同 alertname + instance 的告警合并
|
||||||
|
group_wait: 30s # 第一个告警后,等 30s 看是否有同组告警一起发
|
||||||
|
group_interval: 5m # 同组告警变化后,至少 5 分钟再发一次
|
||||||
|
repeat_interval: 3h # 相同告警,3 小时重复提醒一次
|
||||||
|
receiver: 'null'
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
- name: 'null'
|
||||||
|
|
||||||
|
inhibit_rules:
|
||||||
|
- source_match:
|
||||||
|
severity: 'critical' # critical 告警存在时
|
||||||
|
target_match:
|
||||||
|
severity: 'warning' # 抑制相同 instance 的 warning 告警
|
||||||
|
equal: ['instance']
|
||||||
0
src/alert/tests/data/alertmanager/nflog
Normal file
0
src/alert/tests/data/alertmanager/nflog
Normal file
0
src/alert/tests/data/alertmanager/silences
Normal file
0
src/alert/tests/data/alertmanager/silences
Normal file
1
src/alert/tests/data/etc/alertmanager.alert.argus.com
Normal file
1
src/alert/tests/data/etc/alertmanager.alert.argus.com
Normal file
@ -0,0 +1 @@
|
|||||||
|
172.18.0.2
|
||||||
37
src/alert/tests/docker-compose.yml
Normal file
37
src/alert/tests/docker-compose.yml
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
services:
|
||||||
|
alertmanager:
|
||||||
|
build:
|
||||||
|
context: ../../../
|
||||||
|
dockerfile: src/alert/alertmanager/build/Dockerfile
|
||||||
|
args:
|
||||||
|
ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
|
||||||
|
ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
|
||||||
|
USE_INTRANET: ${USE_INTRANET:-false}
|
||||||
|
image: argus-alertmanager:latest
|
||||||
|
container_name: argus-alertmanager
|
||||||
|
environment:
|
||||||
|
- ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
ports:
|
||||||
|
- "${ARGUS_PORT:-9093}:9093"
|
||||||
|
volumes:
|
||||||
|
- ${DATA_ROOT:-./data}/alert/alertmanager:/private/argus/alert/alertmanager
|
||||||
|
- ${DATA_ROOT:-./data}/etc:/private/argus/etc
|
||||||
|
networks:
|
||||||
|
- argus-debug-net
|
||||||
|
restart: unless-stopped
|
||||||
|
logging:
|
||||||
|
driver: "json-file"
|
||||||
|
options:
|
||||||
|
max-size: "10m"
|
||||||
|
max-file: "3"
|
||||||
|
|
||||||
|
networks:
|
||||||
|
argus-debug-net:
|
||||||
|
driver: bridge
|
||||||
|
name: argus-debug-net
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
alertmanager_data:
|
||||||
|
driver: local
|
||||||
19
src/alert/tests/scripts/01_bootstrap.sh
Normal file
19
src/alert/tests/scripts/01_bootstrap.sh
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)"
|
||||||
|
project_root="$(cd "$root/../../.." && pwd)"
|
||||||
|
|
||||||
|
source "$project_root/scripts/common/build_user.sh"
|
||||||
|
load_build_user
|
||||||
|
|
||||||
|
# 创建新的private目录结构 (基于argus目录结构)
|
||||||
|
echo "[INFO] Creating private directory structure for supervisor-based containers..."
|
||||||
|
mkdir -p "$root/private/argus/alert/alertmanager"
|
||||||
|
mkdir -p "$root/private/argus/etc/"
|
||||||
|
|
||||||
|
# 设置数据目录权限
|
||||||
|
echo "[INFO] Setting permissions for data directories..."
|
||||||
|
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/alert/alertmanager" 2>/dev/null || true
|
||||||
|
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/etc" 2>/dev/null || true
|
||||||
|
|
||||||
|
echo "[INFO] Supervisor-based containers will manage their own scripts and configurations"
|
||||||
10
src/alert/tests/scripts/02_up.sh
Normal file
10
src/alert/tests/scripts/02_up.sh
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
cd "$(dirname "$0")/.."
|
||||||
|
compose_cmd="docker compose"
|
||||||
|
if ! $compose_cmd version >/dev/null 2>&1; then
|
||||||
|
if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else
|
||||||
|
echo "需要 Docker Compose,请安装后重试" >&2; exit 1; fi
|
||||||
|
fi
|
||||||
|
$compose_cmd -p alert-mvp up -d --remove-orphans
|
||||||
|
echo "[OK] 服务已启动:Alertmanager http://localhost:9093"
|
||||||
106
src/alert/tests/scripts/03_alertmanager_add_alert.sh
Normal file
106
src/alert/tests/scripts/03_alertmanager_add_alert.sh
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# ==========================================================
|
||||||
|
# Alertmanager 测试脚本
|
||||||
|
# ==========================================================
|
||||||
|
|
||||||
|
ALERTMANAGER_URL="http://localhost:9093"
|
||||||
|
TEST_ALERT_NAME_CRITICAL="NodeDown"
|
||||||
|
TEST_ALERT_NAME_WARNING="HighCPU"
|
||||||
|
TMP_LOG="/tmp/test-alertmanager.log"
|
||||||
|
|
||||||
|
# 等待参数
|
||||||
|
am_wait_attempts=30
|
||||||
|
am_wait_interval=2
|
||||||
|
|
||||||
|
GREEN="\033[1;32m"
|
||||||
|
RED="\033[1;31m"
|
||||||
|
YELLOW="\033[1;33m"
|
||||||
|
RESET="\033[0m"
|
||||||
|
|
||||||
|
# ==========================================================
|
||||||
|
# 函数定义
|
||||||
|
# ==========================================================
|
||||||
|
|
||||||
|
wait_for_alertmanager() {
|
||||||
|
local attempt=1
|
||||||
|
echo "[INFO] 等待 Alertmanager 启动中..."
|
||||||
|
while (( attempt <= am_wait_attempts )); do
|
||||||
|
if curl -fsS "${ALERTMANAGER_URL}/api/v2/status" >/dev/null 2>&1; then
|
||||||
|
echo -e "${GREEN}[OK] Alertmanager 已就绪 (attempt=${attempt}/${am_wait_attempts})${RESET}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
echo "[..] Alertmanager 尚未就绪 (${attempt}/${am_wait_attempts})"
|
||||||
|
sleep "${am_wait_interval}"
|
||||||
|
(( attempt++ ))
|
||||||
|
done
|
||||||
|
echo -e "${RED}[ERROR] Alertmanager 在 ${am_wait_attempts} 次尝试后仍未就绪${RESET}"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
log_step() {
|
||||||
|
echo -e "${YELLOW}==== $1 ====${RESET}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ==========================================================
|
||||||
|
# 主流程
|
||||||
|
# ==========================================================
|
||||||
|
|
||||||
|
log_step "测试 Alertmanager 开始"
|
||||||
|
echo "[INFO] Alertmanager 地址: $ALERTMANAGER_URL"
|
||||||
|
|
||||||
|
# Step 1: 等待 Alertmanager 启动
|
||||||
|
wait_for_alertmanager
|
||||||
|
|
||||||
|
# Step 2: 触发一个critical测试告警
|
||||||
|
echo "[INFO] 发送critical测试告警..."
|
||||||
|
curl -fsS -X POST "${ALERTMANAGER_URL}/api/v2/alerts" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '[
|
||||||
|
{
|
||||||
|
"labels": {
|
||||||
|
"alertname": "'"${TEST_ALERT_NAME_CRITICAL}"'",
|
||||||
|
"instance": "node-1",
|
||||||
|
"severity": "critical"
|
||||||
|
},
|
||||||
|
"annotations": {
|
||||||
|
"summary": "节点 node-1 宕机"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]' \
|
||||||
|
-o "$TMP_LOG"
|
||||||
|
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo -e "${GREEN}[OK] 已成功发送critical测试告警${RESET}"
|
||||||
|
else
|
||||||
|
echo -e "${RED}[ERROR] critical告警发送失败!${RESET}"
|
||||||
|
cat "$TMP_LOG"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Step 3: 触发一个warning测试告警
|
||||||
|
echo "[INFO] 发送warning测试告警..."
|
||||||
|
curl -fsS -X POST "${ALERTMANAGER_URL}/api/v2/alerts" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '[
|
||||||
|
{
|
||||||
|
"labels": {
|
||||||
|
"alertname": "'"${TEST_ALERT_NAME_WARNING}"'",
|
||||||
|
"instance": "node-1",
|
||||||
|
"severity": "warning"
|
||||||
|
},
|
||||||
|
"annotations": {
|
||||||
|
"summary": "节点 node-1 CPU 使用率过高"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]' \
|
||||||
|
-o "$TMP_LOG"
|
||||||
|
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo -e "${GREEN}[OK] 已成功发送warning测试告警${RESET}"
|
||||||
|
else
|
||||||
|
echo -e "${RED}[ERROR] warning告警发送失败!${RESET}"
|
||||||
|
cat "$TMP_LOG"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
71
src/alert/tests/scripts/04_query_alerts.sh
Normal file
71
src/alert/tests/scripts/04_query_alerts.sh
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# ==========================================================
|
||||||
|
# Alertmanager 测试脚本(含启动等待)
|
||||||
|
# ==========================================================
|
||||||
|
|
||||||
|
ALERTMANAGER_URL="http://localhost:9093"
|
||||||
|
TEST_ALERT_NAME_CRITICAL="NodeDown"
|
||||||
|
TEST_ALERT_NAME_WARNING="HighCPU"
|
||||||
|
TMP_LOG="/tmp/test-alertmanager.log"
|
||||||
|
|
||||||
|
# 等待参数
|
||||||
|
am_wait_attempts=30
|
||||||
|
am_wait_interval=2
|
||||||
|
|
||||||
|
GREEN="\033[1;32m"
|
||||||
|
RED="\033[1;31m"
|
||||||
|
YELLOW="\033[1;33m"
|
||||||
|
RESET="\033[0m"
|
||||||
|
|
||||||
|
# ==========================================================
|
||||||
|
# 函数定义
|
||||||
|
# ==========================================================
|
||||||
|
|
||||||
|
wait_for_alertmanager() {
|
||||||
|
local attempt=1
|
||||||
|
echo "[INFO] 等待 Alertmanager 启动中..."
|
||||||
|
while (( attempt <= am_wait_attempts )); do
|
||||||
|
if curl -fsS "${ALERTMANAGER_URL}/api/v2/status" >/dev/null 2>&1; then
|
||||||
|
echo -e "${GREEN}[OK] Alertmanager 已就绪 (attempt=${attempt}/${am_wait_attempts})${RESET}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
echo "[..] Alertmanager 尚未就绪 (${attempt}/${am_wait_attempts})"
|
||||||
|
sleep "${am_wait_interval}"
|
||||||
|
(( attempt++ ))
|
||||||
|
done
|
||||||
|
echo -e "${RED}[ERROR] Alertmanager 在 ${am_wait_attempts} 次尝试后仍未就绪${RESET}"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
log_step() {
|
||||||
|
echo -e "${YELLOW}==== $1 ====${RESET}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ==========================================================
|
||||||
|
# 主流程
|
||||||
|
# ==========================================================
|
||||||
|
|
||||||
|
log_step "查询 Alertmanager 当前告警列表开始"
|
||||||
|
echo "[INFO] Alertmanager 地址: $ALERTMANAGER_URL"
|
||||||
|
|
||||||
|
# Step 1: 等待 Alertmanager 启动
|
||||||
|
wait_for_alertmanager
|
||||||
|
|
||||||
|
# Step 2: 查询当前告警列表
|
||||||
|
echo "[INFO] 查询当前告警..."
|
||||||
|
sleep 1
|
||||||
|
curl -fsS "${ALERTMANAGER_URL}/api/v2/alerts" | jq '.' || {
|
||||||
|
echo -e "${RED}[WARN] 无法解析返回 JSON,请检查 jq 是否安装${RESET}"
|
||||||
|
curl -s "${ALERTMANAGER_URL}/api/v2/alerts"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Step 3: 检查告警是否包含 NodeDown
|
||||||
|
if curl -fsS "${ALERTMANAGER_URL}/api/v2/alerts" | grep -q "${TEST_ALERT_NAME_CRITICAL}"; then
|
||||||
|
echo -e "${GREEN}✅ 测试通过:Alertmanager 已成功接收告警 ${TEST_ALERT_NAME_CRITICAL}${RESET}"
|
||||||
|
else
|
||||||
|
echo -e "${RED}❌ 测试失败:未检测到告警 ${TEST_ALERT_NAME_CRITICAL}${RESET}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_step "测试结束"
|
||||||
21
src/alert/tests/scripts/05_down.sh
Normal file
21
src/alert/tests/scripts/05_down.sh
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
cd "$(dirname "$0")/.."
|
||||||
|
compose_cmd="docker compose"
|
||||||
|
if ! $compose_cmd version >/dev/null 2>&1; then
|
||||||
|
if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else
|
||||||
|
echo "需要 Docker Compose,请安装后重试" >&2; exit 1; fi
|
||||||
|
fi
|
||||||
|
$compose_cmd -p alert-mvp down
|
||||||
|
echo "[OK] 已停止所有容器"
|
||||||
|
|
||||||
|
# 清理private目录内容
|
||||||
|
echo "[INFO] 清理private目录内容..."
|
||||||
|
cd "$(dirname "$0")/.."
|
||||||
|
if [ -d "private" ]; then
|
||||||
|
# 删除private目录及其所有内容
|
||||||
|
rm -rf private
|
||||||
|
echo "[OK] 已清理private目录"
|
||||||
|
else
|
||||||
|
echo "[INFO] private目录不存在,无需清理"
|
||||||
|
fi
|
||||||
105
src/alert/tests/scripts/e2e_test.sh
Normal file
105
src/alert/tests/scripts/e2e_test.sh
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
echo "======================================="
|
||||||
|
echo "ARGUS Alert System End-to-End Test"
|
||||||
|
echo "======================================="
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 记录测试开始时间
|
||||||
|
test_start_time=$(date +%s)
|
||||||
|
|
||||||
|
# 函数:等待服务就绪
|
||||||
|
wait_for_services() {
|
||||||
|
echo "[INFO] Waiting for all services to be ready..."
|
||||||
|
local max_attempts=${SERVICE_WAIT_ATTEMPTS:-120}
|
||||||
|
local attempt=1
|
||||||
|
|
||||||
|
while [ $attempt -le $max_attempts ]; do
|
||||||
|
if curl -fs http://localhost:9093/api/v2/status >/dev/null 2>&1; then
|
||||||
|
echo "[OK] All services are ready!"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
echo " Waiting for services... ($attempt/$max_attempts)"
|
||||||
|
sleep 5
|
||||||
|
((attempt++))
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "[ERROR] Services not ready after $max_attempts attempts"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# 函数:显示测试步骤
|
||||||
|
show_step() {
|
||||||
|
echo ""
|
||||||
|
echo "🔄 Step $1: $2"
|
||||||
|
echo "----------------------------------------"
|
||||||
|
}
|
||||||
|
|
||||||
|
# 函数:验证步骤结果
|
||||||
|
verify_step() {
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo "✅ $1 - SUCCESS"
|
||||||
|
else
|
||||||
|
echo "❌ $1 - FAILED"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# 开始端到端测试
|
||||||
|
show_step "1" "Bootstrap - Initialize environment"
|
||||||
|
./scripts/01_bootstrap.sh
|
||||||
|
verify_step "Bootstrap"
|
||||||
|
|
||||||
|
show_step "2" "Startup - Start all services"
|
||||||
|
./scripts/02_up.sh
|
||||||
|
verify_step "Service startup"
|
||||||
|
|
||||||
|
# 等待服务完全就绪
|
||||||
|
wait_for_services || exit 1
|
||||||
|
|
||||||
|
# 发送告警数据
|
||||||
|
show_step "3" "Add alerts - Send test alerts to Alertmanager"
|
||||||
|
./scripts/03_alertmanager_add_alert.sh
|
||||||
|
verify_step "Send test alerts"
|
||||||
|
|
||||||
|
# 查询告警数据
|
||||||
|
show_step "4" "Verify data - Query Alertmanager"
|
||||||
|
./scripts/04_query_alerts.sh
|
||||||
|
verify_step "Data verification"
|
||||||
|
|
||||||
|
|
||||||
|
# 检查服务健康状态
|
||||||
|
show_step "Health" "Check service health"
|
||||||
|
echo "[INFO] Checking service health..."
|
||||||
|
|
||||||
|
# 检查 Alertmanager 状态
|
||||||
|
if curl -fs "http://localhost:9093/api/v2/status" >/dev/null 2>&1; then
|
||||||
|
am_status="available"
|
||||||
|
echo "✅ Alertmanager status: $am_status"
|
||||||
|
else
|
||||||
|
am_status="unavailable"
|
||||||
|
echo "⚠️ Alertmanager status: $am_status"
|
||||||
|
fi
|
||||||
|
verify_step "Service health check"
|
||||||
|
|
||||||
|
# 清理环境
|
||||||
|
show_step "5" "Cleanup - Stop all services"
|
||||||
|
./scripts/05_down.sh
|
||||||
|
verify_step "Service cleanup"
|
||||||
|
|
||||||
|
# 计算总测试时间
|
||||||
|
test_end_time=$(date +%s)
|
||||||
|
total_time=$((test_end_time - test_start_time))
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "======================================="
|
||||||
|
echo "🎉 END-TO-END TEST COMPLETED SUCCESSFULLY!"
|
||||||
|
echo "======================================="
|
||||||
|
echo "📊 Test Summary:"
|
||||||
|
echo " • Total time: ${total_time}s"
|
||||||
|
echo " • Alertmanager status: $am_status"
|
||||||
|
echo " • All services started and stopped successfully"
|
||||||
|
echo ""
|
||||||
|
echo "✅ The ARGUS Alert system is working correctly!"
|
||||||
|
echo ""
|
||||||
2
src/bind/.gitignore
vendored
Normal file
2
src/bind/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
|
||||||
|
images/
|
||||||
90
src/bind/build/Dockerfile
Normal file
90
src/bind/build/Dockerfile
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
FROM ubuntu:22.04
|
||||||
|
|
||||||
|
# Set timezone and avoid interactive prompts
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
ENV TZ=Asia/Shanghai
|
||||||
|
|
||||||
|
# 设置构建参数
|
||||||
|
ARG USE_INTRANET=false
|
||||||
|
ARG ARGUS_BUILD_UID=2133
|
||||||
|
ARG ARGUS_BUILD_GID=2015
|
||||||
|
|
||||||
|
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
|
||||||
|
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
|
||||||
|
|
||||||
|
# 配置内网 apt 源 (如果指定了内网选项)
|
||||||
|
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||||
|
echo "Configuring intranet apt sources..." && \
|
||||||
|
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
|
||||||
|
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
|
||||||
|
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
|
||||||
|
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Update package list and install required packages
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y \
|
||||||
|
bind9 \
|
||||||
|
bind9utils \
|
||||||
|
dnsutils \
|
||||||
|
bind9-doc \
|
||||||
|
supervisor \
|
||||||
|
net-tools \
|
||||||
|
inetutils-ping \
|
||||||
|
vim \
|
||||||
|
&& apt-get clean \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# 调整 bind 用户与用户组 ID 以匹配宿主机配置
|
||||||
|
RUN set -eux; \
|
||||||
|
current_gid="$(getent group bind | awk -F: '{print $3}')"; \
|
||||||
|
if [ -z "$current_gid" ]; then \
|
||||||
|
groupadd -g "${ARGUS_BUILD_GID}" bind; \
|
||||||
|
elif [ "$current_gid" != "${ARGUS_BUILD_GID}" ]; then \
|
||||||
|
groupmod -g "${ARGUS_BUILD_GID}" bind; \
|
||||||
|
fi; \
|
||||||
|
if id bind >/dev/null 2>&1; then \
|
||||||
|
current_uid="$(id -u bind)"; \
|
||||||
|
if [ "$current_uid" != "${ARGUS_BUILD_UID}" ]; then \
|
||||||
|
usermod -u "${ARGUS_BUILD_UID}" bind; \
|
||||||
|
fi; \
|
||||||
|
else \
|
||||||
|
useradd -m -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" bind; \
|
||||||
|
fi; \
|
||||||
|
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /var/cache/bind /var/lib/bind
|
||||||
|
|
||||||
|
# 配置部署时使用的apt源
|
||||||
|
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||||
|
echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Create supervisor configuration directory
|
||||||
|
RUN mkdir -p /etc/supervisor/conf.d
|
||||||
|
|
||||||
|
# Copy supervisor configuration
|
||||||
|
COPY src/bind/build/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
||||||
|
|
||||||
|
# Copy BIND9 configuration files
|
||||||
|
COPY src/bind/build/named.conf.local /etc/bind/named.conf.local
|
||||||
|
COPY src/bind/build/db.argus.com /etc/bind/db.argus.com
|
||||||
|
|
||||||
|
# Copy startup and reload scripts
|
||||||
|
COPY src/bind/build/startup.sh /usr/local/bin/startup.sh
|
||||||
|
COPY src/bind/build/reload-bind9.sh /usr/local/bin/reload-bind9.sh
|
||||||
|
COPY src/bind/build/argus_dns_sync.sh /usr/local/bin/argus_dns_sync.sh
|
||||||
|
COPY src/bind/build/update-dns.sh /usr/local/bin/update-dns.sh
|
||||||
|
|
||||||
|
# Make scripts executable
|
||||||
|
RUN chmod +x /usr/local/bin/startup.sh /usr/local/bin/reload-bind9.sh /usr/local/bin/argus_dns_sync.sh /usr/local/bin/update-dns.sh
|
||||||
|
|
||||||
|
# Set proper ownership for BIND9 files
|
||||||
|
RUN chown bind:bind /etc/bind/named.conf.local /etc/bind/db.argus.com
|
||||||
|
|
||||||
|
# Expose DNS port
|
||||||
|
EXPOSE 53/tcp 53/udp
|
||||||
|
|
||||||
|
# Use root user as requested
|
||||||
|
USER root
|
||||||
|
|
||||||
|
# Start with startup script
|
||||||
|
CMD ["/usr/local/bin/startup.sh"]
|
||||||
106
src/bind/build/argus_dns_sync.sh
Normal file
106
src/bind/build/argus_dns_sync.sh
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
WATCH_DIR="/private/argus/etc"
|
||||||
|
ZONE_DB="/private/argus/bind/db.argus.com"
|
||||||
|
LOCKFILE="/var/lock/argus_dns_sync.lock"
|
||||||
|
BACKUP_DIR="/private/argus/bind/.backup"
|
||||||
|
SLEEP_SECONDS=10
|
||||||
|
RELOAD_SCRIPT="/usr/local/bin/reload-bind9.sh" # 这里放你已有脚本的路径
|
||||||
|
|
||||||
|
mkdir -p "$(dirname "$LOCKFILE")" "$BACKUP_DIR"
|
||||||
|
BACKUP_UID="${ARGUS_BUILD_UID:-2133}"
|
||||||
|
BACKUP_GID="${ARGUS_BUILD_GID:-2015}"
|
||||||
|
chown -R "$BACKUP_UID:$BACKUP_GID" "$BACKUP_DIR" 2>/dev/null || true
|
||||||
|
|
||||||
|
is_ipv4() {
|
||||||
|
local ip="$1"
|
||||||
|
[[ "$ip" =~ ^([0-9]{1,3}\.){3}[0-9]{1,3}$ ]] || return 1
|
||||||
|
IFS='.' read -r a b c d <<<"$ip"
|
||||||
|
for n in "$a" "$b" "$c" "$d"; do
|
||||||
|
(( n >= 0 && n <= 255 )) || return 1
|
||||||
|
done
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
get_current_ip() {
|
||||||
|
local name="$1"
|
||||||
|
sed -n -E "s/^${name}[[:space:]]+IN[[:space:]]+A[[:space:]]+([0-9.]+)[[:space:]]*$/\1/p" "$ZONE_DB" | head -n1
|
||||||
|
}
|
||||||
|
|
||||||
|
upsert_record() {
|
||||||
|
local name="$1"
|
||||||
|
local new_ip="$2"
|
||||||
|
local ts
|
||||||
|
ts="$(date +%Y%m%d-%H%M%S)"
|
||||||
|
local changed=0
|
||||||
|
|
||||||
|
cp -a "$ZONE_DB" "$BACKUP_DIR/db.argus.com.$ts.bak"
|
||||||
|
chown "$BACKUP_UID:$BACKUP_GID" "$BACKUP_DIR/db.argus.com.$ts.bak" 2>/dev/null || true
|
||||||
|
|
||||||
|
local cur_ip
|
||||||
|
cur_ip="$(get_current_ip "$name" || true)"
|
||||||
|
|
||||||
|
if [[ -z "$cur_ip" ]]; then
|
||||||
|
# Ensure the file ends with a newline before adding new record
|
||||||
|
if [[ -s "$ZONE_DB" ]] && [[ $(tail -c1 "$ZONE_DB" | wc -l) -eq 0 ]]; then
|
||||||
|
echo "" >> "$ZONE_DB"
|
||||||
|
fi
|
||||||
|
printf "%-20s IN A %s\n" "$name" "$new_ip" >> "$ZONE_DB"
|
||||||
|
echo "[ADD] ${name} -> ${new_ip}"
|
||||||
|
changed=1
|
||||||
|
elif [[ "$cur_ip" != "$new_ip" ]]; then
|
||||||
|
awk -v n="$name" -v ip="$new_ip" '
|
||||||
|
{
|
||||||
|
if ($1==n && $2=="IN" && $3=="A") {
|
||||||
|
printf "%-20s IN A %s\n", n, ip
|
||||||
|
} else {
|
||||||
|
print
|
||||||
|
}
|
||||||
|
}
|
||||||
|
' "$ZONE_DB" > "${ZONE_DB}.tmp" && mv "${ZONE_DB}.tmp" "$ZONE_DB"
|
||||||
|
echo "[UPDATE] ${name}: ${cur_ip} -> ${new_ip}"
|
||||||
|
changed=1
|
||||||
|
else
|
||||||
|
echo "[SKIP] ${name} unchanged (${new_ip})"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $changed -eq 1 ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
exec 9>"$LOCKFILE"
|
||||||
|
if flock -n 9; then
|
||||||
|
shopt -s nullglob
|
||||||
|
NEED_RELOAD=0
|
||||||
|
|
||||||
|
for f in "$WATCH_DIR"/*.argus.com; do
|
||||||
|
base="$(basename "$f")"
|
||||||
|
name="${base%.argus.com}"
|
||||||
|
ip="$(grep -Eo '([0-9]{1,3}\.){3}[0-9]{1,3}' "$f" | tail -n1 || true)"
|
||||||
|
|
||||||
|
if [[ -z "$ip" ]] || ! is_ipv4 "$ip"; then
|
||||||
|
echo "[WARN] $f 未找到有效 IPv4,跳过"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if upsert_record "$name" "$ip"; then
|
||||||
|
NEED_RELOAD=1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ $NEED_RELOAD -eq 1 ]]; then
|
||||||
|
echo "[INFO] 检测到 db.argus.com 变更,执行 reload-bind9.sh"
|
||||||
|
bash "$RELOAD_SCRIPT"
|
||||||
|
fi
|
||||||
|
|
||||||
|
flock -u 9
|
||||||
|
else
|
||||||
|
echo "[INFO] 已有同步任务在运行,跳过本轮"
|
||||||
|
fi
|
||||||
|
|
||||||
|
sleep "$SLEEP_SECONDS"
|
||||||
|
done
|
||||||
16
src/bind/build/db.argus.com
Normal file
16
src/bind/build/db.argus.com
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
$TTL 604800
|
||||||
|
@ IN SOA ns1.argus.com. admin.argus.com. (
|
||||||
|
2 ; Serial
|
||||||
|
604800 ; Refresh
|
||||||
|
86400 ; Retry
|
||||||
|
2419200 ; Expire
|
||||||
|
604800 ) ; Negative Cache TTL
|
||||||
|
|
||||||
|
; 定义 DNS 服务器
|
||||||
|
@ IN NS ns1.argus.com.
|
||||||
|
|
||||||
|
; 定义 ns1 主机
|
||||||
|
ns1 IN A 127.0.0.1
|
||||||
|
|
||||||
|
; 定义 web 指向 12.4.5.6
|
||||||
|
web IN A 12.4.5.6
|
||||||
71
src/bind/build/dns-monitor.sh
Normal file
71
src/bind/build/dns-monitor.sh
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# DNS监控脚本 - 每10秒检查dns.conf是否有变化
|
||||||
|
# 如果有变化则执行update-dns.sh脚本
|
||||||
|
|
||||||
|
DNS_CONF="/private/argus/etc/dns.conf"
|
||||||
|
DNS_BACKUP="/tmp/dns.conf.backup"
|
||||||
|
UPDATE_SCRIPT="/private/argus/etc/update-dns.sh"
|
||||||
|
LOG_FILE="/var/log/supervisor/dns-monitor.log"
|
||||||
|
|
||||||
|
# 确保日志文件存在
|
||||||
|
touch "$LOG_FILE"
|
||||||
|
|
||||||
|
log_message() {
|
||||||
|
echo "$(date '+%Y-%m-%d %H:%M:%S') [DNS-Monitor] $1" >> "$LOG_FILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_message "DNS监控脚本启动"
|
||||||
|
|
||||||
|
log_message "删除DNS备份文件(如果存在)"
|
||||||
|
rm -f $DNS_BACKUP
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
if [ -f "$DNS_CONF" ]; then
|
||||||
|
if [ -f "$DNS_BACKUP" ]; then
|
||||||
|
# 比较文件内容
|
||||||
|
if ! cmp -s "$DNS_CONF" "$DNS_BACKUP"; then
|
||||||
|
log_message "检测到DNS配置变化"
|
||||||
|
|
||||||
|
# 更新备份文件
|
||||||
|
cp "$DNS_CONF" "$DNS_BACKUP"
|
||||||
|
|
||||||
|
# 执行更新脚本
|
||||||
|
if [ -x "$UPDATE_SCRIPT" ]; then
|
||||||
|
log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
|
||||||
|
"$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
log_message "DNS更新脚本执行成功"
|
||||||
|
else
|
||||||
|
log_message "DNS更新脚本执行失败"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
|
||||||
|
# 第一次检测到配置文件,执行更新脚本
|
||||||
|
if [ -x "$UPDATE_SCRIPT" ]; then
|
||||||
|
log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
|
||||||
|
"$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
log_message "DNS更新脚本执行成功"
|
||||||
|
|
||||||
|
# 第一次运行,创建备份并执行更新
|
||||||
|
cp "$DNS_CONF" "$DNS_BACKUP"
|
||||||
|
log_message "创建DNS配置备份文件"
|
||||||
|
|
||||||
|
else
|
||||||
|
log_message "DNS更新脚本执行失败"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log_message "警告: DNS配置文件不存在: $DNS_CONF"
|
||||||
|
fi
|
||||||
|
|
||||||
|
sleep 10
|
||||||
|
done
|
||||||
4
src/bind/build/named.conf.local
Normal file
4
src/bind/build/named.conf.local
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
zone "argus.com" {
|
||||||
|
type master;
|
||||||
|
file "/etc/bind/db.argus.com";
|
||||||
|
};
|
||||||
27
src/bind/build/reload-bind9.sh
Normal file
27
src/bind/build/reload-bind9.sh
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
echo "Reloading BIND9 configuration..."
|
||||||
|
|
||||||
|
# Check if configuration files are valid
|
||||||
|
echo "Checking named.conf.local syntax..."
|
||||||
|
if ! named-checkconf /etc/bind/named.conf.local; then
|
||||||
|
echo "ERROR: named.conf.local has syntax errors!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Checking zone file syntax..."
|
||||||
|
if ! named-checkzone argus.com /etc/bind/db.argus.com; then
|
||||||
|
echo "ERROR: db.argus.com has syntax errors!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Reload BIND9 via supervisor
|
||||||
|
echo "Reloading BIND9 service..."
|
||||||
|
supervisorctl restart bind9
|
||||||
|
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo "BIND9 reloaded successfully!"
|
||||||
|
else
|
||||||
|
echo "ERROR: Failed to reload BIND9!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
42
src/bind/build/startup.sh
Normal file
42
src/bind/build/startup.sh
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Set /private permissions to 777 as requested
|
||||||
|
chmod 777 /private 2>/dev/null || true
|
||||||
|
|
||||||
|
# Create persistent directories for BIND9 configs and DNS sync
|
||||||
|
mkdir -p /private/argus/bind
|
||||||
|
mkdir -p /private/argus/etc
|
||||||
|
chown bind:bind /private/argus 2>/dev/null || true
|
||||||
|
chown -R bind:bind /private/argus/bind /private/argus/etc
|
||||||
|
|
||||||
|
# Copy configuration files to persistent storage if they don't exist
|
||||||
|
if [ ! -f /private/argus/bind/named.conf.local ]; then
|
||||||
|
cp /etc/bind/named.conf.local /private/argus/bind/named.conf.local
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f /private/argus/bind/db.argus.com ]; then
|
||||||
|
cp /etc/bind/db.argus.com /private/argus/bind/db.argus.com
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Copy update-dns.sh to /private/argus/etc/
|
||||||
|
cp /usr/local/bin/update-dns.sh /private/argus/etc/update-dns.sh
|
||||||
|
chown bind:bind /private/argus/etc/update-dns.sh
|
||||||
|
chmod a+x /private/argus/etc/update-dns.sh
|
||||||
|
|
||||||
|
# Create symlinks to use persistent configs
|
||||||
|
ln -sf /private/argus/bind/named.conf.local /etc/bind/named.conf.local
|
||||||
|
ln -sf /private/argus/bind/db.argus.com /etc/bind/db.argus.com
|
||||||
|
|
||||||
|
# Set proper ownership
|
||||||
|
chown bind:bind /private/argus/bind/named.conf.local /private/argus/bind/db.argus.com
|
||||||
|
|
||||||
|
# 记录容器ip地址更新到dns.conf
|
||||||
|
IP=`ifconfig | grep -A 1 eth0 | grep inet | awk '{print $2}'`
|
||||||
|
echo current IP: ${IP}
|
||||||
|
echo ${IP} > /private/argus/etc/dns.conf
|
||||||
|
|
||||||
|
# Create supervisor log directory
|
||||||
|
mkdir -p /var/log/supervisor
|
||||||
|
|
||||||
|
# Start supervisor
|
||||||
|
exec /usr/bin/supervisord -c /etc/supervisor/conf.d/supervisord.conf
|
||||||
37
src/bind/build/supervisord.conf
Normal file
37
src/bind/build/supervisord.conf
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
[unix_http_server]
|
||||||
|
file=/var/run/supervisor.sock
|
||||||
|
chmod=0700
|
||||||
|
|
||||||
|
[supervisord]
|
||||||
|
nodaemon=true
|
||||||
|
user=root
|
||||||
|
logfile=/var/log/supervisor/supervisord.log
|
||||||
|
pidfile=/var/run/supervisord.pid
|
||||||
|
|
||||||
|
[rpcinterface:supervisor]
|
||||||
|
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
|
||||||
|
|
||||||
|
[supervisorctl]
|
||||||
|
serverurl=unix:///var/run/supervisor.sock
|
||||||
|
|
||||||
|
[program:bind9]
|
||||||
|
command=/usr/sbin/named -g -c /etc/bind/named.conf -u bind
|
||||||
|
user=bind
|
||||||
|
autostart=true
|
||||||
|
autorestart=true
|
||||||
|
stderr_logfile=/var/log/supervisor/bind9.err.log
|
||||||
|
stdout_logfile=/var/log/supervisor/bind9.out.log
|
||||||
|
priority=10
|
||||||
|
|
||||||
|
[program:argus-dns-sync]
|
||||||
|
command=/usr/local/bin/argus_dns_sync.sh
|
||||||
|
autostart=true
|
||||||
|
autorestart=true
|
||||||
|
startsecs=3
|
||||||
|
stopsignal=TERM
|
||||||
|
user=root
|
||||||
|
stdout_logfile=/var/log/argus_dns_sync.out.log
|
||||||
|
stderr_logfile=/var/log/argus_dns_sync.err.log
|
||||||
|
; 根据环境调整环境变量(可选)
|
||||||
|
; environment=RNDC_RELOAD="yes"
|
||||||
|
|
||||||
31
src/bind/build/update-dns.sh
Executable file
31
src/bind/build/update-dns.sh
Executable file
@ -0,0 +1,31 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# update-dns.sh
|
||||||
|
# 从 /private/argus/etc/dns.conf 读取 IP,写入 /etc/resolv.conf
|
||||||
|
|
||||||
|
DNS_CONF="/private/argus/etc/dns.conf"
|
||||||
|
RESOLV_CONF="/etc/resolv.conf"
|
||||||
|
|
||||||
|
# 检查配置文件是否存在
|
||||||
|
if [ ! -f "$DNS_CONF" ]; then
|
||||||
|
echo "配置文件不存在: $DNS_CONF" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 生成 resolv.conf 内容
|
||||||
|
{
|
||||||
|
while IFS= read -r ip; do
|
||||||
|
# 跳过空行和注释
|
||||||
|
case "$ip" in
|
||||||
|
\#*) continue ;;
|
||||||
|
"") continue ;;
|
||||||
|
esac
|
||||||
|
echo "nameserver $ip"
|
||||||
|
done < "$DNS_CONF"
|
||||||
|
} > "$RESOLV_CONF".tmp
|
||||||
|
|
||||||
|
# 替换写入 /etc/resolv.conf
|
||||||
|
cat "$RESOLV_CONF".tmp > "$RESOLV_CONF"
|
||||||
|
rm -f "$RESOLV_CONF".tmp
|
||||||
|
|
||||||
|
echo "已更新 $RESOLV_CONF"
|
||||||
|
|
||||||
16
src/bind/tests/docker-compose.yml
Normal file
16
src/bind/tests/docker-compose.yml
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
services:
|
||||||
|
bind9:
|
||||||
|
image: argus-bind9:latest
|
||||||
|
container_name: argus-bind9-test
|
||||||
|
ports:
|
||||||
|
- "${HOST_DNS_PORT:-1053}:53/tcp"
|
||||||
|
- "${HOST_DNS_PORT:-1053}:53/udp"
|
||||||
|
volumes:
|
||||||
|
- ./private:/private
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- bind-test-network
|
||||||
|
|
||||||
|
networks:
|
||||||
|
bind-test-network:
|
||||||
|
driver: bridge
|
||||||
118
src/bind/tests/scripts/00_e2e_test.sh
Executable file
118
src/bind/tests/scripts/00_e2e_test.sh
Executable file
@ -0,0 +1,118 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# End-to-end test for BIND9 DNS server
|
||||||
|
# This script runs all tests in sequence to validate the complete functionality
|
||||||
|
# Usage: ./00_e2e_test.sh
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
|
||||||
|
|
||||||
|
export HOST_DNS_PORT
|
||||||
|
|
||||||
|
echo "=========================================="
|
||||||
|
echo "BIND9 DNS Server End-to-End Test Suite"
|
||||||
|
echo "=========================================="
|
||||||
|
|
||||||
|
# Track test results
|
||||||
|
total_tests=0
|
||||||
|
passed_tests=0
|
||||||
|
failed_tests=0
|
||||||
|
|
||||||
|
# Function to run a test step
|
||||||
|
run_test_step() {
|
||||||
|
local step_name="$1"
|
||||||
|
local script_name="$2"
|
||||||
|
local description="$3"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "[$step_name] $description"
|
||||||
|
echo "$(printf '=%.0s' {1..50})"
|
||||||
|
|
||||||
|
((total_tests++))
|
||||||
|
|
||||||
|
if [ ! -f "$SCRIPT_DIR/$script_name" ]; then
|
||||||
|
echo "✗ Test script not found: $script_name"
|
||||||
|
((failed_tests++))
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Make sure script is executable
|
||||||
|
chmod +x "$SCRIPT_DIR/$script_name"
|
||||||
|
|
||||||
|
# Run the test
|
||||||
|
echo "Executing: $SCRIPT_DIR/$script_name"
|
||||||
|
if "$SCRIPT_DIR/$script_name"; then
|
||||||
|
echo "✓ $step_name completed successfully"
|
||||||
|
((passed_tests++))
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
echo "✗ $step_name failed"
|
||||||
|
((failed_tests++))
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Cleanup any previous test environment (but preserve the Docker image)
|
||||||
|
echo ""
|
||||||
|
echo "[SETUP] Cleaning up any previous test environment..."
|
||||||
|
if [ -f "$SCRIPT_DIR/05_cleanup.sh" ]; then
|
||||||
|
chmod +x "$SCRIPT_DIR/05_cleanup.sh"
|
||||||
|
"$SCRIPT_DIR/05_cleanup.sh" || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Starting BIND9 DNS server end-to-end test sequence..."
|
||||||
|
|
||||||
|
# Test sequence
|
||||||
|
run_test_step "TEST-01" "01_start_container.sh" "Start BIND9 container" || true
|
||||||
|
|
||||||
|
run_test_step "TEST-02" "02_dig_test.sh" "Initial DNS resolution test" || true
|
||||||
|
|
||||||
|
run_test_step "TEST-03" "03_reload_test.sh" "Configuration reload with IP modification" || true
|
||||||
|
|
||||||
|
run_test_step "TEST-03.5" "03.5_dns_sync_test.sh" "DNS auto-sync functionality test" || true
|
||||||
|
|
||||||
|
run_test_step "TEST-04" "04_persistence_test.sh" "Configuration persistence after restart" || true
|
||||||
|
|
||||||
|
# Final cleanup (but preserve logs for review)
|
||||||
|
echo ""
|
||||||
|
echo "[CLEANUP] Cleaning up test environment..."
|
||||||
|
run_test_step "CLEANUP" "05_cleanup.sh" "Clean up containers and networks" || true
|
||||||
|
|
||||||
|
# Test summary
|
||||||
|
echo ""
|
||||||
|
echo "=========================================="
|
||||||
|
echo "TEST SUMMARY"
|
||||||
|
echo "=========================================="
|
||||||
|
echo "Total tests: $total_tests"
|
||||||
|
echo "Passed: $passed_tests"
|
||||||
|
echo "Failed: $failed_tests"
|
||||||
|
|
||||||
|
if [ $failed_tests -eq 0 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "✅ ALL TESTS PASSED!"
|
||||||
|
echo ""
|
||||||
|
echo "BIND9 DNS server functionality validated:"
|
||||||
|
echo " ✓ Container startup and basic functionality"
|
||||||
|
echo " ✓ DNS resolution for configured domains"
|
||||||
|
echo " ✓ Configuration modification and reload"
|
||||||
|
echo " ✓ DNS auto-sync from IP files"
|
||||||
|
echo " ✓ Configuration persistence across restarts"
|
||||||
|
echo " ✓ Cleanup and resource management"
|
||||||
|
echo ""
|
||||||
|
echo "The BIND9 DNS server is ready for production use."
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "❌ SOME TESTS FAILED!"
|
||||||
|
echo ""
|
||||||
|
echo "Please review the test output above to identify and fix issues."
|
||||||
|
echo "You may need to:"
|
||||||
|
echo " - Check Docker installation and permissions"
|
||||||
|
echo " - Verify network connectivity"
|
||||||
|
echo " - Review BIND9 configuration files"
|
||||||
|
echo " - Check system resources and port availability"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
42
src/bind/tests/scripts/01_start_container.sh
Executable file
42
src/bind/tests/scripts/01_start_container.sh
Executable file
@ -0,0 +1,42 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Start BIND9 test container
|
||||||
|
# Usage: ./01_start_container.sh
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
TEST_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
|
||||||
|
|
||||||
|
export HOST_DNS_PORT
|
||||||
|
|
||||||
|
cd "$TEST_DIR"
|
||||||
|
|
||||||
|
echo "Starting BIND9 test container..."
|
||||||
|
|
||||||
|
# Ensure private directory exists with proper permissions
|
||||||
|
mkdir -p private/argus/bind
|
||||||
|
mkdir -p private/argus/etc
|
||||||
|
chmod 777 private
|
||||||
|
|
||||||
|
# Start the container
|
||||||
|
docker compose up -d
|
||||||
|
|
||||||
|
echo "Waiting for container to be ready..."
|
||||||
|
sleep 5
|
||||||
|
|
||||||
|
# Check if container is running
|
||||||
|
if docker compose ps | grep -q "Up"; then
|
||||||
|
echo "✓ Container started successfully"
|
||||||
|
echo "Container status:"
|
||||||
|
docker compose ps
|
||||||
|
else
|
||||||
|
echo "✗ Failed to start container"
|
||||||
|
docker compose logs
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "BIND9 test environment is ready!"
|
||||||
|
echo "DNS server listening on localhost:${HOST_DNS_PORT}"
|
||||||
75
src/bind/tests/scripts/02_dig_test.sh
Executable file
75
src/bind/tests/scripts/02_dig_test.sh
Executable file
@ -0,0 +1,75 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Test DNS resolution using dig
|
||||||
|
# Usage: ./02_dig_test.sh
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
|
||||||
|
|
||||||
|
echo "Testing DNS resolution with dig..."
|
||||||
|
echo "Using DNS server localhost:${HOST_DNS_PORT}"
|
||||||
|
|
||||||
|
# Function to test DNS query
|
||||||
|
test_dns_query() {
|
||||||
|
local hostname="$1"
|
||||||
|
local expected_ip="$2"
|
||||||
|
local description="$3"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Testing: $description"
|
||||||
|
echo "Query: $hostname.argus.com"
|
||||||
|
echo "Expected IP: $expected_ip"
|
||||||
|
|
||||||
|
# Perform dig query
|
||||||
|
result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
|
||||||
|
|
||||||
|
if [ "$result" = "QUERY_FAILED" ]; then
|
||||||
|
echo "✗ DNS query failed"
|
||||||
|
return 1
|
||||||
|
elif [ "$result" = "$expected_ip" ]; then
|
||||||
|
echo "✓ DNS query successful: $result"
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
echo "✗ DNS query returned unexpected result: $result"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check if dig is available
|
||||||
|
if ! command -v dig &> /dev/null; then
|
||||||
|
echo "Installing dig (dnsutils)..."
|
||||||
|
apt-get update && apt-get install -y dnsutils
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if container is running
|
||||||
|
if ! docker compose ps | grep -q "Up"; then
|
||||||
|
echo "Error: BIND9 container is not running"
|
||||||
|
echo "Please start the container first with: ./01_start_container.sh"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "=== DNS Resolution Tests ==="
|
||||||
|
|
||||||
|
# Test cases based on current configuration
|
||||||
|
failed_tests=0
|
||||||
|
|
||||||
|
# Test ns1.argus.com -> 127.0.0.1
|
||||||
|
if ! test_dns_query "ns1" "127.0.0.1" "Name server resolution"; then
|
||||||
|
((failed_tests++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test web.argus.com -> 12.4.5.6
|
||||||
|
if ! test_dns_query "web" "12.4.5.6" "Web server resolution"; then
|
||||||
|
((failed_tests++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== Test Summary ==="
|
||||||
|
if [ $failed_tests -eq 0 ]; then
|
||||||
|
echo "✓ All DNS tests passed!"
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
echo "✗ $failed_tests test(s) failed"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
259
src/bind/tests/scripts/03.5_dns_sync_test.sh
Executable file
259
src/bind/tests/scripts/03.5_dns_sync_test.sh
Executable file
@ -0,0 +1,259 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Test DNS auto-sync functionality using argus_dns_sync.sh
|
||||||
|
# This test validates the automatic DNS record updates from IP files
|
||||||
|
# Usage: ./03.5_dns_sync_test.sh
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
TEST_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
|
||||||
|
echo "=== DNS Auto-Sync Functionality Test ==="
|
||||||
|
echo "Using DNS server localhost:${HOST_DNS_PORT}"
|
||||||
|
|
||||||
|
# Check if container is running
|
||||||
|
if ! docker compose ps | grep -q "Up"; then
|
||||||
|
echo "Error: BIND9 container is not running"
|
||||||
|
echo "Please start the container first with: ./01_start_container.sh"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if dig is available
|
||||||
|
if ! command -v dig &> /dev/null; then
|
||||||
|
echo "Installing dig (dnsutils)..."
|
||||||
|
apt-get update && apt-get install -y dnsutils
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Function to test DNS query
|
||||||
|
test_dns_query() {
|
||||||
|
local hostname="$1"
|
||||||
|
local expected_ip="$2"
|
||||||
|
local description="$3"
|
||||||
|
|
||||||
|
echo "Testing: $description"
|
||||||
|
echo "Query: $hostname.argus.com -> Expected: $expected_ip"
|
||||||
|
|
||||||
|
# Wait a moment for DNS cache
|
||||||
|
sleep 2
|
||||||
|
|
||||||
|
result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
|
||||||
|
|
||||||
|
if [ "$result" = "$expected_ip" ]; then
|
||||||
|
echo "✓ $result"
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
echo "✗ Got: $result, Expected: $expected_ip"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to wait for sync to complete
|
||||||
|
wait_for_sync() {
|
||||||
|
local timeout=15
|
||||||
|
local elapsed=0
|
||||||
|
echo "Waiting for DNS sync to complete (max ${timeout}s)..."
|
||||||
|
|
||||||
|
while [ $elapsed -lt $timeout ]; do
|
||||||
|
if docker compose exec bind9 test -f /var/lock/argus_dns_sync.lock; then
|
||||||
|
echo "Sync process is running..."
|
||||||
|
else
|
||||||
|
echo "Sync completed"
|
||||||
|
sleep 2 # Extra wait for DNS propagation
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
elapsed=$((elapsed + 2))
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Warning: Sync may still be running after ${timeout}s"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 1: Preparing test environment..."
|
||||||
|
|
||||||
|
# Ensure required directories exist
|
||||||
|
docker compose exec bind9 mkdir -p /private/argus/etc
|
||||||
|
docker compose exec bind9 mkdir -p /private/argus/bind/.backup
|
||||||
|
|
||||||
|
# Backup original configuration if it exists
|
||||||
|
docker compose exec bind9 test -f /private/argus/bind/db.argus.com && \
|
||||||
|
docker compose exec bind9 cp /private/argus/bind/db.argus.com /private/argus/bind/db.argus.com.backup.test || true
|
||||||
|
|
||||||
|
# Ensure initial configuration is available (may already be symlinked)
|
||||||
|
docker compose exec bind9 test -f /private/argus/bind/db.argus.com || \
|
||||||
|
docker compose exec bind9 cp /etc/bind/db.argus.com /private/argus/bind/db.argus.com
|
||||||
|
|
||||||
|
echo "✓ Test environment prepared"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 2: Testing initial DNS configuration..."
|
||||||
|
|
||||||
|
# Get current IP for web.argus.com (may have been changed by previous tests)
|
||||||
|
current_web_ip=$(dig @localhost -p "$HOST_DNS_PORT" web.argus.com A +short 2>/dev/null || echo "UNKNOWN")
|
||||||
|
echo "Current web.argus.com IP: $current_web_ip"
|
||||||
|
|
||||||
|
# Test that DNS is working (regardless of specific IP)
|
||||||
|
if [ "$current_web_ip" = "UNKNOWN" ] || [ -z "$current_web_ip" ]; then
|
||||||
|
echo "DNS resolution not working for web.argus.com"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "✓ DNS resolution is working"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 3: Creating IP files for auto-sync..."
|
||||||
|
|
||||||
|
# Create test IP files in the watch directory
|
||||||
|
echo "Creating test1.argus.com with IP 10.0.0.100"
|
||||||
|
docker compose exec bind9 bash -c 'echo "10.0.0.100" > /private/argus/etc/test1.argus.com'
|
||||||
|
|
||||||
|
echo "Creating test2.argus.com with IP 10.0.0.200"
|
||||||
|
docker compose exec bind9 bash -c 'echo "test2 service running on 10.0.0.200" > /private/argus/etc/test2.argus.com'
|
||||||
|
|
||||||
|
echo "Creating api.argus.com with IP 192.168.1.50"
|
||||||
|
docker compose exec bind9 bash -c 'echo "API server: 192.168.1.50 port 8080" > /private/argus/etc/api.argus.com'
|
||||||
|
|
||||||
|
echo "✓ IP files created"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 4: Checking DNS sync process..."
|
||||||
|
|
||||||
|
# Check if DNS sync process is already running (via supervisord)
|
||||||
|
if docker compose exec bind9 pgrep -f argus_dns_sync.sh > /dev/null; then
|
||||||
|
echo "✓ DNS sync process already running (via supervisord)"
|
||||||
|
else
|
||||||
|
echo "Starting DNS sync process manually..."
|
||||||
|
# Start the DNS sync process in background if not running
|
||||||
|
docker compose exec -d bind9 /usr/local/bin/argus_dns_sync.sh
|
||||||
|
echo "✓ DNS sync process started manually"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Wait for first sync cycle
|
||||||
|
wait_for_sync
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 5: Testing auto-synced DNS records..."
|
||||||
|
|
||||||
|
failed_tests=0
|
||||||
|
|
||||||
|
# Test new DNS records created by auto-sync
|
||||||
|
if ! test_dns_query "test1" "10.0.0.100" "Auto-synced test1.argus.com"; then
|
||||||
|
((failed_tests++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! test_dns_query "test2" "10.0.0.200" "Auto-synced test2.argus.com"; then
|
||||||
|
((failed_tests++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! test_dns_query "api" "192.168.1.50" "Auto-synced api.argus.com"; then
|
||||||
|
((failed_tests++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Verify original records still work (use current IP from earlier)
|
||||||
|
if ! test_dns_query "web" "$current_web_ip" "Original web.argus.com still working"; then
|
||||||
|
((failed_tests++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! test_dns_query "ns1" "127.0.0.1" "Original ns1.argus.com still working"; then
|
||||||
|
((failed_tests++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 6: Testing IP update functionality..."
|
||||||
|
|
||||||
|
# Update an existing IP file
|
||||||
|
echo "Updating test1.argus.com IP from 10.0.0.100 to 10.0.0.150"
|
||||||
|
docker compose exec bind9 bash -c 'echo "10.0.0.150" > /private/argus/etc/test1.argus.com'
|
||||||
|
|
||||||
|
# Wait for sync
|
||||||
|
wait_for_sync
|
||||||
|
|
||||||
|
# Test updated record
|
||||||
|
if ! test_dns_query "test1" "10.0.0.150" "Updated test1.argus.com IP"; then
|
||||||
|
((failed_tests++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 7: Testing invalid IP handling..."
|
||||||
|
|
||||||
|
# Create file with invalid IP
|
||||||
|
echo "Creating invalid.argus.com with invalid IP"
|
||||||
|
docker compose exec bind9 bash -c 'echo "this is not an IP address" > /private/argus/etc/invalid.argus.com'
|
||||||
|
|
||||||
|
# Wait for sync (should skip invalid IP)
|
||||||
|
wait_for_sync
|
||||||
|
|
||||||
|
# Verify invalid record was not added (should fail to resolve)
|
||||||
|
result=$(dig @localhost -p "$HOST_DNS_PORT" invalid.argus.com A +short 2>/dev/null || echo "NO_RESULT")
|
||||||
|
if [ "$result" = "NO_RESULT" ] || [ -z "$result" ]; then
|
||||||
|
echo "✓ Invalid IP correctly ignored"
|
||||||
|
else
|
||||||
|
echo "✗ Invalid IP was processed: $result"
|
||||||
|
((failed_tests++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 8: Verifying backup functionality..."
|
||||||
|
|
||||||
|
# Check if backups were created
|
||||||
|
backup_count=$(docker compose exec bind9 ls -1 /private/argus/bind/.backup/ | wc -l || echo "0")
|
||||||
|
if [ "$backup_count" -gt 0 ]; then
|
||||||
|
echo "✓ Configuration backups created ($backup_count files)"
|
||||||
|
# Show latest backup
|
||||||
|
docker compose exec bind9 ls -la /private/argus/bind/.backup/ | tail -1
|
||||||
|
else
|
||||||
|
echo "✗ No backup files found"
|
||||||
|
((failed_tests++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 9: Cleanup..."
|
||||||
|
|
||||||
|
# Note: We don't stop the DNS sync process since it's managed by supervisord
|
||||||
|
echo "Note: DNS sync process will continue running (managed by supervisord)"
|
||||||
|
|
||||||
|
# Clean up test files
|
||||||
|
docker compose exec bind9 rm -f /private/argus/etc/test1.argus.com
|
||||||
|
docker compose exec bind9 rm -f /private/argus/etc/test2.argus.com
|
||||||
|
docker compose exec bind9 rm -f /private/argus/etc/api.argus.com
|
||||||
|
docker compose exec bind9 rm -f /private/argus/etc/invalid.argus.com
|
||||||
|
|
||||||
|
# Restore original configuration if backup exists
|
||||||
|
docker compose exec bind9 test -f /private/argus/bind/db.argus.com.backup.test && \
|
||||||
|
docker compose exec bind9 cp /private/argus/bind/db.argus.com.backup.test /private/argus/bind/db.argus.com && \
|
||||||
|
docker compose exec bind9 rm /private/argus/bind/db.argus.com.backup.test || true
|
||||||
|
|
||||||
|
# Reload original configuration
|
||||||
|
docker compose exec bind9 /usr/local/bin/reload-bind9.sh
|
||||||
|
|
||||||
|
echo "✓ Cleanup completed"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== DNS Auto-Sync Test Summary ==="
|
||||||
|
if [ $failed_tests -eq 0 ]; then
|
||||||
|
echo "✅ All DNS auto-sync tests passed!"
|
||||||
|
echo ""
|
||||||
|
echo "Validated functionality:"
|
||||||
|
echo " ✓ Automatic DNS record creation from IP files"
|
||||||
|
echo " ✓ IP address extraction from various file formats"
|
||||||
|
echo " ✓ Dynamic DNS record updates"
|
||||||
|
echo " ✓ Invalid IP address handling"
|
||||||
|
echo " ✓ Configuration backup mechanism"
|
||||||
|
echo " ✓ Preservation of existing DNS records"
|
||||||
|
echo ""
|
||||||
|
echo "The DNS auto-sync functionality is working correctly!"
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
echo "❌ $failed_tests DNS auto-sync test(s) failed!"
|
||||||
|
echo ""
|
||||||
|
echo "Please check:"
|
||||||
|
echo " - argus_dns_sync.sh script configuration"
|
||||||
|
echo " - File permissions in /private/argus/etc/"
|
||||||
|
echo " - BIND9 reload functionality"
|
||||||
|
echo " - Network connectivity and DNS resolution"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
115
src/bind/tests/scripts/03_reload_test.sh
Executable file
115
src/bind/tests/scripts/03_reload_test.sh
Executable file
@ -0,0 +1,115 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Test DNS configuration reload with IP modification
|
||||||
|
# Usage: ./03_reload_test.sh
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
TEST_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
|
||||||
|
echo "=== DNS Configuration Reload Test ==="
|
||||||
|
echo "Using DNS server localhost:${HOST_DNS_PORT}"
|
||||||
|
|
||||||
|
# Check if container is running
|
||||||
|
if ! docker compose ps | grep -q "Up"; then
|
||||||
|
echo "Error: BIND9 container is not running"
|
||||||
|
echo "Please start the container first with: ./01_start_container.sh"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if dig is available
|
||||||
|
if ! command -v dig &> /dev/null; then
|
||||||
|
echo "Installing dig (dnsutils)..."
|
||||||
|
apt-get update && apt-get install -y dnsutils
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Function to test DNS query
|
||||||
|
test_dns_query() {
|
||||||
|
local hostname="$1"
|
||||||
|
local expected_ip="$2"
|
||||||
|
local description="$3"
|
||||||
|
|
||||||
|
echo "Testing: $description"
|
||||||
|
echo "Query: $hostname.argus.com -> Expected: $expected_ip"
|
||||||
|
|
||||||
|
result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
|
||||||
|
|
||||||
|
if [ "$result" = "$expected_ip" ]; then
|
||||||
|
echo "✓ $result"
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
echo "✗ Got: $result, Expected: $expected_ip"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 1: Testing initial DNS configuration..."
|
||||||
|
|
||||||
|
# Test initial configuration
|
||||||
|
if ! test_dns_query "web" "12.4.5.6" "Initial web.argus.com resolution"; then
|
||||||
|
echo "Initial DNS test failed"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 2: Modifying DNS configuration..."
|
||||||
|
|
||||||
|
# Backup original configuration
|
||||||
|
cp "$TEST_DIR/private/argus/bind/db.argus.com" "$TEST_DIR/private/argus/bind/db.argus.com.backup" 2>/dev/null || true
|
||||||
|
|
||||||
|
# Create new configuration with modified IP
|
||||||
|
DB_FILE="$TEST_DIR/private/argus/bind/db.argus.com"
|
||||||
|
|
||||||
|
# Check if persistent config exists, if not use from container
|
||||||
|
if [ ! -f "$DB_FILE" ]; then
|
||||||
|
echo "Persistent config not found, copying from container..."
|
||||||
|
docker compose exec bind9 cp /etc/bind/db.argus.com /private/argus/bind/db.argus.com
|
||||||
|
docker compose exec bind9 chown bind:bind /private/argus/bind/db.argus.com
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Modify the IP address (12.4.5.6 -> 192.168.1.100)
|
||||||
|
sed -i 's/12\.4\.5\.6/192.168.1.100/g' "$DB_FILE"
|
||||||
|
|
||||||
|
# Increment serial number for DNS cache invalidation
|
||||||
|
current_serial=$(grep -o "2[[:space:]]*;" "$DB_FILE" | grep -o "2")
|
||||||
|
new_serial=$((current_serial + 1))
|
||||||
|
sed -i "s/2[[:space:]]*;/${new_serial} ;/" "$DB_FILE"
|
||||||
|
|
||||||
|
echo "Modified configuration:"
|
||||||
|
echo "- Changed web.argus.com IP: 12.4.5.6 -> 192.168.1.100"
|
||||||
|
echo "- Updated serial number: $current_serial -> $new_serial"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 3: Reloading BIND9 configuration..."
|
||||||
|
|
||||||
|
# Reload BIND9 configuration
|
||||||
|
docker compose exec bind9 /usr/local/bin/reload-bind9.sh
|
||||||
|
|
||||||
|
echo "Configuration reloaded"
|
||||||
|
|
||||||
|
# Wait a moment for changes to take effect
|
||||||
|
sleep 3
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 4: Testing modified DNS configuration..."
|
||||||
|
|
||||||
|
# Test modified configuration
|
||||||
|
if ! test_dns_query "web" "192.168.1.100" "Modified web.argus.com resolution"; then
|
||||||
|
echo "Modified DNS test failed"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Also verify ns1 still works
|
||||||
|
if ! test_dns_query "ns1" "127.0.0.1" "ns1.argus.com still working"; then
|
||||||
|
echo "ns1 DNS test failed after reload"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "✓ DNS configuration reload test completed successfully!"
|
||||||
|
echo "✓ IP address changed from 12.4.5.6 to 192.168.1.100"
|
||||||
|
echo "✓ Configuration persisted and reloaded correctly"
|
||||||
118
src/bind/tests/scripts/04_persistence_test.sh
Executable file
118
src/bind/tests/scripts/04_persistence_test.sh
Executable file
@ -0,0 +1,118 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Test configuration persistence after container restart
|
||||||
|
# Usage: ./04_persistence_test.sh
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
TEST_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
|
||||||
|
echo "=== Configuration Persistence Test ==="
|
||||||
|
echo "Using DNS server localhost:${HOST_DNS_PORT}"
|
||||||
|
|
||||||
|
# Check if dig is available
|
||||||
|
if ! command -v dig &> /dev/null; then
|
||||||
|
echo "Installing dig (dnsutils)..."
|
||||||
|
apt-get update && apt-get install -y dnsutils
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Function to test DNS query
|
||||||
|
test_dns_query() {
|
||||||
|
local hostname="$1"
|
||||||
|
local expected_ip="$2"
|
||||||
|
local description="$3"
|
||||||
|
|
||||||
|
echo "Testing: $description"
|
||||||
|
echo "Query: $hostname.argus.com -> Expected: $expected_ip"
|
||||||
|
|
||||||
|
result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
|
||||||
|
|
||||||
|
if [ "$result" = "$expected_ip" ]; then
|
||||||
|
echo "✓ $result"
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
echo "✗ Got: $result, Expected: $expected_ip"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 1: Stopping current container..."
|
||||||
|
|
||||||
|
# Stop the container
|
||||||
|
docker compose down
|
||||||
|
|
||||||
|
echo "Container stopped"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 2: Verifying persistent configuration exists..."
|
||||||
|
|
||||||
|
# Check if modified configuration exists
|
||||||
|
DB_FILE="$TEST_DIR/private/argus/bind/db.argus.com"
|
||||||
|
|
||||||
|
if [ ! -f "$DB_FILE" ]; then
|
||||||
|
echo "✗ Persistent configuration file not found: $DB_FILE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if the modified IP is in the configuration
|
||||||
|
if grep -q "192.168.1.100" "$DB_FILE"; then
|
||||||
|
echo "✓ Modified IP (192.168.1.100) found in persistent configuration"
|
||||||
|
else
|
||||||
|
echo "✗ Modified IP not found in persistent configuration"
|
||||||
|
echo "Configuration content:"
|
||||||
|
cat "$DB_FILE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 3: Restarting container with persistent configuration..."
|
||||||
|
|
||||||
|
# Start the container again
|
||||||
|
docker compose up -d
|
||||||
|
|
||||||
|
echo "Waiting for container to be ready..."
|
||||||
|
sleep 5
|
||||||
|
|
||||||
|
# Check if container is running
|
||||||
|
if ! docker compose ps | grep -q "Up"; then
|
||||||
|
echo "✗ Failed to restart container"
|
||||||
|
docker compose logs
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "✓ Container restarted successfully"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 4: Testing DNS resolution after restart..."
|
||||||
|
|
||||||
|
# Wait a bit more for DNS to be fully ready
|
||||||
|
sleep 5
|
||||||
|
|
||||||
|
# Test that the modified configuration is still active
|
||||||
|
if ! test_dns_query "web" "192.168.1.100" "Persistent web.argus.com resolution"; then
|
||||||
|
echo "✗ Persistent configuration test failed"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Also verify ns1 still works
|
||||||
|
if ! test_dns_query "ns1" "127.0.0.1" "ns1.argus.com still working"; then
|
||||||
|
echo "✗ ns1 DNS test failed after restart"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 5: Verifying configuration files are linked correctly..."
|
||||||
|
|
||||||
|
# Check that the persistent files are properly linked
|
||||||
|
echo "Checking file links in container:"
|
||||||
|
docker compose exec bind9 ls -la /etc/bind/named.conf.local /etc/bind/db.argus.com
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "✓ Configuration persistence test completed successfully!"
|
||||||
|
echo "✓ Modified IP (192.168.1.100) persisted after container restart"
|
||||||
|
echo "✓ Configuration files properly linked to persistent storage"
|
||||||
|
echo "✓ DNS resolution working correctly with persisted configuration"
|
||||||
90
src/bind/tests/scripts/05_cleanup.sh
Executable file
90
src/bind/tests/scripts/05_cleanup.sh
Executable file
@ -0,0 +1,90 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Clean up test environment and containers
|
||||||
|
# Usage: ./05_cleanup.sh [--full]
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
TEST_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
|
||||||
|
|
||||||
|
export HOST_DNS_PORT
|
||||||
|
|
||||||
|
# Parse command line arguments
|
||||||
|
FULL_CLEANUP=true
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case $1 in
|
||||||
|
--full)
|
||||||
|
FULL_CLEANUP=true
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Unknown option: $1"
|
||||||
|
echo "Usage: $0 [--full]"
|
||||||
|
echo " --full: Also remove persistent data "
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
cd "$TEST_DIR"
|
||||||
|
|
||||||
|
echo "=== Cleaning up BIND9 test environment ==="
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 1: Stopping and removing containers..."
|
||||||
|
|
||||||
|
# Stop and remove containers
|
||||||
|
docker compose down -v
|
||||||
|
|
||||||
|
echo "✓ Containers stopped and removed"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Step 2: Removing Docker networks..."
|
||||||
|
|
||||||
|
# Clean up networks
|
||||||
|
docker network prune -f > /dev/null 2>&1 || true
|
||||||
|
|
||||||
|
echo "✓ Docker networks cleaned"
|
||||||
|
|
||||||
|
if [ "$FULL_CLEANUP" = true ]; then
|
||||||
|
echo ""
|
||||||
|
echo "Step 3: Removing persistent data..."
|
||||||
|
|
||||||
|
# Remove persistent data directory
|
||||||
|
if [ -d "private" ]; then
|
||||||
|
rm -rf private
|
||||||
|
echo "✓ Persistent data directory removed"
|
||||||
|
else
|
||||||
|
echo "✓ No persistent data directory found"
|
||||||
|
fi
|
||||||
|
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "Step 3: Preserving persistent data and Docker image..."
|
||||||
|
echo "✓ Persistent data preserved in: private/"
|
||||||
|
echo "✓ Docker image 'argus-bind9:latest' preserved"
|
||||||
|
echo ""
|
||||||
|
echo "To perform full cleanup including persistent data and image, run:"
|
||||||
|
echo " $0 --full"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== Cleanup Summary ==="
|
||||||
|
echo "✓ Containers stopped and removed"
|
||||||
|
echo "✓ Docker networks cleaned"
|
||||||
|
|
||||||
|
if [ "$FULL_CLEANUP" = true ]; then
|
||||||
|
echo "✓ Persistent data removed"
|
||||||
|
echo ""
|
||||||
|
echo "Full cleanup completed! Test environment completely removed."
|
||||||
|
else
|
||||||
|
echo "✓ Persistent data preserved"
|
||||||
|
echo "✓ Docker image preserved"
|
||||||
|
echo ""
|
||||||
|
echo "Basic cleanup completed! Run './01_start_container.sh' to restart testing."
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Test environment cleanup finished."
|
||||||
5
src/log/.gitignore
vendored
Normal file
5
src/log/.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
|
||||||
|
private/
|
||||||
|
|
||||||
|
|
||||||
|
images/
|
||||||
8
src/log/README.md
Normal file
8
src/log/README.md
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
|
||||||
|
测试log模块开发
|
||||||
|
|
||||||
|
elasticsearch: 部署镜像构建及启动脚本(解决账号问题、挂载目录、使用supervisor守护)
|
||||||
|
kibana: 镜像构建
|
||||||
|
fluent-bit: 安装包,脚本准备, 交付给大鹏统一组织客户端侧安装流程
|
||||||
|
init: EK初始化脚本:数据视图创建脚本等
|
||||||
|
|
||||||
75
src/log/elasticsearch/build/Dockerfile
Normal file
75
src/log/elasticsearch/build/Dockerfile
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
FROM docker.elastic.co/elasticsearch/elasticsearch:8.13.4
|
||||||
|
|
||||||
|
# 切换到 root 用户进行系统级安装
|
||||||
|
USER root
|
||||||
|
|
||||||
|
ARG ARGUS_BUILD_UID=2133
|
||||||
|
ARG ARGUS_BUILD_GID=2015
|
||||||
|
|
||||||
|
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
|
||||||
|
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
|
||||||
|
|
||||||
|
# 调整 elasticsearch 用户与用户组 ID 以匹配宿主机配置
|
||||||
|
RUN set -eux; \
|
||||||
|
current_gid="$(getent group elasticsearch | awk -F: '{print $3}')"; \
|
||||||
|
if [ -z "$current_gid" ]; then \
|
||||||
|
groupadd -g "${ARGUS_BUILD_GID}" elasticsearch; \
|
||||||
|
elif [ "$current_gid" != "${ARGUS_BUILD_GID}" ]; then \
|
||||||
|
groupmod -g "${ARGUS_BUILD_GID}" elasticsearch; \
|
||||||
|
fi; \
|
||||||
|
if id elasticsearch >/dev/null 2>&1; then \
|
||||||
|
current_uid="$(id -u elasticsearch)"; \
|
||||||
|
if [ "$current_uid" != "${ARGUS_BUILD_UID}" ]; then \
|
||||||
|
usermod -u "${ARGUS_BUILD_UID}" elasticsearch; \
|
||||||
|
fi; \
|
||||||
|
else \
|
||||||
|
useradd -m -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" elasticsearch; \
|
||||||
|
fi; \
|
||||||
|
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/elasticsearch
|
||||||
|
|
||||||
|
# 设置构建参数
|
||||||
|
ARG USE_INTRANET=false
|
||||||
|
|
||||||
|
# 配置内网 apt 源 (如果指定了内网选项)
|
||||||
|
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||||
|
echo "Configuring intranet apt sources..." && \
|
||||||
|
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
|
||||||
|
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
|
||||||
|
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
|
||||||
|
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 安装 supervisor, net-tools, vim
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y supervisor net-tools inetutils-ping vim && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# 配置部署时使用的apt源
|
||||||
|
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||||
|
echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 创建 supervisor 日志目录
|
||||||
|
RUN mkdir -p /var/log/supervisor
|
||||||
|
|
||||||
|
|
||||||
|
# 复制 supervisor 配置文件
|
||||||
|
COPY src/log/elasticsearch/build/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
||||||
|
|
||||||
|
# 复制启动脚本
|
||||||
|
COPY src/log/elasticsearch/build/start-es-supervised.sh /usr/local/bin/start-es-supervised.sh
|
||||||
|
RUN chmod +x /usr/local/bin/start-es-supervised.sh
|
||||||
|
|
||||||
|
# 复制DNS监控脚本
|
||||||
|
COPY src/log/elasticsearch/build/dns-monitor.sh /usr/local/bin/dns-monitor.sh
|
||||||
|
RUN chmod +x /usr/local/bin/dns-monitor.sh
|
||||||
|
|
||||||
|
# 保持 root 用户,由 supervisor 管理用户切换
|
||||||
|
USER root
|
||||||
|
|
||||||
|
# 暴露端口
|
||||||
|
EXPOSE 9200 9300
|
||||||
|
|
||||||
|
# 使用 supervisor 作为入口点
|
||||||
|
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
|
||||||
1
src/log/elasticsearch/build/dns-monitor.sh
Symbolic link
1
src/log/elasticsearch/build/dns-monitor.sh
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../bind/build/dns-monitor.sh
|
||||||
32
src/log/elasticsearch/build/start-es-supervised.sh
Normal file
32
src/log/elasticsearch/build/start-es-supervised.sh
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
echo "[INFO] Starting Elasticsearch under supervisor..."
|
||||||
|
|
||||||
|
# 创建数据目录并设置权限(如果不存在)
|
||||||
|
mkdir -p /private/argus/log/elasticsearch
|
||||||
|
|
||||||
|
# 创建软链接到Elasticsearch预期的数据目录
|
||||||
|
if [ -L /usr/share/elasticsearch/data ]; then
|
||||||
|
rm /usr/share/elasticsearch/data
|
||||||
|
elif [ -d /usr/share/elasticsearch/data ]; then
|
||||||
|
rm -rf /usr/share/elasticsearch/data
|
||||||
|
fi
|
||||||
|
|
||||||
|
ln -sf /private/argus/log/elasticsearch /usr/share/elasticsearch/data
|
||||||
|
|
||||||
|
# 记录容器ip地址
|
||||||
|
DOMAIN=es.log.argus.com
|
||||||
|
IP=`ifconfig | grep -A 1 eth0 | grep inet | awk '{print $2}'`
|
||||||
|
echo current IP: ${IP}
|
||||||
|
echo ${IP} > /private/argus/etc/${DOMAIN}
|
||||||
|
|
||||||
|
echo "[INFO] Data directory linked: /usr/share/elasticsearch/data -> /private/argus/log/elasticsearch"
|
||||||
|
|
||||||
|
# 设置环境变量(ES配置通过docker-compose传递)
|
||||||
|
export ES_JAVA_OPTS="${ES_JAVA_OPTS:-"-Xms512m -Xmx512m"}"
|
||||||
|
|
||||||
|
echo "[INFO] Starting Elasticsearch process..."
|
||||||
|
|
||||||
|
# 启动原始的Elasticsearch entrypoint
|
||||||
|
exec /usr/local/bin/docker-entrypoint.sh elasticsearch
|
||||||
39
src/log/elasticsearch/build/supervisord.conf
Normal file
39
src/log/elasticsearch/build/supervisord.conf
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
[supervisord]
|
||||||
|
nodaemon=true
|
||||||
|
logfile=/var/log/supervisor/supervisord.log
|
||||||
|
pidfile=/var/run/supervisord.pid
|
||||||
|
user=root
|
||||||
|
|
||||||
|
[program:elasticsearch]
|
||||||
|
command=/usr/local/bin/start-es-supervised.sh
|
||||||
|
user=elasticsearch
|
||||||
|
stdout_logfile=/var/log/supervisor/elasticsearch.log
|
||||||
|
stderr_logfile=/var/log/supervisor/elasticsearch_error.log
|
||||||
|
autorestart=true
|
||||||
|
startretries=3
|
||||||
|
startsecs=30
|
||||||
|
stopwaitsecs=30
|
||||||
|
killasgroup=true
|
||||||
|
stopasgroup=true
|
||||||
|
|
||||||
|
[program:dns-monitor]
|
||||||
|
command=/usr/local/bin/dns-monitor.sh
|
||||||
|
user=root
|
||||||
|
stdout_logfile=/var/log/supervisor/dns-monitor.log
|
||||||
|
stderr_logfile=/var/log/supervisor/dns-monitor_error.log
|
||||||
|
autorestart=true
|
||||||
|
startretries=3
|
||||||
|
startsecs=5
|
||||||
|
stopwaitsecs=10
|
||||||
|
killasgroup=true
|
||||||
|
stopasgroup=true
|
||||||
|
|
||||||
|
[unix_http_server]
|
||||||
|
file=/var/run/supervisor.sock
|
||||||
|
chmod=0700
|
||||||
|
|
||||||
|
[supervisorctl]
|
||||||
|
serverurl=unix:///var/run/supervisor.sock
|
||||||
|
|
||||||
|
[rpcinterface:supervisor]
|
||||||
|
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
|
||||||
37
src/log/fluent-bit/build/etc/fluent-bit.conf
Normal file
37
src/log/fluent-bit/build/etc/fluent-bit.conf
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
[SERVICE]
|
||||||
|
Daemon Off
|
||||||
|
Parsers_File parsers.conf
|
||||||
|
HTTP_Server On
|
||||||
|
HTTP_Listen 0.0.0.0
|
||||||
|
HTTP_Port 2020
|
||||||
|
storage.path /buffers
|
||||||
|
storage.sync normal
|
||||||
|
storage.checksum on
|
||||||
|
storage.backlog.mem_limit 128M
|
||||||
|
# 备注:该镜像默认未开启 Hot Reload,修改配置后请重启容器。
|
||||||
|
|
||||||
|
@INCLUDE inputs.d/*.conf
|
||||||
|
|
||||||
|
[FILTER]
|
||||||
|
Name parser
|
||||||
|
Match app.*
|
||||||
|
Key_Name log
|
||||||
|
Parser timestamp_parser
|
||||||
|
Reserve_Data On
|
||||||
|
Preserve_Key On
|
||||||
|
Unescape_Key On
|
||||||
|
|
||||||
|
[FILTER]
|
||||||
|
Name record_modifier
|
||||||
|
Match *
|
||||||
|
Record cluster ${CLUSTER}
|
||||||
|
Record rack ${RACK}
|
||||||
|
Record host ${HOSTNAME}
|
||||||
|
|
||||||
|
[FILTER]
|
||||||
|
Name lua
|
||||||
|
Match app.*
|
||||||
|
script inject_labels.lua
|
||||||
|
call add_labels
|
||||||
|
|
||||||
|
@INCLUDE outputs.d/*.conf
|
||||||
15
src/log/fluent-bit/build/etc/inject_labels.lua
Normal file
15
src/log/fluent-bit/build/etc/inject_labels.lua
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
function add_labels(tag, ts, record)
|
||||||
|
record["job_id"] = os.getenv("FB_JOB_ID") or record["job_id"] or "unknown"
|
||||||
|
record["user"] = os.getenv("FB_USER") or record["user"] or "unknown"
|
||||||
|
record["model"] = os.getenv("FB_MODEL") or record["model"] or "unknown"
|
||||||
|
record["gpu_id"] = os.getenv("FB_GPU_ID") or record["gpu_id"] or "na"
|
||||||
|
local p = record["log_path"] or ""
|
||||||
|
if string.find(p, "/logs/infer/") then
|
||||||
|
record["role"] = "infer"
|
||||||
|
elseif string.find(p, "/logs/train/") then
|
||||||
|
record["role"] = "train"
|
||||||
|
else
|
||||||
|
record["role"] = record["role"] or "app"
|
||||||
|
end
|
||||||
|
return 1, ts, record
|
||||||
|
end
|
||||||
10
src/log/fluent-bit/build/etc/inputs.d/10-train.conf
Normal file
10
src/log/fluent-bit/build/etc/inputs.d/10-train.conf
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
[INPUT]
|
||||||
|
Name tail
|
||||||
|
Path /logs/train/*.log
|
||||||
|
Tag app.train
|
||||||
|
Path_Key log_path
|
||||||
|
Refresh_Interval 5
|
||||||
|
DB /buffers/train.db
|
||||||
|
Skip_Long_Lines On
|
||||||
|
storage.type filesystem
|
||||||
|
multiline.parser python,go,java
|
||||||
10
src/log/fluent-bit/build/etc/inputs.d/20-infer.conf
Normal file
10
src/log/fluent-bit/build/etc/inputs.d/20-infer.conf
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
[INPUT]
|
||||||
|
Name tail
|
||||||
|
Path /logs/infer/*.log
|
||||||
|
Tag app.infer
|
||||||
|
Path_Key log_path
|
||||||
|
Refresh_Interval 5
|
||||||
|
DB /buffers/infer.db
|
||||||
|
Skip_Long_Lines On
|
||||||
|
storage.type filesystem
|
||||||
|
multiline.parser python,go,java
|
||||||
24
src/log/fluent-bit/build/etc/outputs.d/10-es.conf
Normal file
24
src/log/fluent-bit/build/etc/outputs.d/10-es.conf
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
# 重要:使用 Logstash_Format + Logstash_Prefix,生成 train-*/infer-* 索引
|
||||||
|
[OUTPUT]
|
||||||
|
Name es
|
||||||
|
Match app.train
|
||||||
|
Host ${ES_HOST}
|
||||||
|
Port ${ES_PORT}
|
||||||
|
Logstash_Format On
|
||||||
|
Logstash_Prefix train
|
||||||
|
Replace_Dots On
|
||||||
|
Generate_ID On
|
||||||
|
Retry_Limit False
|
||||||
|
Suppress_Type_Name On
|
||||||
|
|
||||||
|
[OUTPUT]
|
||||||
|
Name es
|
||||||
|
Match app.infer
|
||||||
|
Host ${ES_HOST}
|
||||||
|
Port ${ES_PORT}
|
||||||
|
Logstash_Format On
|
||||||
|
Logstash_Prefix infer
|
||||||
|
Replace_Dots On
|
||||||
|
Generate_ID On
|
||||||
|
Retry_Limit False
|
||||||
|
Suppress_Type_Name On
|
||||||
29
src/log/fluent-bit/build/etc/parsers.conf
Normal file
29
src/log/fluent-bit/build/etc/parsers.conf
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
[MULTILINE_PARSER]
|
||||||
|
Name python
|
||||||
|
Type regex
|
||||||
|
Flush 2
|
||||||
|
Rule "start_state" "/^\d{4}-\d{2}-\d{2}[\sT]/" "cont"
|
||||||
|
Rule "cont" "/^\s+|^Traceback|^\tat\s+/" "cont"
|
||||||
|
|
||||||
|
[MULTILINE_PARSER]
|
||||||
|
Name go
|
||||||
|
Type regex
|
||||||
|
Flush 2
|
||||||
|
Rule "start_state" "/^[0-9]{4}\/[0-9]{2}\/[0-9]{2}/" "cont"
|
||||||
|
Rule "cont" "/^\s+|^\t/" "cont"
|
||||||
|
|
||||||
|
[MULTILINE_PARSER]
|
||||||
|
Name java
|
||||||
|
Type regex
|
||||||
|
Flush 2
|
||||||
|
Rule "start_state" "/^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}/" "cont"
|
||||||
|
Rule "cont" "/^\s+at\s+|^\t.../" "cont"
|
||||||
|
|
||||||
|
[PARSER]
|
||||||
|
Name timestamp_parser
|
||||||
|
Format regex
|
||||||
|
Regex ^(?<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(?<level>\w+)\s+(?<message>.*)$
|
||||||
|
Time_Key timestamp
|
||||||
|
Time_Format %Y-%m-%d %H:%M:%S
|
||||||
|
Time_Offset +0800
|
||||||
|
Time_Keep On
|
||||||
BIN
src/log/fluent-bit/build/packages/fluent-bit_3.1.9_amd64.deb
Normal file
BIN
src/log/fluent-bit/build/packages/fluent-bit_3.1.9_amd64.deb
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user