Compare commits
No commits in common. "dev_1.0.0_yuyr_6" and "main" have entirely different histories.
dev_1.0.0_
...
main
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -1 +0,0 @@
|
|||||||
src/metric/client-plugins/all-in-one-full/plugins/*/bin/* filter=lfs diff=lfs merge=lfs -text
|
|
||||||
1
.gitignore
vendored
1
.gitignore
vendored
@ -1 +0,0 @@
|
|||||||
.idea/
|
|
||||||
@ -5,10 +5,3 @@
|
|||||||
项目文档:【腾讯文档】GPU集群运维系统
|
项目文档:【腾讯文档】GPU集群运维系统
|
||||||
https://docs.qq.com/doc/DQUxDdmhIZ1dpeERk
|
https://docs.qq.com/doc/DQUxDdmhIZ1dpeERk
|
||||||
|
|
||||||
## 构建账号配置
|
|
||||||
|
|
||||||
镜像构建和运行账号的 UID/GID 可通过 `configs/build_user.conf` 配置,详细说明见 `doc/build-user-config.md`。
|
|
||||||
|
|
||||||
## 本地端口占用提示
|
|
||||||
|
|
||||||
如需运行 BIND 模块端到端测试且宿主机 53 端口已占用,可通过环境变量 `HOST_DNS_PORT`(默认 1053)指定对外映射端口,例如 `HOST_DNS_PORT=12053 ./scripts/00_e2e_test.sh`。
|
|
||||||
|
|||||||
@ -1,409 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
show_help() {
|
|
||||||
cat <<'EOF'
|
|
||||||
ARGUS Unified Build System - Image Build Tool
|
|
||||||
|
|
||||||
Usage: $0 [OPTIONS]
|
|
||||||
|
|
||||||
Options:
|
|
||||||
--intranet Use intranet mirror for log/bind builds
|
|
||||||
--master-offline Build master offline image (requires src/master/offline_wheels.tar.gz)
|
|
||||||
--metric Build metric module images (ftp, prometheus, grafana, test nodes)
|
|
||||||
--no-cache Build all images without using Docker layer cache
|
|
||||||
--only LIST Comma-separated targets to build: core,master,metric,web,alert,sys,all
|
|
||||||
-h, --help Show this help message
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
$0 # Build with default sources
|
|
||||||
$0 --intranet # Build with intranet mirror
|
|
||||||
$0 --master-offline # Additionally build argus-master:offline
|
|
||||||
$0 --metric # Additionally build metric module images
|
|
||||||
$0 --intranet --master-offline --metric
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
use_intranet=false
|
|
||||||
build_core=true
|
|
||||||
build_master=true
|
|
||||||
build_master_offline=false
|
|
||||||
build_metric=true
|
|
||||||
build_web=true
|
|
||||||
build_alert=true
|
|
||||||
build_sys=true
|
|
||||||
no_cache=false
|
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
case $1 in
|
|
||||||
--intranet)
|
|
||||||
use_intranet=true
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
--master)
|
|
||||||
build_master=true
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
--master-offline)
|
|
||||||
build_master=true
|
|
||||||
build_master_offline=true
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
--metric)
|
|
||||||
build_metric=true
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
--no-cache)
|
|
||||||
no_cache=true
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
--only)
|
|
||||||
if [[ -z ${2:-} ]]; then
|
|
||||||
echo "--only requires a target list" >&2; exit 1
|
|
||||||
fi
|
|
||||||
sel="$2"; shift 2
|
|
||||||
# reset all, then enable selected
|
|
||||||
build_core=false; build_master=false; build_metric=false; build_web=false; build_alert=false; build_sys=false
|
|
||||||
IFS=',' read -ra parts <<< "$sel"
|
|
||||||
for p in "${parts[@]}"; do
|
|
||||||
case "$p" in
|
|
||||||
core) build_core=true ;;
|
|
||||||
master) build_master=true ;;
|
|
||||||
metric) build_metric=true ;;
|
|
||||||
web) build_web=true ;;
|
|
||||||
alert) build_alert=true ;;
|
|
||||||
sys) build_sys=true ;;
|
|
||||||
all) build_core=true; build_master=true; build_metric=true; build_web=true; build_alert=true; build_sys=true ;;
|
|
||||||
*) echo "Unknown --only target: $p" >&2; exit 1 ;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
;;
|
|
||||||
-h|--help)
|
|
||||||
show_help
|
|
||||||
exit 0
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "Unknown option: $1" >&2
|
|
||||||
show_help
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
|
||||||
. "$root/scripts/common/build_user.sh"
|
|
||||||
|
|
||||||
declare -a build_args=()
|
|
||||||
|
|
||||||
if [[ "$use_intranet" == true ]]; then
|
|
||||||
build_args+=("--build-arg" "USE_INTRANET=true")
|
|
||||||
fi
|
|
||||||
|
|
||||||
cd "$root"
|
|
||||||
|
|
||||||
load_build_user
|
|
||||||
build_args+=("--build-arg" "ARGUS_BUILD_UID=${ARGUS_BUILD_UID}" "--build-arg" "ARGUS_BUILD_GID=${ARGUS_BUILD_GID}")
|
|
||||||
|
|
||||||
if [[ "$no_cache" == true ]]; then
|
|
||||||
build_args+=("--no-cache")
|
|
||||||
fi
|
|
||||||
|
|
||||||
master_root="$root/src/master"
|
|
||||||
master_offline_tar="$master_root/offline_wheels.tar.gz"
|
|
||||||
master_offline_dir="$master_root/offline_wheels"
|
|
||||||
|
|
||||||
if [[ "$build_master_offline" == true ]]; then
|
|
||||||
if [[ ! -f "$master_offline_tar" ]]; then
|
|
||||||
echo "❌ offline wheels tar not found: $master_offline_tar" >&2
|
|
||||||
echo " 请提前准备好 offline_wheels.tar.gz 后再执行 --master-offline" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "📦 Preparing offline wheels for master (extracting $master_offline_tar)"
|
|
||||||
rm -rf "$master_offline_dir"
|
|
||||||
mkdir -p "$master_offline_dir"
|
|
||||||
tar -xzf "$master_offline_tar" -C "$master_root"
|
|
||||||
has_wheel=$(find "$master_offline_dir" -maxdepth 1 -type f -name '*.whl' -print -quit)
|
|
||||||
if [[ -z "$has_wheel" ]]; then
|
|
||||||
echo "❌ offline_wheels extraction failed或无 wheel: $master_offline_dir" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "======================================="
|
|
||||||
echo "ARGUS Unified Build System"
|
|
||||||
echo "======================================="
|
|
||||||
|
|
||||||
if [[ "$use_intranet" == true ]]; then
|
|
||||||
echo "🌐 Mode: Intranet (Using internal mirror: 10.68.64.1)"
|
|
||||||
else
|
|
||||||
echo "🌐 Mode: Public (Using default package sources)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "👤 Build user UID:GID -> ${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}"
|
|
||||||
|
|
||||||
echo "📁 Build context: $root"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
build_image() {
|
|
||||||
local image_name=$1
|
|
||||||
local dockerfile_path=$2
|
|
||||||
local tag=$3
|
|
||||||
local context="."
|
|
||||||
shift 3
|
|
||||||
|
|
||||||
if [[ $# -gt 0 ]]; then
|
|
||||||
context=$1
|
|
||||||
shift
|
|
||||||
fi
|
|
||||||
|
|
||||||
local extra_args=("$@")
|
|
||||||
|
|
||||||
echo "🔄 Building $image_name image..."
|
|
||||||
echo " Dockerfile: $dockerfile_path"
|
|
||||||
echo " Tag: $tag"
|
|
||||||
echo " Context: $context"
|
|
||||||
|
|
||||||
if docker build "${build_args[@]}" "${extra_args[@]}" -f "$dockerfile_path" -t "$tag" "$context"; then
|
|
||||||
echo "✅ $image_name image built successfully"
|
|
||||||
return 0
|
|
||||||
else
|
|
||||||
echo "❌ Failed to build $image_name image"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
pull_base_image() {
|
|
||||||
local image_ref=$1
|
|
||||||
local attempts=${2:-3}
|
|
||||||
local delay=${3:-5}
|
|
||||||
|
|
||||||
# If the image already exists locally, skip pulling.
|
|
||||||
if docker image inspect "$image_ref" >/dev/null 2>&1; then
|
|
||||||
echo " Local image present; skip pull: $image_ref"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
for ((i=1; i<=attempts; i++)); do
|
|
||||||
echo " Pulling base image ($i/$attempts): $image_ref"
|
|
||||||
if docker pull "$image_ref" >/dev/null; then
|
|
||||||
echo " Base image ready: $image_ref"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
echo " Pull failed: $image_ref"
|
|
||||||
if (( i < attempts )); then
|
|
||||||
echo " Retrying in ${delay}s..."
|
|
||||||
sleep "$delay"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "❌ Unable to pull base image after ${attempts} attempts: $image_ref"
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
images_built=()
|
|
||||||
build_failed=false
|
|
||||||
|
|
||||||
if [[ "$build_core" == true ]]; then
|
|
||||||
if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then
|
|
||||||
images_built+=("argus-elasticsearch:latest")
|
|
||||||
else
|
|
||||||
build_failed=true
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:latest"; then
|
|
||||||
images_built+=("argus-kibana:latest")
|
|
||||||
else
|
|
||||||
build_failed=true
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:latest"; then
|
|
||||||
images_built+=("argus-bind9:latest")
|
|
||||||
else
|
|
||||||
build_failed=true
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
if [[ "$build_master" == true ]]; then
|
|
||||||
echo ""
|
|
||||||
echo "🔄 Building Master image..."
|
|
||||||
pushd "$master_root" >/dev/null
|
|
||||||
master_args=("--tag" "argus-master:latest")
|
|
||||||
if [[ "$use_intranet" == true ]]; then
|
|
||||||
master_args+=("--intranet")
|
|
||||||
fi
|
|
||||||
if [[ "$build_master_offline" == true ]]; then
|
|
||||||
master_args+=("--offline")
|
|
||||||
fi
|
|
||||||
if [[ "$no_cache" == true ]]; then
|
|
||||||
master_args+=("--no-cache")
|
|
||||||
fi
|
|
||||||
if ./scripts/build_images.sh "${master_args[@]}"; then
|
|
||||||
if [[ "$build_master_offline" == true ]]; then
|
|
||||||
images_built+=("argus-master:offline")
|
|
||||||
else
|
|
||||||
images_built+=("argus-master:latest")
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
build_failed=true
|
|
||||||
fi
|
|
||||||
popd >/dev/null
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$build_metric" == true ]]; then
|
|
||||||
echo ""
|
|
||||||
echo "Building Metric module images..."
|
|
||||||
|
|
||||||
metric_base_images=(
|
|
||||||
"ubuntu:22.04"
|
|
||||||
"ubuntu/prometheus:3-24.04_stable"
|
|
||||||
"grafana/grafana:11.1.0"
|
|
||||||
)
|
|
||||||
|
|
||||||
for base_image in "${metric_base_images[@]}"; do
|
|
||||||
if ! pull_base_image "$base_image"; then
|
|
||||||
build_failed=true
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
metric_builds=(
|
|
||||||
"Metric FTP|src/metric/ftp/build/Dockerfile|argus-metric-ftp:latest|src/metric/ftp/build"
|
|
||||||
"Metric Prometheus|src/metric/prometheus/build/Dockerfile|argus-metric-prometheus:latest|src/metric/prometheus/build"
|
|
||||||
"Metric Grafana|src/metric/grafana/build/Dockerfile|argus-metric-grafana:latest|src/metric/grafana/build"
|
|
||||||
)
|
|
||||||
|
|
||||||
for build_spec in "${metric_builds[@]}"; do
|
|
||||||
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
|
|
||||||
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
|
|
||||||
images_built+=("$image_tag")
|
|
||||||
else
|
|
||||||
build_failed=true
|
|
||||||
fi
|
|
||||||
echo ""
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
# =======================================
|
|
||||||
# Sys (system tests) node images
|
|
||||||
# =======================================
|
|
||||||
|
|
||||||
if [[ "$build_sys" == true ]]; then
|
|
||||||
echo ""
|
|
||||||
echo "Building Sys node images..."
|
|
||||||
|
|
||||||
sys_base_images=(
|
|
||||||
"ubuntu:22.04"
|
|
||||||
"nvidia/cuda:12.2.2-runtime-ubuntu22.04"
|
|
||||||
)
|
|
||||||
|
|
||||||
for base_image in "${sys_base_images[@]}"; do
|
|
||||||
if ! pull_base_image "$base_image"; then
|
|
||||||
build_failed=true
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
sys_builds=(
|
|
||||||
"Sys Node|src/sys/build/node/Dockerfile|argus-sys-node:latest|."
|
|
||||||
"Sys Metric Test Node|src/sys/build/test-node/Dockerfile|argus-sys-metric-test-node:latest|."
|
|
||||||
"Sys Metric Test GPU Node|src/sys/build/test-gpu-node/Dockerfile|argus-sys-metric-test-gpu-node:latest|."
|
|
||||||
)
|
|
||||||
|
|
||||||
for build_spec in "${sys_builds[@]}"; do
|
|
||||||
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
|
|
||||||
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
|
|
||||||
images_built+=("$image_tag")
|
|
||||||
else
|
|
||||||
build_failed=true
|
|
||||||
fi
|
|
||||||
echo ""
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
# =======================================
|
|
||||||
# Web & Alert module images
|
|
||||||
# =======================================
|
|
||||||
|
|
||||||
if [[ "$build_web" == true || "$build_alert" == true ]]; then
|
|
||||||
echo ""
|
|
||||||
echo "Building Web and Alert module images..."
|
|
||||||
|
|
||||||
# Pre-pull commonly used base images for stability
|
|
||||||
web_alert_base_images=(
|
|
||||||
"node:20"
|
|
||||||
"ubuntu:24.04"
|
|
||||||
)
|
|
||||||
|
|
||||||
for base_image in "${web_alert_base_images[@]}"; do
|
|
||||||
if ! pull_base_image "$base_image"; then
|
|
||||||
build_failed=true
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ "$build_web" == true ]]; then
|
|
||||||
web_builds=(
|
|
||||||
"Web Frontend|src/web/build_tools/frontend/Dockerfile|argus-web-frontend:latest|."
|
|
||||||
"Web Proxy|src/web/build_tools/proxy/Dockerfile|argus-web-proxy:latest|."
|
|
||||||
)
|
|
||||||
for build_spec in "${web_builds[@]}"; do
|
|
||||||
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
|
|
||||||
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
|
|
||||||
images_built+=("$image_tag")
|
|
||||||
else
|
|
||||||
build_failed=true
|
|
||||||
fi
|
|
||||||
echo ""
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$build_alert" == true ]]; then
|
|
||||||
alert_builds=(
|
|
||||||
"Alertmanager|src/alert/alertmanager/build/Dockerfile|argus-alertmanager:latest|."
|
|
||||||
)
|
|
||||||
for build_spec in "${alert_builds[@]}"; do
|
|
||||||
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
|
|
||||||
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
|
|
||||||
images_built+=("$image_tag")
|
|
||||||
else
|
|
||||||
build_failed=true
|
|
||||||
fi
|
|
||||||
echo ""
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "======================================="
|
|
||||||
echo "📦 Build Summary"
|
|
||||||
echo "======================================="
|
|
||||||
|
|
||||||
if [[ ${#images_built[@]} -gt 0 ]]; then
|
|
||||||
echo "✅ Successfully built images:"
|
|
||||||
for image in "${images_built[@]}"; do
|
|
||||||
echo " • $image"
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$build_failed" == true ]]; then
|
|
||||||
echo ""
|
|
||||||
echo "❌ Some images failed to build. Please check the errors above."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$use_intranet" == true ]]; then
|
|
||||||
echo ""
|
|
||||||
echo "🌐 Built with intranet mirror configuration"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$build_master_offline" == true ]]; then
|
|
||||||
echo ""
|
|
||||||
echo "🧳 Master offline wheels 已解压到 $master_offline_dir"
|
|
||||||
fi
|
|
||||||
echo ""
|
|
||||||
echo "🚀 Next steps:"
|
|
||||||
echo " ./build/save_images.sh --compress # 导出镜像"
|
|
||||||
echo " cd src/master/tests && MASTER_IMAGE_TAG=argus-master:offline ./scripts/00_e2e_test.sh"
|
|
||||||
echo ""
|
|
||||||
@ -1,229 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# 帮助信息
|
|
||||||
show_help() {
|
|
||||||
cat << EOF
|
|
||||||
ARGUS Unified Build System - Image Export Tool
|
|
||||||
|
|
||||||
Usage: $0 [OPTIONS]
|
|
||||||
|
|
||||||
Options:
|
|
||||||
--compress Compress exported images with gzip
|
|
||||||
-h, --help Show this help message
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
$0 # Export all images without compression
|
|
||||||
$0 --compress # Export all images with gzip compression
|
|
||||||
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
# 解析命令行参数
|
|
||||||
use_compression=false
|
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
case $1 in
|
|
||||||
--compress)
|
|
||||||
use_compression=true
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
-h|--help)
|
|
||||||
show_help
|
|
||||||
exit 0
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "Unknown option: $1"
|
|
||||||
show_help
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
# 获取项目根目录
|
|
||||||
root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
|
||||||
cd "$root"
|
|
||||||
|
|
||||||
# 创建镜像输出目录
|
|
||||||
images_dir="$root/images"
|
|
||||||
mkdir -p "$images_dir"
|
|
||||||
|
|
||||||
echo "======================================="
|
|
||||||
echo "ARGUS Unified Build System - Image Export"
|
|
||||||
echo "======================================="
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
if [[ "$use_compression" == true ]]; then
|
|
||||||
echo "🗜️ Mode: With gzip compression"
|
|
||||||
else
|
|
||||||
echo "📦 Mode: No compression"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "📁 Output directory: $images_dir"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# 定义镜像列表
|
|
||||||
declare -A images=(
|
|
||||||
["argus-elasticsearch:latest"]="argus-elasticsearch-latest.tar"
|
|
||||||
["argus-kibana:latest"]="argus-kibana-latest.tar"
|
|
||||||
["argus-bind9:latest"]="argus-bind9-latest.tar"
|
|
||||||
["argus-master:offline"]="argus-master-offline.tar"
|
|
||||||
["argus-metric-ftp:latest"]="argus-metric-ftp-latest.tar"
|
|
||||||
["argus-metric-prometheus:latest"]="argus-metric-prometheus-latest.tar"
|
|
||||||
["argus-metric-grafana:latest"]="argus-metric-grafana-latest.tar"
|
|
||||||
["argus-web-frontend:latest"]="argus-web-frontend-latest.tar"
|
|
||||||
["argus-web-proxy:latest"]="argus-web-proxy-latest.tar"
|
|
||||||
["argus-alertmanager:latest"]="argus-alertmanager-latest.tar"
|
|
||||||
)
|
|
||||||
|
|
||||||
# 函数:检查镜像是否存在
|
|
||||||
check_image() {
|
|
||||||
local image_name="$1"
|
|
||||||
if docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^$image_name$"; then
|
|
||||||
echo "✅ Image found: $image_name"
|
|
||||||
return 0
|
|
||||||
else
|
|
||||||
echo "❌ Image not found: $image_name"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# 函数:显示镜像信息
|
|
||||||
show_image_info() {
|
|
||||||
local image_name="$1"
|
|
||||||
echo "📋 Image info for $image_name:"
|
|
||||||
docker images "$image_name" --format " Size: {{.Size}}, Created: {{.CreatedSince}}, ID: {{.ID}}"
|
|
||||||
}
|
|
||||||
|
|
||||||
# 函数:保存镜像
|
|
||||||
save_image() {
|
|
||||||
local image_name="$1"
|
|
||||||
local output_file="$2"
|
|
||||||
local output_path="$images_dir/$output_file"
|
|
||||||
|
|
||||||
echo "🔄 Saving $image_name to $output_file..."
|
|
||||||
|
|
||||||
# 删除旧的镜像文件(如果存在)
|
|
||||||
if [[ -f "$output_path" ]]; then
|
|
||||||
echo " Removing existing file: $output_file"
|
|
||||||
rm "$output_path"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$use_compression" == true && -f "$output_path.gz" ]]; then
|
|
||||||
echo " Removing existing compressed file: $output_file.gz"
|
|
||||||
rm "$output_path.gz"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 保存镜像
|
|
||||||
docker save "$image_name" -o "$output_path"
|
|
||||||
|
|
||||||
if [[ "$use_compression" == true ]]; then
|
|
||||||
echo " Compressing with gzip..."
|
|
||||||
gzip "$output_path"
|
|
||||||
output_path="$output_path.gz"
|
|
||||||
output_file="$output_file.gz"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 检查文件大小
|
|
||||||
local file_size=$(du -h "$output_path" | cut -f1)
|
|
||||||
echo "✅ Saved successfully: $output_file ($file_size)"
|
|
||||||
}
|
|
||||||
|
|
||||||
echo "🔍 Checking for ARGUS images..."
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# 检查所有镜像
|
|
||||||
available_images=()
|
|
||||||
missing_images=()
|
|
||||||
|
|
||||||
for image_name in "${!images[@]}"; do
|
|
||||||
if check_image "$image_name"; then
|
|
||||||
show_image_info "$image_name"
|
|
||||||
available_images+=("$image_name")
|
|
||||||
else
|
|
||||||
missing_images+=("$image_name")
|
|
||||||
fi
|
|
||||||
echo ""
|
|
||||||
done
|
|
||||||
|
|
||||||
# 如果没有镜像存在,提示构建
|
|
||||||
if [[ ${#available_images[@]} -eq 0 ]]; then
|
|
||||||
echo "❌ No ARGUS images found to export."
|
|
||||||
echo ""
|
|
||||||
echo "🔧 Please build the images first with:"
|
|
||||||
echo " ./build/build_images.sh"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 显示缺失的镜像
|
|
||||||
if [[ ${#missing_images[@]} -gt 0 ]]; then
|
|
||||||
echo "⚠️ Missing images (will be skipped):"
|
|
||||||
for image_name in "${missing_images[@]}"; do
|
|
||||||
echo " • $image_name"
|
|
||||||
done
|
|
||||||
echo ""
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "💾 Starting image export process..."
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# 保存所有可用的镜像
|
|
||||||
exported_files=()
|
|
||||||
for image_name in "${available_images[@]}"; do
|
|
||||||
output_file="${images[$image_name]}"
|
|
||||||
save_image "$image_name" "$output_file"
|
|
||||||
|
|
||||||
if [[ "$use_compression" == true ]]; then
|
|
||||||
exported_files+=("$output_file.gz")
|
|
||||||
else
|
|
||||||
exported_files+=("$output_file")
|
|
||||||
fi
|
|
||||||
echo ""
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "======================================="
|
|
||||||
echo "📦 Export Summary"
|
|
||||||
echo "======================================="
|
|
||||||
|
|
||||||
# 显示导出的文件
|
|
||||||
echo "📁 Exported files in $images_dir:"
|
|
||||||
total_size=0
|
|
||||||
for file in "${exported_files[@]}"; do
|
|
||||||
full_path="$images_dir/$file"
|
|
||||||
if [[ -f "$full_path" ]]; then
|
|
||||||
size=$(du -h "$full_path" | cut -f1)
|
|
||||||
size_bytes=$(du -b "$full_path" | cut -f1)
|
|
||||||
total_size=$((total_size + size_bytes))
|
|
||||||
echo " ✅ $file ($size)"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
# 显示总大小
|
|
||||||
if [[ $total_size -gt 0 ]]; then
|
|
||||||
total_size_human=$(numfmt --to=iec --suffix=B $total_size)
|
|
||||||
echo ""
|
|
||||||
echo "📊 Total size: $total_size_human"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "🚀 Usage instructions:"
|
|
||||||
echo " To load these images on another system:"
|
|
||||||
|
|
||||||
if [[ "$use_compression" == true ]]; then
|
|
||||||
for file in "${exported_files[@]}"; do
|
|
||||||
if [[ -f "$images_dir/$file" ]]; then
|
|
||||||
base_name="${file%.gz}"
|
|
||||||
echo " gunzip $file && docker load -i $base_name"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
else
|
|
||||||
for file in "${exported_files[@]}"; do
|
|
||||||
if [[ -f "$images_dir/$file" ]]; then
|
|
||||||
echo " docker load -i $file"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "✅ Image export completed successfully!"
|
|
||||||
echo ""
|
|
||||||
2
configs/.gitignore
vendored
2
configs/.gitignore
vendored
@ -1,2 +0,0 @@
|
|||||||
# Local overrides for build user/group settings
|
|
||||||
build_user.local.conf
|
|
||||||
@ -1,6 +0,0 @@
|
|||||||
# Default build-time UID/GID for Argus images
|
|
||||||
# Override by creating configs/build_user.local.conf with the same format.
|
|
||||||
# Syntax: KEY=VALUE, supports UID/GID only. Whitespace and lines starting with # are ignored.
|
|
||||||
|
|
||||||
UID=2133
|
|
||||||
GID=2015
|
|
||||||
1
deployment/.gitignore
vendored
1
deployment/.gitignore
vendored
@ -1 +0,0 @@
|
|||||||
artifact/
|
|
||||||
@ -1,16 +0,0 @@
|
|||||||
# Deployment Build Toolkit
|
|
||||||
|
|
||||||
This folder provides scripts to produce offline server/client packages and publish the client package to FTP.
|
|
||||||
|
|
||||||
Commands
|
|
||||||
- build_server_package.sh [--version YYYYMMDD]
|
|
||||||
- build_client_package.sh [--version YYYYMMDD]
|
|
||||||
- publish_client.sh --version YYYYMMDD --server <host> --user ftpuser --password <pass> [--port 21]
|
|
||||||
|
|
||||||
Outputs
|
|
||||||
- deployment/artifact/server/<YYYYMMDD>/
|
|
||||||
- deployment/artifact/client/<YYYYMMDD>/
|
|
||||||
|
|
||||||
Notes
|
|
||||||
- Server package contains docker images (single all-images.tar.gz), compose/, scripts/, docs/, private/ skeleton.
|
|
||||||
- Client package reuses all-in-one-full artifact, repacked as argus-metric_<YYYYMMDD>.tar.gz (compatible with setup.sh).
|
|
||||||
@ -1,90 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
||||||
BUILD_DIR="$ROOT_DIR/deployment/build"
|
|
||||||
ART_ROOT="$ROOT_DIR/deployment/artifact"
|
|
||||||
|
|
||||||
. "$BUILD_DIR/common.sh"
|
|
||||||
|
|
||||||
usage() { cat <<'EOF'
|
|
||||||
Build Argus Client Offline Package
|
|
||||||
|
|
||||||
Usage: build_client_package.sh [--version YYYYMMDD] [--out DIR]
|
|
||||||
|
|
||||||
Produces: deployment/artifact/client/<YYYYMMDD>/argus-metric_<YYYYMMDD>.tar.gz
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
VERSION="$(today_version)"
|
|
||||||
OUT_DIR=""
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
case "$1" in
|
|
||||||
--version) VERSION="$2"; shift 2;;
|
|
||||||
--out) OUT_DIR="$2"; shift 2;;
|
|
||||||
-h|--help) usage; exit 0;;
|
|
||||||
*) err "unknown arg: $1"; usage; exit 1;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
PKG_DIR="${OUT_DIR:-$ART_ROOT/client/$VERSION}"
|
|
||||||
make_dir "$PKG_DIR"
|
|
||||||
|
|
||||||
log "Packaging client from all-in-one-full artifact"
|
|
||||||
PLUGIN_DIR="$ROOT_DIR/src/metric/client-plugins/all-in-one-full"
|
|
||||||
require_cmd bash tar gzip
|
|
||||||
|
|
||||||
(cd "$PLUGIN_DIR" && bash scripts/package_artifact.sh --force)
|
|
||||||
|
|
||||||
# pick latest artifact dir
|
|
||||||
ART_BASE="$PLUGIN_DIR/artifact"
|
|
||||||
latest_dir=$(ls -1dt "$ART_BASE"/*/ 2>/dev/null | head -n1 || true)
|
|
||||||
[[ -n "$latest_dir" ]] || { err "no client artifact found in $ART_BASE"; exit 1; }
|
|
||||||
|
|
||||||
tmpdir=$(mktemp -d)
|
|
||||||
trap 'rm -rf "$tmpdir"' EXIT
|
|
||||||
# Filter-only copy: keep install_order files + scripts + deps + version.json
|
|
||||||
mkdir -p "$tmpdir/src"
|
|
||||||
cp -f "$latest_dir/version.json" "$tmpdir/src/version.json"
|
|
||||||
if command -v jq >/dev/null 2>&1; then
|
|
||||||
mapfile -t files < <(jq -r '.install_order[]' "$latest_dir/version.json")
|
|
||||||
else
|
|
||||||
files=( $(grep -E '"install_order"' -A10 "$latest_dir/version.json" | sed -n 's/\s*"\(.*\.tar\.gz\)".*/\1/p') )
|
|
||||||
fi
|
|
||||||
for f in "${files[@]}"; do
|
|
||||||
[[ -f "$latest_dir/$f" ]] && cp -f "$latest_dir/$f" "$tmpdir/src/$f"
|
|
||||||
done
|
|
||||||
for aux in install.sh uninstall.sh check_health.sh check_version.sh sync_dns.sh restart_unhealthy.sh config.env; do
|
|
||||||
[[ -f "$latest_dir/$aux" ]] && cp -f "$latest_dir/$aux" "$tmpdir/src/$aux";
|
|
||||||
done
|
|
||||||
if [[ -d "$latest_dir/deps" ]]; then
|
|
||||||
mkdir -p "$tmpdir/src/deps" && rsync -a "$latest_dir/deps/" "$tmpdir/src/deps/" >/dev/null 2>&1 || cp -r "$latest_dir/deps/." "$tmpdir/src/deps/";
|
|
||||||
fi
|
|
||||||
|
|
||||||
out_name="argus-metric_$(echo "$VERSION" | sed 's/\./_/g').tar.gz"
|
|
||||||
|
|
||||||
(cd "$tmpdir/src" && tar -czf "$PKG_DIR/$out_name" .)
|
|
||||||
|
|
||||||
log "Client package ready: $PKG_DIR/$out_name"
|
|
||||||
echo "$VERSION" > "$PKG_DIR/LATEST_VERSION"
|
|
||||||
|
|
||||||
# include publish helper and setup.sh for convenience (place first)
|
|
||||||
PUBLISH_TPL="$BUILD_DIR/templates/client/publish.sh"
|
|
||||||
if [[ -f "$PUBLISH_TPL" ]]; then
|
|
||||||
cp "$PUBLISH_TPL" "$PKG_DIR/publish.sh"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# also place a copy of setup.sh alongside
|
|
||||||
SETUP_SRC="$ROOT_DIR/src/metric/client-plugins/all-in-one-full/scripts/setup.sh"
|
|
||||||
[[ -f "$SETUP_SRC" ]] && cp "$SETUP_SRC" "$PKG_DIR/setup.sh" || true
|
|
||||||
|
|
||||||
# docs for end users (this may overwrite file modes), then fix execute bits
|
|
||||||
CLIENT_DOC_DIR="$BUILD_DIR/templates/client"
|
|
||||||
if [[ -d "$CLIENT_DOC_DIR" ]]; then
|
|
||||||
rsync -a "$CLIENT_DOC_DIR/" "$PKG_DIR/" >/dev/null 2>&1 || cp -r "$CLIENT_DOC_DIR/." "$PKG_DIR/"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# ensure helpers are executable
|
|
||||||
chmod +x "$PKG_DIR/publish.sh" "$PKG_DIR/setup.sh" 2>/dev/null || true
|
|
||||||
|
|
||||||
exit 0
|
|
||||||
@ -1,39 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
||||||
cd "$ROOT_DIR"
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
cat <<EOF
|
|
||||||
Build CPU node-bundle image (wrapper)
|
|
||||||
|
|
||||||
Usage: $(basename "$0") [--client-version YYYYMMDD]
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
$(basename "$0") --client-version 20251106
|
|
||||||
$(basename "$0") # auto-detect artifact version via packaging
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
VERSION=""
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
case "$1" in
|
|
||||||
--client-version) VERSION="${2:-}"; shift 2;;
|
|
||||||
-h|--help) usage; exit 0;;
|
|
||||||
*) echo "Unknown arg: $1" >&2; usage; exit 1;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
CMD=("./deployment/build/build_images.sh" "--with-node-bundle")
|
|
||||||
if [[ -n "$VERSION" ]]; then CMD+=("--client-version" "$VERSION"); fi
|
|
||||||
|
|
||||||
echo "[CPU-BUNDLE] invoking: ${CMD[*]}"
|
|
||||||
"${CMD[@]}"
|
|
||||||
|
|
||||||
echo "[CPU-BUNDLE] built image: argus-sys-metric-test-node-bundle:latest"
|
|
||||||
docker image inspect argus-sys-metric-test-node-bundle:latest >/dev/null 2>&1 || {
|
|
||||||
echo "[ERR] expected image not found" >&2; exit 1; }
|
|
||||||
|
|
||||||
echo "[CPU-BUNDLE] done"
|
|
||||||
|
|
||||||
@ -1,49 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
||||||
cd "$ROOT_DIR"
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
cat <<EOF
|
|
||||||
Build GPU node-bundle image (wrapper)
|
|
||||||
|
|
||||||
Usage: $(basename "$0") [--client-version YYYYMMDD] [--tag IMAGE:TAG]
|
|
||||||
|
|
||||||
Defaults:
|
|
||||||
base-image = argus-sys-metric-test-gpu-node:latest
|
|
||||||
output tag = argus-sys-metric-test-node-bundle-gpu:latest
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
$(basename "$0") --client-version 20251106
|
|
||||||
$(basename "$0") --client-version 20251106 --tag myrepo/node-bundle-gpu:20251106
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
VERSION=""
|
|
||||||
OUT_TAG="argus-sys-metric-test-node-bundle-gpu:latest"
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
case "$1" in
|
|
||||||
--client-version) VERSION="${2:-}"; shift 2;;
|
|
||||||
--tag) OUT_TAG="${2:-}"; shift 2;;
|
|
||||||
-h|--help) usage; exit 0;;
|
|
||||||
*) echo "Unknown arg: $1" >&2; usage; exit 1;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
BASE_IMAGE="argus-sys-metric-test-gpu-node:latest"
|
|
||||||
|
|
||||||
CMD=("./deployment/build/build_images.sh" "--with-node-bundle" "--base-image" "$BASE_IMAGE")
|
|
||||||
if [[ -n "$VERSION" ]]; then CMD+=("--client-version" "$VERSION"); fi
|
|
||||||
|
|
||||||
echo "[GPU-BUNDLE] invoking: ${CMD[*]}"
|
|
||||||
"${CMD[@]}"
|
|
||||||
|
|
||||||
echo "[GPU-BUNDLE] re-tagging to $OUT_TAG"
|
|
||||||
docker image inspect argus-sys-metric-test-node-bundle:latest >/dev/null 2>&1 || {
|
|
||||||
echo "[ERR] base bundle image missing: argus-sys-metric-test-node-bundle:latest" >&2; exit 1; }
|
|
||||||
docker tag argus-sys-metric-test-node-bundle:latest "$OUT_TAG"
|
|
||||||
docker image inspect "$OUT_TAG" >/dev/null 2>&1 || { echo "[ERR] re-tag failed" >&2; exit 1; }
|
|
||||||
|
|
||||||
echo "[GPU-BUNDLE] built image: $OUT_TAG (base=$BASE_IMAGE)"
|
|
||||||
|
|
||||||
@ -1,98 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
||||||
. "$ROOT_DIR/deployment/build/common.sh"
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
cat <<EOF
|
|
||||||
Build Argus images (optional node-bundle)
|
|
||||||
|
|
||||||
Usage: build_images.sh [--with-node-bundle] [--client-version YYYYMMDD] [--base-image NAME[:TAG]]
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
./deployment/build/build_images.sh --with-node-bundle --client-version 20251106
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
WITH_BUNDLE=false
|
|
||||||
CLIENT_VERSION=""
|
|
||||||
BASE_IMAGE="argus-sys-metric-test-node:latest"
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
case "$1" in
|
|
||||||
--with-node-bundle) WITH_BUNDLE=true; shift;;
|
|
||||||
--client-version) CLIENT_VERSION="$2"; shift 2;;
|
|
||||||
--base-image) BASE_IMAGE="$2"; shift 2;;
|
|
||||||
-h|--help) usage; exit 0;;
|
|
||||||
*) err "unknown arg: $1"; usage; exit 1;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ "$WITH_BUNDLE" == true ]]; then
|
|
||||||
require_cmd docker tar gzip
|
|
||||||
BUNDLE_DIR="$ROOT_DIR/src/sys/build/node-bundle"
|
|
||||||
CTX_DIR="$BUNDLE_DIR"
|
|
||||||
TMP_BUNDLE="$BUNDLE_DIR/bundle"
|
|
||||||
rm -rf "$TMP_BUNDLE"; mkdir -p "$TMP_BUNDLE"
|
|
||||||
|
|
||||||
# Build or locate client artifact
|
|
||||||
PLUGIN_DIR="$ROOT_DIR/src/metric/client-plugins/all-in-one-full"
|
|
||||||
# CLIENT_VERSION 支持两种形式:
|
|
||||||
# - 形如 1.42.0 的 artifact 版本(默认)
|
|
||||||
# - 形如 YYYYMMDD 的打包日期,将从 deployment/artifact/client/ 下解析出内部 artifact 版本
|
|
||||||
if [[ -z "$CLIENT_VERSION" ]]; then
|
|
||||||
pushd "$PLUGIN_DIR" >/dev/null
|
|
||||||
bash scripts/package_artifact.sh --force
|
|
||||||
CLIENT_VERSION=$(cat artifact/*/version.json 2>/dev/null | sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' | tail -n1)
|
|
||||||
popd >/dev/null
|
|
||||||
[[ -n "$CLIENT_VERSION" ]] || { err "failed to detect client version"; exit 1; }
|
|
||||||
else
|
|
||||||
if [[ "$CLIENT_VERSION" =~ ^[0-9]{8}$ ]]; then
|
|
||||||
PKG_DIR="$ROOT_DIR/deployment/artifact/client/$CLIENT_VERSION"
|
|
||||||
TAR_PKG="$PKG_DIR/argus-metric_${CLIENT_VERSION}.tar.gz"
|
|
||||||
[[ -f "$TAR_PKG" ]] || { err "client date package not found: $TAR_PKG"; exit 1; }
|
|
||||||
# 解包读取内部 version.json
|
|
||||||
tmpd=$(mktemp -d); trap 'rm -rf "$tmpd"' EXIT
|
|
||||||
tar -xzf "$TAR_PKG" -C "$tmpd"
|
|
||||||
if [[ -f "$tmpd/version.json" ]]; then
|
|
||||||
ART_VER=$(sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' "$tmpd/version.json" | head -n1)
|
|
||||||
[[ -n "$ART_VER" ]] || { err "failed to parse artifact version from date package"; exit 1; }
|
|
||||||
CLIENT_VERSION="$ART_VER"
|
|
||||||
# 直接使用该 tar 作为 bundle 源
|
|
||||||
cp "$TAR_PKG" "$TMP_BUNDLE/argus-metric_$(echo "$ART_VER" | tr '.' '_').tar.gz"
|
|
||||||
# 同时尝试复制 setup.sh(若存在)
|
|
||||||
[[ -f "$PKG_DIR/setup.sh" ]] && cp "$PKG_DIR/setup.sh" "$TMP_BUNDLE/" || true
|
|
||||||
else
|
|
||||||
err "version.json missing in client date package"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
# 假定为 artifact 版本目录
|
|
||||||
pushd "$PLUGIN_DIR" >/dev/null
|
|
||||||
[[ -d "artifact/$CLIENT_VERSION" ]] || bash scripts/package_artifact.sh --force
|
|
||||||
popd >/dev/null
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 若未通过日期包预置 tar,则从插件 artifact 目录取
|
|
||||||
TAR_NAME="argus-metric_$(echo "$CLIENT_VERSION" | tr '.' '_').tar.gz"
|
|
||||||
if [[ ! -f "$TMP_BUNDLE/$TAR_NAME" ]]; then
|
|
||||||
SRC_TAR="$PLUGIN_DIR/artifact/$CLIENT_VERSION/$TAR_NAME"
|
|
||||||
[[ -f "$SRC_TAR" ]] || { err "missing client tar: $SRC_TAR"; exit 1; }
|
|
||||||
cp "$SRC_TAR" "$TMP_BUNDLE/"
|
|
||||||
# also include setup.sh for fallback
|
|
||||||
if [[ -f "$ROOT_DIR/deployment/artifact/client/$(today_version)/setup.sh" ]]; then
|
|
||||||
cp "$ROOT_DIR/deployment/artifact/client/$(today_version)/setup.sh" "$TMP_BUNDLE/" || true
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
log "Building node-bundle image with client version: $CLIENT_VERSION"
|
|
||||||
DOCKER_BUILDKIT=0 docker build \
|
|
||||||
--build-arg CLIENT_VER="$CLIENT_VERSION" \
|
|
||||||
--build-arg BASE_IMAGE="$BASE_IMAGE" \
|
|
||||||
-t argus-sys-metric-test-node-bundle:latest \
|
|
||||||
-f "$BUNDLE_DIR/Dockerfile" "$BUNDLE_DIR"
|
|
||||||
log "Built image: argus-sys-metric-test-node-bundle:latest"
|
|
||||||
fi
|
|
||||||
|
|
||||||
log "Done."
|
|
||||||
@ -1,139 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
||||||
BUILD_DIR="$ROOT_DIR/deployment/build"
|
|
||||||
ART_ROOT="$ROOT_DIR/deployment/artifact"
|
|
||||||
|
|
||||||
. "$BUILD_DIR/common.sh"
|
|
||||||
|
|
||||||
usage() { cat <<'EOF'
|
|
||||||
Build Argus Server Offline Package
|
|
||||||
|
|
||||||
Usage: build_server_package.sh [--version YYYYMMDD] [--out DIR] [--resave-image]
|
|
||||||
|
|
||||||
Outputs into deployment/artifact/server/<YYYYMMDD>/ by default.
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
VERSION="$(today_version)"
|
|
||||||
OUT_DIR=""
|
|
||||||
RESAVE_IMAGE=false
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
case "$1" in
|
|
||||||
--version) VERSION="$2"; shift 2;;
|
|
||||||
--out) OUT_DIR="$2"; shift 2;;
|
|
||||||
--resave-image) RESAVE_IMAGE=true; shift;;
|
|
||||||
-h|--help) usage; exit 0;;
|
|
||||||
*) err "unknown arg: $1"; usage; exit 1;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
PKG_DIR="${OUT_DIR:-$ART_ROOT/server/$VERSION}"
|
|
||||||
STAGE="$(mktemp -d)"
|
|
||||||
trap 'rm -rf "$STAGE"' EXIT
|
|
||||||
|
|
||||||
log "Version: $VERSION"
|
|
||||||
log "Staging: $STAGE"
|
|
||||||
|
|
||||||
# 1) Layout
|
|
||||||
make_dir "$STAGE/images"
|
|
||||||
make_dir "$STAGE/compose"
|
|
||||||
make_dir "$STAGE/scripts"
|
|
||||||
make_dir "$STAGE/docs"
|
|
||||||
make_dir "$STAGE/private/argus"
|
|
||||||
|
|
||||||
# 2) Compose: derive from sys/tests by removing test-only services
|
|
||||||
SRC_COMPOSE="$ROOT_DIR/src/sys/tests/docker-compose.yml"
|
|
||||||
[[ -f "$SRC_COMPOSE" ]] || { err "missing $SRC_COMPOSE"; exit 1; }
|
|
||||||
# 2.1 filter out test services
|
|
||||||
tmp_compose1="$STAGE/compose/docker-compose.filtered.yml"
|
|
||||||
awk -f "$BUILD_DIR/templates/docker-compose.filter.awk" -v remove="node-a,node-b,test-node,test-gpu-node" "$SRC_COMPOSE" > "$tmp_compose1"
|
|
||||||
# 2.2 transform to external overlay network (remove sysnet and per-service blocks)
|
|
||||||
awk -f "$BUILD_DIR/templates/docker-compose.overlay.awk" "$tmp_compose1" > "$STAGE/compose/docker-compose.yml"
|
|
||||||
cp "$BUILD_DIR/templates/.env.example" "$STAGE/compose/.env.example"
|
|
||||||
# fix relative private path to match package layout (compose/ and private/ are siblings)
|
|
||||||
sed -i 's#\./private/#../private/#g' "$STAGE/compose/docker-compose.yml"
|
|
||||||
# also handle bind mount form without trailing slash
|
|
||||||
sed -i -E 's#- \./private:/private#- ../private:/private#g' "$STAGE/compose/docker-compose.yml"
|
|
||||||
# drop timezone file bind which may not exist on target distros (e.g. NixOS)
|
|
||||||
sed -i '/\/etc\/timezone:\/etc\/timezone:ro/d' "$STAGE/compose/docker-compose.yml"
|
|
||||||
|
|
||||||
# sanity-check: ensure test services are absent and external network present
|
|
||||||
if grep -E '^(\s{2})(node-a|node-b|test-node|test-gpu-node):\s*$' -n "$STAGE/compose/docker-compose.yml" >/dev/null; then
|
|
||||||
err "compose filter failed: test services still present"; exit 1;
|
|
||||||
fi
|
|
||||||
if ! grep -q '^networks:' "$STAGE/compose/docker-compose.yml" || ! grep -q 'argus-sys-net:' "$STAGE/compose/docker-compose.yml"; then
|
|
||||||
err "compose overlay transform failed: external network missing"; exit 1;
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 3) Images (reuse if already exported unless --resave-image)
|
|
||||||
existing_images_tar="$PKG_DIR/images/all-images.tar.gz"
|
|
||||||
if [[ "$RESAVE_IMAGE" == false && -f "$existing_images_tar" ]]; then
|
|
||||||
log "Reusing existing images tar: $existing_images_tar"
|
|
||||||
cp "$existing_images_tar" "$STAGE/images/"
|
|
||||||
elif [[ "$RESAVE_IMAGE" == false ]]; then
|
|
||||||
# Try cross-version reuse from latest server_*.tar.gz
|
|
||||||
latest_pkg=$(ls -1t "$ART_ROOT/server"/server_*.tar.gz 2>/dev/null | head -n1 || true)
|
|
||||||
if [[ -n "$latest_pkg" ]]; then
|
|
||||||
log "Reusing images from: $latest_pkg"
|
|
||||||
mkdir -p "$STAGE/images"
|
|
||||||
# extract matching file regardless of top-level dir
|
|
||||||
if tar -xzf "$latest_pkg" -C "$STAGE" --wildcards '*/images/all-images.tar.gz' 2>/dev/null; then
|
|
||||||
# locate and move
|
|
||||||
found=$(find "$STAGE" -type f -path '*/images/all-images.tar.gz' | head -n1 || true)
|
|
||||||
if [[ -n "$found" ]]; then
|
|
||||||
mv "$found" "$STAGE/images/all-images.tar.gz"
|
|
||||||
# cleanup leftover extracted dir
|
|
||||||
dir_to_clean=$(dirname "$found")
|
|
||||||
rm -rf "${dir_to_clean%/images}" 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# If still not present, save from local docker daemon
|
|
||||||
if [[ ! -f "$STAGE/images/all-images.tar.gz" ]]; then
|
|
||||||
require_cmd docker gzip
|
|
||||||
images=(
|
|
||||||
argus-bind9:latest
|
|
||||||
argus-master:latest
|
|
||||||
argus-elasticsearch:latest
|
|
||||||
argus-kibana:latest
|
|
||||||
argus-metric-ftp:latest
|
|
||||||
argus-metric-prometheus:latest
|
|
||||||
argus-metric-grafana:latest
|
|
||||||
argus-alertmanager:latest
|
|
||||||
argus-web-frontend:latest
|
|
||||||
argus-web-proxy:latest
|
|
||||||
)
|
|
||||||
log "Saving images: ${#images[@]}"
|
|
||||||
tarfile="$STAGE/images/all-images.tar"
|
|
||||||
docker save -o "$tarfile" "${images[@]}"
|
|
||||||
gzip -f "$tarfile"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 4) Scripts & Docs
|
|
||||||
copy_tree "$BUILD_DIR/templates/scripts" "$STAGE/scripts"
|
|
||||||
copy_tree "$BUILD_DIR/templates/docs" "$STAGE/docs"
|
|
||||||
find "$STAGE/scripts" -type f -name '*.sh' -exec chmod +x {} + 2>/dev/null || true
|
|
||||||
|
|
||||||
# 5) Manifests
|
|
||||||
gen_manifest "$STAGE" "$STAGE/manifest.txt"
|
|
||||||
checksum_dir "$STAGE" "$STAGE/checksums.txt"
|
|
||||||
|
|
||||||
# 6) Move to artifact
|
|
||||||
make_dir "$PKG_DIR"
|
|
||||||
rsync -a "$STAGE/" "$PKG_DIR/" 2>/dev/null || cp -r "$STAGE/." "$PKG_DIR/"
|
|
||||||
log "Server package ready: $PKG_DIR"
|
|
||||||
|
|
||||||
echo "$VERSION" > "$PKG_DIR/version.json"
|
|
||||||
|
|
||||||
# 7) Create distributable tarball
|
|
||||||
OUT_TAR_DIR="$(dirname "$PKG_DIR")"
|
|
||||||
OUT_TAR="$OUT_TAR_DIR/server_${VERSION}.tar.gz"
|
|
||||||
log "Creating tarball: $OUT_TAR"
|
|
||||||
(cd "$PKG_DIR/.." && tar -czf "$OUT_TAR" "$(basename "$PKG_DIR")")
|
|
||||||
log "Tarball ready: $OUT_TAR"
|
|
||||||
|
|
||||||
exit 0
|
|
||||||
@ -1,33 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
log() { echo -e "\033[0;34m[INFO]\033[0m $*"; }
|
|
||||||
warn() { echo -e "\033[1;33m[WARN]\033[0m $*"; }
|
|
||||||
err() { echo -e "\033[0;31m[ERR ]\033[0m $*" >&2; }
|
|
||||||
|
|
||||||
require_cmd() {
|
|
||||||
for c in "$@"; do
|
|
||||||
command -v "$c" >/dev/null 2>&1 || { err "missing command: $c"; exit 1; }
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
today_version() {
|
|
||||||
date +%Y%m%d
|
|
||||||
}
|
|
||||||
|
|
||||||
checksum_dir() {
|
|
||||||
local dir="$1"; local out="$2"; : > "$out";
|
|
||||||
(cd "$dir" && find . -type f -print0 | sort -z | xargs -0 sha256sum) >> "$out"
|
|
||||||
}
|
|
||||||
|
|
||||||
make_dir() { mkdir -p "$1"; }
|
|
||||||
|
|
||||||
copy_tree() {
|
|
||||||
local src="$1" dst="$2"; rsync -a --delete "$src/" "$dst/" 2>/dev/null || cp -r "$src/." "$dst/";
|
|
||||||
}
|
|
||||||
|
|
||||||
gen_manifest() {
|
|
||||||
local root="$1"; local out="$2"; : > "$out";
|
|
||||||
(cd "$root" && find . -maxdepth 3 -type f -printf "%p\n" | sort) >> "$out"
|
|
||||||
}
|
|
||||||
|
|
||||||
@ -1,32 +0,0 @@
|
|||||||
# UID/GID for service processes
|
|
||||||
ARGUS_BUILD_UID=1000
|
|
||||||
ARGUS_BUILD_GID=1000
|
|
||||||
|
|
||||||
# Host ports (adjust if occupied)
|
|
||||||
MASTER_PORT=32300
|
|
||||||
ES_HTTP_PORT=9200
|
|
||||||
KIBANA_PORT=5601
|
|
||||||
NODE_A_PORT=2020
|
|
||||||
NODE_B_PORT=2021
|
|
||||||
PROMETHEUS_PORT=9090
|
|
||||||
GRAFANA_PORT=3000
|
|
||||||
ALERTMANAGER_PORT=9093
|
|
||||||
WEB_PROXY_PORT_8080=8080
|
|
||||||
WEB_PROXY_PORT_8081=8081
|
|
||||||
WEB_PROXY_PORT_8082=8082
|
|
||||||
WEB_PROXY_PORT_8083=8083
|
|
||||||
WEB_PROXY_PORT_8084=8084
|
|
||||||
WEB_PROXY_PORT_8085=8085
|
|
||||||
|
|
||||||
# FTP
|
|
||||||
FTP_PORT=21
|
|
||||||
FTP_DATA_PORT=20
|
|
||||||
FTP_PASSIVE_HOST_RANGE=21100-21110
|
|
||||||
FTP_PASSWORD=ZGClab1234!
|
|
||||||
FTP_DOMAIN=ftp.metric.argus.com
|
|
||||||
|
|
||||||
# GPU profile disabled by default
|
|
||||||
ENABLE_GPU=false
|
|
||||||
|
|
||||||
# External overlay network (Swarm attachable)
|
|
||||||
OVERLAY_NET_NAME=argus-sys-net
|
|
||||||
@ -1,44 +0,0 @@
|
|||||||
# Argus Metric 客户端安装指南(容器内普通用户场景)
|
|
||||||
|
|
||||||
## 准备与连通性检查
|
|
||||||
- FTP 连接(需要账号密码,默认 `ftpuser / ZGClab1234!`)
|
|
||||||
- `curl -u ftpuser:ZGClab1234! -I ftp://<FTP_IP>:21/LATEST_VERSION`
|
|
||||||
- `curl -u ftpuser:ZGClab1234! -s ftp://<FTP_IP>:21/ | head`
|
|
||||||
- 下载安装脚本
|
|
||||||
- `curl -u ftpuser:ZGClab1234! -fsSL ftp://<FTP_IP>:21/setup.sh -o /tmp/setup.sh`
|
|
||||||
- `chmod +x /tmp/setup.sh`
|
|
||||||
|
|
||||||
## 元数据与主机名
|
|
||||||
- Agent 需要元数据(env/user/instance)与 Master 地址:
|
|
||||||
- 方式A:hostname 形如 `env-user-instance-xxx`(推荐)
|
|
||||||
- 方式B:导出环境变量:
|
|
||||||
- `export AGENT_ENV=dev`
|
|
||||||
- `export AGENT_USER=<your_user>`
|
|
||||||
- `export AGENT_INSTANCE=<node_id>`
|
|
||||||
- Master 地址:
|
|
||||||
- `export MASTER_ENDPOINT=http://master.argus.com:3000`
|
|
||||||
|
|
||||||
> 安装脚本会在开头检查上述条件,缺失时会终止并给出修复提示。
|
|
||||||
|
|
||||||
## 执行安装
|
|
||||||
- 以 root 运行(容器内如为非 root 用户请切换为 root):
|
|
||||||
- `sudo /tmp/setup.sh --server <FTP_IP> --user ftpuser --password 'ZGClab1234!' --port 21`
|
|
||||||
- 如需自定义安装根目录:`--install-dir /opt/argus-metric`
|
|
||||||
|
|
||||||
提示(容器接入 overlay 网络时):
|
|
||||||
- 在执行 setup 前,先将容器内 DNS 指向 Bind9 的 overlay IP:
|
|
||||||
- `echo "nameserver <BIND_OVERLAY_IP>" > /etc/resolv.conf`
|
|
||||||
- 这样 `master.argus.com`、`es.log.argus.com` 等域名即可解析;首次下载 `setup.sh` 仍建议使用 FTP 的 overlay IP。
|
|
||||||
|
|
||||||
更多快速步骤请参考:`QUICK_NODE_DEPLOY_zh.md`。
|
|
||||||
|
|
||||||
## 安装后自检(setup 自动执行)
|
|
||||||
- setup 会等待最多 5 分钟,确认以下条件后才报告完成:
|
|
||||||
- `/private/argus/agent/<hostname>/node.json` 已生成;
|
|
||||||
- `last_report` 在持续更新;
|
|
||||||
- `health.metric-argus-agent|metric-node-exporter|metric-fluent-bit|metric-dcgm-exporter` 均为 `healthy` 且 `error` 为空。
|
|
||||||
|
|
||||||
## 手工验证(可选)
|
|
||||||
- `cat /private/argus/agent/$(hostname)/node.json | jq '.'`
|
|
||||||
- `curl -s -o /dev/null -w "%{http_code}\n" http://localhost:9100/metrics` 应为 200
|
|
||||||
- 查看日志:`/var/log/argus-agent.log`、`/opt/argus-metric/versions/*/.install.log`
|
|
||||||
@ -1,57 +0,0 @@
|
|||||||
# Argus Metric 客户端发布说明(FTP)
|
|
||||||
|
|
||||||
本说明面向“发布人员”,讲清楚如何把客户端离线包发布到 FTP,供各节点通过 `curl` 自动安装。
|
|
||||||
|
|
||||||
## 目录结构(构建后)
|
|
||||||
- `client-YYYYMMDD/`
|
|
||||||
- `argus-metric_YYYYMMDD.tar.gz` 客户端离线包
|
|
||||||
- `setup.sh` 客户端安装入口脚本(提供给节点用 curl 下载)
|
|
||||||
- `publish.sh` 发布脚本(将上述两项与 `LATEST_VERSION` 上传到 FTP)
|
|
||||||
- `LATEST_VERSION` 文本(内容为 `YYYYMMDD`,或 `YYYYMMDD-rN`)
|
|
||||||
- `INSTALL_CLIENT_zh.md` 本地安装指南(给使用者看,不会上载到 FTP)
|
|
||||||
- `PUBLISH_CLIENT_zh.md` 本说明
|
|
||||||
|
|
||||||
> 注意:`publish.sh`/`setup.sh` 为可执行脚本;构建脚本已保证二者具有执行权限。
|
|
||||||
|
|
||||||
## 前置条件
|
|
||||||
- FTP 服务已运行(默认容器:`argus-ftp`),并打开端口:21、20、21100–21110(被动模式)。
|
|
||||||
- FTP 账号:默认 `ftpuser / ZGClab1234!`(如有更改,以实际为准)。
|
|
||||||
|
|
||||||
## 发布步骤(在 server 机器或能直连 FTP 的任意机器上)
|
|
||||||
1) 进入发布目录:
|
|
||||||
- `cd client-YYYYMMDD`
|
|
||||||
|
|
||||||
2) 执行发布:
|
|
||||||
- `./publish.sh --server <FTP_HOST> --user <USER> --password '<PASS>' [--port 21]`
|
|
||||||
- 例如在服务端本机:`./publish.sh --server localhost --user ftpuser --password 'ZGClab1234!' --port 21`
|
|
||||||
|
|
||||||
脚本会上传三类文件到 FTP 根:
|
|
||||||
- `setup.sh`
|
|
||||||
- `argus-metric_YYYYMMDD[ -rN ].tar.gz`
|
|
||||||
- `LATEST_VERSION`(内容为当前版本号)
|
|
||||||
|
|
||||||
3) 发布后验证:
|
|
||||||
- `curl -u ftpuser:****** -I ftp://<FTP_HOST>:21/LATEST_VERSION` 应返回 200
|
|
||||||
- `curl -u ftpuser:****** -fsSL ftp://<FTP_HOST>:21/LATEST_VERSION` 内容为版本号(如 `20251104`)
|
|
||||||
- `curl -u ftpuser:****** -I ftp://<FTP_HOST>:21/argus-metric_YYYYMMDD.tar.gz` 返回 200
|
|
||||||
|
|
||||||
## 节点侧使用方式(摘要)
|
|
||||||
- 首次下载用 FTP 的“IP 地址”:
|
|
||||||
- `curl -u ftpuser:****** -fsSL ftp://<FTP_IP>:21/setup.sh -o /tmp/setup.sh && chmod +x /tmp/setup.sh`
|
|
||||||
- 执行安装:
|
|
||||||
- 必需元数据:`AGENT_ENV/AGENT_USER/AGENT_INSTANCE`,以及 `MASTER_ENDPOINT=http://master.argus.com:3000`
|
|
||||||
- `sudo /tmp/setup.sh --server <FTP_IP> --user ftpuser --password '******' --port 21`
|
|
||||||
- overlay 容器场景:
|
|
||||||
- 先将容器内 DNS 指向 Bind9 的 overlay IP:`echo "nameserver <BIND_OVERLAY_IP>" > /etc/resolv.conf`
|
|
||||||
- 然后再执行上述安装;安装后约 1–2 分钟内 DNS 即可解析 `*.argus.com` 域名。
|
|
||||||
|
|
||||||
## 常见问题
|
|
||||||
- `530 Access denied`:用户名/密码错误或 FTP 目录无权限;请核对账号与 FTP 容器状态。
|
|
||||||
- `Permission denied` 执行 `publish.sh`:为脚本权限问题;`chmod +x publish.sh`。构建脚本已修复默认权限。
|
|
||||||
- 被动端口不通导致失败:请开放 21100–21110。
|
|
||||||
- 客户端安装后短时 `curl http://master.argus.com:3000` 为 000:服务冷启动或 DNS 同步延迟,等待 1–2 分钟再试。
|
|
||||||
|
|
||||||
## 版本与回滚
|
|
||||||
- `LATEST_VERSION` 决定客户端默认安装的版本号。
|
|
||||||
- 如需回滚:将旧版本号写回 `LATEST_VERSION` 并重新发布(或手动指定 `--version` 安装)。
|
|
||||||
|
|
||||||
@ -1,58 +0,0 @@
|
|||||||
# Argus Metric 节点快速部署(Overlay 网络容器)
|
|
||||||
|
|
||||||
本文档给出在 Docker Swarm external overlay 网络中,快速拉起一个测试节点并完成注册的最小可行步骤。
|
|
||||||
|
|
||||||
## 前提
|
|
||||||
- 服务端已在 Manager 机安装完成并运行良好(`server-selfcheck` 通过)。
|
|
||||||
- Overlay 网络名称:`argus-sys-net`(默认)。
|
|
||||||
- 已通过 FTP 发布 `setup.sh` 与客户端包,且能从 FTP 获取 `LATEST_VERSION`。
|
|
||||||
- 用于测试的镜像:`argus-sys-metric-test-node:latest` 已存在于目标机器。
|
|
||||||
|
|
||||||
## 步骤
|
|
||||||
|
|
||||||
- 获取 FTP 和 Bind 的 overlay IP(在 Manager 上执行)
|
|
||||||
- `FTPIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-ftp)`
|
|
||||||
- `BINDIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-bind-sys)`
|
|
||||||
- `echo "FTP=$FTPIP BIND=$BINDIP"`
|
|
||||||
|
|
||||||
- 准备宿主挂载目录(以 s4 为例)
|
|
||||||
- `mkdir -p /home2/yuyr/deploy/test-metric-node/s4`
|
|
||||||
|
|
||||||
- 启动测试节点容器(接入 overlay)
|
|
||||||
- `docker run -d --name argus-metric-test-node-s4 \
|
|
||||||
--hostname dev2-yuyr-node002s4 \
|
|
||||||
--network argus-sys-net \
|
|
||||||
-v /home2/yuyr/deploy/test-metric-node/s4:/private/argus/agent \
|
|
||||||
argus-sys-metric-test-node:latest sleep infinity`
|
|
||||||
|
|
||||||
- 在容器内执行安装(先用 FTP IP 引导,DNS 指向 Bind)
|
|
||||||
- `docker exec -it argus-metric-test-node-s4 bash`
|
|
||||||
- `echo "nameserver $BINDIP" > /etc/resolv.conf`
|
|
||||||
- `curl --ftp-method nocwd -u ftpuser:ZGClab1234! -fsSL "ftp://$FTPIP:21/setup.sh" -o /tmp/setup.sh`
|
|
||||||
- `chmod +x /tmp/setup.sh`
|
|
||||||
- `export AGENT_ENV=dev2 AGENT_USER=yuyr AGENT_INSTANCE=node002s4`
|
|
||||||
- `export MASTER_ENDPOINT=http://master.argus.com:3000`
|
|
||||||
- `/tmp/setup.sh --server "$FTPIP" --user ftpuser --password 'ZGClab1234!' --port 21`
|
|
||||||
- 说明:setup 会自动执行安装后自检(最多 5 分钟),无需手动轮询。
|
|
||||||
|
|
||||||
## 验证(推荐在容器内执行,避免宿主权限问题)
|
|
||||||
|
|
||||||
- 查看 node.json 关键字段
|
|
||||||
- `cat /private/argus/agent/dev2-yuyr-node002s4/node.json | jq '{last_report, health}'`
|
|
||||||
- 期望:四个 health 全部 healthy;等待 ≥70s 再查看,`last_report` 持续更新。
|
|
||||||
|
|
||||||
- 指标端口
|
|
||||||
- `curl -s -o /dev/null -w '%{http_code}\n' http://localhost:9100/metrics`(期望 200)
|
|
||||||
- (如测试 GPU)`curl -s -o /dev/null -w '%{http_code}\n' http://localhost:9400/metrics`(有 GPU 时 200)
|
|
||||||
|
|
||||||
- 与服务端连通(域名经 Bind 解析)
|
|
||||||
- `curl -s -o /dev/null -w '%{http_code}\n' http://master.argus.com:3000/readyz`(期望 200)
|
|
||||||
- `curl -s -o /dev/null -w '%{http_code}\n' http://es.log.argus.com:9200/_cluster/health`(期望 200)
|
|
||||||
|
|
||||||
## (可选)在服务器主机侧观察 Prometheus 目标更新
|
|
||||||
- `cat /home2/yuyr/deploy/versions/<VERSION>/private/argus/metric/prometheus/nodes.json | jq '.'`
|
|
||||||
|
|
||||||
## 常见提示
|
|
||||||
- 初次安装后短时 `curl` 域名返回 000/超时属正常,多等待 1–2 分钟 DNS 同步/组件冷启动完成。
|
|
||||||
- 如在宿主直接读取挂载的 node.json 报 Permission denied,请使用 `docker exec` 在容器内查看。
|
|
||||||
- MASTER_ENDPOINT 固定使用域名 `http://master.argus.com:3000`,客户端无需固定 IP。
|
|
||||||
@ -1,54 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
usage() { cat <<'EOF'
|
|
||||||
Publish Argus client package to FTP
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
./publish.sh --server HOST --user USER --password PASS [--port 21]
|
|
||||||
|
|
||||||
Notes:
|
|
||||||
- This script expects to run inside the built client artifact directory.
|
|
||||||
- It reads LATEST_VERSION and uploads setup.sh, argus-metric_<ver>.tar.gz, and LATEST_VERSION.
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
HOST=""; USERNAME=""; PASSWORD=""; PORT=21
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
case "$1" in
|
|
||||||
--server) HOST="$2"; shift 2;;
|
|
||||||
--user) USERNAME="$2"; shift 2;;
|
|
||||||
--password) PASSWORD="$2"; shift 2;;
|
|
||||||
--port) PORT="$2"; shift 2;;
|
|
||||||
-h|--help) usage; exit 0;;
|
|
||||||
*) echo "unknown arg: $1" >&2; usage; exit 1;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
[[ -n "$HOST" && -n "$USERNAME" && -n "$PASSWORD" ]] || { usage; exit 1; }
|
|
||||||
|
|
||||||
here="$(pwd)"
|
|
||||||
if [[ ! -f "$here/LATEST_VERSION" ]]; then
|
|
||||||
echo "LATEST_VERSION not found in $(pwd)" >&2; exit 1;
|
|
||||||
fi
|
|
||||||
VER=$(cat "$here/LATEST_VERSION" | tr -d '\n')
|
|
||||||
PKG="argus-metric_${VER}.tar.gz"
|
|
||||||
|
|
||||||
if [[ ! -f "$here/$PKG" ]]; then
|
|
||||||
echo "client tar not found: $PKG" >&2; exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# locate setup.sh (prefer colocated, fallback to bundled path if provided)
|
|
||||||
SETUP="${here}/setup.sh"
|
|
||||||
if [[ ! -f "$SETUP" ]]; then
|
|
||||||
echo "setup.sh not found in $(pwd)" >&2; exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[PUBLISH] server=$HOST port=$PORT version=$VER"
|
|
||||||
|
|
||||||
curl -u "$USERNAME:$PASSWORD" -sfT "$SETUP" "ftp://$HOST:$PORT/setup.sh"
|
|
||||||
curl -u "$USERNAME:$PASSWORD" -sfT "$PKG" "ftp://$HOST:$PORT/$PKG"
|
|
||||||
printf "%s" "$VER" | curl -u "$USERNAME:$PASSWORD" -sfT - "ftp://$HOST:$PORT/LATEST_VERSION"
|
|
||||||
|
|
||||||
echo "[OK] publish completed"
|
|
||||||
|
|
||||||
@ -1,41 +0,0 @@
|
|||||||
#!/usr/bin/awk -f
|
|
||||||
# Remove specific service blocks from a docker-compose.yml by service name.
|
|
||||||
# Usage: awk -f docker-compose.filter.awk -v remove="node-a,node-b,test-node,test-gpu-node" input.yml > output.yml
|
|
||||||
|
|
||||||
BEGIN{
|
|
||||||
split(remove, rm, ",");
|
|
||||||
for(i in rm){
|
|
||||||
gsub(/^\s+|\s+$/,"",rm[i]);
|
|
||||||
if (rm[i] != "") skipname[rm[i]] = 1;
|
|
||||||
}
|
|
||||||
in_services=0; skipping=0;
|
|
||||||
}
|
|
||||||
|
|
||||||
function service_header(line, m) {
|
|
||||||
# match exactly two leading spaces followed by name:
|
|
||||||
if (match(line, /^ ([A-Za-z0-9_-]+):[ ]*$/, m)) return m[1];
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
# Track top-level sections (no indentation)
|
|
||||||
if ($0 ~ /^[A-Za-z0-9_-]+:[ ]*$/ && $0 !~ /^\s/) {
|
|
||||||
in_services = ($0 ~ /^services:[ ]*$/) ? 1 : 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (skipping) {
|
|
||||||
# Stop skipping at next service header or another top-level section
|
|
||||||
if (service_header($0) != "" || ($0 ~ /^(networks|volumes):[ ]*$/ && $0 !~ /^\s/)) {
|
|
||||||
skipping=0;
|
|
||||||
} else {
|
|
||||||
next;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (in_services) {
|
|
||||||
name = service_header($0);
|
|
||||||
if (name != "" && (name in skipname)) { skipping=1; next; }
|
|
||||||
}
|
|
||||||
|
|
||||||
print;
|
|
||||||
}
|
|
||||||
@ -1,74 +0,0 @@
|
|||||||
#!/usr/bin/awk -f
|
|
||||||
# Transform docker-compose.yml to use an external overlay network for all services
|
|
||||||
# - Remove top-level networks definition
|
|
||||||
# - Remove per-service networks block (including ipv4_address and sysnet refs)
|
|
||||||
# - Insert per-service networks: [argus-sys-net]
|
|
||||||
# - Append external networks mapping at the end
|
|
||||||
|
|
||||||
BEGIN{
|
|
||||||
in_top_networks=0; in_services=0; in_service=0; svc_indent=0; curr_name="";
|
|
||||||
}
|
|
||||||
|
|
||||||
function is_service_header(line){ return svc_name(line)!=""; }
|
|
||||||
function svc_name(line, m){ if (match(line, /^ ([A-Za-z0-9_-]+):[ ]*$/, m)) return m[1]; return ""; }
|
|
||||||
|
|
||||||
function indent_len(s, n){ n=match(s,/[^ ]/)-1; if(n<0) n=0; return n; }
|
|
||||||
|
|
||||||
{
|
|
||||||
# Detect entry into top-level sections
|
|
||||||
if ($0 ~ /^[A-Za-z0-9_-]+:[ ]*$/ && $0 !~ /^\s/) {
|
|
||||||
in_services = ($0 ~ /^services:[ ]*$/);
|
|
||||||
# If a new top-level section starts, stop skipping top networks
|
|
||||||
in_top_networks = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
# Handle removal of initial top-level 'networks:' block
|
|
||||||
if ($0 ~ /^networks:[ ]*$/ && $0 !~ /^\s/) {
|
|
||||||
in_top_networks = 1; next;
|
|
||||||
}
|
|
||||||
if (in_top_networks) {
|
|
||||||
# skip until next top-level section (non-indented key)
|
|
||||||
next;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (in_services) {
|
|
||||||
# Track service boundaries
|
|
||||||
if (is_service_header($0)) {
|
|
||||||
in_service=1; svc_indent=2; networks_inserted=0; curr_name=svc_name($0); print; next;
|
|
||||||
}
|
|
||||||
if (in_service) {
|
|
||||||
# If line is indented <= service indent, we've left this service
|
|
||||||
if (indent_len($0) <= svc_indent && $0 !~ /^\s*$/) {
|
|
||||||
in_service=0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (in_service) {
|
|
||||||
# Skip any existing networks block under the service
|
|
||||||
if ($0 ~ /^\s{4}networks:[ ]*$/) { skipping_nets=1; next; }
|
|
||||||
if (skipping_nets) {
|
|
||||||
if (indent_len($0) <= 4) { skipping_nets=0; }
|
|
||||||
else next;
|
|
||||||
}
|
|
||||||
|
|
||||||
# After container_name or image, inject networks once
|
|
||||||
if (!networks_inserted && ($0 ~ /^\s{4}container_name:/ || $0 ~ /^\s{4}image:/)) {
|
|
||||||
print;
|
|
||||||
print " networks:";
|
|
||||||
print " - argus-sys-net";
|
|
||||||
networks_inserted=1; next;
|
|
||||||
}
|
|
||||||
# no host port injection; bind serves DNS inside overlay only
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
print;
|
|
||||||
}
|
|
||||||
|
|
||||||
END{
|
|
||||||
print "";
|
|
||||||
print "networks:";
|
|
||||||
print " argus-sys-net:";
|
|
||||||
print " external: true";
|
|
||||||
print " name: ${OVERLAY_NET_NAME:-argus-sys-net}";
|
|
||||||
}
|
|
||||||
@ -1,50 +0,0 @@
|
|||||||
# Argus Server Offline Installation
|
|
||||||
|
|
||||||
## Prerequisites
|
|
||||||
- Linux x86_64 (Ubuntu 22.04 recommended; see OS compatibility for NixOS)
|
|
||||||
- Docker & Docker Compose installed
|
|
||||||
- Open ports: 32300, 9200, 5601, 9090, 9093, 8080..8085, 21, 20, 21100–21110 (or auto-fallback to high ports)
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
1. Extract to a target dir, e.g. `/opt/argus-deploy/versions/<YYYYMMDD>`
|
|
||||||
2. `./server-install.sh` (non‑root is supported: it will precreate minimal dirs and auto-fix Kibana/ES/Bind in containers)
|
|
||||||
3. `./server-status.sh`
|
|
||||||
4. `./server-selfcheck.sh` (on failure it auto-runs diagnose and writes logs under `logs/`)
|
|
||||||
5. `./server-uninstall.sh` to tear down
|
|
||||||
|
|
||||||
## What the Installer Does
|
|
||||||
- Loads local images (`images/all-images.tar.gz`)
|
|
||||||
- Generates OS-compat override (`security_opt: ["label=disable"]`, `userns_mode: host`, bind `tmpfs:/run/named`)
|
|
||||||
- Starts server-only services: bind/master/es/kibana/ftp/prometheus/grafana/alertmanager/web-frontend/web-proxy
|
|
||||||
- DNS Bootstrap:
|
|
||||||
- Ensure `/private/argus/etc/dns.conf` exists (write `172.31.0.2` if missing);
|
|
||||||
- Run `/private/argus/etc/update-dns.sh` in dependent containers so `/etc/resolv.conf` points to bind;
|
|
||||||
- Wait for `*.argus.com` hint files, then reload bind;
|
|
||||||
- Restart web‑proxy to re-render nginx resolver from `dns.conf`;
|
|
||||||
- Writes `logs/selfcheck.json` as final summary
|
|
||||||
|
|
||||||
## OS Compatibility
|
|
||||||
- NixOS / non-xattr FS: containers run with `security_opt: ["label=disable"]` and `userns_mode: host`.
|
|
||||||
- If you cannot use sudo, the installer will:
|
|
||||||
- create minimal data dirs (incl. `private/argus/log/{elasticsearch,kibana}`) with permissive perms when possible;
|
|
||||||
- ensure inside containers: Kibana `data` → `/private/argus/log/kibana`, Elasticsearch `data` → `/private/argus/log/elasticsearch`, and Bind `rndc.key` is generated.
|
|
||||||
(Manual pre-creation scripts are no longer required.)
|
|
||||||
|
|
||||||
## Files & Layout
|
|
||||||
- `compose/` (docker-compose.yml, .env)
|
|
||||||
- `private/` (data mounts)
|
|
||||||
- `scripts/` (install/uninstall/status/selfcheck/diagnose)
|
|
||||||
- `logs/` (selfcheck + diagnose outputs)
|
|
||||||
|
|
||||||
## Troubleshooting (Quick)
|
|
||||||
- Run `./server-selfcheck.sh` → see `logs/selfcheck.json`
|
|
||||||
- Run `./server-diagnose.sh` → produces timestamped logs:
|
|
||||||
- `logs/diagnose_details_YYYYMMDD-HHMMSSZ.log`
|
|
||||||
- `logs/diagnose_error_YYYYMMDD-HHMMSSZ.log`
|
|
||||||
And updates `diagnose_details.log`/`diagnose_error.log` to the latest
|
|
||||||
- Error lines are tagged `[service][source]`, e.g. `[kibana][http] /api/status=503`
|
|
||||||
|
|
||||||
Common issues:
|
|
||||||
- Kibana 503: wait cold start or fix DNS so `es.log.argus.com` resolves
|
|
||||||
- web‑proxy 504: check nginx `resolver` includes `172.31.0.2 127.0.0.11`
|
|
||||||
- EACCES/locks: ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches UID:GID
|
|
||||||
@ -1,29 +0,0 @@
|
|||||||
# Argus 服务端离线安装指南
|
|
||||||
|
|
||||||
## 先决条件
|
|
||||||
- Linux x86_64(推荐 Ubuntu 22.04;NixOS 见“兼容说明”)
|
|
||||||
- 已安装 Docker 与 Docker Compose
|
|
||||||
- 端口:32300, 9200, 5601, 9090, 9093, 8080..8085, 21, 20, 21100–21110
|
|
||||||
|
|
||||||
## 快速开始
|
|
||||||
1. 解压到目标目录(例如 `/opt/argus-deploy/versions/<YYYYMMDD>`)
|
|
||||||
2. 安装:`./server-install.sh`(支持普通用户:会自动创建最小目录并在容器内修复 Kibana/ES/Bind)
|
|
||||||
3. 状态:`./server-status.sh`
|
|
||||||
4. 自检:`./server-selfcheck.sh`(失败会自动采集诊断)
|
|
||||||
5. 卸载:`./server-uninstall.sh`
|
|
||||||
|
|
||||||
## 安装流程要点
|
|
||||||
- 仅启动 10 个服务端组件(不包含测试节点);
|
|
||||||
- DNS Bootstrap:补齐首次部署 DNS 依赖(生成/确认 `dns.conf`、统一容器 resolv.conf、写入 `*.argus.com`、reload bind、重启 web‑proxy);
|
|
||||||
- 输出自检结果到 `logs/selfcheck.json`。
|
|
||||||
|
|
||||||
## 兼容说明(NixOS 等)
|
|
||||||
- 使用 `security_opt: ["label=disable"]` 与 `userns_mode: host`。
|
|
||||||
- 非 root 场景:安装器会创建最小目录(含 `private/argus/log/{elasticsearch,kibana}`),并在容器内完成:
|
|
||||||
- Kibana 的 `data` 软链到 `/private/argus/log/kibana`
|
|
||||||
- Elasticsearch 的 `data` 软链到 `/private/argus/log/elasticsearch`
|
|
||||||
- Bind 生成 `/etc/bind/rndc.key`
|
|
||||||
|
|
||||||
## 故障排查(见下文 Troubleshooting_zh)
|
|
||||||
- `./server-selfcheck.sh` → `logs/selfcheck.json`
|
|
||||||
- `./server-diagnose.sh` → `logs/diagnose_error_*.log` / `logs/diagnose_details_*.log`
|
|
||||||
@ -1,50 +0,0 @@
|
|||||||
# Argus 多机部署(Docker Swarm + External Overlay)
|
|
||||||
|
|
||||||
- 前提:Docker ≥ 20.10;Manager/Worker 节点开放 2377/tcp、7946/tcp+udp、4789/udp。
|
|
||||||
- DNS:Bind9 为统一权威,解析 *.argus.com 至部署机固定对外主机 IP。
|
|
||||||
|
|
||||||
## 在部署机(Manager)
|
|
||||||
- 初始化 Swarm:`docker swarm init --advertise-addr <manager_ip>`
|
|
||||||
- 创建 overlay:`docker network create --driver overlay --attachable argus-sys-net`
|
|
||||||
- 解压离线包后执行:
|
|
||||||
- `./server-install.sh`(会验证 Swarm 状态、确保 overlay 存在、写入 dns.conf)
|
|
||||||
- `./server-selfcheck.sh`(失败会自动触发诊断)
|
|
||||||
|
|
||||||
## 在节点机(Worker 或非 Docker 主机)
|
|
||||||
- Swarm Worker:执行 Manager 的 `docker swarm join ...`;
|
|
||||||
- 运行客户端容器:
|
|
||||||
- `docker run -d --name argus-metric-node-001 --network argus-sys-net -v /data/argus/agent:/private/argus/agent argus-sys-metric-test-node:latest sleep infinity`
|
|
||||||
- 进入容器安装(先 IP 引导,后域名):
|
|
||||||
- `curl -u ftpuser:*** -fsSL ftp://<FTP_IP>:21/setup.sh -o /tmp/setup.sh && chmod +x /tmp/setup.sh`
|
|
||||||
- `AGENT_ENV=dev AGENT_USER=alice AGENT_INSTANCE=node001 MASTER_ENDPOINT=http://master.argus.com:3000 /tmp/setup.sh --server ftp.argus.com --user ftpuser --password '***' --port 21`
|
|
||||||
|
|
||||||
## 关键点
|
|
||||||
- 首次必须使用 FTP 的 IP 引导(下载 setup.sh 与 dns.conf)
|
|
||||||
- MASTER_ENDPOINT 永远使用域名:`http://master.argus.com:3000`
|
|
||||||
- docker compose 改为 external overlay;容器内不使用 Docker 服务名;web-proxy 与组件上游统一用域名
|
|
||||||
|
|
||||||
## 找回/轮换 Swarm 加入令牌与解锁密钥
|
|
||||||
|
|
||||||
在任意一个 Manager 节点上执行以下命令即可查看或轮换加入令牌(join token):
|
|
||||||
|
|
||||||
- 查看加入 Worker 的命令:
|
|
||||||
- `docker swarm join-token worker`
|
|
||||||
- 只打印 Worker 的 token:
|
|
||||||
- `docker swarm join-token -q worker`
|
|
||||||
- 查看加入 Manager 的命令:
|
|
||||||
- `docker swarm join-token manager`
|
|
||||||
- 只打印 Manager 的 token:
|
|
||||||
- `docker swarm join-token -q manager`
|
|
||||||
|
|
||||||
在待加入节点执行(示例,替换 Manager_IP):
|
|
||||||
- `docker swarm join --token <上面查到的token> <Manager_IP>:2377`
|
|
||||||
|
|
||||||
轮换 token(怀疑泄露或需要更新时):
|
|
||||||
- 轮换 Worker:`docker swarm join-token --rotate worker`
|
|
||||||
- 轮换 Manager:`docker swarm join-token --rotate manager`
|
|
||||||
|
|
||||||
如果你指的是“解锁密钥”(autolock 的 unlock key),在 Manager 上:
|
|
||||||
- 查看:`docker swarm unlock-key`
|
|
||||||
- 轮换:`docker swarm unlock-key --rotate`
|
|
||||||
|
|
||||||
提示:当看到 “This node is not a swarm manager.” 时,说明当前节点不是 Manager,需要到 Manager 节点执行,或在现有 Manager 上 `docker node promote <NODE-ID>` 将其提升为 Manager。
|
|
||||||
@ -1,20 +0,0 @@
|
|||||||
# Troubleshooting
|
|
||||||
|
|
||||||
- Status: `scripts/server-status.sh`
|
|
||||||
- Selfcheck: `scripts/server-selfcheck.sh`
|
|
||||||
- Diagnose: `scripts/server-diagnose.sh`
|
|
||||||
|
|
||||||
Outputs:
|
|
||||||
- `logs/selfcheck.json`
|
|
||||||
- `logs/diagnose_details_*.log` (full details)
|
|
||||||
- `logs/diagnose_error_*.log` (tagged errors)
|
|
||||||
|
|
||||||
Web‑Proxy:
|
|
||||||
- 8083 expects 200/302/403; 8084/8085 must include CORS header
|
|
||||||
- nginx resolver should be `172.31.0.2 127.0.0.11`
|
|
||||||
|
|
||||||
Kibana/ES:
|
|
||||||
- Verify `es.log.argus.com` resolves inside Kibana
|
|
||||||
|
|
||||||
Permissions:
|
|
||||||
- The installer auto-creates minimal dirs and applies container-side fixes (Kibana/ES/Bind). If you still see EACCES/lock errors, rerun `./server-install.sh` and review diagnose logs.
|
|
||||||
@ -1,16 +0,0 @@
|
|||||||
# 故障排查
|
|
||||||
|
|
||||||
- 状态:`scripts/server-status.sh`
|
|
||||||
- 自检:`scripts/server-selfcheck.sh`
|
|
||||||
- 诊断:`scripts/server-diagnose.sh`
|
|
||||||
|
|
||||||
输出:
|
|
||||||
- `logs/selfcheck.json`
|
|
||||||
- `logs/diagnose_error_*.log`(错误摘要)
|
|
||||||
- `logs/diagnose_details_*.log`(详细信息)
|
|
||||||
|
|
||||||
Web‑Proxy:8083=200/302/403;8084/8085 需包含 CORS
|
|
||||||
Kibana:确认可解析 `es.log.argus.com`
|
|
||||||
权限:
|
|
||||||
- 非 root 安装时,安装器会创建最小目录并在容器内修复 Kibana/ES/Bind;
|
|
||||||
- 如仍有 `EACCES`/锁文件报错,先重跑 `./server-install.sh`(会重复容器内修复),并查看诊断日志。
|
|
||||||
@ -1,82 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
|
||||||
|
|
||||||
ES_URL="http://localhost:${ES_HTTP_PORT:-9200}"
|
|
||||||
|
|
||||||
# Tunables (env overrides)
|
|
||||||
RELAX_WM_LOW="${RELAX_WM_LOW:-99%}"
|
|
||||||
RELAX_WM_HIGH="${RELAX_WM_HIGH:-99%}"
|
|
||||||
RELAX_WM_FLOOD="${RELAX_WM_FLOOD:-99%}"
|
|
||||||
DISABLE_WATERMARK="${DISABLE_WATERMARK:-1}"
|
|
||||||
SET_KIBANA_REPLICAS_ZERO="${SET_KIBANA_REPLICAS_ZERO:-1}"
|
|
||||||
CLEAR_READONLY_BLOCKS="${CLEAR_READONLY_BLOCKS:-1}"
|
|
||||||
|
|
||||||
echo "[RELAX] Checking Elasticsearch at $ES_URL"
|
|
||||||
code=$(curl -s -o /dev/null -w '%{http_code}' "$ES_URL/_cluster/health" || true)
|
|
||||||
if [[ "$code" != "200" ]]; then
|
|
||||||
echo "[RELAX][ERROR] ES not reachable (code=$code). Ensure argus-es-sys is running." >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[RELAX] Applying transient cluster settings (watermarks)"
|
|
||||||
th_enabled=$([[ "$DISABLE_WATERMARK" == "1" ]] && echo false || echo true)
|
|
||||||
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d "{
|
|
||||||
\"transient\": {
|
|
||||||
\"cluster.routing.allocation.disk.threshold_enabled\": $th_enabled,
|
|
||||||
\"cluster.routing.allocation.disk.watermark.low\": \"$RELAX_WM_LOW\",
|
|
||||||
\"cluster.routing.allocation.disk.watermark.high\": \"$RELAX_WM_HIGH\",
|
|
||||||
\"cluster.routing.allocation.disk.watermark.flood_stage\": \"$RELAX_WM_FLOOD\"
|
|
||||||
}
|
|
||||||
}" | sed -n '1,5p'
|
|
||||||
|
|
||||||
if [[ "$CLEAR_READONLY_BLOCKS" == "1" ]]; then
|
|
||||||
echo "[RELAX] Clearing read_only/read_only_allow_delete blocks on all indices (best-effort)"
|
|
||||||
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_all/_settings" -d '{
|
|
||||||
"index.blocks.read_only": false,
|
|
||||||
"index.blocks.read_only_allow_delete": false
|
|
||||||
}' >/dev/null || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "${SET_KIBANA_REPLICAS_ZERO:-1}" != "0" ]]; then
|
|
||||||
echo "[RELAX] Ensure .kibana* use replicas=0 via index template and per-index settings (best-effort)"
|
|
||||||
# high priority template for .kibana* only, avoid impacting other indices
|
|
||||||
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_index_template/kibana-replicas-0" -d '{
|
|
||||||
"index_patterns": [".kibana*"],
|
|
||||||
"priority": 200,
|
|
||||||
"template": { "settings": { "number_of_replicas": 0 } }
|
|
||||||
}' >/dev/null || true
|
|
||||||
# set existing .kibana* to replicas=0
|
|
||||||
idxs=$(curl -sS "$ES_URL/_cat/indices/.kibana*?h=index" | awk '{print $1}')
|
|
||||||
for i in $idxs; do
|
|
||||||
[[ -n "$i" ]] || continue
|
|
||||||
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/$i/_settings" -d '{"index":{"number_of_replicas":0}}' >/dev/null || true
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Retry failed shard allocations (best-effort)
|
|
||||||
curl -sS -H 'Content-Type: application/json' -X POST "$ES_URL/_cluster/reroute?retry_failed=true" -d '{}' >/dev/null || true
|
|
||||||
|
|
||||||
echo "[RELAX] Cluster health (post):"
|
|
||||||
curl -sS "$ES_URL/_cluster/health?pretty" | sed -n '1,80p'
|
|
||||||
|
|
||||||
# Simple current status summary
|
|
||||||
ch=$(curl -sS "$ES_URL/_cluster/health" || true)
|
|
||||||
status=$(printf '%s' "$ch" | awk -F'"' '/"status"/{print $4; exit}')
|
|
||||||
unassigned=$(printf '%s' "$ch" | awk -F'[,: ]+' '/"unassigned_shards"/{print $3; exit}')
|
|
||||||
duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true)
|
|
||||||
settings=$(curl -sS "$ES_URL/_cluster/settings?flat_settings=true" || true)
|
|
||||||
th=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.threshold_enabled"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
|
|
||||||
low=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.low"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
|
|
||||||
high=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.high"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
|
|
||||||
flood=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.flood_stage"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
|
|
||||||
ks=$(curl -sS "$ES_URL/_cat/shards/.kibana*?h=state" || true)
|
|
||||||
total=$(printf '%s' "$ks" | awk 'NF{c++} END{print c+0}')
|
|
||||||
started=$(printf '%s' "$ks" | awk '/STARTED/{c++} END{print c+0}')
|
|
||||||
unass=$(printf '%s' "$ks" | awk '/UNASSIGNED/{c++} END{print c+0}')
|
|
||||||
echo "[RELAX][SUMMARY] status=${status:-?} unassigned=${unassigned:-?} es.data.use=${duse:-?} watermarks(threshold=${th:-?} low=${low:-?} high=${high:-?} flood=${flood:-?}) kibana_shards(total=${total},started=${started},unassigned=${unass})"
|
|
||||||
|
|
||||||
echo "[RELAX] Done. Remember to run scripts/es-watermark-restore.sh after freeing disk space and cluster becomes stable."
|
|
||||||
@ -1,37 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
|
||||||
|
|
||||||
ES_URL="http://localhost:${ES_HTTP_PORT:-9200}"
|
|
||||||
|
|
||||||
echo "[RESTORE] Checking Elasticsearch at $ES_URL"
|
|
||||||
code=$(curl -s -o /dev/null -w '%{http_code}' "$ES_URL/_cluster/health" || true)
|
|
||||||
if [[ "$code" != "200" ]]; then
|
|
||||||
echo "[RESTORE][ERROR] ES not reachable (code=$code). Ensure argus-es-sys is running." >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[RESTORE] Re-enabling disk threshold and clearing relaxed watermarks (transient)"
|
|
||||||
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d '{
|
|
||||||
"transient": {
|
|
||||||
"cluster.routing.allocation.disk.threshold_enabled": true,
|
|
||||||
"cluster.routing.allocation.disk.watermark.low": null,
|
|
||||||
"cluster.routing.allocation.disk.watermark.high": null,
|
|
||||||
"cluster.routing.allocation.disk.watermark.flood_stage": null
|
|
||||||
}
|
|
||||||
}' | sed -n '1,5p'
|
|
||||||
|
|
||||||
# Optionally restore default replicas to 1 (set RESTORE_DEFAULT_REPLICAS=1 to enable)
|
|
||||||
if [[ "${RESTORE_DEFAULT_REPLICAS:-0}" == "1" ]]; then
|
|
||||||
echo "[RESTORE] Setting transient default index.number_of_replicas=1"
|
|
||||||
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d '{"transient":{"index.number_of_replicas":"1"}}' >/dev/null || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[RESTORE] Cluster health:"
|
|
||||||
curl -sS "$ES_URL/_cluster/health?pretty" | sed -n '1,80p'
|
|
||||||
|
|
||||||
echo "[RESTORE] Done. Verify shards and consider keeping replicas=0 for single-node deployments."
|
|
||||||
|
|
||||||
@ -1,103 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Quick fix tool: replace 172.22/16 targets in nodes.json with overlay IPs resolved from hostname.
|
|
||||||
# Usage: run on server package host: scripts/fix-prom-targets-overlay.sh
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
NODES_JSON="$ROOT/private/argus/metric/prometheus/nodes.json"
|
|
||||||
|
|
||||||
require_cmd() { command -v "$1" >/dev/null 2>&1 || { echo "[ERROR] missing command: $1" >&2; exit 1; }; }
|
|
||||||
|
|
||||||
backup() {
|
|
||||||
local src="$1"; local ts; ts=$(date -u +%Y%m%d-%H%M%SZ)
|
|
||||||
cp "$src" "${src%.json}_bak_${ts}.json"
|
|
||||||
}
|
|
||||||
|
|
||||||
prefer_overlay_ip() {
|
|
||||||
local host="$1"
|
|
||||||
# prefer 10.0/8 then 172.31/16
|
|
||||||
getent hosts "$host" | awk '{print $1}' | while read -r ip; do
|
|
||||||
if [[ "$ip" =~ ^10\. ]]; then echo "$ip"; return; fi
|
|
||||||
done
|
|
||||||
getent hosts "$host" | awk '{print $1}' | while read -r ip; do
|
|
||||||
if [[ "$ip" =~ ^172\.31\. ]]; then echo "$ip"; return; fi
|
|
||||||
done
|
|
||||||
# fallback: first A record
|
|
||||||
getent hosts "$host" | awk '{print $1; exit}'
|
|
||||||
}
|
|
||||||
|
|
||||||
require_cmd awk
|
|
||||||
require_cmd sed
|
|
||||||
|
|
||||||
if [[ ! -f "$NODES_JSON" ]]; then
|
|
||||||
echo "[WARN] nodes.json not found: $NODES_JSON" >&2
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
backup "$NODES_JSON"
|
|
||||||
|
|
||||||
tmp=$(mktemp)
|
|
||||||
trap 'rm -f "$tmp"' EXIT
|
|
||||||
|
|
||||||
changed=0
|
|
||||||
python3 - "$NODES_JSON" <<'PY' > "$tmp" || {
|
|
||||||
import ipaddress, json, sys, socket
|
|
||||||
path=sys.argv[1]
|
|
||||||
data=json.load(open(path)) if path else []
|
|
||||||
def resolve(host):
|
|
||||||
try:
|
|
||||||
infos=socket.getaddrinfo(host,None,family=socket.AF_INET)
|
|
||||||
ips=[i[4][0] for i in infos]
|
|
||||||
# prefer 10. over 172.31.
|
|
||||||
for ip in ips:
|
|
||||||
if ip.startswith('10.'): return ip
|
|
||||||
for ip in ips:
|
|
||||||
if ip.startswith('172.31.'): return ip
|
|
||||||
return ips[0] if ips else None
|
|
||||||
except OSError:
|
|
||||||
return None
|
|
||||||
gw=ipaddress.ip_network('172.22.0.0/16')
|
|
||||||
out=[]
|
|
||||||
changed=False
|
|
||||||
for item in data:
|
|
||||||
ip=item.get('ip')
|
|
||||||
host=item.get('hostname') or ''
|
|
||||||
try:
|
|
||||||
bad = ip and ipaddress.ip_address(ip) in gw
|
|
||||||
except Exception:
|
|
||||||
bad = False
|
|
||||||
if bad and host:
|
|
||||||
new=resolve(host)
|
|
||||||
if new:
|
|
||||||
item=dict(item)
|
|
||||||
item['ip']=new
|
|
||||||
changed=True
|
|
||||||
out.append(item)
|
|
||||||
json.dump(out, sys.stdout, ensure_ascii=False)
|
|
||||||
sys.stderr.write('CHANGED' if changed else 'UNCHANGED')
|
|
||||||
PY
|
|
||||||
|
|
||||||
status=$?
|
|
||||||
marker=$(tail -n1 /dev/stderr 2>/dev/null || true)
|
|
||||||
if [[ "$status" -ne 0 ]]; then
|
|
||||||
echo "[ERROR] failed to rewrite nodes.json" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if grep -q '"ip"\s*:\s*"172\.22\.' "$tmp"; then
|
|
||||||
echo "[WARN] some gwbridge targets remain; manual fix may be required" >&2
|
|
||||||
fi
|
|
||||||
|
|
||||||
mv "$tmp" "$NODES_JSON"
|
|
||||||
echo "[OK] nodes.json updated"
|
|
||||||
|
|
||||||
# try to reload Prometheus
|
|
||||||
if docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then
|
|
||||||
docker exec argus-prometheus sh -lc 'pidof prometheus >/dev/null 2>&1 && kill -HUP $(pidof prometheus) || supervisorctl restart prometheus' >/dev/null 2>&1 || true
|
|
||||||
echo "[INFO] Prometheus reloaded"
|
|
||||||
fi
|
|
||||||
|
|
||||||
exit 0
|
|
||||||
|
|
||||||
@ -1,198 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
|
|
||||||
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
|
||||||
|
|
||||||
ts="$(date -u +%Y%m%d-%H%M%SZ)"
|
|
||||||
LOG_DIR="$ROOT/logs"
|
|
||||||
mkdir -p "$LOG_DIR" || true
|
|
||||||
# Fallback to /tmp when logs dir is not writable
|
|
||||||
if ! ( : > "$LOG_DIR/.w" 2>/dev/null ); then
|
|
||||||
LOG_DIR="/tmp/argus-logs"
|
|
||||||
mkdir -p "$LOG_DIR" || true
|
|
||||||
fi
|
|
||||||
DETAILS="$LOG_DIR/diagnose_details_${ts}.log"
|
|
||||||
ERRORS="$LOG_DIR/diagnose_error_${ts}.log"
|
|
||||||
: > "$DETAILS"; : > "$ERRORS"
|
|
||||||
|
|
||||||
logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; }
|
|
||||||
append_err() { echo "$*" >> "$ERRORS"; }
|
|
||||||
|
|
||||||
http_code() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
|
|
||||||
http_body_head() { curl -s --max-time 3 "$1" 2>/dev/null | sed -n '1,5p' || true; }
|
|
||||||
header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; }
|
|
||||||
|
|
||||||
section() {
|
|
||||||
local name="$1"; logd "===== [$name] ====="; }
|
|
||||||
|
|
||||||
svc() {
|
|
||||||
local svc_name="$1"; local cname="$2"; shift 2
|
|
||||||
section "$svc_name ($cname)"
|
|
||||||
logd "docker ps:"; docker ps -a --format '{{.Names}}\t{{.Status}}\t{{.Image}}' | awk -v n="$cname" '$1==n' >> "$DETAILS" || true
|
|
||||||
logd "docker inspect (state/restartcount):"; docker inspect -f '{{.State.Status}} rc={{.RestartCount}} started={{.State.StartedAt}}' "$cname" >> "$DETAILS" 2>&1 || true
|
|
||||||
logd "last 200 container logs:"; docker logs --tail 200 "$cname" >> "$DETAILS" 2>&1 || true
|
|
||||||
|
|
||||||
# extract error lines from container logs
|
|
||||||
docker logs --tail 200 "$cname" 2>&1 | \
|
|
||||||
grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \
|
|
||||||
sed "s/^/[${svc_name}][container] /" >> "$ERRORS" || true
|
|
||||||
|
|
||||||
# supervisor status and logs
|
|
||||||
if docker exec "$cname" sh -lc 'command -v supervisorctl >/dev/null 2>&1' >/dev/null 2>&1; then
|
|
||||||
logd "supervisorctl status:"; docker exec "$cname" sh -lc 'supervisorctl status' >> "$DETAILS" 2>&1 || true
|
|
||||||
# iterate supervisor logs and collect tails + errors per file
|
|
||||||
local files
|
|
||||||
files=$(docker exec "$cname" sh -lc 'ls /var/log/supervisor/*.log 2>/dev/null' || true)
|
|
||||||
for f in $files; do
|
|
||||||
logd "tail -n 80 $f:"; docker exec "$cname" sh -lc "tail -n 80 $f 2>/dev/null || true" >> "$DETAILS" 2>&1 || true
|
|
||||||
docker exec "$cname" sh -lc "tail -n 200 $f 2>/dev/null" 2>/dev/null | \
|
|
||||||
grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \
|
|
||||||
sed "s/^/[${svc_name}][supervisor:$(basename $f)] /" >> "$ERRORS" || true
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Core services
|
|
||||||
svc bind argus-bind-sys
|
|
||||||
svc master argus-master-sys
|
|
||||||
svc es argus-es-sys
|
|
||||||
svc kibana argus-kibana-sys
|
|
||||||
svc ftp argus-ftp
|
|
||||||
svc prometheus argus-prometheus
|
|
||||||
svc grafana argus-grafana
|
|
||||||
svc alertmanager argus-alertmanager
|
|
||||||
svc web-frontend argus-web-frontend
|
|
||||||
svc web-proxy argus-web-proxy
|
|
||||||
|
|
||||||
# HTTP checks (host side)
|
|
||||||
section HTTP
|
|
||||||
logd "ES: $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health")"
|
|
||||||
http_body_head "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" >> "$DETAILS" 2>&1 || true
|
|
||||||
|
|
||||||
logd "Kibana: $(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status")"
|
|
||||||
http_body_head "http://localhost:${KIBANA_PORT:-5601}/api/status" >> "$DETAILS" 2>&1 || true
|
|
||||||
|
|
||||||
logd "Master readyz: $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz")"
|
|
||||||
|
|
||||||
logd "Prometheus: $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready")"
|
|
||||||
logd "Grafana: $(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health")"
|
|
||||||
http_body_head "http://localhost:${GRAFANA_PORT:-3000}/api/health" >> "$DETAILS" 2>&1 || true
|
|
||||||
logd "Alertmanager: $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status")"
|
|
||||||
|
|
||||||
cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true)
|
|
||||||
cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true)
|
|
||||||
logd "Web-Proxy 8080: $(http_code "http://localhost:${WEB_PROXY_PORT_8080:-8080}/")"
|
|
||||||
logd "Web-Proxy 8083: $(http_code "http://localhost:${WEB_PROXY_PORT_8083:-8083}/")"
|
|
||||||
logd "Web-Proxy 8084 CORS: ${cors8084}"
|
|
||||||
logd "Web-Proxy 8085 CORS: ${cors8085}"
|
|
||||||
|
|
||||||
# Elasticsearch deep checks: disk watermark and Kibana index status
|
|
||||||
section ES-CHECKS
|
|
||||||
ch=$(curl -s --max-time 3 "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" || true)
|
|
||||||
status=$(printf '%s' "$ch" | awk -F'\"' '/"status"/{print $4; exit}')
|
|
||||||
if [[ -n "$status" ]]; then logd "cluster.status=$status"; fi
|
|
||||||
if [[ "$status" != "green" ]]; then append_err "[es][cluster] status=$status"; fi
|
|
||||||
|
|
||||||
if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then
|
|
||||||
duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true)
|
|
||||||
logd "es.data.df_use=$duse"
|
|
||||||
usep=${duse%%%}
|
|
||||||
if [[ -n "$usep" ]] && (( usep >= 90 )); then
|
|
||||||
append_err "[es][disk] data path usage ${duse} (>=90%) likely hit high/flood watermarks"
|
|
||||||
echo "HINT: High ES disk usage. Free space or run scripts/es-watermark-relax.sh (then scripts/es-watermark-restore.sh)." >&2
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
ks=$(curl -s --max-time 3 "http://localhost:${ES_HTTP_PORT:-9200}/_cat/shards/.kibana*?h=index,shard,prirep,state,unassigned.reason" || true)
|
|
||||||
if printf '%s' "$ks" | grep -Eiq '\\b(UNASSIGNED|INITIALIZING|RELOCATING)\\b'; then
|
|
||||||
append_err "[kibana][index] .kibana* shards not green"; logd "$ks"
|
|
||||||
echo "HINT: .kibana* shards not green. On single-node, set replicas=0 and ensure ES disk watermarks are not exceeded." >&2
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Overlay network diagnostics
|
|
||||||
section OVERLAY-NET
|
|
||||||
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then
|
|
||||||
logd "overlay present: ${OVERLAY_NET_NAME:-argus-sys-net}"
|
|
||||||
docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" | sed -n '1,60p' >> "$DETAILS" 2>/dev/null || true
|
|
||||||
else
|
|
||||||
append_err "[overlay][network] missing ${OVERLAY_NET_NAME:-argus-sys-net}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Domain resolution & reachability from inside web-proxy (bind-backed)
|
|
||||||
section DOMAIN
|
|
||||||
for d in master.argus.com ftp.argus.com kibana.argus.com es.log.argus.com prom.argus.com grafana.argus.com; do
|
|
||||||
logd "getent $d (web-proxy):"
|
|
||||||
docker exec argus-web-proxy sh -lc "getent hosts $d || true" >> "$DETAILS" 2>&1 || true
|
|
||||||
done
|
|
||||||
logd "HTTP (web-proxy): master.readyz=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://master.argus.com:3000/readyz\" 2>/dev/null || echo 000)"
|
|
||||||
logd "HTTP (web-proxy): es.health=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://es.log.argus.com:9200/_cluster/health\" 2>/dev/null || echo 000)"
|
|
||||||
logd "HTTP (web-proxy): kibana.status=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://kibana.argus.com:5601/api/status\" 2>/dev/null || echo 000)"
|
|
||||||
|
|
||||||
# FTP share writability (container perspective)
|
|
||||||
section FTP-SHARE
|
|
||||||
docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share; test -w /private/argus/ftp/share && echo "write:OK" || echo "write:FAIL"' >> "$DETAILS" 2>&1 || true
|
|
||||||
|
|
||||||
# Collect system info for context
|
|
||||||
section SYSTEM
|
|
||||||
logd "uname -a:"; uname -a >> "$DETAILS"
|
|
||||||
logd "docker version:"; docker version --format '{{.Server.Version}}' >> "$DETAILS" 2>&1 || true
|
|
||||||
logd "compose ps:"; (cd "$ROOT/compose" && if docker compose version >/dev/null 2>&1; then docker compose -p argus-sys ps; else docker-compose -p argus-sys ps; fi) >> "$DETAILS" 2>&1 || true
|
|
||||||
|
|
||||||
section SUMMARY
|
|
||||||
# Add HTTP failures and CORS problems to error log with tags
|
|
||||||
[[ $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health") != 200 ]] && echo "[es][http] health not 200" >> "$ERRORS"
|
|
||||||
kbcode=$(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status"); [[ "$kbcode" != 200 ]] && echo "[kibana][http] /api/status=$kbcode" >> "$ERRORS"
|
|
||||||
[[ $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz") != 200 ]] && echo "[master][http] /readyz not 200" >> "$ERRORS"
|
|
||||||
[[ $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready") != 200 ]] && echo "[prometheus][http] /-/ready not 200" >> "$ERRORS"
|
|
||||||
gfcode=$(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health"); [[ "$gfcode" != 200 ]] && echo "[grafana][http] /api/health=$gfcode" >> "$ERRORS"
|
|
||||||
[[ $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status") != 200 ]] && echo "[alertmanager][http] /api/v2/status not 200" >> "$ERRORS"
|
|
||||||
[[ -z "$cors8084" ]] && echo "[web-proxy][cors] 8084 missing Access-Control-Allow-Origin" >> "$ERRORS"
|
|
||||||
[[ -z "$cors8085" ]] && echo "[web-proxy][cors] 8085 missing Access-Control-Allow-Origin" >> "$ERRORS"
|
|
||||||
|
|
||||||
# Deduplicate errors
|
|
||||||
sort -u -o "$ERRORS" "$ERRORS"
|
|
||||||
|
|
||||||
# --- Prometheus targets & nodes.json checks ---
|
|
||||||
section PROMETHEUS-TARGETS
|
|
||||||
nodes_json_path="$ROOT/private/argus/metric/prometheus/nodes.json"
|
|
||||||
if [[ -f "$nodes_json_path" ]]; then
|
|
||||||
logd "nodes.json present: $nodes_json_path"
|
|
||||||
# detect gwbridge addresses (172.22/16)
|
|
||||||
if grep -E '"ip"\s*:\s*"172\.22\.' "$nodes_json_path" >/dev/null 2>&1; then
|
|
||||||
append_err "[prometheus][targets] contains gwbridge (172.22/16) addresses; prefer overlay (10.0/8)."
|
|
||||||
echo "HINT: nodes.json has 172.22.x.x targets. Ensure agent publishes overlay_ip and master prefers it." >&2
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
logd "nodes.json missing at $nodes_json_path"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Query Prometheus activeTargets and list down items when possible
|
|
||||||
pt_json=$(curl -s --max-time 3 "http://localhost:${PROMETHEUS_PORT:-9090}/api/v1/targets" || true)
|
|
||||||
if command -v jq >/dev/null 2>&1; then
|
|
||||||
downs=$(printf '%s' "$pt_json" | jq -r '.data.activeTargets[] | select(.health=="down") | "[prometheus][activeTargets] down url=\(.scrapeUrl) lastError=\(.lastError)"' 2>/dev/null || true)
|
|
||||||
if [[ -n "$downs" ]]; then
|
|
||||||
printf '%s\n' "$downs" >> "$ERRORS"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
# best-effort grep when jq is unavailable
|
|
||||||
if printf '%s' "$pt_json" | grep -q '"health":"down"'; then
|
|
||||||
append_err "[prometheus][activeTargets] some targets are down (enable jq for detailed reasons)"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Diagnostic details -> $DETAILS"
|
|
||||||
echo "Detected errors -> $ERRORS"
|
|
||||||
|
|
||||||
if [[ "$LOG_DIR" == "$ROOT/logs" ]]; then
|
|
||||||
# maintain latest symlinks when writing under package logs
|
|
||||||
ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true
|
|
||||||
ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true
|
|
||||||
else
|
|
||||||
echo "Diagnostic details -> $DETAILS"
|
|
||||||
echo "Detected errors -> $ERRORS"
|
|
||||||
fi
|
|
||||||
|
|
||||||
exit 0
|
|
||||||
@ -1,365 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" # version root
|
|
||||||
|
|
||||||
PROJECT_NAME="argus-sys"
|
|
||||||
|
|
||||||
log() { echo -e "\033[0;34m[INSTALL]\033[0m $*"; }
|
|
||||||
err() { echo -e "\033[0;31m[ERROR ]\033[0m $*" >&2; }
|
|
||||||
|
|
||||||
require() { command -v "$1" >/dev/null 2>&1 || { err "missing command: $1"; exit 1; }; }
|
|
||||||
|
|
||||||
require docker
|
|
||||||
if docker compose version >/dev/null 2>&1; then COMPOSE=(docker compose); else require docker-compose; COMPOSE=(docker-compose); fi
|
|
||||||
|
|
||||||
ENV_FILE="$PKG_ROOT/compose/.env"
|
|
||||||
ENV_TEMPLATE="$PKG_ROOT/compose/.env.example"
|
|
||||||
|
|
||||||
find_free_port() {
|
|
||||||
local prefer="$1"; local start=${2:-20000}; local max=${3:-65000};
|
|
||||||
if ! ss -ltnH 2>/dev/null | awk -v pat=":"$prefer"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then echo "$prefer"; return; fi
|
|
||||||
for ((p=start; p<=max; p++)); do
|
|
||||||
if ! ss -ltnH 2>/dev/null | awk -v pat=":"$p"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then echo "$p"; return; fi
|
|
||||||
done
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
prepare_env() {
|
|
||||||
if [[ -f "$ENV_FILE" ]]; then log ".env exists, keep as-is"; return; fi
|
|
||||||
[[ -f "$ENV_TEMPLATE" ]] || { err "missing $ENV_TEMPLATE"; exit 1; }
|
|
||||||
cp "$ENV_TEMPLATE" "$ENV_FILE"
|
|
||||||
# overlay 模式下,避免为不同服务分配到同一新端口;保持模板端口不做自动改写
|
|
||||||
}
|
|
||||||
|
|
||||||
# read VAR from .env (simple parser)
|
|
||||||
_read_env_var() { local var="$1"; local f="$ENV_FILE"; [[ -f "$f" ]] || return 1; awk -F'=' -v k="$var" 'BEGIN{found=0} $1==k{print substr($0,index($0,"=")+1); found=1; exit} END{exit found?0:1}' "$f"; }
|
|
||||||
|
|
||||||
# set or append VAR=VAL in .env atomically
|
|
||||||
_set_env_var() {
|
|
||||||
local var="$1"; local val="$2"; local f="$ENV_FILE"; local tmp="$ENV_FILE.tmp$$"
|
|
||||||
if [[ -f "$f" ]] && grep -qE "^${var}=" "$f"; then
|
|
||||||
sed -E "s#^(${var}=).*#\\1${val}#" "$f" >"$tmp" && mv "$tmp" "$f"
|
|
||||||
else
|
|
||||||
[[ -f "$f" ]] || : >"$f"
|
|
||||||
printf "%s=%s\n" "$var" "$val" >>"$f"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
auto_assign_ports() {
|
|
||||||
local enable="${AUTO_ASSIGN_PORTS:-true}"
|
|
||||||
case "$enable" in
|
|
||||||
0|false|no|off) log "AUTO_ASSIGN_PORTS disabled"; return;;
|
|
||||||
esac
|
|
||||||
[[ -f "$ENV_FILE" ]] || return 0
|
|
||||||
log "auto-assigning free host ports (with fallback)"
|
|
||||||
cp "$ENV_FILE" "$ENV_FILE.bak.$(date +%Y%m%d-%H%M%S)" || true
|
|
||||||
|
|
||||||
# list of VAR:default pairs to try; FTP 相关端口与被动端口范围不自动改写
|
|
||||||
local pairs=(
|
|
||||||
"MASTER_PORT:32300"
|
|
||||||
"ES_HTTP_PORT:9200"
|
|
||||||
"KIBANA_PORT:5601"
|
|
||||||
"PROMETHEUS_PORT:9090"
|
|
||||||
"ALERTMANAGER_PORT:9093"
|
|
||||||
"GRAFANA_PORT:3000"
|
|
||||||
"WEB_PROXY_PORT_8080:8080"
|
|
||||||
"WEB_PROXY_PORT_8081:8081"
|
|
||||||
"WEB_PROXY_PORT_8082:8082"
|
|
||||||
"WEB_PROXY_PORT_8083:8083"
|
|
||||||
"WEB_PROXY_PORT_8084:8084"
|
|
||||||
"WEB_PROXY_PORT_8085:8085"
|
|
||||||
)
|
|
||||||
|
|
||||||
# track ports reserved in this run to avoid duplicates
|
|
||||||
declare -A reserved=()
|
|
||||||
# pre-mark currently listening ports to avoid choosing them twice within the same run
|
|
||||||
while read -r lp; do reserved["$lp"]=1; done < <(ss -ltnH 2>/dev/null | awk '{print $4}' | sed -n 's#.*:##p')
|
|
||||||
|
|
||||||
for ent in "${pairs[@]}"; do
|
|
||||||
local var=${ent%%:*}; local def=${ent##*:}
|
|
||||||
local cur
|
|
||||||
if ! cur=$(_read_env_var "$var"); then cur="$def"; fi
|
|
||||||
# strip quotes if any
|
|
||||||
cur=${cur%\r}; cur=${cur%\n}; cur=${cur//\"/}
|
|
||||||
# find a free port, avoiding ones we already reserved in this loop
|
|
||||||
local cand="$cur"
|
|
||||||
# if already in use or reserved, pick a free one
|
|
||||||
if ss -ltnH 2>/dev/null | awk -v pat=":"$cand"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then
|
|
||||||
cand=$(find_free_port "$cand" 20000 65000)
|
|
||||||
fi
|
|
||||||
# avoid duplicates chosen in this loop
|
|
||||||
local attempts=0
|
|
||||||
while [[ -n "${reserved[$cand]:-}" ]]; do
|
|
||||||
attempts=$((attempts+1))
|
|
||||||
local start=$((cand+1)); [[ $start -lt 20000 ]] && start=20000
|
|
||||||
local next
|
|
||||||
next=$(find_free_port "$start" "$start" 65000 || true)
|
|
||||||
if [[ -z "$next" ]]; then
|
|
||||||
next=$(find_free_port 20000 20000 65000 || true)
|
|
||||||
fi
|
|
||||||
if [[ -z "$next" || "$next" == "$cand" ]]; then
|
|
||||||
err "no free port available while assigning for $var (last tried: $cand)"; exit 1
|
|
||||||
fi
|
|
||||||
cand="$next"
|
|
||||||
if (( attempts > 1000 )); then err "port assignment loop exceeded for $var"; exit 1; fi
|
|
||||||
done
|
|
||||||
reserved["$cand"]=1
|
|
||||||
if [[ "$cand" != "$cur" ]]; then
|
|
||||||
log " port reassigned: $var $cur -> $cand"
|
|
||||||
_set_env_var "$var" "$cand"
|
|
||||||
else
|
|
||||||
# ensure the var exists in .env for clarity
|
|
||||||
_set_env_var "$var" "$cand"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
prepare_data_dirs() {
|
|
||||||
if [[ $EUID -ne 0 ]]; then
|
|
||||||
echo -e "\033[1;33m[WARN]\033[0m running as non-root: will not chown data dirs."
|
|
||||||
echo -e "\033[1;33m[WARN]\033[0m If you hit Permission denied, run: sudo $SCRIPT_DIR/server-prepare-dirs.sh"
|
|
||||||
# still ensure basic directories exist (no chown)
|
|
||||||
mkdir -p \
|
|
||||||
"$PKG_ROOT/private/argus/etc" \
|
|
||||||
"$PKG_ROOT/private/argus/log/elasticsearch" \
|
|
||||||
"$PKG_ROOT/private/argus/log/kibana" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/prometheus" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/prometheus/data" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/prometheus/rules" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/data" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/logs" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/plugins" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/data/sessions" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \
|
|
||||||
"$PKG_ROOT/private/argus/alert/alertmanager" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/ftp/share"
|
|
||||||
# non-root: relax permissions to avoid container UID mismatch blocking writes
|
|
||||||
chmod -R a+rwx "$PKG_ROOT/private/argus" 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
ensure_swarm_and_overlay() {
|
|
||||||
local net_name="${OVERLAY_NET_NAME:-argus-sys-net}"
|
|
||||||
# Require swarm active
|
|
||||||
local state
|
|
||||||
state=$(docker info --format '{{.Swarm.LocalNodeState}}' 2>/dev/null || echo "")
|
|
||||||
if [[ "$state" != "active" ]]; then
|
|
||||||
err "Docker Swarm is not active. On this host run:"
|
|
||||||
err " docker swarm init --advertise-addr <this_host_ip>"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
# Create attachable overlay if missing
|
|
||||||
if ! docker network inspect "$net_name" >/dev/null 2>&1; then
|
|
||||||
log "creating attachable overlay network: $net_name"
|
|
||||||
docker network create --driver overlay --attachable "$net_name" >/dev/null
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
bootstrap_dns_conf() {
|
|
||||||
local etc_dir="$PKG_ROOT/private/argus/etc"
|
|
||||||
mkdir -p "$etc_dir"
|
|
||||||
local dns_file="$etc_dir/dns.conf"
|
|
||||||
if [[ ! -s "$dns_file" ]]; then
|
|
||||||
# detect host primary IP
|
|
||||||
local host_ip
|
|
||||||
host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7; exit}')
|
|
||||||
[[ -z "$host_ip" ]] && host_ip=$(hostname -I 2>/dev/null | awk '{print $1}')
|
|
||||||
if [[ -n "$host_ip" ]]; then
|
|
||||||
echo "$host_ip" > "$dns_file"
|
|
||||||
log "wrote initial dns.conf with host IP: $host_ip"
|
|
||||||
else
|
|
||||||
err "failed to determine host IP for dns.conf; please edit $dns_file manually"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
load_images() {
|
|
||||||
local tar="$PKG_ROOT/images/all-images.tar.gz"
|
|
||||||
[[ -f "$tar" ]] || { err "missing images tar: $tar"; exit 1; }
|
|
||||||
log "loading images from $(basename "$tar") (may take minutes)"
|
|
||||||
gunzip -c "$tar" | docker load >/dev/null
|
|
||||||
}
|
|
||||||
|
|
||||||
bring_up() {
|
|
||||||
log "starting services via compose"
|
|
||||||
ensure_swarm_and_overlay
|
|
||||||
bootstrap_dns_conf
|
|
||||||
local ov="$PKG_ROOT/compose/docker-compose.os-compat.override.yml"
|
|
||||||
if [[ ! -f "$ov" ]]; then
|
|
||||||
cat > "$ov" <<'YAML'
|
|
||||||
services:
|
|
||||||
bind:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
tmpfs:
|
|
||||||
- /run/named
|
|
||||||
master:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
es:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
kibana:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
ftp:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
prometheus:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
grafana:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
alertmanager:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
# ensure runtime path matches container expectation
|
|
||||||
volumes:
|
|
||||||
- ../private/argus/etc:/private/argus/etc
|
|
||||||
- ../private/argus/alert/alertmanager:/alertmanager
|
|
||||||
web-frontend:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
web-proxy:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
YAML
|
|
||||||
log "generated OS-compat override: $(basename "$ov")"
|
|
||||||
fi
|
|
||||||
# 仅启动服务端组件,避免误起测试节点(node-a/node-b/test-node/test-gpu-node)
|
|
||||||
local services=(bind master es kibana ftp prometheus grafana alertmanager web-frontend web-proxy)
|
|
||||||
log "services: ${services[*]}"
|
|
||||||
(cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" -f docker-compose.yml -f $(basename "$ov") up -d "${services[@]}")
|
|
||||||
}
|
|
||||||
|
|
||||||
# Post bootstrap container-side fixes that do not require sudo on host.
|
|
||||||
post_bootstrap_fixes() {
|
|
||||||
# Kibana: ensure /usr/share/kibana/data is a symlink into mounted path to avoid EACCES
|
|
||||||
if docker ps --format '{{.Names}}' | grep -q '^argus-kibana-sys$'; then
|
|
||||||
docker exec argus-kibana-sys bash -lc '
|
|
||||||
set -e
|
|
||||||
mkdir -p /private/argus/log/kibana && chmod 777 /private/argus/log/kibana || true
|
|
||||||
if [ -d /usr/share/kibana/data ] && [ ! -L /usr/share/kibana/data ]; then rm -rf /usr/share/kibana/data; fi
|
|
||||||
if [ ! -e /usr/share/kibana/data ]; then ln -s /private/argus/log/kibana /usr/share/kibana/data; fi
|
|
||||||
' >/dev/null 2>&1 || true
|
|
||||||
fi
|
|
||||||
# Elasticsearch: ensure data path points to mounted path and is writable
|
|
||||||
if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then
|
|
||||||
docker exec argus-es-sys bash -lc '
|
|
||||||
set -e
|
|
||||||
mkdir -p /private/argus/log/elasticsearch && chmod 777 /private/argus/log/elasticsearch || true
|
|
||||||
if [ -d /usr/share/elasticsearch/data ] && [ ! -L /usr/share/elasticsearch/data ]; then rm -rf /usr/share/elasticsearch/data; fi
|
|
||||||
if [ ! -e /usr/share/elasticsearch/data ]; then ln -s /private/argus/log/elasticsearch /usr/share/elasticsearch/data; fi
|
|
||||||
' >/dev/null 2>&1 || true
|
|
||||||
fi
|
|
||||||
# Bind9: ensure rndc.key exists
|
|
||||||
if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then
|
|
||||||
docker exec argus-bind-sys bash -lc '
|
|
||||||
set -e
|
|
||||||
mkdir -p /etc/bind
|
|
||||||
if [ ! -f /etc/bind/rndc.key ]; then rndc-confgen -a -c /etc/bind/rndc.key; fi
|
|
||||||
chmod 644 /etc/bind/rndc.key || true
|
|
||||||
' >/dev/null 2>&1 || true
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
dns_bootstrap() {
|
|
||||||
log "DNS bootstrap: initializing shared dns.conf and container resolv.conf"
|
|
||||||
local etc_dir="$PKG_ROOT/private/argus/etc"
|
|
||||||
mkdir -p "$etc_dir"
|
|
||||||
# 1) ensure dns.conf exists (fallback to bind IP 172.31.0.2)
|
|
||||||
if [[ ! -s "$etc_dir/dns.conf" ]]; then
|
|
||||||
if echo "172.31.0.2" > "$etc_dir/dns.conf" 2>/dev/null; then
|
|
||||||
log "wrote fallback dns.conf with 172.31.0.2"
|
|
||||||
else
|
|
||||||
# host-side write denied (ownership 1000:1000); write via bind container instead
|
|
||||||
if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then
|
|
||||||
docker exec argus-bind-sys sh -lc 'echo 172.31.0.2 > /private/argus/etc/dns.conf && chmod 644 /private/argus/etc/dns.conf' || true
|
|
||||||
log "fallback dns.conf written via bind container"
|
|
||||||
else
|
|
||||||
log "bind not ready; skip writing fallback dns.conf"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
# 2) wait briefly for bind to copy update-dns.sh into shared etc (bind startup.sh does this)
|
|
||||||
local i=0
|
|
||||||
while [[ ! -x "$etc_dir/update-dns.sh" && $i -lt 20 ]]; do
|
|
||||||
sleep 0.5; ((i++));
|
|
||||||
done
|
|
||||||
if [[ ! -x "$etc_dir/update-dns.sh" ]]; then
|
|
||||||
log "update-dns.sh not present yet; continuing with existing resolv.conf"
|
|
||||||
fi
|
|
||||||
# 3) run update-dns.sh inside key containers so /etc/resolv.conf points to bind
|
|
||||||
local c
|
|
||||||
for c in argus-master-sys argus-es-sys argus-kibana-sys argus-grafana argus-prometheus argus-ftp argus-web-frontend argus-web-proxy argus-alertmanager; do
|
|
||||||
if docker ps --format '{{.Names}}' | grep -q "^${c}$"; then
|
|
||||||
docker exec "$c" sh -lc 'test -x /private/argus/etc/update-dns.sh && /private/argus/etc/update-dns.sh || true' >/dev/null 2>&1 || true
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
# 4) wait for service A-record hint files generated by services (best-effort)
|
|
||||||
local need=( es.log.argus.com kibana.log.argus.com master.argus.com grafana.metric.argus.com prom.metric.argus.com alertmanager.alert.argus.com )
|
|
||||||
local waited=0; local missing=1
|
|
||||||
while (( waited < 15 )); do
|
|
||||||
missing=0
|
|
||||||
for f in "${need[@]}"; do [[ -s "$etc_dir/$f" ]] || { missing=1; break; }; done
|
|
||||||
[[ $missing -eq 0 ]] && break
|
|
||||||
sleep 1; ((waited++))
|
|
||||||
done
|
|
||||||
# 5) reload bind zone (script uses supervisor to restart bind9)
|
|
||||||
if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then
|
|
||||||
docker exec argus-bind-sys sh -lc '/usr/local/bin/reload-bind9.sh' >/dev/null 2>&1 || true
|
|
||||||
fi
|
|
||||||
# 6) restart web-proxy once to re-render nginx resolver with latest dns.conf
|
|
||||||
if docker ps --format '{{.Names}}' | grep -q '^argus-web-proxy$'; then
|
|
||||||
docker restart argus-web-proxy >/dev/null 2>&1 || true
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
selfcheck() {
|
|
||||||
# Initial selfcheck with retries to absorb cold starts
|
|
||||||
local max_retries="${SELF_CHECK_RETRIES:-5}" # 重试次数(不含首次),默认 5
|
|
||||||
local wait_seconds="${SELF_CHECK_WAIT_SECONDS:-30}" # 每次重试前等待秒数,默认 30s
|
|
||||||
|
|
||||||
local attempt=0
|
|
||||||
while :; do
|
|
||||||
attempt=$((attempt+1))
|
|
||||||
if (( attempt == 1 )); then
|
|
||||||
log "running selfcheck (attempt ${attempt})"
|
|
||||||
else
|
|
||||||
log "running selfcheck (attempt ${attempt}/${max_retries}+1)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if bash "$PKG_ROOT/scripts/server-selfcheck.sh"; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# failed
|
|
||||||
if (( attempt > max_retries )); then
|
|
||||||
err "selfcheck failed after ${attempt} attempt(s)"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
log "selfcheck not ready yet; retrying in ${wait_seconds}s..."
|
|
||||||
sleep "$wait_seconds"
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
main() {
|
|
||||||
mkdir -p "$PKG_ROOT/logs"
|
|
||||||
prepare_env
|
|
||||||
auto_assign_ports
|
|
||||||
prepare_data_dirs
|
|
||||||
load_images
|
|
||||||
bring_up
|
|
||||||
post_bootstrap_fixes
|
|
||||||
dns_bootstrap
|
|
||||||
selfcheck
|
|
||||||
log "install completed. See logs in $PKG_ROOT/logs/"
|
|
||||||
}
|
|
||||||
|
|
||||||
main "$@"
|
|
||||||
@ -1,73 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
|
|
||||||
if [[ $EUID -ne 0 ]]; then
|
|
||||||
echo "[PREPARE] This script requires root (sudo)." >&2
|
|
||||||
echo " Try: sudo $0" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
ENV_FILE="$PKG_ROOT/compose/.env"
|
|
||||||
[[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
|
||||||
UIDV="${ARGUS_BUILD_UID:-1000}"; GIDV="${ARGUS_BUILD_GID:-1000}"
|
|
||||||
|
|
||||||
echo "[PREPARE] Using owner ${UIDV}:${GIDV}"
|
|
||||||
|
|
||||||
# Core etc and service data dirs (aligned with src/sys/tests/scripts/01_bootstrap.sh)
|
|
||||||
mkdir -p \
|
|
||||||
"$PKG_ROOT/private/argus/etc" \
|
|
||||||
"$PKG_ROOT/private/argus/bind" \
|
|
||||||
"$PKG_ROOT/private/argus/master" \
|
|
||||||
"$PKG_ROOT/private/argus/agent" \
|
|
||||||
"$PKG_ROOT/private/argus/log/elasticsearch" \
|
|
||||||
"$PKG_ROOT/private/argus/log/kibana"
|
|
||||||
|
|
||||||
# Prometheus
|
|
||||||
mkdir -p \
|
|
||||||
"$PKG_ROOT/private/argus/metric/prometheus" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/prometheus/data" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/prometheus/rules" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/prometheus/targets"
|
|
||||||
|
|
||||||
# Grafana
|
|
||||||
mkdir -p \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/data" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/logs" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/plugins" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/data/sessions" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/config"
|
|
||||||
|
|
||||||
# FTP
|
|
||||||
mkdir -p "$PKG_ROOT/private/argus/metric/ftp/share"
|
|
||||||
|
|
||||||
# Alertmanager
|
|
||||||
mkdir -p "$PKG_ROOT/private/argus/alert/alertmanager"
|
|
||||||
|
|
||||||
chown -R "$UIDV":"$GIDV" \
|
|
||||||
"$PKG_ROOT/private/argus/etc" \
|
|
||||||
"$PKG_ROOT/private/argus/bind" \
|
|
||||||
"$PKG_ROOT/private/argus/master" \
|
|
||||||
"$PKG_ROOT/private/argus/agent" \
|
|
||||||
"$PKG_ROOT/private/argus/log/elasticsearch" \
|
|
||||||
"$PKG_ROOT/private/argus/log/kibana" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/prometheus" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/ftp" \
|
|
||||||
"$PKG_ROOT/private/argus/alert"
|
|
||||||
|
|
||||||
chmod -R g+w "$PKG_ROOT/private/argus/alert" "$PKG_ROOT/private/argus/etc" || true
|
|
||||||
|
|
||||||
# Ensure parent directories also owned by runtime user for consistency
|
|
||||||
chown "$UIDV":"$GIDV" \
|
|
||||||
"$PKG_ROOT/private/argus" \
|
|
||||||
"$PKG_ROOT/private/argus/log" \
|
|
||||||
"$PKG_ROOT/private/argus/metric" || true
|
|
||||||
|
|
||||||
echo "[PREPARE] Done. You can now run server-install.sh"
|
|
||||||
@ -1,104 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
|
|
||||||
log() { echo -e "\033[0;34m[CHECK]\033[0m $*"; }
|
|
||||||
err() { echo -e "\033[0;31m[ERROR]\033[0m $*" >&2; }
|
|
||||||
|
|
||||||
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
|
||||||
|
|
||||||
wait_http() { local url="$1"; local attempts=${2:-120}; local i=1; while ((i<=attempts)); do curl -fsS "$url" >/dev/null 2>&1 && return 0; echo "[..] waiting $url ($i/$attempts)"; sleep 5; ((i++)); done; return 1; }
|
|
||||||
code_for() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
|
|
||||||
header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; }
|
|
||||||
|
|
||||||
LOG_DIR="$ROOT/logs"
|
|
||||||
mkdir -p "$LOG_DIR" || true
|
|
||||||
OUT_JSON="$LOG_DIR/selfcheck.json"
|
|
||||||
tmp=$(mktemp)
|
|
||||||
|
|
||||||
ok=1
|
|
||||||
|
|
||||||
log "checking overlay network"
|
|
||||||
net_ok=false
|
|
||||||
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then
|
|
||||||
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" | grep -q '"Driver": "overlay"'; then net_ok=true; fi
|
|
||||||
fi
|
|
||||||
[[ "$net_ok" == true ]] || ok=0
|
|
||||||
|
|
||||||
log "checking Elasticsearch (via domain inside web-proxy)"
|
|
||||||
if docker exec argus-web-proxy sh -lc "curl -fsS http://es.log.argus.com:9200/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then es_ok=true; else es_ok=false; ok=0; fi
|
|
||||||
|
|
||||||
log "checking Kibana (via domain inside web-proxy)"
|
|
||||||
kb_code=$(docker exec argus-web-proxy sh -lc "curl -s -o /dev/null -w '%{http_code}' http://kibana.log.argus.com:5601/api/status" || echo 000)
|
|
||||||
kb_ok=false
|
|
||||||
if [[ "$kb_code" == "200" ]]; then body=$(curl -sS "http://localhost:${KIBANA_PORT:-5601}/api/status"); echo "$body" | grep -q '"level":"available"' && kb_ok=true; fi
|
|
||||||
[[ "$kb_ok" == true ]] || ok=0
|
|
||||||
|
|
||||||
log "checking Master (via domain inside web-proxy)"
|
|
||||||
if docker exec argus-web-proxy sh -lc "curl -fsS http://master.argus.com:3000/readyz" >/dev/null 2>&1; then true; else ok=0; fi
|
|
||||||
|
|
||||||
log "checking FTP"
|
|
||||||
if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then
|
|
||||||
if docker exec argus-ftp sh -lc 'test -w /private/argus/ftp/share'; then ftp_ok=true; else ftp_ok=false; ok=0; fi
|
|
||||||
else
|
|
||||||
ftp_ok=false; ok=0;
|
|
||||||
fi
|
|
||||||
|
|
||||||
log "checking Prometheus"
|
|
||||||
wait_http "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready" 60 || ok=0
|
|
||||||
|
|
||||||
log "checking Grafana"
|
|
||||||
gf_code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:${GRAFANA_PORT:-3000}/api/health" || echo 000)
|
|
||||||
gf_ok=false; if [[ "$gf_code" == "200" ]]; then body=$(curl -sS "http://localhost:${GRAFANA_PORT:-3000}/api/health"); echo "$body" | grep -q '"database"\s*:\s*"ok"' && gf_ok=true; fi
|
|
||||||
[[ "$gf_ok" == true ]] || ok=0
|
|
||||||
|
|
||||||
log "checking Alertmanager"
|
|
||||||
wait_http "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status" 60 || ok=0
|
|
||||||
|
|
||||||
log "checking Web-Proxy"
|
|
||||||
p8080=$(code_for "http://localhost:${WEB_PROXY_PORT_8080:-8080}/")
|
|
||||||
p8083=$(code_for "http://localhost:${WEB_PROXY_PORT_8083:-8083}/")
|
|
||||||
cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true)
|
|
||||||
cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true)
|
|
||||||
wp_ok=true
|
|
||||||
# 有些环境首页可能 403,此处接受 200/403
|
|
||||||
([[ "$p8080" == 200 || "$p8080" == 403 ]]) || wp_ok=false
|
|
||||||
([[ "$p8083" == 200 || "$p8083" == 302 || "$p8083" == 403 ]]) || wp_ok=false
|
|
||||||
[[ -n "$cors8084" && -n "$cors8085" ]] || wp_ok=false
|
|
||||||
[[ "$wp_ok" == true ]] || ok=0
|
|
||||||
|
|
||||||
cat > "$tmp" <<JSON
|
|
||||||
{
|
|
||||||
"es": $es_ok,
|
|
||||||
"kibana": $kb_ok,
|
|
||||||
"master_readyz": true,
|
|
||||||
"ftp_share_writable": $ftp_ok,
|
|
||||||
"prometheus": true,
|
|
||||||
"grafana": $gf_ok,
|
|
||||||
"alertmanager": true,
|
|
||||||
"web_proxy": $wp_ok,
|
|
||||||
"overlay_net": $net_ok,
|
|
||||||
"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
|
||||||
}
|
|
||||||
JSON
|
|
||||||
|
|
||||||
if ! mv "$tmp" "$OUT_JSON" 2>/dev/null; then
|
|
||||||
# fallback when logs dir not writable (no sudo allowed)
|
|
||||||
OUT_JSON="/tmp/selfcheck_$(date -u +%Y%m%d-%H%M%SZ).json"
|
|
||||||
cp "$tmp" "$OUT_JSON"
|
|
||||||
log "selfcheck.json written to $OUT_JSON (logs dir not writable)"
|
|
||||||
fi
|
|
||||||
if [[ "$ok" == 1 ]]; then
|
|
||||||
log "selfcheck OK"
|
|
||||||
exit 0
|
|
||||||
else
|
|
||||||
err "selfcheck FAILED (see $OUT_JSON)"
|
|
||||||
# If diagnose script exists, run it to collect more details
|
|
||||||
if [[ -x "$SCRIPT_DIR/server-diagnose.sh" ]]; then
|
|
||||||
# run diagnose; it will print the actual timestamped file paths and update 'latest' symlinks
|
|
||||||
"$SCRIPT_DIR/server-diagnose.sh" || true
|
|
||||||
fi
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
@ -1,28 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
|
|
||||||
PROJECT_NAME="argus-sys"
|
|
||||||
|
|
||||||
if docker compose version >/dev/null 2>&1; then COMPOSE=(docker compose); else COMPOSE=(docker-compose); fi
|
|
||||||
|
|
||||||
echo "== Containers =="
|
|
||||||
(cd "$ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" ps)
|
|
||||||
|
|
||||||
echo
|
|
||||||
echo "== Key Endpoints =="
|
|
||||||
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
|
||||||
printf "master http://localhost:%s/readyz\n" "${MASTER_PORT:-32300}"
|
|
||||||
printf "es http://localhost:%s/_cluster/health\n" "${ES_HTTP_PORT:-9200}"
|
|
||||||
printf "kibana http://localhost:%s/api/status\n" "${KIBANA_PORT:-5601}"
|
|
||||||
printf "prom http://localhost:%s/-/ready\n" "${PROMETHEUS_PORT:-9090}"
|
|
||||||
printf "grafana http://localhost:%s/api/health\n" "${GRAFANA_PORT:-3000}"
|
|
||||||
printf "alert http://localhost:%s/api/v2/status\n" "${ALERTMANAGER_PORT:-9093}"
|
|
||||||
printf "web http://localhost:%s/ (8080)\n" "${WEB_PROXY_PORT_8080:-8080}"
|
|
||||||
|
|
||||||
echo
|
|
||||||
echo "== Selfcheck result =="
|
|
||||||
cat "$ROOT/logs/selfcheck.json" 2>/dev/null || echo "(no selfcheck yet)"
|
|
||||||
|
|
||||||
@ -1,16 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
|
|
||||||
PROJECT_NAME="argus-sys"
|
|
||||||
|
|
||||||
log() { echo -e "\033[0;34m[UNINSTALL]\033[0m $*"; }
|
|
||||||
|
|
||||||
if docker compose version >/dev/null 2>&1; then COMPOSE=(docker compose); else COMPOSE=(docker-compose); fi
|
|
||||||
|
|
||||||
(cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" down -v || true)
|
|
||||||
log "compose stack removed"
|
|
||||||
log "you may remove data under $PKG_ROOT/private if you want a clean slate"
|
|
||||||
|
|
||||||
@ -1,38 +0,0 @@
|
|||||||
# Argus 镜像构建 UID/GID 配置说明
|
|
||||||
|
|
||||||
通过统一配置文件可以为 Kibana、Elasticsearch、Bind、Master 等容器指定运行账号,解决跨机器部署时 UID/GID 不一致导致的权限问题。
|
|
||||||
|
|
||||||
## 配置入口
|
|
||||||
|
|
||||||
- 默认配置存放在 `configs/build_user.conf`,内容示例:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
UID=2133
|
|
||||||
GID=2015
|
|
||||||
```
|
|
||||||
|
|
||||||
- 如果需要本地覆盖,可在 `configs/` 下新建 `build_user.local.conf`,字段与默认文件一致。该文件已列入 `.gitignore`,不会被意外提交。
|
|
||||||
- 亦可在执行脚本前通过环境变量 `ARGUS_BUILD_UID` / `ARGUS_BUILD_GID` 强制指定值,优先级最高。
|
|
||||||
|
|
||||||
## 作用范围
|
|
||||||
|
|
||||||
- `build/build_images.sh` 在构建 log/bind/master 镜像时读取配置,并传递 `--build-arg ARGUS_BUILD_UID/GID`;控制台会输出当前使用的 UID/GID。
|
|
||||||
- `src/master/scripts/build_images.sh` 同步使用配置,确保单独构建 master 镜像时行为一致。
|
|
||||||
- 各镜像 Dockerfile 会根据传入的 UID/GID 调整容器内账号(如 `elasticsearch`、`kibana`、`bind`、`argus`),并以环境变量形式暴露运行时可见值。
|
|
||||||
- Master 启动脚本会在执行 DNS 逻辑后,降权到配置的账号运行 `gunicorn`,确保写入 `/private/argus/**` 的文件具备正确属主。
|
|
||||||
- Log 模块测试脚本 `01_bootstrap.sh` 会根据配置修正挂载目录属主,方便端到端测试在任意用户下运行。
|
|
||||||
|
|
||||||
## 使用建议
|
|
||||||
|
|
||||||
1. 初次克隆仓库后无需修改,默认 UID/GID 保持向后兼容。
|
|
||||||
2. 如果在目标环境中使用新的账号(例如 `uid=4001,gid=4001`):
|
|
||||||
- 编辑 `configs/build_user.local.conf` 填入新值;
|
|
||||||
- 使用新账号登录,并确保其加入宿主机的 `docker` 组;
|
|
||||||
- 重新执行 `build/build_images.sh` 或相关模块的构建脚本。
|
|
||||||
3. 切换配置后建议重新运行目标模块的端到端脚本(如 `src/log/tests/scripts/01_bootstrap.sh`、`src/master/tests/scripts/00_e2e_test.sh`、`src/agent/tests/scripts/00_e2e_test.sh`),验证 `/private/argus` 下文件属主是否为期望账号。
|
|
||||||
|
|
||||||
## 故障排查
|
|
||||||
|
|
||||||
- **镜像构建报错 `groupmod: GID already in use`**:说明所选 GID 已存在于基础镜像,建议换用未占用的值,或在自定义基础镜像中先移除冲突。
|
|
||||||
- **容器内运行时报写权限不足**:检查宿主机挂载目录是否已经由目标 UID/GID 创建;必要时重新执行模块的 `01_bootstrap.sh` 之类的准备脚本。
|
|
||||||
- **仍看到旧 UID/GID**:确认脚本执行时未继承旧缓存,可运行 `ARGUS_BUILD_UID=... ARGUS_BUILD_GID=... ./build/build_images.sh` 强制覆盖。
|
|
||||||
Binary file not shown.
@ -1,115 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Shared helper to load Argus build user/group configuration.
|
|
||||||
# Usage:
|
|
||||||
# source "${PROJECT_ROOT}/scripts/common/build_user.sh"
|
|
||||||
# load_build_user
|
|
||||||
# echo "$ARGUS_BUILD_UID:$ARGUS_BUILD_GID"
|
|
||||||
|
|
||||||
ARGUS_BUILD_UID_DEFAULT=2133
|
|
||||||
ARGUS_BUILD_GID_DEFAULT=2015
|
|
||||||
|
|
||||||
shopt -s extglob
|
|
||||||
|
|
||||||
_ARGUS_BUILD_USER_LOADED="${_ARGUS_BUILD_USER_LOADED:-0}"
|
|
||||||
|
|
||||||
_argus_build_user_script_dir() {
|
|
||||||
local dir
|
|
||||||
dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
echo "$dir"
|
|
||||||
}
|
|
||||||
|
|
||||||
argus_project_root() {
|
|
||||||
local script_dir
|
|
||||||
script_dir="$(_argus_build_user_script_dir)"
|
|
||||||
(cd "$script_dir/../.." >/dev/null && pwd)
|
|
||||||
}
|
|
||||||
|
|
||||||
_argus_trim() {
|
|
||||||
local value="$1"
|
|
||||||
value="${value##+([[:space:]])}"
|
|
||||||
value="${value%%+([[:space:]])}"
|
|
||||||
printf '%s' "$value"
|
|
||||||
}
|
|
||||||
|
|
||||||
_argus_is_number() {
|
|
||||||
[[ "$1" =~ ^[0-9]+$ ]]
|
|
||||||
}
|
|
||||||
|
|
||||||
load_build_user() {
|
|
||||||
if [[ "$_ARGUS_BUILD_USER_LOADED" == "1" ]]; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
local project_root config_files config uid gid
|
|
||||||
project_root="$(argus_project_root)"
|
|
||||||
config_files=(
|
|
||||||
"$project_root/configs/build_user.local.conf"
|
|
||||||
"$project_root/configs/build_user.conf"
|
|
||||||
)
|
|
||||||
|
|
||||||
uid="$ARGUS_BUILD_UID_DEFAULT"
|
|
||||||
gid="$ARGUS_BUILD_GID_DEFAULT"
|
|
||||||
|
|
||||||
for config in "${config_files[@]}"; do
|
|
||||||
if [[ -f "$config" ]]; then
|
|
||||||
while IFS= read -r raw_line || [[ -n "$raw_line" ]]; do
|
|
||||||
local line key value
|
|
||||||
line="${raw_line%%#*}"
|
|
||||||
line="$(_argus_trim "${line}")"
|
|
||||||
[[ -z "$line" ]] && continue
|
|
||||||
if [[ "$line" != *=* ]]; then
|
|
||||||
echo "[ARGUS build_user] Ignoring malformed line in $config: $raw_line" >&2
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
key="${line%%=*}"
|
|
||||||
value="${line#*=}"
|
|
||||||
key="$(_argus_trim "$key")"
|
|
||||||
value="$(_argus_trim "$value")"
|
|
||||||
case "$key" in
|
|
||||||
UID)
|
|
||||||
uid="$value"
|
|
||||||
;;
|
|
||||||
GID)
|
|
||||||
gid="$value"
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "[ARGUS build_user] Unknown key '$key' in $config" >&2
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done < "$config"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ -n "${ARGUS_BUILD_UID:-}" ]]; then
|
|
||||||
uid="$ARGUS_BUILD_UID"
|
|
||||||
fi
|
|
||||||
if [[ -n "${ARGUS_BUILD_GID:-}" ]]; then
|
|
||||||
gid="$ARGUS_BUILD_GID"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! _argus_is_number "$uid"; then
|
|
||||||
echo "[ARGUS build_user] Invalid UID '$uid'" >&2
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
if ! _argus_is_number "$gid"; then
|
|
||||||
echo "[ARGUS build_user] Invalid GID '$gid'" >&2
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
export ARGUS_BUILD_UID="$uid"
|
|
||||||
export ARGUS_BUILD_GID="$gid"
|
|
||||||
_ARGUS_BUILD_USER_LOADED=1
|
|
||||||
}
|
|
||||||
|
|
||||||
argus_build_user_args() {
|
|
||||||
load_build_user
|
|
||||||
printf '%s' "--build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} --build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID}"
|
|
||||||
}
|
|
||||||
|
|
||||||
print_build_user() {
|
|
||||||
load_build_user
|
|
||||||
echo "ARGUS build user: UID=${ARGUS_BUILD_UID} GID=${ARGUS_BUILD_GID}"
|
|
||||||
}
|
|
||||||
2
src/.gitignore
vendored
2
src/.gitignore
vendored
@ -1,2 +0,0 @@
|
|||||||
|
|
||||||
__pycache__/
|
|
||||||
5
src/agent/.gitignore
vendored
5
src/agent/.gitignore
vendored
@ -1,5 +0,0 @@
|
|||||||
build/
|
|
||||||
*.egg-info/
|
|
||||||
__pycache__/
|
|
||||||
|
|
||||||
.env
|
|
||||||
@ -1,78 +0,0 @@
|
|||||||
# Argus Agent 模块
|
|
||||||
|
|
||||||
Argus Agent 是一个轻量级 Python 进程,负责向 Argus Master 注册节点、汇报健康数据,并维护本地持久化信息。模块现以 PyInstaller 打包为独立可执行文件,便于在普通容器或虚机中直接运行。
|
|
||||||
|
|
||||||
## 构建可执行文件
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd src/agent
|
|
||||||
./scripts/build_binary.sh # 生成 dist/argus-agent
|
|
||||||
```
|
|
||||||
|
|
||||||
脚本默认会在 Docker 容器 (`python:3.11-slim-bullseye`) 内执行 PyInstaller,确保产物运行时兼容 glibc 2.31+(覆盖 2.35 环境)。构建流程注意事项:
|
|
||||||
|
|
||||||
- 每次构建前会清理 `build/`、`dist/` 并在容器内重新创建虚拟环境。
|
|
||||||
- 需要使用内网 Python 镜像时,可通过 `PIP_INDEX_URL`、`PIP_EXTRA_INDEX_URL`、`PIP_TRUSTED_HOST` 等环境变量传入,脚本会自动透传给容器。
|
|
||||||
- 如果宿主机无法运行 Docker,可设置 `AGENT_BUILD_USE_DOCKER=0` 回退到本地构建;此时代码必须在 glibc ≤ 2.35 的机器上执行。
|
|
||||||
|
|
||||||
构建结束后脚本会在 `build/compat_check/` 下解包关键动态库并输出最高 `GLIBC_x.y` 版本,便于快速核对兼容性。如果结果中缺少 `libssl.so.3` / `libcrypto.so.3`,表示系统会在目标宿主机上使用本地 OpenSSL 库,无需额外处理。
|
|
||||||
|
|
||||||
例如:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
strings build/compat_check/libpython*.so.1.0 | grep -Eo 'GLIBC_[0-9]+\.[0-9]+' | sort -Vu | tail -n1
|
|
||||||
```
|
|
||||||
|
|
||||||
如遇构建失败,常见原因是 Docker 不可用(请改用 `AGENT_BUILD_USE_DOCKER=0`)或无法访问 Python 包镜像(先设置上述镜像环境变量后重试)。
|
|
||||||
|
|
||||||
## 运行时配置
|
|
||||||
|
|
||||||
Agent 不再依赖配置文件;所有参数均由环境变量与主机名推导:
|
|
||||||
|
|
||||||
| 变量 | 必填 | 默认值 | 说明 |
|
|
||||||
| --- | --- | --- | --- |
|
|
||||||
| `MASTER_ENDPOINT` | 是 | N/A | Master 基础地址,可写 `http://host:3000` 或 `host:3000`(自动补全 `http://`)。 |
|
|
||||||
| `REPORT_INTERVAL_SECONDS` | 否 | `60` | 状态上报间隔(秒)。必须为正整数。 |
|
|
||||||
| `AGENT_HOSTNAME` | 否 | `$(hostname)` | 覆盖容器内主机名,便于测试或特殊命名需求。 |
|
|
||||||
| `AGENT_ENV` | 否 | 来源于主机名 | 运行环境标识(如 `dev`、`prod`)。与 `AGENT_USER`、`AGENT_INSTANCE` 必须同时设置。 |
|
|
||||||
| `AGENT_USER` | 否 | 来源于主机名 | 归属用户或团队标识。与 `AGENT_ENV`、`AGENT_INSTANCE` 必须同时设置。 |
|
|
||||||
| `AGENT_INSTANCE` | 否 | 来源于主机名 | 实例编号或别名。与 `AGENT_ENV`、`AGENT_USER` 必须同时设置。 |
|
|
||||||
|
|
||||||
主机名与元数据的解析优先级:
|
|
||||||
|
|
||||||
1. 若设置 `AGENT_ENV` / `AGENT_USER` / `AGENT_INSTANCE` 且全部存在,则直接使用这些值。
|
|
||||||
2. 否则检查历史 `node.json`(注册成功后由 Master 返回的信息),若包含 `env` / `user` / `instance` 则沿用。
|
|
||||||
3. 若以上均不可用,则按历史约定从主机名解析 `env-user-instance` 前缀。
|
|
||||||
4. 如果仍无法得到完整结果,Agent 启动会失败并提示需要提供上述环境变量。
|
|
||||||
|
|
||||||
> 提示:在首次部署时需确保环境变量或主机名能够提供完整信息。完成注册后,Agent 会把 Master 返回的元数据写入 `node.json`,后续重启无需再次提供环境变量就能保持一致性。
|
|
||||||
|
|
||||||
派生路径:
|
|
||||||
|
|
||||||
- 节点信息:`/private/argus/agent/<hostname>/node.json`
|
|
||||||
- 子模块健康目录:`/private/argus/agent/<hostname>/health/`
|
|
||||||
|
|
||||||
健康目录中的文件需遵循 `<模块前缀>-*.json` 命名(例如 `log-fluentbit.json`、`metric-node-exporter.json`),文件内容会原样并入上报的 `health` 字段。
|
|
||||||
|
|
||||||
## 日志与持久化
|
|
||||||
|
|
||||||
- Agent 会在成功注册、状态上报、异常重试等关键节点输出结构化日志,便于聚合分析。
|
|
||||||
- `node.json` 保存 Master 返回的最新节点对象,用于重启后继续使用既有节点 ID。
|
|
||||||
|
|
||||||
## 端到端测试
|
|
||||||
|
|
||||||
仓库内提供 Docker Compose 测试栈(master + ubuntu 容器):
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd src/agent/tests
|
|
||||||
./scripts/00_e2e_test.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
测试脚本会:
|
|
||||||
|
|
||||||
1. 构建 master 镜像与 agent 可执行文件。
|
|
||||||
2. 以 `ubuntu:24.04` 启动 agent 容器,并通过环境变量注入 `MASTER_ENDPOINT`、`REPORT_INTERVAL_SECONDS`。
|
|
||||||
3. 验证注册、健康上报、nodes.json 生成、统计接口,以及“容器重启 + IP 变化”重注册流程。
|
|
||||||
4. 清理 `tests/private/` 与临时容器网络。
|
|
||||||
|
|
||||||
如需在真实环境部署,只需将 `dist/argus-agent` 连同健康目录挂载到目标主机,并按上表设置环境变量即可。
|
|
||||||
@ -1,60 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
from typing import Any, Dict, Optional
|
|
||||||
|
|
||||||
import requests
|
|
||||||
|
|
||||||
from .log import get_logger
|
|
||||||
|
|
||||||
LOGGER = get_logger("argus.agent.client")
|
|
||||||
|
|
||||||
|
|
||||||
class MasterAPIError(Exception):
|
|
||||||
def __init__(self, message: str, status_code: int, payload: Optional[Dict[str, Any]] = None) -> None:
|
|
||||||
super().__init__(message)
|
|
||||||
self.status_code = status_code
|
|
||||||
self.payload = payload or {}
|
|
||||||
|
|
||||||
|
|
||||||
class AgentClient:
|
|
||||||
def __init__(self, base_url: str, *, timeout: int = 10) -> None:
|
|
||||||
self._base_url = base_url.rstrip("/")
|
|
||||||
self._timeout = timeout
|
|
||||||
self._session = requests.Session()
|
|
||||||
|
|
||||||
def register_node(self, body: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
"""调用 master 注册接口,返回节点对象。"""
|
|
||||||
url = f"{self._base_url}/api/v1/master/nodes"
|
|
||||||
response = self._session.post(url, json=body, timeout=self._timeout)
|
|
||||||
return self._parse_response(response, "Failed to register node")
|
|
||||||
|
|
||||||
def update_status(self, node_id: str, body: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
"""上报健康信息,由 master 更新 last_report。"""
|
|
||||||
url = f"{self._base_url}/api/v1/master/nodes/{node_id}/status"
|
|
||||||
response = self._session.put(url, json=body, timeout=self._timeout)
|
|
||||||
return self._parse_response(response, "Failed to update node status")
|
|
||||||
|
|
||||||
def _parse_response(self, response: requests.Response, error_prefix: str) -> Dict[str, Any]:
|
|
||||||
content_type = response.headers.get("Content-Type", "")
|
|
||||||
payload: Dict[str, Any] | None = None
|
|
||||||
if "application/json" in content_type:
|
|
||||||
try:
|
|
||||||
payload = response.json()
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
LOGGER.warning("Response contained invalid JSON", extra={"status": response.status_code})
|
|
||||||
|
|
||||||
if response.status_code >= 400:
|
|
||||||
message = payload.get("error") if isinstance(payload, dict) else response.text
|
|
||||||
raise MasterAPIError(
|
|
||||||
f"{error_prefix}: {message}",
|
|
||||||
status_code=response.status_code,
|
|
||||||
payload=payload if isinstance(payload, dict) else None,
|
|
||||||
)
|
|
||||||
|
|
||||||
if payload is None:
|
|
||||||
try:
|
|
||||||
payload = response.json()
|
|
||||||
except json.JSONDecodeError as exc:
|
|
||||||
raise MasterAPIError("Master returned non-JSON payload", response.status_code) from exc
|
|
||||||
return payload
|
|
||||||
@ -1,262 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import socket
|
|
||||||
import subprocess
|
|
||||||
import ipaddress
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Dict
|
|
||||||
|
|
||||||
from .config import AgentConfig
|
|
||||||
from .log import get_logger
|
|
||||||
|
|
||||||
LOGGER = get_logger("argus.agent.collector")
|
|
||||||
|
|
||||||
_HOSTNAME_PATTERN = re.compile(r"^([^-]+)-([^-]+)-([^-]+)-.*$")
|
|
||||||
|
|
||||||
|
|
||||||
def collect_metadata(config: AgentConfig) -> Dict[str, Any]:
|
|
||||||
"""汇总节点注册需要的静态信息,带有更智能的 IP 选择。
|
|
||||||
|
|
||||||
规则(从高到低):
|
|
||||||
1) AGENT_PUBLISH_IP 指定;
|
|
||||||
2) Hostname A 记录(若命中优先网段);
|
|
||||||
3) 网卡扫描:排除 AGENT_EXCLUDE_IFACES,优先 AGENT_PREFER_NET_CIDRS;
|
|
||||||
4) 默认路由回退(UDP socket 技巧)。
|
|
||||||
|
|
||||||
额外发布:overlay_ip / gwbridge_ip / interfaces,便于 Master 与诊断使用。
|
|
||||||
"""
|
|
||||||
hostname = config.hostname
|
|
||||||
|
|
||||||
prefer_cidrs = _read_cidrs_env(
|
|
||||||
os.environ.get("AGENT_PREFER_NET_CIDRS", "10.0.0.0/8,172.31.0.0/16")
|
|
||||||
)
|
|
||||||
exclude_ifaces = _read_csv_env(
|
|
||||||
os.environ.get("AGENT_EXCLUDE_IFACES", "docker_gwbridge,lo")
|
|
||||||
)
|
|
||||||
|
|
||||||
# interface inventory
|
|
||||||
interfaces = _list_global_ipv4_addrs()
|
|
||||||
if exclude_ifaces:
|
|
||||||
interfaces = [it for it in interfaces if it[0] not in set(exclude_ifaces)]
|
|
||||||
|
|
||||||
# resolve hostname candidates
|
|
||||||
host_ips = _resolve_hostname_ips(hostname)
|
|
||||||
|
|
||||||
selected_ip, overlay_ip, gwbridge_ip = _select_publish_ips(
|
|
||||||
interfaces=interfaces,
|
|
||||||
host_ips=host_ips,
|
|
||||||
prefer_cidrs=prefer_cidrs,
|
|
||||||
)
|
|
||||||
|
|
||||||
meta: Dict[str, Any] = {
|
|
||||||
"hostname": hostname,
|
|
||||||
"ip": os.environ.get("AGENT_PUBLISH_IP", selected_ip), # keep required field
|
|
||||||
"overlay_ip": overlay_ip or selected_ip,
|
|
||||||
"gwbridge_ip": gwbridge_ip,
|
|
||||||
"interfaces": [
|
|
||||||
{"iface": name, "ip": ip} for name, ip in interfaces
|
|
||||||
],
|
|
||||||
"env": config.environment,
|
|
||||||
"user": config.user,
|
|
||||||
"instance": config.instance,
|
|
||||||
"cpu_number": _detect_cpu_count(),
|
|
||||||
"memory_in_bytes": _detect_memory_bytes(),
|
|
||||||
"gpu_number": _detect_gpu_count(),
|
|
||||||
}
|
|
||||||
return meta
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_hostname(hostname: str) -> tuple[str, str, str]:
|
|
||||||
"""按照约定的 env-user-instance 前缀拆解主机名。"""
|
|
||||||
match = _HOSTNAME_PATTERN.match(hostname)
|
|
||||||
if not match:
|
|
||||||
LOGGER.warning("Hostname does not match expected pattern", extra={"hostname": hostname})
|
|
||||||
return "", "", ""
|
|
||||||
return match.group(1), match.group(2), match.group(3)
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_cpu_count() -> int:
|
|
||||||
count = os.cpu_count()
|
|
||||||
return count if count is not None else 0
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_memory_bytes() -> int:
|
|
||||||
"""优先读取 cgroup 限额,失败时退回 /proc/meminfo。"""
|
|
||||||
cgroup_path = Path("/sys/fs/cgroup/memory.max")
|
|
||||||
try:
|
|
||||||
raw = cgroup_path.read_text(encoding="utf-8").strip()
|
|
||||||
if raw and raw != "max":
|
|
||||||
return int(raw)
|
|
||||||
except FileNotFoundError:
|
|
||||||
LOGGER.debug("cgroup memory.max not found, falling back to /proc/meminfo")
|
|
||||||
except ValueError:
|
|
||||||
LOGGER.warning("Failed to parse memory.max, falling back", extra={"value": raw})
|
|
||||||
|
|
||||||
try:
|
|
||||||
with open("/proc/meminfo", "r", encoding="utf-8") as handle:
|
|
||||||
for line in handle:
|
|
||||||
if line.startswith("MemTotal:"):
|
|
||||||
parts = line.split()
|
|
||||||
if len(parts) >= 2:
|
|
||||||
return int(parts[1]) * 1024
|
|
||||||
except FileNotFoundError:
|
|
||||||
LOGGER.error("/proc/meminfo not found; defaulting memory to 0")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_gpu_count() -> int:
|
|
||||||
"""采集 GPU 数量,如无法探测则默认为 0。"""
|
|
||||||
try:
|
|
||||||
proc = subprocess.run(
|
|
||||||
["nvidia-smi", "-L"],
|
|
||||||
check=False,
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
text=True,
|
|
||||||
timeout=5,
|
|
||||||
)
|
|
||||||
except FileNotFoundError:
|
|
||||||
LOGGER.debug("nvidia-smi not available; assuming 0 GPUs")
|
|
||||||
return 0
|
|
||||||
except subprocess.SubprocessError as exc:
|
|
||||||
LOGGER.warning("nvidia-smi invocation failed", extra={"error": str(exc)})
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if proc.returncode != 0:
|
|
||||||
LOGGER.debug("nvidia-smi returned non-zero", extra={"stderr": proc.stderr.strip()})
|
|
||||||
return 0
|
|
||||||
|
|
||||||
count = sum(1 for line in proc.stdout.splitlines() if line.strip())
|
|
||||||
return count
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_ip_address() -> str:
|
|
||||||
"""保留旧接口,作为最终回退:默认路由源地址 → 主机名解析 → 127.0.0.1。"""
|
|
||||||
try:
|
|
||||||
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock:
|
|
||||||
sock.connect(("8.8.8.8", 80))
|
|
||||||
return sock.getsockname()[0]
|
|
||||||
except OSError:
|
|
||||||
LOGGER.debug("UDP socket trick failed; falling back to hostname lookup")
|
|
||||||
try:
|
|
||||||
return socket.gethostbyname(socket.gethostname())
|
|
||||||
except OSError:
|
|
||||||
LOGGER.warning("Unable to resolve hostname to IP; defaulting to 127.0.0.1")
|
|
||||||
return "127.0.0.1"
|
|
||||||
|
|
||||||
|
|
||||||
def _read_csv_env(raw: str | None) -> list[str]:
|
|
||||||
if not raw:
|
|
||||||
return []
|
|
||||||
return [x.strip() for x in raw.split(",") if x.strip()]
|
|
||||||
|
|
||||||
|
|
||||||
def _read_cidrs_env(raw: str | None) -> list[ipaddress.IPv4Network]:
|
|
||||||
cidrs: list[ipaddress.IPv4Network] = []
|
|
||||||
for item in _read_csv_env(raw):
|
|
||||||
try:
|
|
||||||
net = ipaddress.ip_network(item, strict=False)
|
|
||||||
if isinstance(net, (ipaddress.IPv4Network,)):
|
|
||||||
cidrs.append(net)
|
|
||||||
except ValueError:
|
|
||||||
LOGGER.warning("Ignoring invalid CIDR in AGENT_PREFER_NET_CIDRS", extra={"cidr": item})
|
|
||||||
return cidrs
|
|
||||||
|
|
||||||
|
|
||||||
def _list_global_ipv4_addrs() -> list[tuple[str, str]]:
|
|
||||||
"""列出 (iface, ip) 形式的全局 IPv4 地址。
|
|
||||||
依赖 iproute2:ip -4 -o addr show scope global
|
|
||||||
"""
|
|
||||||
results: list[tuple[str, str]] = []
|
|
||||||
try:
|
|
||||||
proc = subprocess.run(
|
|
||||||
["sh", "-lc", "ip -4 -o addr show scope global | awk '{print $2, $4}'"],
|
|
||||||
check=False,
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
text=True,
|
|
||||||
timeout=3,
|
|
||||||
)
|
|
||||||
if proc.returncode == 0:
|
|
||||||
for line in proc.stdout.splitlines():
|
|
||||||
line = line.strip()
|
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
parts = line.split()
|
|
||||||
if len(parts) != 2:
|
|
||||||
continue
|
|
||||||
iface, cidr = parts
|
|
||||||
ip = cidr.split("/")[0]
|
|
||||||
try:
|
|
||||||
ipaddress.IPv4Address(ip)
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
results.append((iface, ip))
|
|
||||||
except Exception as exc: # pragma: no cover - defensive
|
|
||||||
LOGGER.debug("Failed to list interfaces", extra={"error": str(exc)})
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def _resolve_hostname_ips(name: str) -> list[str]:
|
|
||||||
ips: list[str] = []
|
|
||||||
try:
|
|
||||||
infos = socket.getaddrinfo(name, None, family=socket.AF_INET)
|
|
||||||
for info in infos:
|
|
||||||
ip = info[4][0]
|
|
||||||
if ip not in ips:
|
|
||||||
ips.append(ip)
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
return ips
|
|
||||||
|
|
||||||
|
|
||||||
def _pick_by_cidrs(candidates: list[str], prefer_cidrs: list[ipaddress.IPv4Network]) -> str | None:
|
|
||||||
for net in prefer_cidrs:
|
|
||||||
for ip in candidates:
|
|
||||||
try:
|
|
||||||
if ipaddress.ip_address(ip) in net:
|
|
||||||
return ip
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _select_publish_ips(
|
|
||||||
*,
|
|
||||||
interfaces: list[tuple[str, str]],
|
|
||||||
host_ips: list[str],
|
|
||||||
prefer_cidrs: list[ipaddress.IPv4Network],
|
|
||||||
) -> tuple[str, str | None, str | None]:
|
|
||||||
"""返回 (selected_ip, overlay_ip, gwbridge_ip)。
|
|
||||||
|
|
||||||
- overlay_ip:优先命中 prefer_cidrs(10.0/8 先于 172.31/16)。
|
|
||||||
- gwbridge_ip:若存在 172.22/16 则记录。
|
|
||||||
- selected_ip:优先 AGENT_PUBLISH_IP;否则 overlay_ip;否则 hostname A 记录中的 prefer;否则默认路由回退。
|
|
||||||
"""
|
|
||||||
# detect gwbridge (172.22/16)
|
|
||||||
gwbridge_net = ipaddress.ip_network("172.22.0.0/16")
|
|
||||||
gwbridge_ip = None
|
|
||||||
for _, ip in interfaces:
|
|
||||||
try:
|
|
||||||
if ipaddress.ip_address(ip) in gwbridge_net:
|
|
||||||
gwbridge_ip = ip
|
|
||||||
break
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# overlay candidate from interfaces by prefer cidrs
|
|
||||||
iface_ips = [ip for _, ip in interfaces]
|
|
||||||
overlay_ip = _pick_by_cidrs(iface_ips, prefer_cidrs)
|
|
||||||
|
|
||||||
# hostname A records filtered by prefer cidrs
|
|
||||||
host_pref = _pick_by_cidrs(host_ips, prefer_cidrs)
|
|
||||||
|
|
||||||
env_ip = os.environ.get("AGENT_PUBLISH_IP")
|
|
||||||
if env_ip:
|
|
||||||
selected = env_ip
|
|
||||||
else:
|
|
||||||
selected = overlay_ip or host_pref or _detect_ip_address()
|
|
||||||
|
|
||||||
return selected, overlay_ip, gwbridge_ip
|
|
||||||
@ -1,141 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import os
|
|
||||||
import socket
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Final
|
|
||||||
|
|
||||||
from .state import load_node_state
|
|
||||||
from .version import VERSION
|
|
||||||
from .log import get_logger
|
|
||||||
|
|
||||||
DEFAULT_REPORT_INTERVAL_SECONDS: Final[int] = 60
|
|
||||||
|
|
||||||
LOGGER = get_logger("argus.agent.config")
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class AgentConfig:
|
|
||||||
hostname: str
|
|
||||||
environment: str
|
|
||||||
user: str
|
|
||||||
instance: str
|
|
||||||
node_file: str
|
|
||||||
version: str
|
|
||||||
master_endpoint: str
|
|
||||||
report_interval_seconds: int
|
|
||||||
health_dir: str
|
|
||||||
request_timeout_seconds: int = 10
|
|
||||||
|
|
||||||
|
|
||||||
def _normalise_master_endpoint(value: str) -> str:
|
|
||||||
value = value.strip()
|
|
||||||
if not value:
|
|
||||||
raise ValueError("MASTER_ENDPOINT environment variable is required")
|
|
||||||
if not value.startswith("http://") and not value.startswith("https://"):
|
|
||||||
value = f"http://{value}"
|
|
||||||
return value.rstrip("/")
|
|
||||||
|
|
||||||
|
|
||||||
def _read_report_interval(raw_value: str | None) -> int:
|
|
||||||
if raw_value is None or raw_value.strip() == "":
|
|
||||||
return DEFAULT_REPORT_INTERVAL_SECONDS
|
|
||||||
try:
|
|
||||||
interval = int(raw_value)
|
|
||||||
except ValueError as exc:
|
|
||||||
raise ValueError("REPORT_INTERVAL_SECONDS must be an integer") from exc
|
|
||||||
if interval <= 0:
|
|
||||||
raise ValueError("REPORT_INTERVAL_SECONDS must be positive")
|
|
||||||
return interval
|
|
||||||
|
|
||||||
|
|
||||||
def _resolve_hostname() -> str:
|
|
||||||
return os.environ.get("AGENT_HOSTNAME") or socket.gethostname()
|
|
||||||
|
|
||||||
|
|
||||||
def _load_metadata_from_state(node_file: str) -> tuple[str, str, str] | None:
|
|
||||||
state = load_node_state(node_file)
|
|
||||||
if not state:
|
|
||||||
return None
|
|
||||||
|
|
||||||
meta = state.get("meta_data") or {}
|
|
||||||
env = meta.get("env") or state.get("env")
|
|
||||||
user = meta.get("user") or state.get("user")
|
|
||||||
instance = meta.get("instance") or state.get("instance")
|
|
||||||
|
|
||||||
if env and user and instance:
|
|
||||||
LOGGER.debug("Metadata resolved from node state", extra={"node_file": node_file})
|
|
||||||
return env, user, instance
|
|
||||||
|
|
||||||
LOGGER.warning(
|
|
||||||
"node.json missing metadata fields; ignoring",
|
|
||||||
extra={"node_file": node_file, "meta_data": meta},
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _resolve_metadata_fields(hostname: str, node_file: str) -> tuple[str, str, str]:
|
|
||||||
env = os.environ.get("AGENT_ENV")
|
|
||||||
user = os.environ.get("AGENT_USER")
|
|
||||||
instance = os.environ.get("AGENT_INSTANCE")
|
|
||||||
|
|
||||||
if env and user and instance:
|
|
||||||
return env, user, instance
|
|
||||||
|
|
||||||
if any([env, user, instance]):
|
|
||||||
LOGGER.warning(
|
|
||||||
"Incomplete metadata environment variables; falling back to persisted metadata",
|
|
||||||
extra={
|
|
||||||
"has_env": bool(env),
|
|
||||||
"has_user": bool(user),
|
|
||||||
"has_instance": bool(instance),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
state_metadata = _load_metadata_from_state(node_file)
|
|
||||||
if state_metadata is not None:
|
|
||||||
return state_metadata
|
|
||||||
|
|
||||||
from .collector import _parse_hostname # Local import to avoid circular dependency
|
|
||||||
|
|
||||||
env, user, instance = _parse_hostname(hostname)
|
|
||||||
|
|
||||||
if not all([env, user, instance]):
|
|
||||||
raise ValueError(
|
|
||||||
"Failed to determine metadata fields; set AGENT_ENV/USER/INSTANCE or use supported hostname pattern"
|
|
||||||
)
|
|
||||||
|
|
||||||
return env, user, instance
|
|
||||||
|
|
||||||
|
|
||||||
def load_config() -> AgentConfig:
|
|
||||||
"""从环境变量推导配置,移除了外部配置文件依赖。"""
|
|
||||||
|
|
||||||
hostname = _resolve_hostname()
|
|
||||||
node_file = f"/private/argus/agent/{hostname}/node.json"
|
|
||||||
environment, user, instance = _resolve_metadata_fields(hostname, node_file)
|
|
||||||
|
|
||||||
health_dir = f"/private/argus/agent/{hostname}/health/"
|
|
||||||
|
|
||||||
master_endpoint_env = os.environ.get("MASTER_ENDPOINT")
|
|
||||||
if master_endpoint_env is None:
|
|
||||||
raise ValueError("MASTER_ENDPOINT environment variable is not set")
|
|
||||||
master_endpoint = _normalise_master_endpoint(master_endpoint_env)
|
|
||||||
|
|
||||||
report_interval_seconds = _read_report_interval(os.environ.get("REPORT_INTERVAL_SECONDS"))
|
|
||||||
|
|
||||||
Path(node_file).parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
Path(health_dir).mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
return AgentConfig(
|
|
||||||
hostname=hostname,
|
|
||||||
environment=environment,
|
|
||||||
user=user,
|
|
||||||
instance=instance,
|
|
||||||
node_file=node_file,
|
|
||||||
version=VERSION,
|
|
||||||
master_endpoint=master_endpoint,
|
|
||||||
report_interval_seconds=report_interval_seconds,
|
|
||||||
health_dir=health_dir,
|
|
||||||
)
|
|
||||||
@ -1,32 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Dict
|
|
||||||
|
|
||||||
from .log import get_logger
|
|
||||||
|
|
||||||
LOGGER = get_logger("argus.agent.health")
|
|
||||||
|
|
||||||
|
|
||||||
def read_health_directory(path: str) -> Dict[str, Any]:
|
|
||||||
"""读取目录中所有 <prefix>-*.json 文件并返回 JSON 映射。"""
|
|
||||||
result: Dict[str, Any] = {}
|
|
||||||
directory = Path(path)
|
|
||||||
if not directory.exists():
|
|
||||||
LOGGER.debug("Health directory does not exist", extra={"path": str(directory)})
|
|
||||||
return result
|
|
||||||
|
|
||||||
for health_file in sorted(directory.glob("*.json")):
|
|
||||||
if "-" not in health_file.stem:
|
|
||||||
LOGGER.debug("Skipping non-prefixed health file", extra={"file": health_file.name})
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
with health_file.open("r", encoding="utf-8") as handle:
|
|
||||||
content = json.load(handle)
|
|
||||||
result[health_file.stem] = content
|
|
||||||
except json.JSONDecodeError as exc:
|
|
||||||
LOGGER.warning("Failed to parse health file", extra={"file": health_file.name, "error": str(exc)})
|
|
||||||
except OSError as exc:
|
|
||||||
LOGGER.warning("Failed to read health file", extra={"file": health_file.name, "error": str(exc)})
|
|
||||||
return result
|
|
||||||
@ -1,18 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
_LOG_FORMAT = "%(asctime)s %(levelname)s %(name)s - %(message)s"
|
|
||||||
|
|
||||||
|
|
||||||
def setup_logging() -> None:
|
|
||||||
level_name = os.environ.get("AGENT_LOG_LEVEL", "INFO").upper()
|
|
||||||
level = getattr(logging, level_name, logging.INFO)
|
|
||||||
logging.basicConfig(level=level, format=_LOG_FORMAT)
|
|
||||||
|
|
||||||
|
|
||||||
def get_logger(name: str) -> logging.Logger:
|
|
||||||
setup_logging()
|
|
||||||
return logging.getLogger(name)
|
|
||||||
@ -1,163 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import signal
|
|
||||||
import time
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from .client import AgentClient, MasterAPIError
|
|
||||||
from .collector import collect_metadata
|
|
||||||
from .config import AgentConfig, load_config
|
|
||||||
from .health_reader import read_health_directory
|
|
||||||
from .log import get_logger, setup_logging
|
|
||||||
from .state import clear_node_state, load_node_state, save_node_state
|
|
||||||
|
|
||||||
LOGGER = get_logger("argus.agent")
|
|
||||||
|
|
||||||
|
|
||||||
def _current_timestamp() -> str:
|
|
||||||
return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
|
||||||
|
|
||||||
|
|
||||||
class StopSignal:
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self._stop = False
|
|
||||||
|
|
||||||
def set(self, *_args) -> None: # type: ignore[override]
|
|
||||||
self._stop = True
|
|
||||||
|
|
||||||
def is_set(self) -> bool:
|
|
||||||
return self._stop
|
|
||||||
|
|
||||||
|
|
||||||
def main(argv: Optional[list[str]] = None) -> int: # noqa: ARG001 - 保留签名以兼容入口调用
|
|
||||||
setup_logging()
|
|
||||||
|
|
||||||
stop_signal = StopSignal()
|
|
||||||
signal.signal(signal.SIGTERM, stop_signal.set)
|
|
||||||
signal.signal(signal.SIGINT, stop_signal.set)
|
|
||||||
|
|
||||||
try:
|
|
||||||
config = load_config()
|
|
||||||
except Exception as exc:
|
|
||||||
LOGGER.error("Failed to load configuration", extra={"error": str(exc)})
|
|
||||||
return 1
|
|
||||||
|
|
||||||
LOGGER.info(
|
|
||||||
"Agent starting",
|
|
||||||
extra={
|
|
||||||
"hostname": config.hostname,
|
|
||||||
"master_endpoint": config.master_endpoint,
|
|
||||||
"node_file": config.node_file,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
client = AgentClient(config.master_endpoint, timeout=config.request_timeout_seconds)
|
|
||||||
|
|
||||||
node_state = load_node_state(config.node_file) or {}
|
|
||||||
node_id = node_state.get("id")
|
|
||||||
|
|
||||||
# 与 master 建立注册关系(支持重注册),失败则重试
|
|
||||||
register_response = _register_with_retry(client, config, node_id, stop_signal)
|
|
||||||
if register_response is None:
|
|
||||||
LOGGER.info("Registration aborted due to shutdown signal")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
node_id = register_response.get("id")
|
|
||||||
if not node_id:
|
|
||||||
LOGGER.error("Master did not return node id; aborting")
|
|
||||||
return 1
|
|
||||||
save_node_state(config.node_file, register_response)
|
|
||||||
|
|
||||||
LOGGER.info("Entering status report loop", extra={"node_id": node_id})
|
|
||||||
_status_loop(client, config, node_id, stop_signal)
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
def _register_with_retry(
|
|
||||||
client: AgentClient,
|
|
||||||
config: AgentConfig,
|
|
||||||
node_id: Optional[str],
|
|
||||||
stop_signal: StopSignal,
|
|
||||||
):
|
|
||||||
backoff = 5
|
|
||||||
while not stop_signal.is_set():
|
|
||||||
payload = {
|
|
||||||
"name": config.hostname,
|
|
||||||
"type": "agent",
|
|
||||||
"meta_data": collect_metadata(config),
|
|
||||||
"version": config.version,
|
|
||||||
}
|
|
||||||
if node_id:
|
|
||||||
payload["id"] = node_id
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = client.register_node(payload)
|
|
||||||
LOGGER.info("Registration successful", extra={"node_id": response.get("id")})
|
|
||||||
save_node_state(config.node_file, response)
|
|
||||||
return response
|
|
||||||
except MasterAPIError as exc:
|
|
||||||
if exc.status_code == 404 and node_id:
|
|
||||||
LOGGER.warning(
|
|
||||||
"Master does not recognise node id; clearing local node state",
|
|
||||||
extra={"node_id": node_id},
|
|
||||||
)
|
|
||||||
clear_node_state(config.node_file)
|
|
||||||
node_id = None
|
|
||||||
elif exc.status_code == 500 and node_id:
|
|
||||||
# id 与 name 不匹配通常意味着配置异常,记录但继续重试
|
|
||||||
LOGGER.error(
|
|
||||||
"Master rejected node due to id/name mismatch; will retry",
|
|
||||||
extra={"node_id": node_id},
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
LOGGER.error("Registration failed", extra={"status_code": exc.status_code, "error": str(exc)})
|
|
||||||
time.sleep(min(backoff, 60))
|
|
||||||
backoff = min(backoff * 2, 60)
|
|
||||||
except Exception as exc: # pragma: no cover - defensive
|
|
||||||
LOGGER.exception("Unexpected error during registration", extra={"error": str(exc)})
|
|
||||||
time.sleep(min(backoff, 60))
|
|
||||||
backoff = min(backoff * 2, 60)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _status_loop(
|
|
||||||
client: AgentClient,
|
|
||||||
config: AgentConfig,
|
|
||||||
node_id: str,
|
|
||||||
stop_signal: StopSignal,
|
|
||||||
) -> None:
|
|
||||||
interval = config.report_interval_seconds
|
|
||||||
while not stop_signal.is_set():
|
|
||||||
timestamp = _current_timestamp()
|
|
||||||
health_payload = read_health_directory(config.health_dir)
|
|
||||||
body = {
|
|
||||||
"timestamp": timestamp,
|
|
||||||
"health": health_payload,
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
response = client.update_status(node_id, body)
|
|
||||||
LOGGER.info(
|
|
||||||
"Status report succeeded",
|
|
||||||
extra={"node_id": node_id, "health_keys": list(health_payload.keys())},
|
|
||||||
)
|
|
||||||
save_node_state(config.node_file, response)
|
|
||||||
except MasterAPIError as exc:
|
|
||||||
# 保持循环继续执行,等待下一次重试
|
|
||||||
LOGGER.error(
|
|
||||||
"Failed to report status",
|
|
||||||
extra={"status_code": exc.status_code, "error": str(exc)},
|
|
||||||
)
|
|
||||||
except Exception as exc: # pragma: no cover - defensive
|
|
||||||
LOGGER.exception("Unexpected error during status report", extra={"error": str(exc)})
|
|
||||||
|
|
||||||
for _ in range(interval):
|
|
||||||
if stop_signal.is_set():
|
|
||||||
break
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
LOGGER.info("Stop signal received; exiting status loop")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sys.exit(main())
|
|
||||||
@ -1,44 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import tempfile
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Dict, Optional
|
|
||||||
|
|
||||||
from .log import get_logger
|
|
||||||
|
|
||||||
LOGGER = get_logger("argus.agent.state")
|
|
||||||
|
|
||||||
|
|
||||||
def load_node_state(path: str) -> Optional[Dict[str, Any]]:
|
|
||||||
"""读取本地 node.json,容器重启后沿用之前的 ID。"""
|
|
||||||
try:
|
|
||||||
with open(path, "r", encoding="utf-8") as handle:
|
|
||||||
return json.load(handle)
|
|
||||||
except FileNotFoundError:
|
|
||||||
return None
|
|
||||||
except json.JSONDecodeError as exc:
|
|
||||||
LOGGER.warning("node.json is invalid JSON; ignoring", extra={"error": str(exc)})
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def save_node_state(path: str, data: Dict[str, Any]) -> None:
|
|
||||||
"""原子化写入 node.json,避免并发读取坏数据。"""
|
|
||||||
directory = Path(path).parent
|
|
||||||
directory.mkdir(parents=True, exist_ok=True)
|
|
||||||
with tempfile.NamedTemporaryFile("w", dir=directory, delete=False, encoding="utf-8") as tmp:
|
|
||||||
json.dump(data, tmp, separators=(",", ":"))
|
|
||||||
tmp.flush()
|
|
||||||
os.fsync(tmp.fileno())
|
|
||||||
temp_path = tmp.name
|
|
||||||
os.replace(temp_path, path)
|
|
||||||
|
|
||||||
|
|
||||||
def clear_node_state(path: str) -> None:
|
|
||||||
try:
|
|
||||||
os.remove(path)
|
|
||||||
except FileNotFoundError:
|
|
||||||
return
|
|
||||||
except OSError as exc:
|
|
||||||
LOGGER.warning("Failed to remove node state file", extra={"error": str(exc), "path": path})
|
|
||||||
@ -1,69 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import importlib.metadata
|
|
||||||
|
|
||||||
try:
|
|
||||||
import tomllib
|
|
||||||
except ModuleNotFoundError: # pragma: no cover
|
|
||||||
import tomli as tomllib # type: ignore[no-redef]
|
|
||||||
|
|
||||||
|
|
||||||
def _candidate_paths() -> list[Path]:
|
|
||||||
paths = []
|
|
||||||
bundle_dir: Optional[str] = getattr(sys, "_MEIPASS", None)
|
|
||||||
if bundle_dir:
|
|
||||||
paths.append(Path(bundle_dir) / "pyproject.toml")
|
|
||||||
paths.append(Path(__file__).resolve().parent.parent / "pyproject.toml")
|
|
||||||
paths.append(Path(__file__).resolve().parent / "pyproject.toml")
|
|
||||||
paths.append(Path.cwd() / "pyproject.toml")
|
|
||||||
return paths
|
|
||||||
|
|
||||||
|
|
||||||
def _read_from_pyproject() -> Optional[str]:
|
|
||||||
for path in _candidate_paths():
|
|
||||||
if not path.exists():
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
with path.open("rb") as handle:
|
|
||||||
data = tomllib.load(handle)
|
|
||||||
except (OSError, tomllib.TOMLDecodeError):
|
|
||||||
continue
|
|
||||||
project = data.get("project")
|
|
||||||
if isinstance(project, dict):
|
|
||||||
version = project.get("version")
|
|
||||||
if isinstance(version, str):
|
|
||||||
return version
|
|
||||||
tool = data.get("tool")
|
|
||||||
if isinstance(tool, dict):
|
|
||||||
argus_cfg = tool.get("argus")
|
|
||||||
if isinstance(argus_cfg, dict):
|
|
||||||
version = argus_cfg.get("version")
|
|
||||||
if isinstance(version, str):
|
|
||||||
return version
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_version() -> str:
|
|
||||||
try:
|
|
||||||
return importlib.metadata.version("argus-agent")
|
|
||||||
except importlib.metadata.PackageNotFoundError:
|
|
||||||
pass
|
|
||||||
override = os.environ.get("AGENT_VERSION_OVERRIDE")
|
|
||||||
if override:
|
|
||||||
return override
|
|
||||||
fallback = _read_from_pyproject()
|
|
||||||
if fallback:
|
|
||||||
return fallback
|
|
||||||
return "0.0.0"
|
|
||||||
|
|
||||||
|
|
||||||
VERSION: str = _detect_version()
|
|
||||||
|
|
||||||
|
|
||||||
def get_version() -> str:
|
|
||||||
return VERSION
|
|
||||||
BIN
src/agent/dist/argus-agent
vendored
BIN
src/agent/dist/argus-agent
vendored
Binary file not shown.
@ -1,10 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from app.main import main as agent_main
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sys.exit(agent_main())
|
|
||||||
@ -1,19 +0,0 @@
|
|||||||
[project]
|
|
||||||
name = "argus-agent"
|
|
||||||
version = "1.1.0"
|
|
||||||
description = "Argus agent binary"
|
|
||||||
readme = "README.md"
|
|
||||||
requires-python = ">=3.11"
|
|
||||||
dependencies = [
|
|
||||||
"requests==2.31.0"
|
|
||||||
]
|
|
||||||
|
|
||||||
[build-system]
|
|
||||||
requires = ["setuptools>=69", "wheel"]
|
|
||||||
build-backend = "setuptools.build_meta"
|
|
||||||
|
|
||||||
[tool.argus]
|
|
||||||
entry = "app.main:main"
|
|
||||||
|
|
||||||
[tool.setuptools]
|
|
||||||
packages = ["app"]
|
|
||||||
@ -1,690 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
LOG_PREFIX="[AGENT-VERIFY]"
|
|
||||||
MASTER_ENDPOINT_DEFAULT=""
|
|
||||||
AGENT_DATA_ROOT_DEFAULT="/private/argus/agent"
|
|
||||||
AGENT_ETC_ROOT_DEFAULT="/private/argus/etc"
|
|
||||||
REPORT_INTERVAL_DEFAULT="2"
|
|
||||||
|
|
||||||
ALLOW_CONFIG_TOUCH="false"
|
|
||||||
KEEP_TEST_HEALTH="false"
|
|
||||||
|
|
||||||
log_info() {
|
|
||||||
echo "${LOG_PREFIX} INFO $*"
|
|
||||||
}
|
|
||||||
|
|
||||||
log_warn() {
|
|
||||||
echo "${LOG_PREFIX} WARN $*" >&2
|
|
||||||
}
|
|
||||||
|
|
||||||
log_error() {
|
|
||||||
echo "${LOG_PREFIX} ERROR $*" >&2
|
|
||||||
}
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
cat <<'USAGE'
|
|
||||||
Usage: agent_deployment_verify.sh [options]
|
|
||||||
|
|
||||||
Options:
|
|
||||||
--allow-config-touch Enable optional config PUT dry-run check.
|
|
||||||
--keep-test-health Keep the temporary verify health file after checks.
|
|
||||||
-h, --help Show this help message.
|
|
||||||
|
|
||||||
Environment variables:
|
|
||||||
MASTER_ENDPOINT (required) Master API base endpoint, e.g. http://master:3000
|
|
||||||
AGENT_DATA_ROOT (default: /private/argus/agent)
|
|
||||||
AGENT_ETC_ROOT (default: /private/argus/etc)
|
|
||||||
VERIFY_HOSTNAME (default: output of hostname)
|
|
||||||
REPORT_INTERVAL_SECONDS (default: 2) Agent report interval in seconds
|
|
||||||
USAGE
|
|
||||||
}
|
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
case "$1" in
|
|
||||||
--allow-config-touch)
|
|
||||||
ALLOW_CONFIG_TOUCH="true"
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
--keep-test-health)
|
|
||||||
KEEP_TEST_HEALTH="true"
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
-h|--help)
|
|
||||||
usage
|
|
||||||
exit 0
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
log_error "Unknown option: $1"
|
|
||||||
usage >&2
|
|
||||||
exit 2
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
MASTER_ENDPOINT="${MASTER_ENDPOINT:-$MASTER_ENDPOINT_DEFAULT}"
|
|
||||||
AGENT_DATA_ROOT="${AGENT_DATA_ROOT:-$AGENT_DATA_ROOT_DEFAULT}"
|
|
||||||
AGENT_ETC_ROOT="${AGENT_ETC_ROOT:-$AGENT_ETC_ROOT_DEFAULT}"
|
|
||||||
VERIFY_HOSTNAME="${VERIFY_HOSTNAME:-$(hostname)}"
|
|
||||||
REPORT_INTERVAL_SECONDS="${REPORT_INTERVAL_SECONDS:-$REPORT_INTERVAL_DEFAULT}"
|
|
||||||
|
|
||||||
if [[ -z "$MASTER_ENDPOINT" ]]; then
|
|
||||||
log_error "MASTER_ENDPOINT is required"
|
|
||||||
exit 2
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! [[ "$REPORT_INTERVAL_SECONDS" =~ ^[0-9]+$ ]] || [[ "$REPORT_INTERVAL_SECONDS" -le 0 ]]; then
|
|
||||||
log_warn "Invalid REPORT_INTERVAL_SECONDS='$REPORT_INTERVAL_SECONDS', fallback to $REPORT_INTERVAL_DEFAULT"
|
|
||||||
REPORT_INTERVAL_SECONDS="$REPORT_INTERVAL_DEFAULT"
|
|
||||||
fi
|
|
||||||
|
|
||||||
normalize_endpoint() {
|
|
||||||
local endpoint="$1"
|
|
||||||
if [[ "$endpoint" != http://* && "$endpoint" != https://* ]]; then
|
|
||||||
endpoint="http://$endpoint"
|
|
||||||
fi
|
|
||||||
endpoint="${endpoint%/}"
|
|
||||||
echo "$endpoint"
|
|
||||||
}
|
|
||||||
|
|
||||||
MASTER_BASE="$(normalize_endpoint "$MASTER_ENDPOINT")"
|
|
||||||
|
|
||||||
NODE_DIR="$AGENT_DATA_ROOT/$VERIFY_HOSTNAME"
|
|
||||||
NODE_JSON="$NODE_DIR/node.json"
|
|
||||||
HEALTH_DIR="$NODE_DIR/health"
|
|
||||||
DNS_CONF="$AGENT_ETC_ROOT/dns.conf"
|
|
||||||
UPDATE_SCRIPT="$AGENT_ETC_ROOT/update-dns.sh"
|
|
||||||
|
|
||||||
declare -a RESULTS_PASS=()
|
|
||||||
declare -a RESULTS_WARN=()
|
|
||||||
declare -a RESULTS_FAIL=()
|
|
||||||
|
|
||||||
add_result() {
|
|
||||||
local level="$1" message="$2"
|
|
||||||
case "$level" in
|
|
||||||
PASS)
|
|
||||||
RESULTS_PASS+=("$message")
|
|
||||||
log_info "$message"
|
|
||||||
;;
|
|
||||||
WARN)
|
|
||||||
RESULTS_WARN+=("$message")
|
|
||||||
log_warn "$message"
|
|
||||||
;;
|
|
||||||
FAIL)
|
|
||||||
RESULTS_FAIL+=("$message")
|
|
||||||
log_error "$message"
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
HAS_JQ="0"
|
|
||||||
if command -v jq >/dev/null 2>&1; then
|
|
||||||
HAS_JQ="1"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! command -v curl >/dev/null 2>&1; then
|
|
||||||
log_error "curl command not found; please install curl (e.g. apt-get install -y curl)"
|
|
||||||
exit 2
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$HAS_JQ" == "0" ]] && ! command -v python3 >/dev/null 2>&1; then
|
|
||||||
log_error "Neither jq nor python3 is available for JSON processing"
|
|
||||||
exit 2
|
|
||||||
fi
|
|
||||||
|
|
||||||
CURL_OPTS=(--fail --show-error --silent --max-time 10)
|
|
||||||
|
|
||||||
curl_json() {
|
|
||||||
local url="$1"
|
|
||||||
if ! curl "${CURL_OPTS[@]}" "$url"; then
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
json_query() {
|
|
||||||
local json="$1" jq_expr="$2" py_expr="$3"
|
|
||||||
if [[ "$HAS_JQ" == "1" ]]; then
|
|
||||||
if ! output=$(printf '%s' "$json" | jq -e -r "$jq_expr" 2>/dev/null); then
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
printf '%s' "$output"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
python3 - "$py_expr" <<'PY'
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
|
|
||||||
expr = sys.argv[1]
|
|
||||||
try:
|
|
||||||
data = json.load(sys.stdin)
|
|
||||||
value = eval(expr, {}, {"data": data})
|
|
||||||
except Exception:
|
|
||||||
sys.exit(1)
|
|
||||||
if value is None:
|
|
||||||
sys.exit(1)
|
|
||||||
if isinstance(value, (dict, list)):
|
|
||||||
print(json.dumps(value))
|
|
||||||
else:
|
|
||||||
print(value)
|
|
||||||
PY
|
|
||||||
}
|
|
||||||
|
|
||||||
json_length() {
|
|
||||||
local json="$1" jq_expr="$2" py_expr="$3"
|
|
||||||
if [[ "$HAS_JQ" == "1" ]]; then
|
|
||||||
if ! output=$(printf '%s' "$json" | jq -e "$jq_expr" 2>/dev/null); then
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
printf '%s' "$output"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
python3 - "$py_expr" <<'PY'
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
|
|
||||||
expr = sys.argv[1]
|
|
||||||
try:
|
|
||||||
data = json.load(sys.stdin)
|
|
||||||
value = eval(expr, {}, {"data": data})
|
|
||||||
except Exception:
|
|
||||||
sys.exit(1)
|
|
||||||
try:
|
|
||||||
print(len(value))
|
|
||||||
except Exception:
|
|
||||||
sys.exit(1)
|
|
||||||
PY
|
|
||||||
}
|
|
||||||
|
|
||||||
json_has_key() {
|
|
||||||
local json="$1" jq_expr="$2" py_expr="$3"
|
|
||||||
if [[ "$HAS_JQ" == "1" ]]; then
|
|
||||||
if printf '%s' "$json" | jq -e "$jq_expr" >/dev/null 2>&1; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
python3 - "$py_expr" <<'PY'
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
|
|
||||||
expr = sys.argv[1]
|
|
||||||
try:
|
|
||||||
data = json.load(sys.stdin)
|
|
||||||
value = eval(expr, {}, {"data": data})
|
|
||||||
except Exception:
|
|
||||||
sys.exit(1)
|
|
||||||
if value:
|
|
||||||
sys.exit(0)
|
|
||||||
sys.exit(1)
|
|
||||||
PY
|
|
||||||
}
|
|
||||||
|
|
||||||
iso_to_epoch() {
|
|
||||||
local value="$1"
|
|
||||||
if command -v date >/dev/null 2>&1; then
|
|
||||||
date -d "$value" +%s 2>/dev/null && return 0
|
|
||||||
fi
|
|
||||||
if command -v python3 >/dev/null 2>&1; then
|
|
||||||
python3 - "$value" <<'PY'
|
|
||||||
import sys
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
value = sys.argv[1]
|
|
||||||
if value is None or value == "":
|
|
||||||
sys.exit(1)
|
|
||||||
if value.endswith('Z'):
|
|
||||||
value = value[:-1] + '+00:00'
|
|
||||||
try:
|
|
||||||
dt = datetime.fromisoformat(value)
|
|
||||||
except ValueError:
|
|
||||||
sys.exit(1)
|
|
||||||
print(int(dt.timestamp()))
|
|
||||||
PY
|
|
||||||
return $?
|
|
||||||
fi
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
validate_json_file() {
|
|
||||||
local path="$1"
|
|
||||||
if [[ "$HAS_JQ" == "1" ]]; then
|
|
||||||
jq empty "$path" >/dev/null 2>&1 && return 0
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
if command -v python3 >/dev/null 2>&1; then
|
|
||||||
python3 - "$path" <<'PY'
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
path = sys.argv[1]
|
|
||||||
with open(path, 'r', encoding='utf-8') as handle:
|
|
||||||
json.load(handle)
|
|
||||||
PY
|
|
||||||
return $?
|
|
||||||
fi
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
ensure_directory() {
|
|
||||||
local dir="$1"
|
|
||||||
if [[ ! -d "$dir" ]]; then
|
|
||||||
log_warn "Creating missing directory $dir"
|
|
||||||
mkdir -p "$dir"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_HEALTH_FILE=""
|
|
||||||
TEST_HEALTH_BACKUP=""
|
|
||||||
TEST_HEALTH_EXISTED="false"
|
|
||||||
|
|
||||||
cleanup() {
|
|
||||||
if [[ -n "$TEST_HEALTH_FILE" ]]; then
|
|
||||||
if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then
|
|
||||||
printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE"
|
|
||||||
elif [[ "$KEEP_TEST_HEALTH" == "true" ]]; then
|
|
||||||
:
|
|
||||||
else
|
|
||||||
rm -f "$TEST_HEALTH_FILE"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
log_info "Starting agent deployment verification for hostname '$VERIFY_HOSTNAME'"
|
|
||||||
|
|
||||||
# 4.2 Master health checks
|
|
||||||
health_resp=""
|
|
||||||
if ! health_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/healthz" 2>/tmp/agent_verify_healthz.err); then
|
|
||||||
error_detail=$(cat /tmp/agent_verify_healthz.err || true)
|
|
||||||
add_result FAIL "GET /healthz failed: $error_detail"
|
|
||||||
else
|
|
||||||
http_meta=$(tail -n1 <<<"$health_resp")
|
|
||||||
payload=$(head -n -1 <<<"$health_resp" || true)
|
|
||||||
status_code=${http_meta%% *}
|
|
||||||
elapsed=${http_meta##* }
|
|
||||||
add_result PASS "GET /healthz status=$status_code elapsed=${elapsed}s payload=$payload"
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_healthz.err
|
|
||||||
|
|
||||||
if ! readyz_resp=$(curl "${CURL_OPTS[@]}" -w '\n%{http_code} %{time_total}' "$MASTER_BASE/readyz" 2>/tmp/agent_verify_readyz.err); then
|
|
||||||
error_detail=$(cat /tmp/agent_verify_readyz.err || true)
|
|
||||||
add_result FAIL "GET /readyz failed: $error_detail"
|
|
||||||
readyz_payload=""
|
|
||||||
else
|
|
||||||
readyz_meta=$(tail -n1 <<<"$readyz_resp")
|
|
||||||
readyz_payload=$(head -n -1 <<<"$readyz_resp" || true)
|
|
||||||
readyz_status=${readyz_meta%% *}
|
|
||||||
readyz_elapsed=${readyz_meta##* }
|
|
||||||
add_result PASS "GET /readyz status=$readyz_status elapsed=${readyz_elapsed}s"
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_readyz.err
|
|
||||||
|
|
||||||
# 4.3 Nodes list and detail
|
|
||||||
if ! nodes_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes" 2>/tmp/agent_verify_nodes.err); then
|
|
||||||
error_detail=$(cat /tmp/agent_verify_nodes.err || true)
|
|
||||||
add_result FAIL "GET /api/v1/master/nodes failed: $error_detail"
|
|
||||||
nodes_json=""
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_nodes.err
|
|
||||||
|
|
||||||
NODE_ENTRY=""
|
|
||||||
NODE_ID=""
|
|
||||||
NODE_IP=""
|
|
||||||
if [[ -n "$nodes_json" ]]; then
|
|
||||||
if [[ "$HAS_JQ" == "1" ]]; then
|
|
||||||
NODE_ENTRY=$(printf '%s' "$nodes_json" | jq -e --arg name "$VERIFY_HOSTNAME" '.[] | select(.name == $name)') || NODE_ENTRY=""
|
|
||||||
else
|
|
||||||
NODE_ENTRY=$(python3 - "$VERIFY_HOSTNAME" <<'PY'
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
|
|
||||||
hostname = sys.argv[1]
|
|
||||||
nodes = json.load(sys.stdin)
|
|
||||||
for node in nodes:
|
|
||||||
if node.get("name") == hostname:
|
|
||||||
import json as _json
|
|
||||||
print(_json.dumps(node))
|
|
||||||
sys.exit(0)
|
|
||||||
sys.exit(1)
|
|
||||||
PY
|
|
||||||
) || NODE_ENTRY=""
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -z "$NODE_ENTRY" ]]; then
|
|
||||||
add_result FAIL "Current node '$VERIFY_HOSTNAME' not found in master nodes list"
|
|
||||||
else
|
|
||||||
if NODE_ID=$(json_query "$NODE_ENTRY" '.id' 'data["id"]'); then
|
|
||||||
add_result PASS "Discovered node id '$NODE_ID' for hostname '$VERIFY_HOSTNAME'"
|
|
||||||
else
|
|
||||||
add_result FAIL "Failed to extract node id from master response"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -n "$NODE_ENTRY" ]] && NODE_DETAIL=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_node_detail.err); then
|
|
||||||
NODE_DETAIL_JSON="$NODE_DETAIL"
|
|
||||||
add_result PASS "Fetched node detail for $NODE_ID"
|
|
||||||
if NODE_IP=$(json_query "$NODE_DETAIL_JSON" '.meta_data.ip // .meta_data.host_ip // empty' 'data.get("meta_data", {}).get("ip") or data.get("meta_data", {}).get("host_ip") or ""'); then
|
|
||||||
if [[ -n "$NODE_IP" ]]; then
|
|
||||||
add_result PASS "Registered node IP=$NODE_IP"
|
|
||||||
else
|
|
||||||
add_result INFO "Node detail does not expose IP fields"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
error_detail=$(cat /tmp/agent_verify_node_detail.err 2>/dev/null || true)
|
|
||||||
add_result FAIL "Failed to fetch node detail for $NODE_ID: $error_detail"
|
|
||||||
NODE_DETAIL_JSON=""
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_node_detail.err
|
|
||||||
|
|
||||||
if stats_json=$(curl_json "$MASTER_BASE/api/v1/master/nodes/statistics" 2>/tmp/agent_verify_stats.err); then
|
|
||||||
if total_nodes=$(json_query "$stats_json" '.total // .total_nodes' 'data.get("total") or data.get("total_nodes")'); then
|
|
||||||
if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$total_nodes" -ge 1 ]]; then
|
|
||||||
add_result PASS "Statistics total=$total_nodes"
|
|
||||||
else
|
|
||||||
add_result WARN "Statistics total field not numeric: $total_nodes"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
add_result WARN "Unable to read total field from statistics"
|
|
||||||
fi
|
|
||||||
|
|
||||||
active_nodes=""
|
|
||||||
if [[ "$HAS_JQ" == "1" ]]; then
|
|
||||||
active_nodes=$(printf '%s' "$stats_json" | jq -e 'if .status_statistics then (.status_statistics[] | select(.status == "online") | .count) else empty end' 2>/dev/null | head -n1 || true)
|
|
||||||
elif command -v python3 >/dev/null 2>&1; then
|
|
||||||
active_nodes=$(printf '%s' "$stats_json" | python3 -c 'import json,sys; data=json.load(sys.stdin); print(next((row.get("count") for row in data.get("status_statistics", []) if row.get("status")=="online"), ""))' 2>/dev/null)
|
|
||||||
fi
|
|
||||||
if [[ -n "$active_nodes" ]]; then
|
|
||||||
add_result PASS "Online nodes reported by master: $active_nodes"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$HAS_JQ" == "1" ]]; then
|
|
||||||
node_count=$(printf '%s' "$nodes_json" | jq 'length')
|
|
||||||
else
|
|
||||||
node_count=$(json_length "$nodes_json" 'length' 'len(data)')
|
|
||||||
fi
|
|
||||||
if [[ "$total_nodes" =~ ^[0-9]+$ ]] && [[ "$node_count" =~ ^[0-9]+$ ]] && [[ "$total_nodes" -lt "$node_count" ]]; then
|
|
||||||
add_result WARN "Statistics total=$total_nodes less than nodes list count=$node_count"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
error_detail=$(cat /tmp/agent_verify_stats.err 2>/dev/null || true)
|
|
||||||
add_result FAIL "Failed to fetch node statistics: $error_detail"
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_stats.err
|
|
||||||
else
|
|
||||||
NODE_DETAIL_JSON=""
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 4.4 Agent persistence checks
|
|
||||||
if [[ -f "$NODE_JSON" ]]; then
|
|
||||||
node_file_content="$(cat "$NODE_JSON")"
|
|
||||||
if node_id_local=$(json_query "$node_file_content" '.id' 'data["id"]'); then
|
|
||||||
if [[ "$NODE_ID" != "" && "$node_id_local" == "$NODE_ID" ]]; then
|
|
||||||
add_result PASS "node.json id matches master ($NODE_ID)"
|
|
||||||
else
|
|
||||||
add_result FAIL "node.json id '$node_id_local' differs from master id '$NODE_ID'"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
add_result FAIL "Unable to extract id from node.json"
|
|
||||||
fi
|
|
||||||
if node_name_local=$(json_query "$node_file_content" '.name' 'data["name"]'); then
|
|
||||||
if [[ "$node_name_local" == "$VERIFY_HOSTNAME" ]]; then
|
|
||||||
add_result PASS "node.json name matches $VERIFY_HOSTNAME"
|
|
||||||
else
|
|
||||||
add_result FAIL "node.json name '$node_name_local' differs from hostname '$VERIFY_HOSTNAME'"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
add_result FAIL "Unable to extract name from node.json"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if register_time=$(json_query "$node_file_content" '.register_time' 'data.get("register_time")'); then
|
|
||||||
if iso_to_epoch "$register_time" >/dev/null 2>&1; then
|
|
||||||
add_result PASS "node.json register_time valid ISO timestamp"
|
|
||||||
else
|
|
||||||
add_result WARN "node.json register_time invalid: $register_time"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
add_result WARN "node.json missing register_time"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then
|
|
||||||
if iso_to_epoch "$last_updated" >/dev/null 2>&1; then
|
|
||||||
add_result PASS "node.json last_updated valid ISO timestamp"
|
|
||||||
else
|
|
||||||
add_result WARN "node.json last_updated invalid: $last_updated"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
add_result WARN "node.json missing last_updated"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
add_result FAIL "node.json not found at $NODE_JSON"
|
|
||||||
node_file_content=""
|
|
||||||
fi
|
|
||||||
|
|
||||||
ensure_directory "$HEALTH_DIR"
|
|
||||||
|
|
||||||
if [[ -d "$HEALTH_DIR" ]]; then
|
|
||||||
shopt -s nullglob
|
|
||||||
health_files=("$HEALTH_DIR"/*.json)
|
|
||||||
shopt -u nullglob
|
|
||||||
if [[ ${#health_files[@]} -eq 0 ]]; then
|
|
||||||
add_result WARN "Health directory $HEALTH_DIR is empty"
|
|
||||||
else
|
|
||||||
for hf in "${health_files[@]}"; do
|
|
||||||
base=$(basename "$hf")
|
|
||||||
if [[ "$base" != *-* ]]; then
|
|
||||||
add_result WARN "Health file $base does not follow <module>-*.json"
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
if ! validate_json_file "$hf" >/dev/null 2>&1; then
|
|
||||||
add_result WARN "Health file $base is not valid JSON"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
add_result WARN "Health directory $HEALTH_DIR missing"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if getent hosts master.argus.com >/dev/null 2>&1; then
|
|
||||||
resolved_ips=$(getent hosts master.argus.com | awk '{print $1}' | xargs)
|
|
||||||
add_result PASS "master.argus.com resolves to $resolved_ips"
|
|
||||||
else
|
|
||||||
add_result FAIL "Failed to resolve master.argus.com"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 4.5 Master-Node status consistency
|
|
||||||
sleep_interval=$((REPORT_INTERVAL_SECONDS + 2))
|
|
||||||
|
|
||||||
if [[ -n "$NODE_DETAIL_JSON" ]]; then
|
|
||||||
detail_pre="$NODE_DETAIL_JSON"
|
|
||||||
else
|
|
||||||
detail_pre=""
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -z "$detail_pre" && -n "$NODE_ID" ]]; then
|
|
||||||
if detail_pre=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_pre.err); then
|
|
||||||
add_result PASS "Fetched node detail pre-check"
|
|
||||||
else
|
|
||||||
error_detail=$(cat /tmp/agent_verify_detail_pre.err 2>/dev/null || true)
|
|
||||||
add_result FAIL "Unable to fetch node detail for status check: $error_detail"
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_detail_pre.err
|
|
||||||
fi
|
|
||||||
|
|
||||||
server_ts_pre=""
|
|
||||||
agent_ts_pre=""
|
|
||||||
server_ts_post=""
|
|
||||||
agent_ts_post=""
|
|
||||||
|
|
||||||
if [[ -n "$detail_pre" ]]; then
|
|
||||||
server_ts_pre=$(json_query "$detail_pre" '.last_report' 'data.get("last_report")' || echo "")
|
|
||||||
agent_ts_pre=$(json_query "$detail_pre" '.agent_last_report' 'data.get("agent_last_report")' || echo "")
|
|
||||||
log_info "Captured initial last_report timestamps server='$server_ts_pre' agent='$agent_ts_pre'"
|
|
||||||
|
|
||||||
sleep "$sleep_interval"
|
|
||||||
|
|
||||||
if detail_post=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_detail_post.err); then
|
|
||||||
server_ts_post=$(json_query "$detail_post" '.last_report' 'data.get("last_report")' || echo "")
|
|
||||||
agent_ts_post=$(json_query "$detail_post" '.agent_last_report' 'data.get("agent_last_report")' || echo "")
|
|
||||||
if [[ "$server_ts_post" != "$server_ts_pre" ]]; then
|
|
||||||
add_result PASS "last_report.server_timestamp advanced (pre=$server_ts_pre post=$server_ts_post)"
|
|
||||||
else
|
|
||||||
add_result FAIL "last_report.server_timestamp did not change after ${sleep_interval}s"
|
|
||||||
fi
|
|
||||||
if [[ "$agent_ts_post" != "$agent_ts_pre" ]]; then
|
|
||||||
add_result PASS "last_report.agent_timestamp advanced"
|
|
||||||
else
|
|
||||||
add_result FAIL "last_report.agent_timestamp did not change"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -n "$node_file_content" ]]; then
|
|
||||||
if node_last_updated=$(json_query "$node_file_content" '.last_updated' 'data.get("last_updated")'); then
|
|
||||||
if epoch_post=$(iso_to_epoch "$server_ts_post" 2>/dev/null); then
|
|
||||||
if node_epoch=$(iso_to_epoch "$node_last_updated" 2>/dev/null); then
|
|
||||||
diff=$((epoch_post - node_epoch))
|
|
||||||
[[ $diff -lt 0 ]] && diff=$((-diff))
|
|
||||||
tolerance=$((REPORT_INTERVAL_SECONDS * 2))
|
|
||||||
if [[ $diff -le $tolerance ]]; then
|
|
||||||
add_result PASS "last_report.server_timestamp and node.json last_updated within tolerance ($diff s)"
|
|
||||||
else
|
|
||||||
add_result WARN "Timestamp gap between master ($server_ts_post) and node.json ($node_last_updated) is ${diff}s"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
NODE_DETAIL_JSON="$detail_post"
|
|
||||||
else
|
|
||||||
error_detail=$(cat /tmp/agent_verify_detail_post.err 2>/dev/null || true)
|
|
||||||
add_result FAIL "Failed to fetch node detail post-check: $error_detail"
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_detail_post.err
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 4.6 Health simulation
|
|
||||||
TEST_HEALTH_FILE="$HEALTH_DIR/verify-master.json"
|
|
||||||
ensure_directory "$HEALTH_DIR"
|
|
||||||
|
|
||||||
if [[ -f "$TEST_HEALTH_FILE" ]]; then
|
|
||||||
TEST_HEALTH_EXISTED="true"
|
|
||||||
TEST_HEALTH_BACKUP="$(cat "$TEST_HEALTH_FILE")"
|
|
||||||
else
|
|
||||||
TEST_HEALTH_EXISTED="false"
|
|
||||||
fi
|
|
||||||
|
|
||||||
create_health_file() {
|
|
||||||
local message="$1"
|
|
||||||
cat > "$TEST_HEALTH_FILE" <<HEALTHJSON
|
|
||||||
{"status":"ok","message":"$message"}
|
|
||||||
HEALTHJSON
|
|
||||||
}
|
|
||||||
|
|
||||||
validate_health_in_master() {
|
|
||||||
local expected_message="$1"
|
|
||||||
local detail_json="$2"
|
|
||||||
local message
|
|
||||||
if message=$(json_query "$detail_json" '.health["verify-master"].message' 'data.get("health", {}).get("verify-master", {}).get("message")'); then
|
|
||||||
if [[ "$message" == "$expected_message" ]]; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
remove_health_from_master() {
|
|
||||||
local detail_json="$1"
|
|
||||||
if json_has_key "$detail_json" '(.health | has("verify-master"))' '"verify-master" in data.get("health", {})'; then
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
health_message_one="verify $(date +%s)"
|
|
||||||
create_health_file "$health_message_one"
|
|
||||||
add_result PASS "Created test health file $TEST_HEALTH_FILE"
|
|
||||||
|
|
||||||
sleep "$sleep_interval"
|
|
||||||
if detail_health_one=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health1.err); then
|
|
||||||
if validate_health_in_master "$health_message_one" "$detail_health_one"; then
|
|
||||||
add_result PASS "Master reflects verify-master health message"
|
|
||||||
else
|
|
||||||
add_result FAIL "Master health payload does not match test message"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
error_detail=$(cat /tmp/agent_verify_health1.err 2>/dev/null || true)
|
|
||||||
add_result FAIL "Failed to fetch node detail during health validation: $error_detail"
|
|
||||||
detail_health_one=""
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_health1.err
|
|
||||||
|
|
||||||
health_message_two="verify $(date +%s)-update"
|
|
||||||
create_health_file "$health_message_two"
|
|
||||||
sleep "$sleep_interval"
|
|
||||||
if detail_health_two=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health2.err); then
|
|
||||||
if validate_health_in_master "$health_message_two" "$detail_health_two"; then
|
|
||||||
add_result PASS "Master health updated to new message"
|
|
||||||
else
|
|
||||||
add_result FAIL "Master health message did not update"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
error_detail=$(cat /tmp/agent_verify_health2.err 2>/dev/null || true)
|
|
||||||
add_result FAIL "Failed to fetch node detail after health update: $error_detail"
|
|
||||||
detail_health_two=""
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_health2.err
|
|
||||||
|
|
||||||
rm -f "$TEST_HEALTH_FILE"
|
|
||||||
sleep "$sleep_interval"
|
|
||||||
if detail_health_three=$(curl_json "$MASTER_BASE/api/v1/master/nodes/$NODE_ID" 2>/tmp/agent_verify_health3.err); then
|
|
||||||
if remove_health_from_master "$detail_health_three"; then
|
|
||||||
add_result PASS "Master health no longer lists verify-master after removal"
|
|
||||||
else
|
|
||||||
add_result FAIL "Master health still contains verify-master after file deletion"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
error_detail=$(cat /tmp/agent_verify_health3.err 2>/dev/null || true)
|
|
||||||
add_result FAIL "Failed to fetch node detail after health removal: $error_detail"
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_health3.err
|
|
||||||
|
|
||||||
if [[ "$TEST_HEALTH_EXISTED" == "true" ]]; then
|
|
||||||
printf '%s' "$TEST_HEALTH_BACKUP" > "$TEST_HEALTH_FILE"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Optional config touch
|
|
||||||
if [[ "$ALLOW_CONFIG_TOUCH" == "true" ]]; then
|
|
||||||
if [[ -n "$NODE_ID" ]]; then
|
|
||||||
payload='{"label": {"verify": "true"}}'
|
|
||||||
if curl "${CURL_OPTS[@]}" -X PUT -H 'Content-Type: application/json' -d "$payload" "$MASTER_BASE/api/v1/master/nodes/$NODE_ID/config" >/tmp/agent_verify_config.log 2>&1; then
|
|
||||||
add_result PASS "Config PUT dry-run succeeded"
|
|
||||||
else
|
|
||||||
add_result WARN "Config PUT dry-run failed: $(cat /tmp/agent_verify_config.log)"
|
|
||||||
fi
|
|
||||||
rm -f /tmp/agent_verify_config.log
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
add_result WARN "Config PUT dry-run skipped (enable with --allow-config-touch)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Result summary
|
|
||||||
echo
|
|
||||||
echo "==== Verification Summary ===="
|
|
||||||
for entry in "${RESULTS_PASS[@]}"; do
|
|
||||||
printf 'PASS: %s\n' "$entry"
|
|
||||||
done
|
|
||||||
for entry in "${RESULTS_WARN[@]}"; do
|
|
||||||
printf 'WARN: %s\n' "$entry"
|
|
||||||
done
|
|
||||||
for entry in "${RESULTS_FAIL[@]}"; do
|
|
||||||
printf 'FAIL: %s\n' "$entry"
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ ${#RESULTS_FAIL[@]} -gt 0 ]]; then
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
exit 0
|
|
||||||
@ -1,276 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
MODULE_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
BUILD_ROOT="$MODULE_ROOT/build"
|
|
||||||
DIST_DIR="$MODULE_ROOT/dist"
|
|
||||||
PYINSTALLER_BUILD="$BUILD_ROOT/pyinstaller"
|
|
||||||
PYINSTALLER_SPEC="$PYINSTALLER_BUILD/spec"
|
|
||||||
PYINSTALLER_WORK="$PYINSTALLER_BUILD/work"
|
|
||||||
VENV_DIR="$BUILD_ROOT/venv"
|
|
||||||
|
|
||||||
AGENT_BUILD_IMAGE="${AGENT_BUILD_IMAGE:-python:3.11-slim-bullseye}"
|
|
||||||
AGENT_BUILD_USE_DOCKER="${AGENT_BUILD_USE_DOCKER:-1}"
|
|
||||||
# 默认在容器内忽略代理以避免公司内网代理在 Docker 网络不可达导致 pip 失败(可用 0 关闭)
|
|
||||||
AGENT_BUILD_IGNORE_PROXY="${AGENT_BUILD_IGNORE_PROXY:-1}"
|
|
||||||
USED_DOCKER=0
|
|
||||||
|
|
||||||
run_host_build() {
|
|
||||||
echo "[INFO] Using host Python environment for build" >&2
|
|
||||||
rm -rf "$BUILD_ROOT" "$DIST_DIR"
|
|
||||||
mkdir -p "$PYINSTALLER_BUILD" "$DIST_DIR"
|
|
||||||
python3 -m venv --copies "$VENV_DIR"
|
|
||||||
# shellcheck disable=SC1091
|
|
||||||
source "$VENV_DIR/bin/activate"
|
|
||||||
|
|
||||||
pip install --upgrade pip
|
|
||||||
pip install .
|
|
||||||
pip install "pyinstaller==6.6.0"
|
|
||||||
|
|
||||||
pyinstaller \
|
|
||||||
--clean \
|
|
||||||
--onefile \
|
|
||||||
--name argus-agent \
|
|
||||||
--distpath "$DIST_DIR" \
|
|
||||||
--workpath "$PYINSTALLER_WORK" \
|
|
||||||
--specpath "$PYINSTALLER_SPEC" \
|
|
||||||
--add-data "$MODULE_ROOT/pyproject.toml:." \
|
|
||||||
"$MODULE_ROOT/entry.py"
|
|
||||||
|
|
||||||
chmod +x "$DIST_DIR/argus-agent"
|
|
||||||
deactivate
|
|
||||||
}
|
|
||||||
|
|
||||||
run_docker_build() {
|
|
||||||
if ! command -v docker >/dev/null 2>&1; then
|
|
||||||
echo "[ERROR] docker 命令不存在,无法在容器内构建。请安装 Docker 或设置 AGENT_BUILD_USE_DOCKER=0" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
USED_DOCKER=1
|
|
||||||
echo "[INFO] Building agent binary inside $AGENT_BUILD_IMAGE" >&2
|
|
||||||
|
|
||||||
local host_uid host_gid
|
|
||||||
host_uid="$(id -u)"
|
|
||||||
host_gid="$(id -g)"
|
|
||||||
docker_env=("--rm" "-v" "$MODULE_ROOT:/workspace" "-w" "/workspace" "--env" "TARGET_UID=${host_uid}" "--env" "TARGET_GID=${host_gid}")
|
|
||||||
|
|
||||||
pass_env_if_set() {
|
|
||||||
local var="$1"
|
|
||||||
local value="${!var:-}"
|
|
||||||
if [[ -n "$value" ]]; then
|
|
||||||
docker_env+=("--env" "$var=$value")
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
pass_env_if_set PIP_INDEX_URL
|
|
||||||
pass_env_if_set PIP_EXTRA_INDEX_URL
|
|
||||||
pass_env_if_set PIP_TRUSTED_HOST
|
|
||||||
pass_env_if_set HTTP_PROXY
|
|
||||||
pass_env_if_set HTTPS_PROXY
|
|
||||||
pass_env_if_set NO_PROXY
|
|
||||||
pass_env_if_set http_proxy
|
|
||||||
pass_env_if_set https_proxy
|
|
||||||
pass_env_if_set no_proxy
|
|
||||||
pass_env_if_set AGENT_BUILD_IGNORE_PROXY
|
|
||||||
|
|
||||||
build_script=$(cat <<'INNER'
|
|
||||||
set -euo pipefail
|
|
||||||
cd /workspace
|
|
||||||
apt-get update >/dev/null
|
|
||||||
apt-get install -y --no-install-recommends binutils >/dev/null
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
rm -rf build dist
|
|
||||||
mkdir -p build/pyinstaller dist
|
|
||||||
python3 -m venv --copies build/venv
|
|
||||||
source build/venv/bin/activate
|
|
||||||
# 若指定忽略代理,则清空常见代理与 pip 镜像环境变量,避免容器内代理不可达
|
|
||||||
if [ "${AGENT_BUILD_IGNORE_PROXY:-1}" = "1" ]; then
|
|
||||||
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY PIP_INDEX_URL PIP_EXTRA_INDEX_URL PIP_TRUSTED_HOST
|
|
||||||
fi
|
|
||||||
pip install --upgrade pip
|
|
||||||
pip install .
|
|
||||||
pip install pyinstaller==6.6.0
|
|
||||||
pyinstaller \
|
|
||||||
--clean \
|
|
||||||
--onefile \
|
|
||||||
--name argus-agent \
|
|
||||||
--distpath dist \
|
|
||||||
--workpath build/pyinstaller/work \
|
|
||||||
--specpath build/pyinstaller/spec \
|
|
||||||
--add-data /workspace/pyproject.toml:. \
|
|
||||||
entry.py
|
|
||||||
chmod +x dist/argus-agent
|
|
||||||
|
|
||||||
TARGET_UID="${TARGET_UID:-0}"
|
|
||||||
TARGET_GID="${TARGET_GID:-0}"
|
|
||||||
chown -R "$TARGET_UID:$TARGET_GID" dist build 2>/dev/null || true
|
|
||||||
|
|
||||||
python3 - <<'PY'
|
|
||||||
from pathlib import Path
|
|
||||||
from PyInstaller.archive.readers import CArchiveReader
|
|
||||||
import sys
|
|
||||||
|
|
||||||
archive = Path('dist/argus-agent')
|
|
||||||
out_dir = Path('build/compat_check')
|
|
||||||
out_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
major, minor = sys.version_info[:2]
|
|
||||||
libpython = f'libpython{major}.{minor}.so.1.0'
|
|
||||||
expected_libs = [
|
|
||||||
libpython,
|
|
||||||
'libssl.so.3',
|
|
||||||
'libcrypto.so.3',
|
|
||||||
]
|
|
||||||
reader = CArchiveReader(str(archive))
|
|
||||||
extracted = []
|
|
||||||
missing = []
|
|
||||||
for name in expected_libs:
|
|
||||||
try:
|
|
||||||
data = reader.extract(name)
|
|
||||||
except KeyError:
|
|
||||||
missing.append(name)
|
|
||||||
continue
|
|
||||||
(out_dir / name).write_bytes(data)
|
|
||||||
extracted.append(name)
|
|
||||||
(out_dir / 'manifest').write_text('\n'.join(extracted))
|
|
||||||
if extracted:
|
|
||||||
print('[INFO] Extracted libraries: ' + ', '.join(extracted))
|
|
||||||
if missing:
|
|
||||||
print('[WARN] Missing expected libraries in bundle: ' + ', '.join(missing))
|
|
||||||
PY
|
|
||||||
|
|
||||||
compat_check() {
|
|
||||||
local lib_path="$1"
|
|
||||||
if [[ ! -f "$lib_path" ]]; then
|
|
||||||
echo "[WARN] Missing $lib_path for GLIBC check"
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
local max_glibc
|
|
||||||
max_glibc=$(strings -a "$lib_path" | grep -Eo 'GLIBC_[0-9]+\.[0-9]+' | sort -Vu | tail -n 1 || true)
|
|
||||||
if [[ -n "$max_glibc" ]]; then
|
|
||||||
echo "[INFO] $lib_path references up to $max_glibc"
|
|
||||||
else
|
|
||||||
echo "[INFO] $lib_path does not expose GLIBC version strings"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
compat_libs=()
|
|
||||||
if [[ -f build/compat_check/manifest ]]; then
|
|
||||||
mapfile -t compat_libs < build/compat_check/manifest
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ${#compat_libs[@]} -eq 0 ]]; then
|
|
||||||
echo "[WARN] No libraries captured for GLIBC inspection"
|
|
||||||
else
|
|
||||||
for lib in "${compat_libs[@]}"; do
|
|
||||||
compat_check "build/compat_check/$lib"
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
deactivate
|
|
||||||
INNER
|
|
||||||
)
|
|
||||||
|
|
||||||
if ! docker run "${docker_env[@]}" "$AGENT_BUILD_IMAGE" bash -lc "$build_script"; then
|
|
||||||
echo "[ERROR] Docker 构建失败,请检查 Docker 权限或设置 AGENT_BUILD_USE_DOCKER=0 在兼容主机上构建" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
if [[ "$AGENT_BUILD_USE_DOCKER" == "1" ]]; then
|
|
||||||
run_docker_build
|
|
||||||
else
|
|
||||||
run_host_build
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -f "$DIST_DIR/argus-agent" ]]; then
|
|
||||||
echo "[ERROR] Agent binary was not produced" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$USED_DOCKER" != "1" ]]; then
|
|
||||||
if [[ ! -x "$VENV_DIR/bin/python" ]]; then
|
|
||||||
echo "[WARN] PyInstaller virtualenv missing at $VENV_DIR; skipping compatibility check" >&2
|
|
||||||
else
|
|
||||||
COMPAT_DIR="$BUILD_ROOT/compat_check"
|
|
||||||
rm -rf "$COMPAT_DIR"
|
|
||||||
mkdir -p "$COMPAT_DIR"
|
|
||||||
|
|
||||||
EXTRACT_SCRIPT=$(cat <<'PY'
|
|
||||||
from pathlib import Path
|
|
||||||
from PyInstaller.archive.readers import CArchiveReader
|
|
||||||
import sys
|
|
||||||
|
|
||||||
archive = Path('dist/argus-agent')
|
|
||||||
out_dir = Path('build/compat_check')
|
|
||||||
out_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
major, minor = sys.version_info[:2]
|
|
||||||
libpython = f'libpython{major}.{minor}.so.1.0'
|
|
||||||
expected_libs = [
|
|
||||||
libpython,
|
|
||||||
'libssl.so.3',
|
|
||||||
'libcrypto.so.3',
|
|
||||||
]
|
|
||||||
reader = CArchiveReader(str(archive))
|
|
||||||
extracted = []
|
|
||||||
missing = []
|
|
||||||
for name in expected_libs:
|
|
||||||
try:
|
|
||||||
data = reader.extract(name)
|
|
||||||
except KeyError:
|
|
||||||
missing.append(name)
|
|
||||||
continue
|
|
||||||
(out_dir / name).write_bytes(data)
|
|
||||||
extracted.append(name)
|
|
||||||
(out_dir / 'manifest').write_text('\n'.join(extracted))
|
|
||||||
if extracted:
|
|
||||||
print('[INFO] Extracted libraries: ' + ', '.join(extracted))
|
|
||||||
if missing:
|
|
||||||
print('[WARN] Missing expected libraries in bundle: ' + ', '.join(missing))
|
|
||||||
PY
|
|
||||||
)
|
|
||||||
|
|
||||||
"$VENV_DIR/bin/python" - <<PY
|
|
||||||
$EXTRACT_SCRIPT
|
|
||||||
PY
|
|
||||||
|
|
||||||
compat_libs=()
|
|
||||||
if [[ -f "$COMPAT_DIR/manifest" ]]; then
|
|
||||||
mapfile -t compat_libs < "$COMPAT_DIR/manifest"
|
|
||||||
fi
|
|
||||||
|
|
||||||
check_glibc_version() {
|
|
||||||
local lib_path="$1"
|
|
||||||
if [[ ! -f "$lib_path" ]]; then
|
|
||||||
echo "[WARN] Skipping GLIBC check; file not found: $lib_path" >&2
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
if command -v strings >/dev/null 2>&1; then
|
|
||||||
local max_glibc
|
|
||||||
max_glibc=$(strings -a "$lib_path" | grep -Eo 'GLIBC_[0-9]+\.[0-9]+' | sort -Vu | tail -n 1 || true)
|
|
||||||
if [[ -n "$max_glibc" ]]; then
|
|
||||||
echo "[INFO] $lib_path references up to $max_glibc"
|
|
||||||
else
|
|
||||||
echo "[INFO] $lib_path does not expose GLIBC version strings"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo "[WARN] strings command unavailable; cannot inspect $lib_path" >&2
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
if [[ ${#compat_libs[@]} -eq 0 ]]; then
|
|
||||||
echo "[WARN] No libraries captured for GLIBC inspection" >&2
|
|
||||||
else
|
|
||||||
for lib in "${compat_libs[@]}"; do
|
|
||||||
check_glibc_version "$COMPAT_DIR/$lib"
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo "[INFO] Compatibility check executed inside container"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[INFO] Agent binary generated at $DIST_DIR/argus-agent"
|
|
||||||
2
src/agent/tests/.gitignore
vendored
2
src/agent/tests/.gitignore
vendored
@ -1,2 +0,0 @@
|
|||||||
private/
|
|
||||||
tmp/
|
|
||||||
@ -1,99 +0,0 @@
|
|||||||
services:
|
|
||||||
bind:
|
|
||||||
image: ${BIND_IMAGE_TAG:-argus-bind9:latest}
|
|
||||||
container_name: argus-bind-agent-e2e
|
|
||||||
volumes:
|
|
||||||
- ./private:/private
|
|
||||||
networks:
|
|
||||||
default:
|
|
||||||
ipv4_address: 172.28.0.2
|
|
||||||
environment:
|
|
||||||
- "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
|
|
||||||
- "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
|
|
||||||
restart: always
|
|
||||||
|
|
||||||
master:
|
|
||||||
image: argus-master:latest
|
|
||||||
container_name: argus-master-agent-e2e
|
|
||||||
depends_on:
|
|
||||||
- bind
|
|
||||||
environment:
|
|
||||||
- OFFLINE_THRESHOLD_SECONDS=6
|
|
||||||
- ONLINE_THRESHOLD_SECONDS=2
|
|
||||||
- SCHEDULER_INTERVAL_SECONDS=1
|
|
||||||
- "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
|
|
||||||
- "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
|
|
||||||
ports:
|
|
||||||
- "32300:3000"
|
|
||||||
volumes:
|
|
||||||
- ./private/argus/master:/private/argus/master
|
|
||||||
- ./private/argus/metric/prometheus:/private/argus/metric/prometheus
|
|
||||||
- ./private/argus/etc:/private/argus/etc
|
|
||||||
networks:
|
|
||||||
default:
|
|
||||||
ipv4_address: 172.28.0.10
|
|
||||||
restart: always
|
|
||||||
|
|
||||||
agent:
|
|
||||||
image: ubuntu:22.04
|
|
||||||
container_name: argus-agent-e2e
|
|
||||||
hostname: dev-e2euser-e2einst-pod-0
|
|
||||||
depends_on:
|
|
||||||
- master
|
|
||||||
- bind
|
|
||||||
environment:
|
|
||||||
- MASTER_ENDPOINT=http://master.argus.com:3000
|
|
||||||
- REPORT_INTERVAL_SECONDS=2
|
|
||||||
- "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
|
|
||||||
- "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
|
|
||||||
volumes:
|
|
||||||
- ./private/argus/agent/dev-e2euser-e2einst-pod-0:/private/argus/agent/dev-e2euser-e2einst-pod-0
|
|
||||||
- ./private/argus/agent/dev-e2euser-e2einst-pod-0/health:/private/argus/agent/dev-e2euser-e2einst-pod-0/health
|
|
||||||
- ./private/argus/etc:/private/argus/etc
|
|
||||||
- ../dist/argus-agent:/usr/local/bin/argus-agent:ro
|
|
||||||
- ./scripts/agent_entrypoint.sh:/usr/local/bin/agent-entrypoint.sh:ro
|
|
||||||
- ../scripts/agent_deployment_verify.sh:/usr/local/bin/agent_deployment_verify.sh:ro
|
|
||||||
entrypoint:
|
|
||||||
- /usr/local/bin/agent-entrypoint.sh
|
|
||||||
networks:
|
|
||||||
default:
|
|
||||||
ipv4_address: 172.28.0.20
|
|
||||||
restart: always
|
|
||||||
|
|
||||||
agent_env:
|
|
||||||
image: ubuntu:22.04
|
|
||||||
container_name: argus-agent-env-e2e
|
|
||||||
hostname: host_abc
|
|
||||||
depends_on:
|
|
||||||
- master
|
|
||||||
- bind
|
|
||||||
environment:
|
|
||||||
- MASTER_ENDPOINT=http://master.argus.com:3000
|
|
||||||
- REPORT_INTERVAL_SECONDS=2
|
|
||||||
- AGENT_ENV=prod
|
|
||||||
- AGENT_USER=ml
|
|
||||||
- AGENT_INSTANCE=node-3
|
|
||||||
- AGENT_HOSTNAME=host_abc
|
|
||||||
- "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
|
|
||||||
- "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
|
|
||||||
volumes:
|
|
||||||
- ./private/argus/agent/host_abc:/private/argus/agent/host_abc
|
|
||||||
- ./private/argus/agent/host_abc/health:/private/argus/agent/host_abc/health
|
|
||||||
- ./private/argus/etc:/private/argus/etc
|
|
||||||
- ../dist/argus-agent:/usr/local/bin/argus-agent:ro
|
|
||||||
- ./scripts/agent_entrypoint.sh:/usr/local/bin/agent-entrypoint.sh:ro
|
|
||||||
- ../scripts/agent_deployment_verify.sh:/usr/local/bin/agent_deployment_verify.sh:ro
|
|
||||||
entrypoint:
|
|
||||||
- /usr/local/bin/agent-entrypoint.sh
|
|
||||||
networks:
|
|
||||||
default:
|
|
||||||
ipv4_address: 172.28.0.21
|
|
||||||
restart: always
|
|
||||||
|
|
||||||
networks:
|
|
||||||
default:
|
|
||||||
driver: bridge
|
|
||||||
ipam:
|
|
||||||
driver: default
|
|
||||||
config:
|
|
||||||
- subnet: 172.28.0.0/16
|
|
||||||
@ -1,23 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
SCRIPTS=(
|
|
||||||
"01_bootstrap.sh"
|
|
||||||
"02_up.sh"
|
|
||||||
"03_wait_and_assert_registration.sh"
|
|
||||||
"04_write_health_files.sh"
|
|
||||||
"05_verify_agent.sh"
|
|
||||||
"06_assert_status_on_master.sh"
|
|
||||||
"07_restart_agent_and_reregister.sh"
|
|
||||||
"08_down.sh"
|
|
||||||
)
|
|
||||||
|
|
||||||
for script in "${SCRIPTS[@]}"; do
|
|
||||||
echo "[TEST] Running $script"
|
|
||||||
"$SCRIPT_DIR/$script"
|
|
||||||
echo "[TEST] $script completed"
|
|
||||||
echo
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "[TEST] Agent module E2E tests completed"
|
|
||||||
@ -1,63 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
AGENT_ROOT="$(cd "$TEST_ROOT/.." && pwd)"
|
|
||||||
MASTER_ROOT="$(cd "$AGENT_ROOT/../master" && pwd)"
|
|
||||||
REPO_ROOT="$(cd "$AGENT_ROOT/../.." && pwd)"
|
|
||||||
PRIVATE_ROOT="$TEST_ROOT/private"
|
|
||||||
TMP_ROOT="$TEST_ROOT/tmp"
|
|
||||||
|
|
||||||
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
|
|
||||||
AGENT_CONFIG_DIR="$PRIVATE_ROOT/argus/agent/$AGENT_HOSTNAME"
|
|
||||||
AGENT_HEALTH_DIR="$PRIVATE_ROOT/argus/agent/$AGENT_HOSTNAME/health"
|
|
||||||
MASTER_PRIVATE_DIR="$PRIVATE_ROOT/argus/master"
|
|
||||||
METRIC_PRIVATE_DIR="$PRIVATE_ROOT/argus/metric/prometheus"
|
|
||||||
DNS_DIR="$PRIVATE_ROOT/argus/etc"
|
|
||||||
BIND_IMAGE_TAG="${BIND_IMAGE_TAG:-argus-bind9:latest}"
|
|
||||||
BIND_ROOT="$(cd "$MASTER_ROOT/../bind" && pwd)"
|
|
||||||
|
|
||||||
ensure_image() {
|
|
||||||
local image="$1"
|
|
||||||
if ! docker image inspect "$image" >/dev/null 2>&1; then
|
|
||||||
echo "[ERROR] Docker image '$image' 未找到,请先运行统一构建脚本 (例如 ./build/build_images.sh) 生成所需镜像" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
mkdir -p "$AGENT_CONFIG_DIR"
|
|
||||||
mkdir -p "$AGENT_HEALTH_DIR"
|
|
||||||
mkdir -p "$MASTER_PRIVATE_DIR"
|
|
||||||
mkdir -p "$METRIC_PRIVATE_DIR"
|
|
||||||
mkdir -p "$TMP_ROOT"
|
|
||||||
mkdir -p "$DNS_DIR"
|
|
||||||
|
|
||||||
touch "$AGENT_HEALTH_DIR/.keep"
|
|
||||||
|
|
||||||
# 中文提示:准备 bind 模块提供的 update-dns.sh,模拟生产下发
|
|
||||||
if [[ -f "$BIND_ROOT/build/update-dns.sh" ]]; then
|
|
||||||
cp "$BIND_ROOT/build/update-dns.sh" "$DNS_DIR/update-dns.sh"
|
|
||||||
chmod +x "$DNS_DIR/update-dns.sh"
|
|
||||||
else
|
|
||||||
echo "[WARN] bind update script missing at $BIND_ROOT/build/update-dns.sh"
|
|
||||||
fi
|
|
||||||
|
|
||||||
ensure_image "argus-master:latest"
|
|
||||||
ensure_image "$BIND_IMAGE_TAG"
|
|
||||||
|
|
||||||
AGENT_BINARY="$AGENT_ROOT/dist/argus-agent"
|
|
||||||
|
|
||||||
pushd "$AGENT_ROOT" >/dev/null
|
|
||||||
./scripts/build_binary.sh
|
|
||||||
popd >/dev/null
|
|
||||||
|
|
||||||
if [[ ! -x "$AGENT_BINARY" ]]; then
|
|
||||||
echo "[ERROR] Agent binary not found at $AGENT_BINARY" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "$AGENT_BINARY" > "$TMP_ROOT/agent_binary_path"
|
|
||||||
echo "$BIND_IMAGE_TAG" > "$TMP_ROOT/bind_image_tag"
|
|
||||||
|
|
||||||
echo "[INFO] Agent E2E bootstrap complete"
|
|
||||||
@ -1,53 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
|
|
||||||
|
|
||||||
TMP_ROOT="$TEST_ROOT/tmp"
|
|
||||||
ENV_FILE="$TEST_ROOT/.env"
|
|
||||||
|
|
||||||
source "$REPO_ROOT/scripts/common/build_user.sh"
|
|
||||||
load_build_user
|
|
||||||
export ARGUS_BUILD_UID ARGUS_BUILD_GID
|
|
||||||
|
|
||||||
cat > "$ENV_FILE" <<EOF
|
|
||||||
ARGUS_BUILD_UID=$ARGUS_BUILD_UID
|
|
||||||
ARGUS_BUILD_GID=$ARGUS_BUILD_GID
|
|
||||||
EOF
|
|
||||||
|
|
||||||
if [[ ! -f "$TMP_ROOT/agent_binary_path" ]]; then
|
|
||||||
echo "[ERROR] Agent binary path missing; run 01_bootstrap.sh first" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
AGENT_BINARY="$(cat "$TMP_ROOT/agent_binary_path")"
|
|
||||||
if [[ ! -x "$AGENT_BINARY" ]]; then
|
|
||||||
echo "[ERROR] Agent binary not executable: $AGENT_BINARY" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
BIND_IMAGE_TAG_VALUE="argus-bind9:latest"
|
|
||||||
if [[ -f "$TMP_ROOT/bind_image_tag" ]]; then
|
|
||||||
BIND_IMAGE_TAG_VALUE="$(cat "$TMP_ROOT/bind_image_tag")"
|
|
||||||
fi
|
|
||||||
|
|
||||||
compose() {
|
|
||||||
if docker compose version >/dev/null 2>&1; then
|
|
||||||
docker compose "$@"
|
|
||||||
else
|
|
||||||
docker-compose "$@"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
docker container rm -f argus-agent-e2e argus-agent-env-e2e argus-master-agent-e2e argus-bind-agent-e2e >/dev/null 2>&1 || true
|
|
||||||
|
|
||||||
docker network rm tests_default >/dev/null 2>&1 || true
|
|
||||||
|
|
||||||
pushd "$TEST_ROOT" >/dev/null
|
|
||||||
compose down --remove-orphans || true
|
|
||||||
BIND_IMAGE_TAG="$BIND_IMAGE_TAG_VALUE" compose up -d
|
|
||||||
popd >/dev/null
|
|
||||||
|
|
||||||
echo "[INFO] Master+Agent stack started"
|
|
||||||
@ -1,106 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
TMP_ROOT="$TEST_ROOT/tmp"
|
|
||||||
API_BASE="http://localhost:32300/api/v1/master"
|
|
||||||
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
|
|
||||||
ENV_AGENT_HOSTNAME="host_abc"
|
|
||||||
NODE_FILE="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/node.json"
|
|
||||||
ENV_NODE_FILE="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME/node.json"
|
|
||||||
|
|
||||||
mkdir -p "$TMP_ROOT"
|
|
||||||
|
|
||||||
primary_node_id=""
|
|
||||||
env_node_id=""
|
|
||||||
for _ in {1..30}; do
|
|
||||||
sleep 2
|
|
||||||
response=$(curl -sS "$API_BASE/nodes" || true)
|
|
||||||
if [[ -z "$response" ]]; then
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
list_file="$TMP_ROOT/nodes_list.json"
|
|
||||||
echo "$response" > "$list_file"
|
|
||||||
readarray -t node_ids < <(python3 - "$list_file" "$AGENT_HOSTNAME" "$ENV_AGENT_HOSTNAME" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
nodes = json.load(handle)
|
|
||||||
|
|
||||||
target_primary = sys.argv[2]
|
|
||||||
target_env = sys.argv[3]
|
|
||||||
|
|
||||||
primary_id = ""
|
|
||||||
env_id = ""
|
|
||||||
|
|
||||||
for node in nodes:
|
|
||||||
if node.get("name") == target_primary:
|
|
||||||
primary_id = node.get("id", "")
|
|
||||||
if node.get("name") == target_env:
|
|
||||||
env_id = node.get("id", "")
|
|
||||||
|
|
||||||
print(primary_id)
|
|
||||||
print(env_id)
|
|
||||||
PY
|
|
||||||
)
|
|
||||||
|
|
||||||
primary_node_id="${node_ids[0]}"
|
|
||||||
env_node_id="${node_ids[1]}"
|
|
||||||
|
|
||||||
if [[ -n "$primary_node_id" && -n "$env_node_id" ]]; then
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ -z "$primary_node_id" ]]; then
|
|
||||||
echo "[ERROR] Primary agent did not register within timeout" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -z "$env_node_id" ]]; then
|
|
||||||
echo "[ERROR] Env-variable agent did not register within timeout" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "$primary_node_id" > "$TMP_ROOT/node_id"
|
|
||||||
echo "$env_node_id" > "$TMP_ROOT/node_id_host_abc"
|
|
||||||
|
|
||||||
if [[ ! -f "$NODE_FILE" ]]; then
|
|
||||||
echo "[ERROR] node.json not created at $NODE_FILE" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
python3 - "$NODE_FILE" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
assert "id" in node and node["id"], "node.json missing id"
|
|
||||||
PY
|
|
||||||
|
|
||||||
if [[ ! -f "$ENV_NODE_FILE" ]]; then
|
|
||||||
echo "[ERROR] node.json not created at $ENV_NODE_FILE" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
python3 - "$ENV_NODE_FILE" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
assert "id" in node and node["id"], "env agent node.json missing id"
|
|
||||||
PY
|
|
||||||
|
|
||||||
detail_file="$TMP_ROOT/initial_detail.json"
|
|
||||||
curl -sS "$API_BASE/nodes/$primary_node_id" -o "$detail_file"
|
|
||||||
python3 - "$detail_file" "$TMP_ROOT/initial_ip" <<'PY'
|
|
||||||
import json, sys, pathlib
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
ip = node["meta_data"].get("ip")
|
|
||||||
if not ip:
|
|
||||||
raise SystemExit("meta_data.ip missing")
|
|
||||||
pathlib.Path(sys.argv[2]).write_text(ip)
|
|
||||||
PY
|
|
||||||
|
|
||||||
echo "[INFO] Agent registered with node id $primary_node_id"
|
|
||||||
echo "[INFO] Env-variable agent registered with node id $env_node_id"
|
|
||||||
@ -1,22 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
HEALTH_DIR="$TEST_ROOT/private/argus/agent/dev-e2euser-e2einst-pod-0/health"
|
|
||||||
|
|
||||||
cat > "$HEALTH_DIR/log-fluentbit.json" <<JSON
|
|
||||||
{
|
|
||||||
"status": "healthy",
|
|
||||||
"timestamp": "2023-10-05T12:05:00Z"
|
|
||||||
}
|
|
||||||
JSON
|
|
||||||
|
|
||||||
cat > "$HEALTH_DIR/metric-node-exporter.json" <<JSON
|
|
||||||
{
|
|
||||||
"status": "healthy",
|
|
||||||
"timestamp": "2023-10-05T12:05:00Z"
|
|
||||||
}
|
|
||||||
JSON
|
|
||||||
|
|
||||||
echo "[INFO] Health files written"
|
|
||||||
@ -1,60 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
REPO_ROOT="$(cd "$TEST_ROOT/.." && pwd)"
|
|
||||||
VERIFY_SCRIPT="$REPO_ROOT/scripts/agent_deployment_verify.sh"
|
|
||||||
ENV_NODE_ID_FILE="$TEST_ROOT/tmp/node_id_host_abc"
|
|
||||||
PRIMARY_CONTAINER="argus-agent-e2e"
|
|
||||||
ENV_CONTAINER="argus-agent-env-e2e"
|
|
||||||
PRIMARY_HOSTNAME="dev-e2euser-e2einst-pod-0"
|
|
||||||
ENV_HOSTNAME="host_abc"
|
|
||||||
|
|
||||||
if ! docker ps --format '{{.Names}}' | grep -q "^${PRIMARY_CONTAINER}$"; then
|
|
||||||
echo "[WARN] agent container not running; skip verification"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if docker exec -i "$PRIMARY_CONTAINER" bash -lc 'command -v curl >/dev/null && command -v jq >/dev/null'; then
|
|
||||||
echo "[INFO] curl/jq already installed in agent container"
|
|
||||||
else
|
|
||||||
echo "[INFO] Installing curl/jq in agent container"
|
|
||||||
docker exec -i "$PRIMARY_CONTAINER" bash -lc 'apt-get update >/dev/null 2>&1 && apt-get install -y curl jq >/dev/null 2>&1' || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -f "$VERIFY_SCRIPT" ]]; then
|
|
||||||
echo "[ERROR] Verification script missing at $VERIFY_SCRIPT" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
run_verifier() {
|
|
||||||
local container="$1" hostname="$2"
|
|
||||||
|
|
||||||
if ! docker ps --format '{{.Names}}' | grep -q "^${container}$"; then
|
|
||||||
echo "[WARN] container $container not running; skip"
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! docker exec -i "$container" bash -lc 'command -v /usr/local/bin/agent_deployment_verify.sh >/dev/null'; then
|
|
||||||
echo "[ERROR] /usr/local/bin/agent_deployment_verify.sh missing in $container" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[INFO] Running verification for $hostname in $container"
|
|
||||||
docker exec -i "$container" env VERIFY_HOSTNAME="$hostname" /usr/local/bin/agent_deployment_verify.sh
|
|
||||||
}
|
|
||||||
|
|
||||||
run_verifier "$PRIMARY_CONTAINER" "$PRIMARY_HOSTNAME"
|
|
||||||
|
|
||||||
if docker ps --format '{{.Names}}' | grep -q "^${ENV_CONTAINER}$"; then
|
|
||||||
if docker exec -i "$ENV_CONTAINER" bash -lc 'command -v curl >/dev/null && command -v jq >/dev/null'; then
|
|
||||||
echo "[INFO] curl/jq already installed in env agent container"
|
|
||||||
else
|
|
||||||
echo "[INFO] Installing curl/jq in env agent container"
|
|
||||||
docker exec -i "$ENV_CONTAINER" bash -lc 'apt-get update >/dev/null 2>&1 && apt-get install -y curl jq >/dev/null 2>&1' || true
|
|
||||||
fi
|
|
||||||
run_verifier "$ENV_CONTAINER" "$ENV_HOSTNAME"
|
|
||||||
else
|
|
||||||
echo "[WARN] env-driven agent container not running; skip secondary verification"
|
|
||||||
fi
|
|
||||||
@ -1,78 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
TMP_ROOT="$TEST_ROOT/tmp"
|
|
||||||
API_BASE="http://localhost:32300/api/v1/master"
|
|
||||||
NODE_ID="$(cat "$TMP_ROOT/node_id")"
|
|
||||||
ENV_NODE_ID="$(cat "$TMP_ROOT/node_id_host_abc")"
|
|
||||||
ENV_HOSTNAME="host_abc"
|
|
||||||
NODES_JSON="$TEST_ROOT/private/argus/metric/prometheus/nodes.json"
|
|
||||||
|
|
||||||
success=false
|
|
||||||
detail_file="$TMP_ROOT/agent_status_detail.json"
|
|
||||||
for _ in {1..20}; do
|
|
||||||
sleep 2
|
|
||||||
if ! curl -sS "$API_BASE/nodes/$NODE_ID" -o "$detail_file"; then
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
if python3 - "$detail_file" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
if node["status"] != "online":
|
|
||||||
raise SystemExit(1)
|
|
||||||
health = node.get("health", {})
|
|
||||||
if "log-fluentbit" not in health or "metric-node-exporter" not in health:
|
|
||||||
raise SystemExit(1)
|
|
||||||
PY
|
|
||||||
then
|
|
||||||
success=true
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ "$success" != true ]]; then
|
|
||||||
echo "[ERROR] Node did not report health data in time" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -f "$NODES_JSON" ]]; then
|
|
||||||
echo "[ERROR] nodes.json missing at $NODES_JSON" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
python3 - "$NODES_JSON" "$NODE_ID" "$ENV_NODE_ID" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
nodes = json.load(handle)
|
|
||||||
|
|
||||||
expected_primary = sys.argv[2]
|
|
||||||
expected_env = sys.argv[3]
|
|
||||||
|
|
||||||
ids = {entry.get("node_id") for entry in nodes}
|
|
||||||
assert expected_primary in ids, nodes
|
|
||||||
assert expected_env in ids, nodes
|
|
||||||
assert len(nodes) >= 2, nodes
|
|
||||||
PY
|
|
||||||
|
|
||||||
echo "[INFO] Master reflects agent health and nodes.json entries"
|
|
||||||
|
|
||||||
env_detail_file="$TMP_ROOT/env_agent_detail.json"
|
|
||||||
curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_detail_file"
|
|
||||||
python3 - "$env_detail_file" "$ENV_HOSTNAME" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
|
|
||||||
expected_name = sys.argv[2]
|
|
||||||
|
|
||||||
assert node.get("name") == expected_name, node
|
|
||||||
meta = node.get("meta_data", {})
|
|
||||||
assert meta.get("env") == "prod", meta
|
|
||||||
assert meta.get("user") == "ml", meta
|
|
||||||
assert meta.get("instance") == "node-3", meta
|
|
||||||
PY
|
|
||||||
|
|
||||||
echo "[INFO] Env-variable agent reports expected metadata"
|
|
||||||
@ -1,254 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
TMP_ROOT="$TEST_ROOT/tmp"
|
|
||||||
API_BASE="http://localhost:32300/api/v1/master"
|
|
||||||
NODE_ID="$(cat "$TMP_ROOT/node_id")"
|
|
||||||
ENV_NODE_ID_FILE="$TMP_ROOT/node_id_host_abc"
|
|
||||||
if [[ ! -f "$ENV_NODE_ID_FILE" ]]; then
|
|
||||||
echo "[ERROR] Env agent node id file missing at $ENV_NODE_ID_FILE" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
ENV_NODE_ID="$(cat "$ENV_NODE_ID_FILE")"
|
|
||||||
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
|
|
||||||
ENV_AGENT_HOSTNAME="host_abc"
|
|
||||||
NETWORK_NAME="tests_default"
|
|
||||||
NEW_AGENT_IP="172.28.0.200"
|
|
||||||
NEW_ENV_AGENT_IP="172.28.0.210"
|
|
||||||
ENTRYPOINT_SCRIPT="$SCRIPT_DIR/agent_entrypoint.sh"
|
|
||||||
VERIFY_SCRIPT="$TEST_ROOT/../scripts/agent_deployment_verify.sh"
|
|
||||||
ENV_FILE="$TEST_ROOT/.env"
|
|
||||||
|
|
||||||
# 中文提示:重启场景也需要同样的入口脚本,确保 DNS 注册逻辑一致
|
|
||||||
if [[ ! -f "$ENTRYPOINT_SCRIPT" ]]; then
|
|
||||||
echo "[ERROR] agent entrypoint script missing at $ENTRYPOINT_SCRIPT" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -f "$VERIFY_SCRIPT" ]]; then
|
|
||||||
echo "[ERROR] agent verification script missing at $VERIFY_SCRIPT" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -f "$TMP_ROOT/agent_binary_path" ]]; then
|
|
||||||
echo "[ERROR] Agent binary path missing; rerun bootstrap" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
AGENT_BINARY="$(cat "$TMP_ROOT/agent_binary_path")"
|
|
||||||
if [[ ! -x "$AGENT_BINARY" ]]; then
|
|
||||||
echo "[ERROR] Agent binary not executable: $AGENT_BINARY" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -f "$ENV_FILE" ]]; then
|
|
||||||
set -a
|
|
||||||
# shellcheck disable=SC1090
|
|
||||||
source "$ENV_FILE"
|
|
||||||
set +a
|
|
||||||
else
|
|
||||||
REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
|
|
||||||
# shellcheck disable=SC1090
|
|
||||||
source "$REPO_ROOT/scripts/common/build_user.sh"
|
|
||||||
load_build_user
|
|
||||||
fi
|
|
||||||
|
|
||||||
AGENT_UID="${ARGUS_BUILD_UID:-2133}"
|
|
||||||
AGENT_GID="${ARGUS_BUILD_GID:-2015}"
|
|
||||||
|
|
||||||
compose() {
|
|
||||||
if docker compose version >/dev/null 2>&1; then
|
|
||||||
docker compose "$@"
|
|
||||||
else
|
|
||||||
docker-compose "$@"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
before_file="$TMP_ROOT/before_restart.json"
|
|
||||||
curl -sS "$API_BASE/nodes/$NODE_ID" -o "$before_file"
|
|
||||||
prev_last_updated=$(python3 - "$before_file" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
print(node.get("last_updated", ""))
|
|
||||||
PY
|
|
||||||
)
|
|
||||||
prev_ip=$(python3 - "$before_file" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
print(node["meta_data"].get("ip", ""))
|
|
||||||
PY
|
|
||||||
)
|
|
||||||
initial_ip=$(cat "$TMP_ROOT/initial_ip")
|
|
||||||
if [[ "$prev_ip" != "$initial_ip" ]]; then
|
|
||||||
echo "[ERROR] Expected initial IP $initial_ip, got $prev_ip" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
env_before_file="$TMP_ROOT/env_before_restart.json"
|
|
||||||
curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_before_file"
|
|
||||||
env_prev_last_updated=$(python3 - "$env_before_file" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
print(node.get("last_updated", ""))
|
|
||||||
PY
|
|
||||||
)
|
|
||||||
env_prev_ip=$(python3 - "$env_before_file" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
print(node["meta_data"].get("ip", ""))
|
|
||||||
PY
|
|
||||||
)
|
|
||||||
|
|
||||||
pushd "$TEST_ROOT" >/dev/null
|
|
||||||
compose rm -sf agent
|
|
||||||
compose rm -sf agent_env
|
|
||||||
popd >/dev/null
|
|
||||||
|
|
||||||
docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true
|
|
||||||
docker container rm -f argus-agent-env-e2e >/dev/null 2>&1 || true
|
|
||||||
|
|
||||||
AGENT_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME"
|
|
||||||
HEALTH_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/health"
|
|
||||||
|
|
||||||
ENV_AGENT_DIR="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME"
|
|
||||||
ENV_HEALTH_DIR="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME/health"
|
|
||||||
|
|
||||||
# 先以 sleep 方式启动容器,确保我们掌握注册时的网络状态
|
|
||||||
if ! docker run -d \
|
|
||||||
--name argus-agent-e2e \
|
|
||||||
--hostname "$AGENT_HOSTNAME" \
|
|
||||||
--network "$NETWORK_NAME" \
|
|
||||||
--ip "$NEW_AGENT_IP" \
|
|
||||||
-v "$AGENT_DIR:/private/argus/agent/$AGENT_HOSTNAME" \
|
|
||||||
-v "$HEALTH_DIR:/private/argus/agent/$AGENT_HOSTNAME/health" \
|
|
||||||
-v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \
|
|
||||||
-v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
|
|
||||||
-v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \
|
|
||||||
-v "$VERIFY_SCRIPT:/usr/local/bin/agent_deployment_verify.sh:ro" \
|
|
||||||
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
|
|
||||||
-e REPORT_INTERVAL_SECONDS=2 \
|
|
||||||
-e ARGUS_BUILD_UID="$AGENT_UID" \
|
|
||||||
-e ARGUS_BUILD_GID="$AGENT_GID" \
|
|
||||||
--entrypoint /usr/local/bin/agent-entrypoint.sh \
|
|
||||||
ubuntu:22.04 >/dev/null; then
|
|
||||||
echo "[ERROR] Failed to start agent container with custom IP" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
success=false
|
|
||||||
detail_file="$TMP_ROOT/post_restart.json"
|
|
||||||
for _ in {1..20}; do
|
|
||||||
sleep 3
|
|
||||||
if ! curl -sS "$API_BASE/nodes/$NODE_ID" -o "$detail_file"; then
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
if python3 - "$detail_file" "$prev_last_updated" "$NODE_ID" "$prev_ip" "$NEW_AGENT_IP" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
prev_last_updated = sys.argv[2]
|
|
||||||
expected_id = sys.argv[3]
|
|
||||||
old_ip = sys.argv[4]
|
|
||||||
expected_ip = sys.argv[5]
|
|
||||||
last_updated = node.get("last_updated")
|
|
||||||
current_ip = node["meta_data"].get("ip")
|
|
||||||
assert node["id"] == expected_id
|
|
||||||
if current_ip != expected_ip:
|
|
||||||
raise SystemExit(1)
|
|
||||||
if current_ip == old_ip:
|
|
||||||
raise SystemExit(1)
|
|
||||||
if not last_updated or last_updated == prev_last_updated:
|
|
||||||
raise SystemExit(1)
|
|
||||||
PY
|
|
||||||
then
|
|
||||||
success=true
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ "$success" != true ]]; then
|
|
||||||
echo "[ERROR] Agent did not report expected new IP $NEW_AGENT_IP after restart" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[INFO] Agent restart produced successful re-registration with IP change"
|
|
||||||
|
|
||||||
# ---- Restart env-driven agent without metadata environment variables ----
|
|
||||||
|
|
||||||
if [[ ! -d "$ENV_AGENT_DIR" ]]; then
|
|
||||||
echo "[ERROR] Env agent data dir missing at $ENV_AGENT_DIR" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -d "$ENV_HEALTH_DIR" ]]; then
|
|
||||||
mkdir -p "$ENV_HEALTH_DIR"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! docker run -d \
|
|
||||||
--name argus-agent-env-e2e \
|
|
||||||
--hostname "$ENV_AGENT_HOSTNAME" \
|
|
||||||
--network "$NETWORK_NAME" \
|
|
||||||
--ip "$NEW_ENV_AGENT_IP" \
|
|
||||||
-v "$ENV_AGENT_DIR:/private/argus/agent/$ENV_AGENT_HOSTNAME" \
|
|
||||||
-v "$ENV_HEALTH_DIR:/private/argus/agent/$ENV_AGENT_HOSTNAME/health" \
|
|
||||||
-v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \
|
|
||||||
-v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
|
|
||||||
-v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \
|
|
||||||
-v "$VERIFY_SCRIPT:/usr/local/bin/agent_deployment_verify.sh:ro" \
|
|
||||||
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
|
|
||||||
-e REPORT_INTERVAL_SECONDS=2 \
|
|
||||||
-e ARGUS_BUILD_UID="$AGENT_UID" \
|
|
||||||
-e ARGUS_BUILD_GID="$AGENT_GID" \
|
|
||||||
--entrypoint /usr/local/bin/agent-entrypoint.sh \
|
|
||||||
ubuntu:22.04 >/dev/null; then
|
|
||||||
echo "[ERROR] Failed to start env-driven agent container without metadata env" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
env_success=false
|
|
||||||
env_detail_file="$TMP_ROOT/env_post_restart.json"
|
|
||||||
for _ in {1..20}; do
|
|
||||||
sleep 3
|
|
||||||
if ! curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_detail_file"; then
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
if python3 - "$env_detail_file" "$env_prev_last_updated" "$ENV_NODE_ID" "$env_prev_ip" "$NEW_ENV_AGENT_IP" <<'PY'
|
|
||||||
import json, sys
|
|
||||||
with open(sys.argv[1]) as handle:
|
|
||||||
node = json.load(handle)
|
|
||||||
prev_last_updated = sys.argv[2]
|
|
||||||
expected_id = sys.argv[3]
|
|
||||||
old_ip = sys.argv[4]
|
|
||||||
expected_ip = sys.argv[5]
|
|
||||||
last_updated = node.get("last_updated")
|
|
||||||
current_ip = node["meta_data"].get("ip")
|
|
||||||
meta = node.get("meta_data", {})
|
|
||||||
assert node["id"] == expected_id
|
|
||||||
if current_ip != expected_ip:
|
|
||||||
raise SystemExit(1)
|
|
||||||
if current_ip == old_ip:
|
|
||||||
raise SystemExit(1)
|
|
||||||
if not last_updated or last_updated == prev_last_updated:
|
|
||||||
raise SystemExit(1)
|
|
||||||
if meta.get("env") != "prod" or meta.get("user") != "ml" or meta.get("instance") != "node-3":
|
|
||||||
raise SystemExit(1)
|
|
||||||
PY
|
|
||||||
then
|
|
||||||
env_success=true
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ "$env_success" != true ]]; then
|
|
||||||
echo "[ERROR] Env-driven agent did not reuse persisted metadata after restart" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[INFO] Env-driven agent restart succeeded with persisted metadata"
|
|
||||||
@ -1,36 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
ENV_FILE="$TEST_ROOT/.env"
|
|
||||||
|
|
||||||
compose() {
|
|
||||||
if docker compose version >/dev/null 2>&1; then
|
|
||||||
docker compose "$@"
|
|
||||||
else
|
|
||||||
docker-compose "$@"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
docker container rm -f argus-agent-e2e argus-agent-env-e2e >/dev/null 2>&1 || true
|
|
||||||
|
|
||||||
pushd "$TEST_ROOT" >/dev/null
|
|
||||||
compose down --remove-orphans
|
|
||||||
popd >/dev/null
|
|
||||||
|
|
||||||
if [[ -d "$TEST_ROOT/private" ]]; then
|
|
||||||
docker run --rm \
|
|
||||||
-v "$TEST_ROOT/private:/target" \
|
|
||||||
ubuntu:24.04 \
|
|
||||||
chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true
|
|
||||||
rm -rf "$TEST_ROOT/private"
|
|
||||||
fi
|
|
||||||
|
|
||||||
rm -rf "$TEST_ROOT/tmp"
|
|
||||||
|
|
||||||
if [[ -f "$ENV_FILE" ]]; then
|
|
||||||
rm -f "$ENV_FILE"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[INFO] Agent E2E environment cleaned up"
|
|
||||||
@ -1,79 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
LOG_PREFIX="[AGENT-ENTRYPOINT]"
|
|
||||||
DNS_SCRIPT="/private/argus/etc/update-dns.sh"
|
|
||||||
DNS_CONF="/private/argus/etc/dns.conf"
|
|
||||||
TARGET_DOMAIN="master.argus.com"
|
|
||||||
AGENT_UID="${ARGUS_BUILD_UID:-2133}"
|
|
||||||
AGENT_GID="${ARGUS_BUILD_GID:-2015}"
|
|
||||||
AGENT_HOSTNAME="${HOSTNAME:-unknown}"
|
|
||||||
AGENT_DATA_DIR="/private/argus/agent/${AGENT_HOSTNAME}"
|
|
||||||
AGENT_HEALTH_DIR="${AGENT_DATA_DIR}/health"
|
|
||||||
RUNTIME_GROUP="argusagent"
|
|
||||||
RUNTIME_USER="argusagent"
|
|
||||||
|
|
||||||
log() {
|
|
||||||
echo "${LOG_PREFIX} $*"
|
|
||||||
}
|
|
||||||
|
|
||||||
mkdir -p "$AGENT_DATA_DIR" "$AGENT_HEALTH_DIR"
|
|
||||||
chown -R "$AGENT_UID:$AGENT_GID" "$AGENT_DATA_DIR" "$AGENT_HEALTH_DIR" 2>/dev/null || true
|
|
||||||
chown -R "$AGENT_UID:$AGENT_GID" "/private/argus/etc" 2>/dev/null || true
|
|
||||||
|
|
||||||
if ! getent group "$AGENT_GID" >/dev/null 2>&1; then
|
|
||||||
groupadd -g "$AGENT_GID" "$RUNTIME_GROUP"
|
|
||||||
else
|
|
||||||
RUNTIME_GROUP="$(getent group "$AGENT_GID" | cut -d: -f1)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! getent passwd "$AGENT_UID" >/dev/null 2>&1; then
|
|
||||||
useradd -u "$AGENT_UID" -g "$AGENT_GID" -M -s /bin/bash "$RUNTIME_USER"
|
|
||||||
else
|
|
||||||
RUNTIME_USER="$(getent passwd "$AGENT_UID" | cut -d: -f1)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
log "运行用户: $RUNTIME_USER ($AGENT_UID:$AGENT_GID)"
|
|
||||||
|
|
||||||
# 中文提示:等待 bind 下发的 update-dns.sh 脚本
|
|
||||||
for _ in {1..30}; do
|
|
||||||
if [[ -x "$DNS_SCRIPT" ]]; then
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
log "等待 update-dns.sh 准备就绪..."
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ -x "$DNS_SCRIPT" ]]; then
|
|
||||||
log "执行 update-dns.sh 更新容器 DNS"
|
|
||||||
while true; do
|
|
||||||
if "$DNS_SCRIPT"; then
|
|
||||||
log "update-dns.sh 执行成功"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
log "update-dns.sh 执行失败,3 秒后重试"
|
|
||||||
sleep 3
|
|
||||||
done
|
|
||||||
else
|
|
||||||
log "未获取到 update-dns.sh,使用镜像默认 DNS"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 中文提示:记录当前 dns.conf 内容,便于排查
|
|
||||||
if [[ -f "$DNS_CONF" ]]; then
|
|
||||||
log "dns.conf 内容: $(tr '\n' ' ' < "$DNS_CONF")"
|
|
||||||
else
|
|
||||||
log "dns.conf 暂未生成"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 中文提示:尝试解析 master 域名,失败不阻塞但会打日志
|
|
||||||
for _ in {1..30}; do
|
|
||||||
if getent hosts "$TARGET_DOMAIN" >/dev/null 2>&1; then
|
|
||||||
MASTER_IP=$(getent hosts "$TARGET_DOMAIN" | awk '{print $1}' | head -n 1)
|
|
||||||
log "master.argus.com 解析成功: $MASTER_IP"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
|
|
||||||
log "启动 argus-agent"
|
|
||||||
exec su -s /bin/bash -c /usr/local/bin/argus-agent "$RUNTIME_USER"
|
|
||||||
@ -1,151 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import os
|
|
||||||
import unittest
|
|
||||||
from contextlib import contextmanager
|
|
||||||
from unittest.mock import patch
|
|
||||||
|
|
||||||
from app.config import AgentConfig, load_config
|
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
|
||||||
def temp_env(**overrides: str | None):
|
|
||||||
originals: dict[str, str | None] = {}
|
|
||||||
try:
|
|
||||||
for key, value in overrides.items():
|
|
||||||
originals[key] = os.environ.get(key)
|
|
||||||
if value is None:
|
|
||||||
os.environ.pop(key, None)
|
|
||||||
else:
|
|
||||||
os.environ[key] = value
|
|
||||||
yield
|
|
||||||
finally:
|
|
||||||
for key, original in originals.items():
|
|
||||||
if original is None:
|
|
||||||
os.environ.pop(key, None)
|
|
||||||
else:
|
|
||||||
os.environ[key] = original
|
|
||||||
|
|
||||||
|
|
||||||
class LoadConfigMetadataTests(unittest.TestCase):
|
|
||||||
@patch("app.config.Path.mkdir")
|
|
||||||
def test_metadata_from_environment_variables(self, mock_mkdir):
|
|
||||||
with temp_env(
|
|
||||||
MASTER_ENDPOINT="http://master.local",
|
|
||||||
AGENT_HOSTNAME="dev-user-one-pod",
|
|
||||||
AGENT_ENV="prod",
|
|
||||||
AGENT_USER="ops",
|
|
||||||
AGENT_INSTANCE="node-1",
|
|
||||||
):
|
|
||||||
config = load_config()
|
|
||||||
|
|
||||||
self.assertEqual(config.environment, "prod")
|
|
||||||
self.assertEqual(config.user, "ops")
|
|
||||||
self.assertEqual(config.instance, "node-1")
|
|
||||||
mock_mkdir.assert_called()
|
|
||||||
|
|
||||||
@patch("app.config.Path.mkdir")
|
|
||||||
def test_metadata_falls_back_to_hostname(self, mock_mkdir):
|
|
||||||
with temp_env(
|
|
||||||
MASTER_ENDPOINT="http://master.local",
|
|
||||||
AGENT_HOSTNAME="qa-team-abc-pod-2",
|
|
||||||
AGENT_ENV=None,
|
|
||||||
AGENT_USER=None,
|
|
||||||
AGENT_INSTANCE=None,
|
|
||||||
):
|
|
||||||
config = load_config()
|
|
||||||
|
|
||||||
self.assertEqual(config.environment, "qa")
|
|
||||||
self.assertEqual(config.user, "team")
|
|
||||||
self.assertEqual(config.instance, "abc")
|
|
||||||
mock_mkdir.assert_called()
|
|
||||||
|
|
||||||
@patch("app.config._load_metadata_from_state", return_value=("prod", "ops", "node-1"))
|
|
||||||
@patch("app.config.Path.mkdir")
|
|
||||||
def test_metadata_from_node_state(self, mock_mkdir, mock_state):
|
|
||||||
with temp_env(
|
|
||||||
MASTER_ENDPOINT="http://master.local",
|
|
||||||
AGENT_HOSTNAME="host_abc",
|
|
||||||
AGENT_ENV=None,
|
|
||||||
AGENT_USER=None,
|
|
||||||
AGENT_INSTANCE=None,
|
|
||||||
):
|
|
||||||
config = load_config()
|
|
||||||
|
|
||||||
self.assertEqual(config.environment, "prod")
|
|
||||||
self.assertEqual(config.user, "ops")
|
|
||||||
self.assertEqual(config.instance, "node-1")
|
|
||||||
mock_state.assert_called_once()
|
|
||||||
mock_mkdir.assert_called()
|
|
||||||
|
|
||||||
@patch("app.config.Path.mkdir")
|
|
||||||
def test_partial_environment_variables_fallback(self, mock_mkdir):
|
|
||||||
with temp_env(
|
|
||||||
MASTER_ENDPOINT="http://master.local",
|
|
||||||
AGENT_HOSTNAME="stage-ml-001-node",
|
|
||||||
AGENT_ENV="prod",
|
|
||||||
AGENT_USER=None,
|
|
||||||
AGENT_INSTANCE=None,
|
|
||||||
):
|
|
||||||
config = load_config()
|
|
||||||
|
|
||||||
self.assertEqual(config.environment, "stage")
|
|
||||||
self.assertEqual(config.user, "ml")
|
|
||||||
self.assertEqual(config.instance, "001")
|
|
||||||
mock_mkdir.assert_called()
|
|
||||||
|
|
||||||
@patch("app.config.Path.mkdir")
|
|
||||||
def test_invalid_hostname_raises_error(self, mock_mkdir):
|
|
||||||
with temp_env(
|
|
||||||
MASTER_ENDPOINT="http://master.local",
|
|
||||||
AGENT_HOSTNAME="invalidhostname",
|
|
||||||
AGENT_ENV=None,
|
|
||||||
AGENT_USER=None,
|
|
||||||
AGENT_INSTANCE=None,
|
|
||||||
):
|
|
||||||
with self.assertRaises(ValueError):
|
|
||||||
load_config()
|
|
||||||
|
|
||||||
mock_mkdir.assert_not_called()
|
|
||||||
|
|
||||||
|
|
||||||
class CollectMetadataTests(unittest.TestCase):
|
|
||||||
@patch("app.collector._detect_ip_address", return_value="127.0.0.1")
|
|
||||||
@patch("app.collector._detect_gpu_count", return_value=0)
|
|
||||||
@patch("app.collector._detect_memory_bytes", return_value=1024)
|
|
||||||
@patch("app.collector._detect_cpu_count", return_value=8)
|
|
||||||
def test_collect_metadata_uses_config_fields(
|
|
||||||
self,
|
|
||||||
mock_cpu,
|
|
||||||
mock_memory,
|
|
||||||
mock_gpu,
|
|
||||||
mock_ip,
|
|
||||||
):
|
|
||||||
config = AgentConfig(
|
|
||||||
hostname="dev-user-001-pod",
|
|
||||||
environment="prod",
|
|
||||||
user="ops",
|
|
||||||
instance="node-1",
|
|
||||||
node_file="/tmp/node.json",
|
|
||||||
version="1.0.0",
|
|
||||||
master_endpoint="http://master.local",
|
|
||||||
report_interval_seconds=60,
|
|
||||||
health_dir="/tmp/health",
|
|
||||||
)
|
|
||||||
|
|
||||||
from app.collector import collect_metadata
|
|
||||||
|
|
||||||
metadata = collect_metadata(config)
|
|
||||||
|
|
||||||
self.assertEqual(metadata["env"], "prod")
|
|
||||||
self.assertEqual(metadata["user"], "ops")
|
|
||||||
self.assertEqual(metadata["instance"], "node-1")
|
|
||||||
self.assertEqual(metadata["hostname"], "dev-user-001-pod")
|
|
||||||
self.assertEqual(metadata["ip"], "127.0.0.1")
|
|
||||||
self.assertEqual(metadata["cpu_number"], 8)
|
|
||||||
self.assertEqual(metadata["memory_in_bytes"], 1024)
|
|
||||||
self.assertEqual(metadata["gpu_number"], 0)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
unittest.main()
|
|
||||||
@ -1,31 +0,0 @@
|
|||||||
# Alertmanager
|
|
||||||
|
|
||||||
## 构建
|
|
||||||
1. 首先设置构建和部署的环境变量, 在项目根目录下执行:
|
|
||||||
```bash
|
|
||||||
cp src/alert/tests/.env.example src/alert/tests/.env
|
|
||||||
```
|
|
||||||
|
|
||||||
然后找到复制出来的.env文件,修改环境变量。
|
|
||||||
|
|
||||||
2. 使用脚本构建,在项目根目录下执行:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
bash src/alert/alertmanager/build/build.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
构建成功后,会在项目根目录下生成argus-alertmanager-latest.tar
|
|
||||||
|
|
||||||
## 部署
|
|
||||||
|
|
||||||
提供docker-compose部署。在src/alert/tests目录下
|
|
||||||
```bash
|
|
||||||
docker-compose up -d
|
|
||||||
```
|
|
||||||
|
|
||||||
## 动态配置
|
|
||||||
配置文件放在`/private/argus/alert/alertmanager/alertmanager.yml`下,修改alertmanager.yml后,调用`http://alertmanager.alert.argus.com:9093/-/reload`接口(POST)可以重新加载配置.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -X POST http://localhost:9093/-/reload
|
|
||||||
```
|
|
||||||
@ -1,96 +0,0 @@
|
|||||||
# 基于 Ubuntu 24.04
|
|
||||||
FROM ubuntu:24.04
|
|
||||||
|
|
||||||
# 切换到 root 用户
|
|
||||||
USER root
|
|
||||||
|
|
||||||
# 安装必要依赖
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y wget supervisor net-tools inetutils-ping vim ca-certificates passwd && \
|
|
||||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# 设置 Alertmanager 版本(与本地离线包保持一致)
|
|
||||||
ARG ALERTMANAGER_VERSION=0.28.1
|
|
||||||
|
|
||||||
# 使用仓库内预置的离线包构建(无需联网)
|
|
||||||
COPY src/alert/alertmanager/build/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz /tmp/
|
|
||||||
RUN tar xvf /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz -C /tmp && \
|
|
||||||
mv /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64 /usr/local/alertmanager && \
|
|
||||||
rm -f /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz
|
|
||||||
|
|
||||||
ENV ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
|
|
||||||
|
|
||||||
ARG ARGUS_BUILD_UID=2133
|
|
||||||
ARG ARGUS_BUILD_GID=2015
|
|
||||||
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID}
|
|
||||||
ENV ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
|
|
||||||
|
|
||||||
RUN mkdir -p /usr/share/alertmanager && \
|
|
||||||
mkdir -p ${ALERTMANAGER_BASE_PATH} && \
|
|
||||||
mkdir -p /private/argus/etc && \
|
|
||||||
rm -rf /alertmanager && \
|
|
||||||
ln -s ${ALERTMANAGER_BASE_PATH} /alertmanager
|
|
||||||
|
|
||||||
# 创建 alertmanager 用户(可自定义 UID/GID)
|
|
||||||
# 创建 alertmanager 用户组
|
|
||||||
RUN set -eux; \
|
|
||||||
# 确保目标 GID 存在;若已被占用,直接使用该 GID(组名不限)\
|
|
||||||
if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \
|
|
||||||
groupadd -g "${ARGUS_BUILD_GID}" alertmanager || true; \
|
|
||||||
fi; \
|
|
||||||
# 确保存在 alertmanager 用户;若 UID 已被占用,跳过并继续使用现有 UID 的用户
|
|
||||||
if ! id alertmanager >/dev/null 2>&1; then \
|
|
||||||
if getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
|
|
||||||
# UID 已占用,则创建同名用户但不指定 UID(避免冲突),仅保证 user 存在
|
|
||||||
useradd -M -s /usr/sbin/nologin -g "${ARGUS_BUILD_GID}" alertmanager || true; \
|
|
||||||
else \
|
|
||||||
useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" alertmanager || true; \
|
|
||||||
fi; \
|
|
||||||
else \
|
|
||||||
usermod -g "${ARGUS_BUILD_GID}" alertmanager || true; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
RUN chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/alertmanager /alertmanager ${ALERTMANAGER_BASE_PATH} /private/argus/etc /usr/local/bin || true
|
|
||||||
|
|
||||||
# 配置内网 apt 源 (如果指定了内网选项)
|
|
||||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
|
||||||
echo "Configuring intranet apt sources..." && \
|
|
||||||
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
|
|
||||||
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
|
|
||||||
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
|
|
||||||
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
# 配置部署时使用的 apt 源
|
|
||||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
|
||||||
echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 创建 supervisor 日志目录
|
|
||||||
RUN mkdir -p /var/log/supervisor
|
|
||||||
|
|
||||||
# 复制 supervisor 配置文件
|
|
||||||
COPY src/alert/alertmanager/build/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
|
||||||
|
|
||||||
# 复制启动脚本
|
|
||||||
COPY src/alert/alertmanager/build/start-am-supervised.sh /usr/local/bin/start-am-supervised.sh
|
|
||||||
RUN chmod +x /usr/local/bin/start-am-supervised.sh
|
|
||||||
|
|
||||||
# 复制 Alertmanager 配置文件
|
|
||||||
COPY src/alert/alertmanager/build/alertmanager.yml /etc/alertmanager/alertmanager.yml
|
|
||||||
RUN chmod +x /etc/alertmanager/alertmanager.yml
|
|
||||||
# COPY src/alert/alertmanager/build/alertmanager.yml ${ALERTMANAGER_BASE_PATH}/alertmanager.yml
|
|
||||||
|
|
||||||
# 复制 DNS 监控脚本
|
|
||||||
COPY src/alert/alertmanager/build/dns-monitor.sh /usr/local/bin/dns-monitor.sh
|
|
||||||
RUN chmod +x /usr/local/bin/dns-monitor.sh
|
|
||||||
|
|
||||||
# 保持 root 用户,由 supervisor 控制 user 切换
|
|
||||||
USER root
|
|
||||||
|
|
||||||
# 暴露端口(Alertmanager 默认端口 9093)
|
|
||||||
EXPOSE 9093
|
|
||||||
|
|
||||||
# 使用 supervisor 作为入口点
|
|
||||||
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
|
|
||||||
Binary file not shown.
@ -1,19 +0,0 @@
|
|||||||
global:
|
|
||||||
resolve_timeout: 5m
|
|
||||||
|
|
||||||
route:
|
|
||||||
group_by: ['alertname', 'instance'] # 分组:相同 alertname + instance 的告警合并
|
|
||||||
group_wait: 30s # 第一个告警后,等 30s 看是否有同组告警一起发
|
|
||||||
group_interval: 5m # 同组告警变化后,至少 5 分钟再发一次
|
|
||||||
repeat_interval: 3h # 相同告警,3 小时重复提醒一次
|
|
||||||
receiver: 'null'
|
|
||||||
|
|
||||||
receivers:
|
|
||||||
- name: 'null'
|
|
||||||
|
|
||||||
inhibit_rules:
|
|
||||||
- source_match:
|
|
||||||
severity: 'critical' # critical 告警存在时
|
|
||||||
target_match:
|
|
||||||
severity: 'warning' # 抑制相同 instance 的 warning 告警
|
|
||||||
equal: ['instance']
|
|
||||||
@ -1,13 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -euo pipefail
|
|
||||||
docker pull ubuntu:24.04
|
|
||||||
|
|
||||||
source src/alert/tests/.env
|
|
||||||
|
|
||||||
docker build \
|
|
||||||
--build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
|
|
||||||
--build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID} \
|
|
||||||
-f src/alert/alertmanager/build/Dockerfile \
|
|
||||||
-t argus-alertmanager:latest .
|
|
||||||
|
|
||||||
docker save -o argus-alertmanager-latest.tar argus-alertmanager:latest
|
|
||||||
@ -1,68 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# DNS监控脚本 - 每10秒检查dns.conf是否有变化
|
|
||||||
# 如果有变化则执行update-dns.sh脚本
|
|
||||||
|
|
||||||
DNS_CONF="/private/argus/etc/dns.conf"
|
|
||||||
DNS_BACKUP="/tmp/dns.conf.backup"
|
|
||||||
UPDATE_SCRIPT="/private/argus/etc/update-dns.sh"
|
|
||||||
LOG_FILE="/var/log/supervisor/dns-monitor.log"
|
|
||||||
|
|
||||||
# 确保日志文件存在
|
|
||||||
touch "$LOG_FILE"
|
|
||||||
|
|
||||||
log_message() {
|
|
||||||
echo "$(date '+%Y-%m-%d %H:%M:%S') [DNS-Monitor] $1" >> "$LOG_FILE"
|
|
||||||
}
|
|
||||||
|
|
||||||
log_message "DNS监控脚本启动"
|
|
||||||
|
|
||||||
while true; do
|
|
||||||
if [ -f "$DNS_CONF" ]; then
|
|
||||||
if [ -f "$DNS_BACKUP" ]; then
|
|
||||||
# 比较文件内容
|
|
||||||
if ! cmp -s "$DNS_CONF" "$DNS_BACKUP"; then
|
|
||||||
log_message "检测到DNS配置变化"
|
|
||||||
|
|
||||||
# 更新备份文件
|
|
||||||
cp "$DNS_CONF" "$DNS_BACKUP"
|
|
||||||
|
|
||||||
# 执行更新脚本
|
|
||||||
if [ -x "$UPDATE_SCRIPT" ]; then
|
|
||||||
log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
|
|
||||||
"$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
|
|
||||||
if [ $? -eq 0 ]; then
|
|
||||||
log_message "DNS更新脚本执行成功"
|
|
||||||
else
|
|
||||||
log_message "DNS更新脚本执行失败"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
|
|
||||||
# 第一次检测到配置文件,执行更新脚本
|
|
||||||
if [ -x "$UPDATE_SCRIPT" ]; then
|
|
||||||
log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
|
|
||||||
"$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
|
|
||||||
if [ $? -eq 0 ]; then
|
|
||||||
log_message "DNS更新脚本执行成功"
|
|
||||||
|
|
||||||
# 第一次运行,创建备份并执行更新
|
|
||||||
cp "$DNS_CONF" "$DNS_BACKUP"
|
|
||||||
log_message "创建DNS配置备份文件"
|
|
||||||
|
|
||||||
else
|
|
||||||
log_message "DNS更新脚本执行失败"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
log_message "警告: DNS配置文件不存在: $DNS_CONF"
|
|
||||||
fi
|
|
||||||
|
|
||||||
sleep 10
|
|
||||||
done
|
|
||||||
@ -1,22 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# 下载 Alertmanager 离线安装包到本目录,用于 Docker 构建时 COPY
|
|
||||||
# 用法:
|
|
||||||
# ./fetch-dist.sh [version]
|
|
||||||
# 示例:
|
|
||||||
# ./fetch-dist.sh 0.28.1
|
|
||||||
|
|
||||||
VER="${1:-0.28.1}"
|
|
||||||
OUT="alertmanager-${VER}.linux-amd64.tar.gz"
|
|
||||||
URL="https://github.com/prometheus/alertmanager/releases/download/v${VER}/${OUT}"
|
|
||||||
|
|
||||||
if [[ -f "$OUT" ]]; then
|
|
||||||
echo "[INFO] $OUT already exists, skip download"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[INFO] Downloading $URL"
|
|
||||||
curl -fL --retry 3 --connect-timeout 10 -o "$OUT" "$URL"
|
|
||||||
echo "[OK] Saved to $(pwd)/$OUT"
|
|
||||||
|
|
||||||
@ -1,25 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
echo "[INFO] Starting Alertmanager under supervisor..."
|
|
||||||
|
|
||||||
ALERTMANAGER_BASE_PATH=${ALERTMANAGER_BASE_PATH:-/private/argus/alert/alertmanager}
|
|
||||||
|
|
||||||
echo "[INFO] Alertmanager base path: ${ALERTMANAGER_BASE_PATH}"
|
|
||||||
|
|
||||||
# 使用容器内的 /etc/alertmanager/alertmanager.yml 作为配置文件,避免写入挂载卷导致的权限问题
|
|
||||||
echo "[INFO] Using /etc/alertmanager/alertmanager.yml as configuration"
|
|
||||||
|
|
||||||
|
|
||||||
# 记录容器 IP 地址
|
|
||||||
DOMAIN=alertmanager.alert.argus.com
|
|
||||||
IP=$(ifconfig | grep -A 1 eth0 | grep inet | awk '{print $2}')
|
|
||||||
echo "current IP: ${IP}"
|
|
||||||
echo "${IP}" > /private/argus/etc/${DOMAIN}
|
|
||||||
chmod 755 /private/argus/etc/${DOMAIN}
|
|
||||||
|
|
||||||
|
|
||||||
echo "[INFO] Starting Alertmanager process..."
|
|
||||||
|
|
||||||
# 启动 Alertmanager 主进程
|
|
||||||
exec /usr/local/alertmanager/alertmanager --config.file=/etc/alertmanager/alertmanager.yml --storage.path=/alertmanager --cluster.listen-address=""
|
|
||||||
@ -1,39 +0,0 @@
|
|||||||
[supervisord]
|
|
||||||
nodaemon=true
|
|
||||||
logfile=/var/log/supervisor/supervisord.log
|
|
||||||
pidfile=/var/run/supervisord.pid
|
|
||||||
user=root
|
|
||||||
|
|
||||||
[program:alertmanager]
|
|
||||||
command=/usr/local/bin/start-am-supervised.sh
|
|
||||||
user=alertmanager
|
|
||||||
stdout_logfile=/var/log/supervisor/alertmanager.log
|
|
||||||
stderr_logfile=/var/log/supervisor/alertmanager_error.log
|
|
||||||
autorestart=true
|
|
||||||
startretries=3
|
|
||||||
startsecs=10
|
|
||||||
stopwaitsecs=20
|
|
||||||
killasgroup=true
|
|
||||||
stopasgroup=true
|
|
||||||
|
|
||||||
[program:dns-monitor]
|
|
||||||
command=/usr/local/bin/dns-monitor.sh
|
|
||||||
user=root
|
|
||||||
stdout_logfile=/var/log/supervisor/dns-monitor.log
|
|
||||||
stderr_logfile=/var/log/supervisor/dns-monitor_error.log
|
|
||||||
autorestart=true
|
|
||||||
startretries=3
|
|
||||||
startsecs=5
|
|
||||||
stopwaitsecs=10
|
|
||||||
killasgroup=true
|
|
||||||
stopasgroup=true
|
|
||||||
|
|
||||||
[unix_http_server]
|
|
||||||
file=/var/run/supervisor.sock
|
|
||||||
chmod=0700
|
|
||||||
|
|
||||||
[supervisorctl]
|
|
||||||
serverurl=unix:///var/run/supervisor.sock
|
|
||||||
|
|
||||||
[rpcinterface:supervisor]
|
|
||||||
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
|
|
||||||
@ -1,60 +0,0 @@
|
|||||||
# 告警配置
|
|
||||||
|
|
||||||
> 参考:[自定义Prometheus告警规则](https://yunlzheng.gitbook.io/prometheus-book/parti-prometheus-ji-chu/alert/prometheus-alert-rule)
|
|
||||||
|
|
||||||
在Prometheus中配置告警的有两个步骤:
|
|
||||||
|
|
||||||
1. 写告警规则文件(rules文件)
|
|
||||||
2. 在promethues.yml里加载规则,并配置Alertmanager
|
|
||||||
|
|
||||||
## 1. 编写告警规则文件
|
|
||||||
告警规则如下:
|
|
||||||
```yml
|
|
||||||
groups:
|
|
||||||
- name: example-rules
|
|
||||||
interval: 30s # 每30秒评估一次
|
|
||||||
rules:
|
|
||||||
- alert: InstanceDown
|
|
||||||
expr: up == 0
|
|
||||||
for: 1m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: "实例 {{ $labels.instance }} 已宕机"
|
|
||||||
description: "{{ $labels.instance }} 在 {{ $labels.job }} 中无响应超过 1 分钟。"
|
|
||||||
|
|
||||||
- alert: HighCpuUsage
|
|
||||||
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "CPU 使用率过高"
|
|
||||||
description: "实例 {{ $labels.instance }} CPU 使用率超过 80% 持续 5 分钟。"
|
|
||||||
```
|
|
||||||
|
|
||||||
其中:
|
|
||||||
|
|
||||||
- `alert`:告警规则的名称。
|
|
||||||
- `expr`:基于PromQL表达式告警触发条件,用于计算是否有时间序列满足该条件。
|
|
||||||
- `for`:评估等待时间,可选参数。用于表示只有当触发条件持续一段时间后才发送告警。在等待期间新产生告警的状态为pending。
|
|
||||||
- `labels`:自定义标签,允许用户指定要附加到告警上的一组附加标签,可以在Alertmanager中做路由和分组。
|
|
||||||
- `annotations`:用于指定一组附加信息,比如用于描述告警详细信息的文字等,annotations的内容在告警产生时会一同作为参数发送到Alertmanager。可以提供告警摘要和详细信息。
|
|
||||||
|
|
||||||
## 2. promothues.yml里引用
|
|
||||||
在prometheus.yml中加上`rule_files`和`alerting`:
|
|
||||||
|
|
||||||
```yml
|
|
||||||
global:
|
|
||||||
[ evaluation_interval: <duration> | default = 1m ]
|
|
||||||
|
|
||||||
rule_files:
|
|
||||||
[ - <filepath_glob> ... ]
|
|
||||||
|
|
||||||
alerting:
|
|
||||||
alertmanagers:
|
|
||||||
- static_configs:
|
|
||||||
- targets:
|
|
||||||
- "alertmanager.alert.argus.com:9093" # Alertmanager 地址
|
|
||||||
|
|
||||||
```
|
|
||||||
@ -1,37 +0,0 @@
|
|||||||
groups:
|
|
||||||
- name: example-rules
|
|
||||||
interval: 30s # 每30秒评估一次
|
|
||||||
rules:
|
|
||||||
- alert: InstanceDown
|
|
||||||
expr: up == 0
|
|
||||||
for: 1m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: "实例 {{ $labels.instance }} 已宕机"
|
|
||||||
description: "{{ $labels.instance }} 在 {{ $labels.job }} 中无响应超过 1 分钟。"
|
|
||||||
|
|
||||||
- alert: HighCpuUsage
|
|
||||||
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "CPU 使用率过高"
|
|
||||||
description: "实例 {{ $labels.instance }} CPU 使用率超过 80% 持续 5 分钟。"
|
|
||||||
- alert: HighMemoryUsage
|
|
||||||
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 80
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "内存使用率过高"
|
|
||||||
description: "实例 {{ $labels.instance }} 内存使用率超过 80% 持续 5 分钟。"
|
|
||||||
- alert: DiskSpaceLow
|
|
||||||
expr: (node_filesystem_size_bytes{fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{fstype!~"tmpfs|overlay"}) / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"} * 100 > 90
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "磁盘空间不足"
|
|
||||||
description: "实例 {{ $labels.instance }} 磁盘空间不足超过 90% 持续 10 分钟。"
|
|
||||||
@ -1,5 +0,0 @@
|
|||||||
DATA_ROOT=/home/argus/tmp/private/argus
|
|
||||||
ARGUS_BUILD_UID=1048
|
|
||||||
ARGUS_BUILD_GID=1048
|
|
||||||
|
|
||||||
USE_INTRANET=false
|
|
||||||
@ -1,5 +0,0 @@
|
|||||||
DATA_ROOT=/home/argus/tmp/private/argus
|
|
||||||
ARGUS_BUILD_UID=1048
|
|
||||||
ARGUS_BUILD_GID=1048
|
|
||||||
|
|
||||||
USE_INTRANET=false
|
|
||||||
@ -1,37 +0,0 @@
|
|||||||
services:
|
|
||||||
alertmanager:
|
|
||||||
build:
|
|
||||||
context: ../../../
|
|
||||||
dockerfile: src/alert/alertmanager/build/Dockerfile
|
|
||||||
args:
|
|
||||||
ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
|
|
||||||
ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
|
|
||||||
USE_INTRANET: ${USE_INTRANET:-false}
|
|
||||||
image: argus-alertmanager:latest
|
|
||||||
container_name: argus-alertmanager
|
|
||||||
environment:
|
|
||||||
- ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
|
|
||||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
|
||||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
|
||||||
ports:
|
|
||||||
- "${ARGUS_PORT:-9093}:9093"
|
|
||||||
volumes:
|
|
||||||
- ${DATA_ROOT:-./data}/alert/alertmanager:/private/argus/alert/alertmanager
|
|
||||||
- ${DATA_ROOT:-./data}/etc:/private/argus/etc
|
|
||||||
networks:
|
|
||||||
- argus-debug-net
|
|
||||||
restart: unless-stopped
|
|
||||||
logging:
|
|
||||||
driver: "json-file"
|
|
||||||
options:
|
|
||||||
max-size: "10m"
|
|
||||||
max-file: "3"
|
|
||||||
|
|
||||||
networks:
|
|
||||||
argus-debug-net:
|
|
||||||
driver: bridge
|
|
||||||
name: argus-debug-net
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
alertmanager_data:
|
|
||||||
driver: local
|
|
||||||
@ -1,113 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# verify_alertmanager.sh
|
|
||||||
# 用于部署后验证 Prometheus 与 Alertmanager 通信链路是否正常
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
#=============================
|
|
||||||
# 基础配置
|
|
||||||
#=============================
|
|
||||||
PROM_URL="${PROM_URL:-http://prom.metric.argus.com:9090}"
|
|
||||||
ALERT_URL="${ALERT_URL:-http://alertmanager.alert.argus.com:9093}"
|
|
||||||
# TODO: 根据实际部署环境调整规则目录
|
|
||||||
DATA_ROOT="${DATA_ROOT:-/private/argus}"
|
|
||||||
RULE_DIR = "$DATA_ROOT/metric/prometheus/rules"
|
|
||||||
TMP_RULE="/tmp/test_rule.yml"
|
|
||||||
|
|
||||||
#=============================
|
|
||||||
# 辅助函数
|
|
||||||
#=============================
|
|
||||||
GREEN="\033[32m"; RED="\033[31m"; YELLOW="\033[33m"; RESET="\033[0m"
|
|
||||||
|
|
||||||
log_info() { echo -e "${YELLOW}[INFO]${RESET} $1"; }
|
|
||||||
log_success() { echo -e "${GREEN}[OK]${RESET} $1"; }
|
|
||||||
log_error() { echo -e "${RED}[ERROR]${RESET} $1"; }
|
|
||||||
|
|
||||||
fail_exit() { log_error "$1"; exit 1; }
|
|
||||||
|
|
||||||
#=============================
|
|
||||||
# Step 1: 检查 Alertmanager 是否可访问
|
|
||||||
#=============================
|
|
||||||
log_info "检查 Alertmanager 状态..."
|
|
||||||
if curl -sSf "${ALERT_URL}/api/v2/status" >/dev/null 2>&1; then
|
|
||||||
log_success "Alertmanager 服务正常 (${ALERT_URL})"
|
|
||||||
else
|
|
||||||
fail_exit "无法访问 Alertmanager,请检查端口映射与容器状态。"
|
|
||||||
fi
|
|
||||||
|
|
||||||
#=============================
|
|
||||||
# Step 2: 手动发送测试告警
|
|
||||||
#=============================
|
|
||||||
log_info "发送手动测试告警..."
|
|
||||||
curl -s -XPOST "${ALERT_URL}/api/v2/alerts" -H "Content-Type: application/json" -d '[
|
|
||||||
{
|
|
||||||
"labels": {
|
|
||||||
"alertname": "ManualTestAlert",
|
|
||||||
"severity": "info"
|
|
||||||
},
|
|
||||||
"annotations": {
|
|
||||||
"summary": "This is a test alert from deploy verification"
|
|
||||||
},
|
|
||||||
"startsAt": "'$(date -Iseconds)'"
|
|
||||||
}
|
|
||||||
]' >/dev/null && log_success "测试告警已成功发送到 Alertmanager"
|
|
||||||
|
|
||||||
#=============================
|
|
||||||
# Step 3: 检查 Prometheus 配置中是否包含 Alertmanager
|
|
||||||
#=============================
|
|
||||||
log_info "检查 Prometheus 是否配置了 Alertmanager..."
|
|
||||||
if curl -s "${PROM_URL}/api/v1/status/config" | grep -q "alertmanagers"; then
|
|
||||||
log_success "Prometheus 已配置 Alertmanager 目标"
|
|
||||||
else
|
|
||||||
fail_exit "Prometheus 未配置 Alertmanager,请检查 prometheus.yml"
|
|
||||||
fi
|
|
||||||
|
|
||||||
#=============================
|
|
||||||
# Step 4: 创建并加载测试告警规则
|
|
||||||
#=============================
|
|
||||||
log_info "创建临时测试规则 ${TMP_RULE} ..."
|
|
||||||
cat <<EOF > "${TMP_RULE}"
|
|
||||||
groups:
|
|
||||||
- name: deploy-verify-group
|
|
||||||
rules:
|
|
||||||
- alert: DeployVerifyAlert
|
|
||||||
expr: vector(1)
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "Deployment verification alert"
|
|
||||||
EOF
|
|
||||||
|
|
||||||
mkdir -p "${RULE_DIR}"
|
|
||||||
cp "${TMP_RULE}" "${RULE_DIR}/test_rule.yml"
|
|
||||||
|
|
||||||
log_info "重载 Prometheus 以加载新规则..."
|
|
||||||
if curl -s -X POST "${PROM_URL}/-/reload" >/dev/null; then
|
|
||||||
log_success "Prometheus 已重载规则"
|
|
||||||
else
|
|
||||||
fail_exit "Prometheus reload 失败,请检查 API 可访问性。"
|
|
||||||
fi
|
|
||||||
|
|
||||||
#=============================
|
|
||||||
# Step 5: 等待并验证 Alertmanager 是否收到告警
|
|
||||||
#=============================
|
|
||||||
log_info "等待告警触发 (约5秒)..."
|
|
||||||
sleep 5
|
|
||||||
|
|
||||||
if curl -s "${ALERT_URL}/api/v2/alerts" | grep -q "DeployVerifyAlert"; then
|
|
||||||
log_success "Prometheus → Alertmanager 告警链路验证成功"
|
|
||||||
else
|
|
||||||
fail_exit "未在 Alertmanager 中检测到 DeployVerifyAlert,请检查网络或配置。"
|
|
||||||
fi
|
|
||||||
|
|
||||||
#=============================
|
|
||||||
# Step 6: 清理测试规则
|
|
||||||
#=============================
|
|
||||||
log_info "清理临时测试规则..."
|
|
||||||
rm -f "${RULE_DIR}/test_rule.yml" "${TMP_RULE}"
|
|
||||||
|
|
||||||
curl -s -X POST "${PROM_URL}/-/reload" >/dev/null \
|
|
||||||
&& log_success "Prometheus 已清理验证规则" \
|
|
||||||
|| log_error "Prometheus reload 清理失败,请手动确认。"
|
|
||||||
|
|
||||||
log_success "部署验证全部通过!Prometheus ↔ Alertmanager 通信正常。"
|
|
||||||
2
src/bind/.gitignore
vendored
2
src/bind/.gitignore
vendored
@ -1,2 +0,0 @@
|
|||||||
|
|
||||||
images/
|
|
||||||
@ -1,90 +0,0 @@
|
|||||||
FROM ubuntu:22.04
|
|
||||||
|
|
||||||
# Set timezone and avoid interactive prompts
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
|
||||||
ENV TZ=Asia/Shanghai
|
|
||||||
|
|
||||||
# 设置构建参数
|
|
||||||
ARG USE_INTRANET=false
|
|
||||||
ARG ARGUS_BUILD_UID=2133
|
|
||||||
ARG ARGUS_BUILD_GID=2015
|
|
||||||
|
|
||||||
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
|
|
||||||
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
|
|
||||||
|
|
||||||
# 配置内网 apt 源 (如果指定了内网选项)
|
|
||||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
|
||||||
echo "Configuring intranet apt sources..." && \
|
|
||||||
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
|
|
||||||
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
|
|
||||||
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
|
|
||||||
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Update package list and install required packages
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y \
|
|
||||||
bind9 \
|
|
||||||
bind9utils \
|
|
||||||
dnsutils \
|
|
||||||
bind9-doc \
|
|
||||||
supervisor \
|
|
||||||
net-tools \
|
|
||||||
inetutils-ping \
|
|
||||||
vim \
|
|
||||||
&& apt-get clean \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# 调整 bind 用户与用户组 ID 以匹配宿主机配置
|
|
||||||
RUN set -eux; \
|
|
||||||
current_gid="$(getent group bind | awk -F: '{print $3}')"; \
|
|
||||||
if [ -z "$current_gid" ]; then \
|
|
||||||
groupadd -g "${ARGUS_BUILD_GID}" bind; \
|
|
||||||
elif [ "$current_gid" != "${ARGUS_BUILD_GID}" ]; then \
|
|
||||||
groupmod -g "${ARGUS_BUILD_GID}" bind; \
|
|
||||||
fi; \
|
|
||||||
if id bind >/dev/null 2>&1; then \
|
|
||||||
current_uid="$(id -u bind)"; \
|
|
||||||
if [ "$current_uid" != "${ARGUS_BUILD_UID}" ]; then \
|
|
||||||
usermod -u "${ARGUS_BUILD_UID}" bind; \
|
|
||||||
fi; \
|
|
||||||
else \
|
|
||||||
useradd -m -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" bind; \
|
|
||||||
fi; \
|
|
||||||
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /var/cache/bind /var/lib/bind
|
|
||||||
|
|
||||||
# 配置部署时使用的apt源
|
|
||||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
|
||||||
echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Create supervisor configuration directory
|
|
||||||
RUN mkdir -p /etc/supervisor/conf.d
|
|
||||||
|
|
||||||
# Copy supervisor configuration
|
|
||||||
COPY src/bind/build/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
|
||||||
|
|
||||||
# Copy BIND9 configuration files
|
|
||||||
COPY src/bind/build/named.conf.local /etc/bind/named.conf.local
|
|
||||||
COPY src/bind/build/db.argus.com /etc/bind/db.argus.com
|
|
||||||
|
|
||||||
# Copy startup and reload scripts
|
|
||||||
COPY src/bind/build/startup.sh /usr/local/bin/startup.sh
|
|
||||||
COPY src/bind/build/reload-bind9.sh /usr/local/bin/reload-bind9.sh
|
|
||||||
COPY src/bind/build/argus_dns_sync.sh /usr/local/bin/argus_dns_sync.sh
|
|
||||||
COPY src/bind/build/update-dns.sh /usr/local/bin/update-dns.sh
|
|
||||||
|
|
||||||
# Make scripts executable
|
|
||||||
RUN chmod +x /usr/local/bin/startup.sh /usr/local/bin/reload-bind9.sh /usr/local/bin/argus_dns_sync.sh /usr/local/bin/update-dns.sh
|
|
||||||
|
|
||||||
# Set proper ownership for BIND9 files
|
|
||||||
RUN chown bind:bind /etc/bind/named.conf.local /etc/bind/db.argus.com
|
|
||||||
|
|
||||||
# Expose DNS port
|
|
||||||
EXPOSE 53/tcp 53/udp
|
|
||||||
|
|
||||||
# Use root user as requested
|
|
||||||
USER root
|
|
||||||
|
|
||||||
# Start with startup script
|
|
||||||
CMD ["/usr/local/bin/startup.sh"]
|
|
||||||
@ -1,106 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
WATCH_DIR="/private/argus/etc"
|
|
||||||
ZONE_DB="/private/argus/bind/db.argus.com"
|
|
||||||
LOCKFILE="/var/lock/argus_dns_sync.lock"
|
|
||||||
BACKUP_DIR="/private/argus/bind/.backup"
|
|
||||||
SLEEP_SECONDS=10
|
|
||||||
RELOAD_SCRIPT="/usr/local/bin/reload-bind9.sh" # 这里放你已有脚本的路径
|
|
||||||
|
|
||||||
mkdir -p "$(dirname "$LOCKFILE")" "$BACKUP_DIR"
|
|
||||||
BACKUP_UID="${ARGUS_BUILD_UID:-2133}"
|
|
||||||
BACKUP_GID="${ARGUS_BUILD_GID:-2015}"
|
|
||||||
chown -R "$BACKUP_UID:$BACKUP_GID" "$BACKUP_DIR" 2>/dev/null || true
|
|
||||||
|
|
||||||
is_ipv4() {
|
|
||||||
local ip="$1"
|
|
||||||
[[ "$ip" =~ ^([0-9]{1,3}\.){3}[0-9]{1,3}$ ]] || return 1
|
|
||||||
IFS='.' read -r a b c d <<<"$ip"
|
|
||||||
for n in "$a" "$b" "$c" "$d"; do
|
|
||||||
(( n >= 0 && n <= 255 )) || return 1
|
|
||||||
done
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
get_current_ip() {
|
|
||||||
local name="$1"
|
|
||||||
sed -n -E "s/^${name}[[:space:]]+IN[[:space:]]+A[[:space:]]+([0-9.]+)[[:space:]]*$/\1/p" "$ZONE_DB" | head -n1
|
|
||||||
}
|
|
||||||
|
|
||||||
upsert_record() {
|
|
||||||
local name="$1"
|
|
||||||
local new_ip="$2"
|
|
||||||
local ts
|
|
||||||
ts="$(date +%Y%m%d-%H%M%S)"
|
|
||||||
local changed=0
|
|
||||||
|
|
||||||
cp -a "$ZONE_DB" "$BACKUP_DIR/db.argus.com.$ts.bak"
|
|
||||||
chown "$BACKUP_UID:$BACKUP_GID" "$BACKUP_DIR/db.argus.com.$ts.bak" 2>/dev/null || true
|
|
||||||
|
|
||||||
local cur_ip
|
|
||||||
cur_ip="$(get_current_ip "$name" || true)"
|
|
||||||
|
|
||||||
if [[ -z "$cur_ip" ]]; then
|
|
||||||
# Ensure the file ends with a newline before adding new record
|
|
||||||
if [[ -s "$ZONE_DB" ]] && [[ $(tail -c1 "$ZONE_DB" | wc -l) -eq 0 ]]; then
|
|
||||||
echo "" >> "$ZONE_DB"
|
|
||||||
fi
|
|
||||||
printf "%-20s IN A %s\n" "$name" "$new_ip" >> "$ZONE_DB"
|
|
||||||
echo "[ADD] ${name} -> ${new_ip}"
|
|
||||||
changed=1
|
|
||||||
elif [[ "$cur_ip" != "$new_ip" ]]; then
|
|
||||||
awk -v n="$name" -v ip="$new_ip" '
|
|
||||||
{
|
|
||||||
if ($1==n && $2=="IN" && $3=="A") {
|
|
||||||
printf "%-20s IN A %s\n", n, ip
|
|
||||||
} else {
|
|
||||||
print
|
|
||||||
}
|
|
||||||
}
|
|
||||||
' "$ZONE_DB" > "${ZONE_DB}.tmp" && mv "${ZONE_DB}.tmp" "$ZONE_DB"
|
|
||||||
echo "[UPDATE] ${name}: ${cur_ip} -> ${new_ip}"
|
|
||||||
changed=1
|
|
||||||
else
|
|
||||||
echo "[SKIP] ${name} unchanged (${new_ip})"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $changed -eq 1 ]]; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
while true; do
|
|
||||||
exec 9>"$LOCKFILE"
|
|
||||||
if flock -n 9; then
|
|
||||||
shopt -s nullglob
|
|
||||||
NEED_RELOAD=0
|
|
||||||
|
|
||||||
for f in "$WATCH_DIR"/*.argus.com; do
|
|
||||||
base="$(basename "$f")"
|
|
||||||
name="${base%.argus.com}"
|
|
||||||
ip="$(grep -Eo '([0-9]{1,3}\.){3}[0-9]{1,3}' "$f" | tail -n1 || true)"
|
|
||||||
|
|
||||||
if [[ -z "$ip" ]] || ! is_ipv4 "$ip"; then
|
|
||||||
echo "[WARN] $f 未找到有效 IPv4,跳过"
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
if upsert_record "$name" "$ip"; then
|
|
||||||
NEED_RELOAD=1
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ $NEED_RELOAD -eq 1 ]]; then
|
|
||||||
echo "[INFO] 检测到 db.argus.com 变更,执行 reload-bind9.sh"
|
|
||||||
bash "$RELOAD_SCRIPT"
|
|
||||||
fi
|
|
||||||
|
|
||||||
flock -u 9
|
|
||||||
else
|
|
||||||
echo "[INFO] 已有同步任务在运行,跳过本轮"
|
|
||||||
fi
|
|
||||||
|
|
||||||
sleep "$SLEEP_SECONDS"
|
|
||||||
done
|
|
||||||
@ -1,16 +0,0 @@
|
|||||||
$TTL 604800
|
|
||||||
@ IN SOA ns1.argus.com. admin.argus.com. (
|
|
||||||
2 ; Serial
|
|
||||||
604800 ; Refresh
|
|
||||||
86400 ; Retry
|
|
||||||
2419200 ; Expire
|
|
||||||
604800 ) ; Negative Cache TTL
|
|
||||||
|
|
||||||
; 定义 DNS 服务器
|
|
||||||
@ IN NS ns1.argus.com.
|
|
||||||
|
|
||||||
; 定义 ns1 主机
|
|
||||||
ns1 IN A 127.0.0.1
|
|
||||||
|
|
||||||
; 定义 web 指向 12.4.5.6
|
|
||||||
web IN A 12.4.5.6
|
|
||||||
@ -1,71 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# DNS监控脚本 - 每10秒检查dns.conf是否有变化
|
|
||||||
# 如果有变化则执行update-dns.sh脚本
|
|
||||||
|
|
||||||
DNS_CONF="/private/argus/etc/dns.conf"
|
|
||||||
DNS_BACKUP="/tmp/dns.conf.backup"
|
|
||||||
UPDATE_SCRIPT="/private/argus/etc/update-dns.sh"
|
|
||||||
LOG_FILE="/var/log/supervisor/dns-monitor.log"
|
|
||||||
|
|
||||||
# 确保日志文件存在
|
|
||||||
touch "$LOG_FILE"
|
|
||||||
|
|
||||||
log_message() {
|
|
||||||
echo "$(date '+%Y-%m-%d %H:%M:%S') [DNS-Monitor] $1" >> "$LOG_FILE"
|
|
||||||
}
|
|
||||||
|
|
||||||
log_message "DNS监控脚本启动"
|
|
||||||
|
|
||||||
log_message "删除DNS备份文件(如果存在)"
|
|
||||||
rm -f $DNS_BACKUP
|
|
||||||
|
|
||||||
while true; do
|
|
||||||
if [ -f "$DNS_CONF" ]; then
|
|
||||||
if [ -f "$DNS_BACKUP" ]; then
|
|
||||||
# 比较文件内容
|
|
||||||
if ! cmp -s "$DNS_CONF" "$DNS_BACKUP"; then
|
|
||||||
log_message "检测到DNS配置变化"
|
|
||||||
|
|
||||||
# 更新备份文件
|
|
||||||
cp "$DNS_CONF" "$DNS_BACKUP"
|
|
||||||
|
|
||||||
# 执行更新脚本
|
|
||||||
if [ -x "$UPDATE_SCRIPT" ]; then
|
|
||||||
log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
|
|
||||||
"$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
|
|
||||||
if [ $? -eq 0 ]; then
|
|
||||||
log_message "DNS更新脚本执行成功"
|
|
||||||
else
|
|
||||||
log_message "DNS更新脚本执行失败"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
|
|
||||||
# 第一次检测到配置文件,执行更新脚本
|
|
||||||
if [ -x "$UPDATE_SCRIPT" ]; then
|
|
||||||
log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
|
|
||||||
"$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
|
|
||||||
if [ $? -eq 0 ]; then
|
|
||||||
log_message "DNS更新脚本执行成功"
|
|
||||||
|
|
||||||
# 第一次运行,创建备份并执行更新
|
|
||||||
cp "$DNS_CONF" "$DNS_BACKUP"
|
|
||||||
log_message "创建DNS配置备份文件"
|
|
||||||
|
|
||||||
else
|
|
||||||
log_message "DNS更新脚本执行失败"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
log_message "警告: DNS配置文件不存在: $DNS_CONF"
|
|
||||||
fi
|
|
||||||
|
|
||||||
sleep 10
|
|
||||||
done
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
zone "argus.com" {
|
|
||||||
type master;
|
|
||||||
file "/etc/bind/db.argus.com";
|
|
||||||
};
|
|
||||||
@ -1,27 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
echo "Reloading BIND9 configuration..."
|
|
||||||
|
|
||||||
# Check if configuration files are valid
|
|
||||||
echo "Checking named.conf.local syntax..."
|
|
||||||
if ! named-checkconf /etc/bind/named.conf.local; then
|
|
||||||
echo "ERROR: named.conf.local has syntax errors!"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Checking zone file syntax..."
|
|
||||||
if ! named-checkzone argus.com /etc/bind/db.argus.com; then
|
|
||||||
echo "ERROR: db.argus.com has syntax errors!"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Reload BIND9 via supervisor
|
|
||||||
echo "Reloading BIND9 service..."
|
|
||||||
supervisorctl restart bind9
|
|
||||||
|
|
||||||
if [ $? -eq 0 ]; then
|
|
||||||
echo "BIND9 reloaded successfully!"
|
|
||||||
else
|
|
||||||
echo "ERROR: Failed to reload BIND9!"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
@ -1,42 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Set /private permissions to 777 as requested
|
|
||||||
chmod 777 /private 2>/dev/null || true
|
|
||||||
|
|
||||||
# Create persistent directories for BIND9 configs and DNS sync
|
|
||||||
mkdir -p /private/argus/bind
|
|
||||||
mkdir -p /private/argus/etc
|
|
||||||
chown bind:bind /private/argus 2>/dev/null || true
|
|
||||||
chown -R bind:bind /private/argus/bind /private/argus/etc
|
|
||||||
|
|
||||||
# Copy configuration files to persistent storage if they don't exist
|
|
||||||
if [ ! -f /private/argus/bind/named.conf.local ]; then
|
|
||||||
cp /etc/bind/named.conf.local /private/argus/bind/named.conf.local
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -f /private/argus/bind/db.argus.com ]; then
|
|
||||||
cp /etc/bind/db.argus.com /private/argus/bind/db.argus.com
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Copy update-dns.sh to /private/argus/etc/
|
|
||||||
cp /usr/local/bin/update-dns.sh /private/argus/etc/update-dns.sh
|
|
||||||
chown bind:bind /private/argus/etc/update-dns.sh
|
|
||||||
chmod a+x /private/argus/etc/update-dns.sh
|
|
||||||
|
|
||||||
# Create symlinks to use persistent configs
|
|
||||||
ln -sf /private/argus/bind/named.conf.local /etc/bind/named.conf.local
|
|
||||||
ln -sf /private/argus/bind/db.argus.com /etc/bind/db.argus.com
|
|
||||||
|
|
||||||
# Set proper ownership
|
|
||||||
chown bind:bind /private/argus/bind/named.conf.local /private/argus/bind/db.argus.com
|
|
||||||
|
|
||||||
# 记录容器ip地址更新到dns.conf
|
|
||||||
IP=`ifconfig | grep -A 1 eth0 | grep inet | awk '{print $2}'`
|
|
||||||
echo current IP: ${IP}
|
|
||||||
echo ${IP} > /private/argus/etc/dns.conf
|
|
||||||
|
|
||||||
# Create supervisor log directory
|
|
||||||
mkdir -p /var/log/supervisor
|
|
||||||
|
|
||||||
# Start supervisor
|
|
||||||
exec /usr/bin/supervisord -c /etc/supervisor/conf.d/supervisord.conf
|
|
||||||
@ -1,37 +0,0 @@
|
|||||||
[unix_http_server]
|
|
||||||
file=/var/run/supervisor.sock
|
|
||||||
chmod=0700
|
|
||||||
|
|
||||||
[supervisord]
|
|
||||||
nodaemon=true
|
|
||||||
user=root
|
|
||||||
logfile=/var/log/supervisor/supervisord.log
|
|
||||||
pidfile=/var/run/supervisord.pid
|
|
||||||
|
|
||||||
[rpcinterface:supervisor]
|
|
||||||
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
|
|
||||||
|
|
||||||
[supervisorctl]
|
|
||||||
serverurl=unix:///var/run/supervisor.sock
|
|
||||||
|
|
||||||
[program:bind9]
|
|
||||||
command=/usr/sbin/named -g -c /etc/bind/named.conf -u bind
|
|
||||||
user=bind
|
|
||||||
autostart=true
|
|
||||||
autorestart=true
|
|
||||||
stderr_logfile=/var/log/supervisor/bind9.err.log
|
|
||||||
stdout_logfile=/var/log/supervisor/bind9.out.log
|
|
||||||
priority=10
|
|
||||||
|
|
||||||
[program:argus-dns-sync]
|
|
||||||
command=/usr/local/bin/argus_dns_sync.sh
|
|
||||||
autostart=true
|
|
||||||
autorestart=true
|
|
||||||
startsecs=3
|
|
||||||
stopsignal=TERM
|
|
||||||
user=root
|
|
||||||
stdout_logfile=/var/log/argus_dns_sync.out.log
|
|
||||||
stderr_logfile=/var/log/argus_dns_sync.err.log
|
|
||||||
; 根据环境调整环境变量(可选)
|
|
||||||
; environment=RNDC_RELOAD="yes"
|
|
||||||
|
|
||||||
@ -1,31 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
# update-dns.sh
|
|
||||||
# 从 /private/argus/etc/dns.conf 读取 IP,写入 /etc/resolv.conf
|
|
||||||
|
|
||||||
DNS_CONF="/private/argus/etc/dns.conf"
|
|
||||||
RESOLV_CONF="/etc/resolv.conf"
|
|
||||||
|
|
||||||
# 检查配置文件是否存在
|
|
||||||
if [ ! -f "$DNS_CONF" ]; then
|
|
||||||
echo "配置文件不存在: $DNS_CONF" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 生成 resolv.conf 内容
|
|
||||||
{
|
|
||||||
while IFS= read -r ip; do
|
|
||||||
# 跳过空行和注释
|
|
||||||
case "$ip" in
|
|
||||||
\#*) continue ;;
|
|
||||||
"") continue ;;
|
|
||||||
esac
|
|
||||||
echo "nameserver $ip"
|
|
||||||
done < "$DNS_CONF"
|
|
||||||
} > "$RESOLV_CONF".tmp
|
|
||||||
|
|
||||||
# 替换写入 /etc/resolv.conf
|
|
||||||
cat "$RESOLV_CONF".tmp > "$RESOLV_CONF"
|
|
||||||
rm -f "$RESOLV_CONF".tmp
|
|
||||||
|
|
||||||
echo "已更新 $RESOLV_CONF"
|
|
||||||
|
|
||||||
@ -1,16 +0,0 @@
|
|||||||
services:
|
|
||||||
bind9:
|
|
||||||
image: argus-bind9:latest
|
|
||||||
container_name: argus-bind9-test
|
|
||||||
ports:
|
|
||||||
- "${HOST_DNS_PORT:-1053}:53/tcp"
|
|
||||||
- "${HOST_DNS_PORT:-1053}:53/udp"
|
|
||||||
volumes:
|
|
||||||
- ./private:/private
|
|
||||||
restart: unless-stopped
|
|
||||||
networks:
|
|
||||||
- bind-test-network
|
|
||||||
|
|
||||||
networks:
|
|
||||||
bind-test-network:
|
|
||||||
driver: bridge
|
|
||||||
@ -1,118 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# End-to-end test for BIND9 DNS server
|
|
||||||
# This script runs all tests in sequence to validate the complete functionality
|
|
||||||
# Usage: ./00_e2e_test.sh
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
|
|
||||||
|
|
||||||
export HOST_DNS_PORT
|
|
||||||
|
|
||||||
echo "=========================================="
|
|
||||||
echo "BIND9 DNS Server End-to-End Test Suite"
|
|
||||||
echo "=========================================="
|
|
||||||
|
|
||||||
# Track test results
|
|
||||||
total_tests=0
|
|
||||||
passed_tests=0
|
|
||||||
failed_tests=0
|
|
||||||
|
|
||||||
# Function to run a test step
|
|
||||||
run_test_step() {
|
|
||||||
local step_name="$1"
|
|
||||||
local script_name="$2"
|
|
||||||
local description="$3"
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "[$step_name] $description"
|
|
||||||
echo "$(printf '=%.0s' {1..50})"
|
|
||||||
|
|
||||||
((total_tests++))
|
|
||||||
|
|
||||||
if [ ! -f "$SCRIPT_DIR/$script_name" ]; then
|
|
||||||
echo "✗ Test script not found: $script_name"
|
|
||||||
((failed_tests++))
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Make sure script is executable
|
|
||||||
chmod +x "$SCRIPT_DIR/$script_name"
|
|
||||||
|
|
||||||
# Run the test
|
|
||||||
echo "Executing: $SCRIPT_DIR/$script_name"
|
|
||||||
if "$SCRIPT_DIR/$script_name"; then
|
|
||||||
echo "✓ $step_name completed successfully"
|
|
||||||
((passed_tests++))
|
|
||||||
return 0
|
|
||||||
else
|
|
||||||
echo "✗ $step_name failed"
|
|
||||||
((failed_tests++))
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Cleanup any previous test environment (but preserve the Docker image)
|
|
||||||
echo ""
|
|
||||||
echo "[SETUP] Cleaning up any previous test environment..."
|
|
||||||
if [ -f "$SCRIPT_DIR/05_cleanup.sh" ]; then
|
|
||||||
chmod +x "$SCRIPT_DIR/05_cleanup.sh"
|
|
||||||
"$SCRIPT_DIR/05_cleanup.sh" || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Starting BIND9 DNS server end-to-end test sequence..."
|
|
||||||
|
|
||||||
# Test sequence
|
|
||||||
run_test_step "TEST-01" "01_start_container.sh" "Start BIND9 container" || true
|
|
||||||
|
|
||||||
run_test_step "TEST-02" "02_dig_test.sh" "Initial DNS resolution test" || true
|
|
||||||
|
|
||||||
run_test_step "TEST-03" "03_reload_test.sh" "Configuration reload with IP modification" || true
|
|
||||||
|
|
||||||
run_test_step "TEST-03.5" "03.5_dns_sync_test.sh" "DNS auto-sync functionality test" || true
|
|
||||||
|
|
||||||
run_test_step "TEST-04" "04_persistence_test.sh" "Configuration persistence after restart" || true
|
|
||||||
|
|
||||||
# Final cleanup (but preserve logs for review)
|
|
||||||
echo ""
|
|
||||||
echo "[CLEANUP] Cleaning up test environment..."
|
|
||||||
run_test_step "CLEANUP" "05_cleanup.sh" "Clean up containers and networks" || true
|
|
||||||
|
|
||||||
# Test summary
|
|
||||||
echo ""
|
|
||||||
echo "=========================================="
|
|
||||||
echo "TEST SUMMARY"
|
|
||||||
echo "=========================================="
|
|
||||||
echo "Total tests: $total_tests"
|
|
||||||
echo "Passed: $passed_tests"
|
|
||||||
echo "Failed: $failed_tests"
|
|
||||||
|
|
||||||
if [ $failed_tests -eq 0 ]; then
|
|
||||||
echo ""
|
|
||||||
echo "✅ ALL TESTS PASSED!"
|
|
||||||
echo ""
|
|
||||||
echo "BIND9 DNS server functionality validated:"
|
|
||||||
echo " ✓ Container startup and basic functionality"
|
|
||||||
echo " ✓ DNS resolution for configured domains"
|
|
||||||
echo " ✓ Configuration modification and reload"
|
|
||||||
echo " ✓ DNS auto-sync from IP files"
|
|
||||||
echo " ✓ Configuration persistence across restarts"
|
|
||||||
echo " ✓ Cleanup and resource management"
|
|
||||||
echo ""
|
|
||||||
echo "The BIND9 DNS server is ready for production use."
|
|
||||||
exit 0
|
|
||||||
else
|
|
||||||
echo ""
|
|
||||||
echo "❌ SOME TESTS FAILED!"
|
|
||||||
echo ""
|
|
||||||
echo "Please review the test output above to identify and fix issues."
|
|
||||||
echo "You may need to:"
|
|
||||||
echo " - Check Docker installation and permissions"
|
|
||||||
echo " - Verify network connectivity"
|
|
||||||
echo " - Review BIND9 configuration files"
|
|
||||||
echo " - Check system resources and port availability"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
@ -1,42 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Start BIND9 test container
|
|
||||||
# Usage: ./01_start_container.sh
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_DIR="$(dirname "$SCRIPT_DIR")"
|
|
||||||
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
|
|
||||||
|
|
||||||
export HOST_DNS_PORT
|
|
||||||
|
|
||||||
cd "$TEST_DIR"
|
|
||||||
|
|
||||||
echo "Starting BIND9 test container..."
|
|
||||||
|
|
||||||
# Ensure private directory exists with proper permissions
|
|
||||||
mkdir -p private/argus/bind
|
|
||||||
mkdir -p private/argus/etc
|
|
||||||
chmod 777 private
|
|
||||||
|
|
||||||
# Start the container
|
|
||||||
docker compose up -d
|
|
||||||
|
|
||||||
echo "Waiting for container to be ready..."
|
|
||||||
sleep 5
|
|
||||||
|
|
||||||
# Check if container is running
|
|
||||||
if docker compose ps | grep -q "Up"; then
|
|
||||||
echo "✓ Container started successfully"
|
|
||||||
echo "Container status:"
|
|
||||||
docker compose ps
|
|
||||||
else
|
|
||||||
echo "✗ Failed to start container"
|
|
||||||
docker compose logs
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "BIND9 test environment is ready!"
|
|
||||||
echo "DNS server listening on localhost:${HOST_DNS_PORT}"
|
|
||||||
@ -1,75 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Test DNS resolution using dig
|
|
||||||
# Usage: ./02_dig_test.sh
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
|
|
||||||
|
|
||||||
echo "Testing DNS resolution with dig..."
|
|
||||||
echo "Using DNS server localhost:${HOST_DNS_PORT}"
|
|
||||||
|
|
||||||
# Function to test DNS query
|
|
||||||
test_dns_query() {
|
|
||||||
local hostname="$1"
|
|
||||||
local expected_ip="$2"
|
|
||||||
local description="$3"
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Testing: $description"
|
|
||||||
echo "Query: $hostname.argus.com"
|
|
||||||
echo "Expected IP: $expected_ip"
|
|
||||||
|
|
||||||
# Perform dig query
|
|
||||||
result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
|
|
||||||
|
|
||||||
if [ "$result" = "QUERY_FAILED" ]; then
|
|
||||||
echo "✗ DNS query failed"
|
|
||||||
return 1
|
|
||||||
elif [ "$result" = "$expected_ip" ]; then
|
|
||||||
echo "✓ DNS query successful: $result"
|
|
||||||
return 0
|
|
||||||
else
|
|
||||||
echo "✗ DNS query returned unexpected result: $result"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Check if dig is available
|
|
||||||
if ! command -v dig &> /dev/null; then
|
|
||||||
echo "Installing dig (dnsutils)..."
|
|
||||||
apt-get update && apt-get install -y dnsutils
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check if container is running
|
|
||||||
if ! docker compose ps | grep -q "Up"; then
|
|
||||||
echo "Error: BIND9 container is not running"
|
|
||||||
echo "Please start the container first with: ./01_start_container.sh"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "=== DNS Resolution Tests ==="
|
|
||||||
|
|
||||||
# Test cases based on current configuration
|
|
||||||
failed_tests=0
|
|
||||||
|
|
||||||
# Test ns1.argus.com -> 127.0.0.1
|
|
||||||
if ! test_dns_query "ns1" "127.0.0.1" "Name server resolution"; then
|
|
||||||
((failed_tests++))
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Test web.argus.com -> 12.4.5.6
|
|
||||||
if ! test_dns_query "web" "12.4.5.6" "Web server resolution"; then
|
|
||||||
((failed_tests++))
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "=== Test Summary ==="
|
|
||||||
if [ $failed_tests -eq 0 ]; then
|
|
||||||
echo "✓ All DNS tests passed!"
|
|
||||||
exit 0
|
|
||||||
else
|
|
||||||
echo "✗ $failed_tests test(s) failed"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
@ -1,259 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Test DNS auto-sync functionality using argus_dns_sync.sh
|
|
||||||
# This test validates the automatic DNS record updates from IP files
|
|
||||||
# Usage: ./03.5_dns_sync_test.sh
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
HOST_DNS_PORT="${HOST_DNS_PORT:-1053}"
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_DIR="$(dirname "$SCRIPT_DIR")"
|
|
||||||
|
|
||||||
echo "=== DNS Auto-Sync Functionality Test ==="
|
|
||||||
echo "Using DNS server localhost:${HOST_DNS_PORT}"
|
|
||||||
|
|
||||||
# Check if container is running
|
|
||||||
if ! docker compose ps | grep -q "Up"; then
|
|
||||||
echo "Error: BIND9 container is not running"
|
|
||||||
echo "Please start the container first with: ./01_start_container.sh"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check if dig is available
|
|
||||||
if ! command -v dig &> /dev/null; then
|
|
||||||
echo "Installing dig (dnsutils)..."
|
|
||||||
apt-get update && apt-get install -y dnsutils
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Function to test DNS query
|
|
||||||
test_dns_query() {
|
|
||||||
local hostname="$1"
|
|
||||||
local expected_ip="$2"
|
|
||||||
local description="$3"
|
|
||||||
|
|
||||||
echo "Testing: $description"
|
|
||||||
echo "Query: $hostname.argus.com -> Expected: $expected_ip"
|
|
||||||
|
|
||||||
# Wait a moment for DNS cache
|
|
||||||
sleep 2
|
|
||||||
|
|
||||||
result=$(dig @localhost -p "$HOST_DNS_PORT" "$hostname".argus.com A +short 2>/dev/null || echo "QUERY_FAILED")
|
|
||||||
|
|
||||||
if [ "$result" = "$expected_ip" ]; then
|
|
||||||
echo "✓ $result"
|
|
||||||
return 0
|
|
||||||
else
|
|
||||||
echo "✗ Got: $result, Expected: $expected_ip"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Function to wait for sync to complete
|
|
||||||
wait_for_sync() {
|
|
||||||
local timeout=15
|
|
||||||
local elapsed=0
|
|
||||||
echo "Waiting for DNS sync to complete (max ${timeout}s)..."
|
|
||||||
|
|
||||||
while [ $elapsed -lt $timeout ]; do
|
|
||||||
if docker compose exec bind9 test -f /var/lock/argus_dns_sync.lock; then
|
|
||||||
echo "Sync process is running..."
|
|
||||||
else
|
|
||||||
echo "Sync completed"
|
|
||||||
sleep 2 # Extra wait for DNS propagation
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
sleep 2
|
|
||||||
elapsed=$((elapsed + 2))
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "Warning: Sync may still be running after ${timeout}s"
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Step 1: Preparing test environment..."
|
|
||||||
|
|
||||||
# Ensure required directories exist
|
|
||||||
docker compose exec bind9 mkdir -p /private/argus/etc
|
|
||||||
docker compose exec bind9 mkdir -p /private/argus/bind/.backup
|
|
||||||
|
|
||||||
# Backup original configuration if it exists
|
|
||||||
docker compose exec bind9 test -f /private/argus/bind/db.argus.com && \
|
|
||||||
docker compose exec bind9 cp /private/argus/bind/db.argus.com /private/argus/bind/db.argus.com.backup.test || true
|
|
||||||
|
|
||||||
# Ensure initial configuration is available (may already be symlinked)
|
|
||||||
docker compose exec bind9 test -f /private/argus/bind/db.argus.com || \
|
|
||||||
docker compose exec bind9 cp /etc/bind/db.argus.com /private/argus/bind/db.argus.com
|
|
||||||
|
|
||||||
echo "✓ Test environment prepared"
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Step 2: Testing initial DNS configuration..."
|
|
||||||
|
|
||||||
# Get current IP for web.argus.com (may have been changed by previous tests)
|
|
||||||
current_web_ip=$(dig @localhost -p "$HOST_DNS_PORT" web.argus.com A +short 2>/dev/null || echo "UNKNOWN")
|
|
||||||
echo "Current web.argus.com IP: $current_web_ip"
|
|
||||||
|
|
||||||
# Test that DNS is working (regardless of specific IP)
|
|
||||||
if [ "$current_web_ip" = "UNKNOWN" ] || [ -z "$current_web_ip" ]; then
|
|
||||||
echo "DNS resolution not working for web.argus.com"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "✓ DNS resolution is working"
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Step 3: Creating IP files for auto-sync..."
|
|
||||||
|
|
||||||
# Create test IP files in the watch directory
|
|
||||||
echo "Creating test1.argus.com with IP 10.0.0.100"
|
|
||||||
docker compose exec bind9 bash -c 'echo "10.0.0.100" > /private/argus/etc/test1.argus.com'
|
|
||||||
|
|
||||||
echo "Creating test2.argus.com with IP 10.0.0.200"
|
|
||||||
docker compose exec bind9 bash -c 'echo "test2 service running on 10.0.0.200" > /private/argus/etc/test2.argus.com'
|
|
||||||
|
|
||||||
echo "Creating api.argus.com with IP 192.168.1.50"
|
|
||||||
docker compose exec bind9 bash -c 'echo "API server: 192.168.1.50 port 8080" > /private/argus/etc/api.argus.com'
|
|
||||||
|
|
||||||
echo "✓ IP files created"
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Step 4: Checking DNS sync process..."
|
|
||||||
|
|
||||||
# Check if DNS sync process is already running (via supervisord)
|
|
||||||
if docker compose exec bind9 pgrep -f argus_dns_sync.sh > /dev/null; then
|
|
||||||
echo "✓ DNS sync process already running (via supervisord)"
|
|
||||||
else
|
|
||||||
echo "Starting DNS sync process manually..."
|
|
||||||
# Start the DNS sync process in background if not running
|
|
||||||
docker compose exec -d bind9 /usr/local/bin/argus_dns_sync.sh
|
|
||||||
echo "✓ DNS sync process started manually"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Wait for first sync cycle
|
|
||||||
wait_for_sync
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Step 5: Testing auto-synced DNS records..."
|
|
||||||
|
|
||||||
failed_tests=0
|
|
||||||
|
|
||||||
# Test new DNS records created by auto-sync
|
|
||||||
if ! test_dns_query "test1" "10.0.0.100" "Auto-synced test1.argus.com"; then
|
|
||||||
((failed_tests++))
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! test_dns_query "test2" "10.0.0.200" "Auto-synced test2.argus.com"; then
|
|
||||||
((failed_tests++))
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! test_dns_query "api" "192.168.1.50" "Auto-synced api.argus.com"; then
|
|
||||||
((failed_tests++))
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Verify original records still work (use current IP from earlier)
|
|
||||||
if ! test_dns_query "web" "$current_web_ip" "Original web.argus.com still working"; then
|
|
||||||
((failed_tests++))
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! test_dns_query "ns1" "127.0.0.1" "Original ns1.argus.com still working"; then
|
|
||||||
((failed_tests++))
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Step 6: Testing IP update functionality..."
|
|
||||||
|
|
||||||
# Update an existing IP file
|
|
||||||
echo "Updating test1.argus.com IP from 10.0.0.100 to 10.0.0.150"
|
|
||||||
docker compose exec bind9 bash -c 'echo "10.0.0.150" > /private/argus/etc/test1.argus.com'
|
|
||||||
|
|
||||||
# Wait for sync
|
|
||||||
wait_for_sync
|
|
||||||
|
|
||||||
# Test updated record
|
|
||||||
if ! test_dns_query "test1" "10.0.0.150" "Updated test1.argus.com IP"; then
|
|
||||||
((failed_tests++))
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Step 7: Testing invalid IP handling..."
|
|
||||||
|
|
||||||
# Create file with invalid IP
|
|
||||||
echo "Creating invalid.argus.com with invalid IP"
|
|
||||||
docker compose exec bind9 bash -c 'echo "this is not an IP address" > /private/argus/etc/invalid.argus.com'
|
|
||||||
|
|
||||||
# Wait for sync (should skip invalid IP)
|
|
||||||
wait_for_sync
|
|
||||||
|
|
||||||
# Verify invalid record was not added (should fail to resolve)
|
|
||||||
result=$(dig @localhost -p "$HOST_DNS_PORT" invalid.argus.com A +short 2>/dev/null || echo "NO_RESULT")
|
|
||||||
if [ "$result" = "NO_RESULT" ] || [ -z "$result" ]; then
|
|
||||||
echo "✓ Invalid IP correctly ignored"
|
|
||||||
else
|
|
||||||
echo "✗ Invalid IP was processed: $result"
|
|
||||||
((failed_tests++))
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Step 8: Verifying backup functionality..."
|
|
||||||
|
|
||||||
# Check if backups were created
|
|
||||||
backup_count=$(docker compose exec bind9 ls -1 /private/argus/bind/.backup/ | wc -l || echo "0")
|
|
||||||
if [ "$backup_count" -gt 0 ]; then
|
|
||||||
echo "✓ Configuration backups created ($backup_count files)"
|
|
||||||
# Show latest backup
|
|
||||||
docker compose exec bind9 ls -la /private/argus/bind/.backup/ | tail -1
|
|
||||||
else
|
|
||||||
echo "✗ No backup files found"
|
|
||||||
((failed_tests++))
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Step 9: Cleanup..."
|
|
||||||
|
|
||||||
# Note: We don't stop the DNS sync process since it's managed by supervisord
|
|
||||||
echo "Note: DNS sync process will continue running (managed by supervisord)"
|
|
||||||
|
|
||||||
# Clean up test files
|
|
||||||
docker compose exec bind9 rm -f /private/argus/etc/test1.argus.com
|
|
||||||
docker compose exec bind9 rm -f /private/argus/etc/test2.argus.com
|
|
||||||
docker compose exec bind9 rm -f /private/argus/etc/api.argus.com
|
|
||||||
docker compose exec bind9 rm -f /private/argus/etc/invalid.argus.com
|
|
||||||
|
|
||||||
# Restore original configuration if backup exists
|
|
||||||
docker compose exec bind9 test -f /private/argus/bind/db.argus.com.backup.test && \
|
|
||||||
docker compose exec bind9 cp /private/argus/bind/db.argus.com.backup.test /private/argus/bind/db.argus.com && \
|
|
||||||
docker compose exec bind9 rm /private/argus/bind/db.argus.com.backup.test || true
|
|
||||||
|
|
||||||
# Reload original configuration
|
|
||||||
docker compose exec bind9 /usr/local/bin/reload-bind9.sh
|
|
||||||
|
|
||||||
echo "✓ Cleanup completed"
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "=== DNS Auto-Sync Test Summary ==="
|
|
||||||
if [ $failed_tests -eq 0 ]; then
|
|
||||||
echo "✅ All DNS auto-sync tests passed!"
|
|
||||||
echo ""
|
|
||||||
echo "Validated functionality:"
|
|
||||||
echo " ✓ Automatic DNS record creation from IP files"
|
|
||||||
echo " ✓ IP address extraction from various file formats"
|
|
||||||
echo " ✓ Dynamic DNS record updates"
|
|
||||||
echo " ✓ Invalid IP address handling"
|
|
||||||
echo " ✓ Configuration backup mechanism"
|
|
||||||
echo " ✓ Preservation of existing DNS records"
|
|
||||||
echo ""
|
|
||||||
echo "The DNS auto-sync functionality is working correctly!"
|
|
||||||
exit 0
|
|
||||||
else
|
|
||||||
echo "❌ $failed_tests DNS auto-sync test(s) failed!"
|
|
||||||
echo ""
|
|
||||||
echo "Please check:"
|
|
||||||
echo " - argus_dns_sync.sh script configuration"
|
|
||||||
echo " - File permissions in /private/argus/etc/"
|
|
||||||
echo " - BIND9 reload functionality"
|
|
||||||
echo " - Network connectivity and DNS resolution"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user