dev_1.0.0_sundp_2 优化Argus-metric模块的e2e部署测试流程 (#27)

Co-authored-by: sundapeng.sdp <sundapeng@hashdata.cn>
Reviewed-on: #27
Reviewed-by: yuyr <yuyr@zgclab.edu.cn>
Reviewed-by: xuxt <xuxt@zgclab.edu.cn>
This commit is contained in:
sundapeng 2025-10-17 17:15:55 +08:00
parent 31ccb0b1b8
commit 1a768bc837
36 changed files with 1805 additions and 695 deletions

View File

@ -4,6 +4,15 @@
set -e set -e
# PID 文件检测,防止重复执行
PIDFILE="/var/run/check_health.pid"
if [ -f "$PIDFILE" ] && kill -0 $(cat "$PIDFILE") 2>/dev/null; then
echo "健康检查脚本已在运行中,跳过本次执行" >&2
exit 0
fi
echo $$ > "$PIDFILE"
trap "rm -f $PIDFILE" EXIT
# 获取脚本所在目录 # 获取脚本所在目录
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
HEALTH_LOG_FILE="$SCRIPT_DIR/.health_log" HEALTH_LOG_FILE="$SCRIPT_DIR/.health_log"

View File

@ -200,22 +200,22 @@ parse_version_info() {
VERSION=$(grep '"version"' "$VERSION_FILE_PATH" | sed 's/.*"version": *"\([^"]*\)".*/\1/') VERSION=$(grep '"version"' "$VERSION_FILE_PATH" | sed 's/.*"version": *"\([^"]*\)".*/\1/')
BUILD_TIME=$(grep '"build_time"' "$VERSION_FILE_PATH" | sed 's/.*"build_time": *"\([^"]*\)".*/\1/') BUILD_TIME=$(grep '"build_time"' "$VERSION_FILE_PATH" | sed 's/.*"build_time": *"\([^"]*\)".*/\1/')
# 解析 artifact_list # 解析 artifact_list(跳过字段名本身)
grep -A 100 '"artifact_list"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do grep -A 100 '"artifact_list"' "$VERSION_FILE_PATH" | grep -v '"artifact_list"' | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do
component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/') component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/')
version=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/') version=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/')
echo "$component:$version" >> "$TEMP_DIR/components.txt" echo "$component:$version" >> "$TEMP_DIR/components.txt"
done done
# 解析 checksums # 解析 checksums(跳过字段名本身)
grep -A 100 '"checksums"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do grep -A 100 '"checksums"' "$VERSION_FILE_PATH" | grep -v '"checksums"' | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do
component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/') component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/')
checksum=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/') checksum=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/')
echo "$component:$checksum" >> "$TEMP_DIR/checksums.txt" echo "$component:$checksum" >> "$TEMP_DIR/checksums.txt"
done done
# 解析 install_order # 解析 install_order(跳过字段名本身,只取数组元素)
grep -A 100 '"install_order"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+"' | while read line; do grep -A 100 '"install_order"' "$VERSION_FILE_PATH" | grep -v '"install_order"' | grep -E '^\s*"[^"]+"' | while read line; do
component=$(echo "$line" | sed 's/.*"\([^"]*\)".*/\1/') component=$(echo "$line" | sed 's/.*"\([^"]*\)".*/\1/')
echo "$component" >> "$TEMP_DIR/install_order.txt" echo "$component" >> "$TEMP_DIR/install_order.txt"
done done
@ -317,6 +317,30 @@ create_install_dirs() {
log_success "安装目录创建完成: $INSTALL_DIR" log_success "安装目录创建完成: $INSTALL_DIR"
} }
# 获取系统版本
get_system_version() {
if [[ ! -f /etc/os-release ]]; then
log_error "无法检测操作系统版本"
return 1
fi
source /etc/os-release
# 提取主版本号
case "$VERSION_ID" in
"20.04")
echo "ubuntu20"
;;
"22.04")
echo "ubuntu22"
;;
*)
log_warning "未识别的Ubuntu版本: $VERSION_ID尝试使用ubuntu22"
echo "ubuntu22"
;;
esac
}
# 安装系统依赖包 # 安装系统依赖包
install_system_deps() { install_system_deps() {
log_info "检查系统依赖包..." log_info "检查系统依赖包..."
@ -330,21 +354,43 @@ install_system_deps() {
return 0 return 0
fi fi
# 检查是否有tar.gz文件 # 获取系统版本对应的依赖目录
local system_version=$(get_system_version)
local version_deps_dir="$deps_dir/$system_version"
log_info "检测到系统版本: $system_version"
# 检查版本特定的依赖目录是否存在
if [[ ! -d "$version_deps_dir" ]]; then
log_warning "未找到 $system_version 版本的依赖目录: $version_deps_dir"
# 回退到旧的逻辑检查根deps目录
local deps_count=$(find "$deps_dir" -name "*.tar.gz" | wc -l) local deps_count=$(find "$deps_dir" -name "*.tar.gz" | wc -l)
if [[ $deps_count -eq 0 ]]; then if [[ $deps_count -eq 0 ]]; then
log_info "deps 目录中没有 tar.gz 文件,跳过系统依赖包安装" log_info "deps 目录中没有 tar.gz 文件,跳过系统依赖包安装"
return 0 return 0
fi fi
version_deps_dir="$deps_dir"
else
# 检查版本目录中是否有tar.gz文件
local deps_count=$(find "$version_deps_dir" -name "*.tar.gz" | wc -l)
if [[ $deps_count -eq 0 ]]; then
log_info "$system_version 版本目录中没有 tar.gz 文件,跳过系统依赖包安装"
return 0
fi
fi
log_info "找到 $deps_count 个系统依赖包,开始安装..." log_info "找到 $system_version 版本的依赖包,开始安装..."
# 创建临时目录用于解压依赖包 # 创建临时目录用于解压依赖包
local deps_temp_dir="$TEMP_DIR/deps" local deps_temp_dir="${TEMP_DIR:-/tmp}/deps"
mkdir -p "$deps_temp_dir" mkdir -p "$deps_temp_dir"
# 定义要检查的核心依赖
local CORE_DEPS=(jq cron curl)
local FAILED_DEPS=()
# 处理每个tar.gz文件 # 处理每个tar.gz文件
find "$deps_dir" -name "*.tar.gz" | while read tar_file; do find "$version_deps_dir" -name "*.tar.gz" | while read tar_file; do
local tar_basename=$(basename "$tar_file") local tar_basename=$(basename "$tar_file")
local extract_name="${tar_basename%.tar.gz}" local extract_name="${tar_basename%.tar.gz}"
@ -362,40 +408,61 @@ install_system_deps() {
fi fi
# 进入解压目录查找deb包 # 进入解压目录查找deb包
cd "$extract_dir" cd "$extract_dir" || continue
local deb_count=$(find . -name "*.deb" | wc -l) local deb_files=(*.deb)
if [[ ${#deb_files[@]} -gt 0 ]]; then
log_info " 找到 ${#deb_files[@]} 个 deb 包,开始安装..."
if [[ $deb_count -gt 0 ]]; then for deb in "${deb_files[@]}"; do
log_info " 找到 $deb_count 个 deb 包,开始安装..." local pkg_name
pkg_name=$(dpkg-deb -f "$deb" Package 2>/dev/null)
# 1. 先尝试安装所有deb包 # 如果已安装,则跳过
log_info " 第1步批量安装deb包..." if dpkg -s "$pkg_name" &>/dev/null; then
if dpkg -i *.deb 2>/dev/null; then log_success " $pkg_name 已安装,跳过"
log_success " 所有deb包安装成功" continue
fi
# 尝试安装
log_info " 安装 $pkg_name..."
if DEBIAN_FRONTEND=noninteractive dpkg -i "$deb" &>/dev/null; then
log_success " $pkg_name 安装成功"
else else
log_warning " 部分deb包安装失败可能存在依赖问题" log_warning " $pkg_name 安装失败,尝试修复依赖..."
if DEBIAN_FRONTEND=noninteractive apt-get install -f -y &>/dev/null; then
# 2. 使用apt-get修复依赖 if dpkg -s "$pkg_name" &>/dev/null; then
log_info " 第2步修复依赖关系..." log_success " $pkg_name 修复安装成功"
if apt-get install -f -y; then
log_success " 依赖关系修复完成"
else else
log_error " 依赖关系修复失败" log_error " $pkg_name 仍未安装成功"
# 继续处理其他包,不退出 FAILED_DEPS+=("$pkg_name")
fi
else
log_error " $pkg_name 自动修复失败"
FAILED_DEPS+=("$pkg_name")
fi fi
fi fi
done
else else
log_info " $tar_basename 中没有找到deb包跳过" log_info " $tar_basename 中没有找到deb包跳过"
fi fi
# 返回到依赖临时目录 # 返回到依赖临时目录
cd "$deps_temp_dir" cd "$deps_temp_dir" || continue
done done
# 检查并启动 cron 服务 # 检查并启动 cron 服务
start_cron_service start_cron_service
log_success "系统依赖包安装完成" # 总结安装结果
if [[ ${#FAILED_DEPS[@]} -gt 0 ]]; then
log_error "以下系统依赖未能成功安装,安装终止,请手动安装后重试:"
for f in "${FAILED_DEPS[@]}"; do
echo " - $f"
done
exit 1
else
log_success "系统依赖包安装完成,全部就绪"
fi
} }
# 启动 cron 服务 # 启动 cron 服务
@ -637,6 +704,18 @@ EOF
log_success "安装记录已创建: $install_record_file" log_success "安装记录已创建: $install_record_file"
} }
# 检查cron任务是否已存在
check_cron_task_exists() {
local task_pattern="$1"
local temp_cron="$2"
if grep -q "$task_pattern" "$temp_cron"; then
return 0 # 任务已存在
else
return 1 # 任务不存在
fi
}
# 设置健康检查定时任务 # 设置健康检查定时任务
setup_health_check_cron() { setup_health_check_cron() {
log_info "设置健康检查定时任务..." log_info "设置健康检查定时任务..."
@ -661,7 +740,7 @@ setup_health_check_cron() {
crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron" crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron"
# 检查并删除旧的健康检查任务 # 检查并删除旧的健康检查任务
if grep -q "check_health.sh" "$temp_cron"; then if check_cron_task_exists "check_health.sh" "$temp_cron"; then
log_info "发现旧的健康检查定时任务,正在更新..." log_info "发现旧的健康检查定时任务,正在更新..."
# 删除所有包含check_health.sh的行 # 删除所有包含check_health.sh的行
grep -v "check_health.sh" "$temp_cron" > "$temp_cron.new" grep -v "check_health.sh" "$temp_cron" > "$temp_cron.new"
@ -716,7 +795,7 @@ setup_dns_sync_cron() {
crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron" crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron"
# 检查并删除旧的 DNS 同步任务 # 检查并删除旧的 DNS 同步任务
if grep -q "sync_dns.sh" "$temp_cron"; then if check_cron_task_exists "sync_dns.sh" "$temp_cron"; then
log_info "发现旧的 DNS 同步定时任务,正在更新..." log_info "发现旧的 DNS 同步定时任务,正在更新..."
# 删除所有包含sync_dns.sh的行 # 删除所有包含sync_dns.sh的行
grep -v "sync_dns.sh" "$temp_cron" > "$temp_cron.new" grep -v "sync_dns.sh" "$temp_cron" > "$temp_cron.new"
@ -724,16 +803,15 @@ setup_dns_sync_cron() {
log_info "旧的 DNS 同步定时任务已删除" log_info "旧的 DNS 同步定时任务已删除"
fi fi
# 添加新的定时任务(每30秒执行一次) # 添加新的定时任务(每1分钟执行一次)
# 直接使用版本目录中的 DNS 同步脚本 # 直接使用版本目录中的 DNS 同步脚本
echo "# Argus-Metrics DNS 同步定时任务" >> "$temp_cron" echo "# Argus-Metrics DNS 同步定时任务" >> "$temp_cron"
echo "* * * * * $sync_dns_script >> $INSTALL_DIR/.dns_sync.log 2>&1" >> "$temp_cron" echo "* * * * * $sync_dns_script >> $INSTALL_DIR/.dns_sync.log 2>&1" >> "$temp_cron"
echo "* * * * * sleep 30; $sync_dns_script >> $INSTALL_DIR/.dns_sync.log 2>&1" >> "$temp_cron"
# 安装新的crontab # 安装新的crontab
if crontab "$temp_cron"; then if crontab "$temp_cron"; then
log_success "DNS 同步定时任务设置成功" log_success "DNS 同步定时任务设置成功"
log_info " 执行频率: 每30秒" log_info " 执行频率: 每1分钟"
log_info " 日志文件: $INSTALL_DIR/.dns_sync.log" log_info " 日志文件: $INSTALL_DIR/.dns_sync.log"
log_info " 查看定时任务: crontab -l" log_info " 查看定时任务: crontab -l"
log_info " 删除定时任务: crontab -e" log_info " 删除定时任务: crontab -e"
@ -771,7 +849,7 @@ setup_version_check_cron() {
crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron" crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron"
# 检查是否已存在版本校验定时任务 # 检查是否已存在版本校验定时任务
if grep -q "check_version.sh" "$temp_cron"; then if check_cron_task_exists "check_version.sh" "$temp_cron"; then
log_info "发现旧的版本校验定时任务,正在更新..." log_info "发现旧的版本校验定时任务,正在更新..."
# 删除所有包含check_version.sh的行 # 删除所有包含check_version.sh的行
grep -v "check_version.sh" "$temp_cron" > "$temp_cron.new" grep -v "check_version.sh" "$temp_cron" > "$temp_cron.new"
@ -824,7 +902,7 @@ setup_restart_cron() {
crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron" crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron"
# 检查是否已存在自动重启定时任务 # 检查是否已存在自动重启定时任务
if grep -q "restart_unhealthy.sh" "$temp_cron"; then if check_cron_task_exists "restart_unhealthy.sh" "$temp_cron"; then
log_info "发现旧的自动重启定时任务,正在更新..." log_info "发现旧的自动重启定时任务,正在更新..."
# 删除所有包含restart_unhealthy.sh的行 # 删除所有包含restart_unhealthy.sh的行
grep -v "restart_unhealthy.sh" "$temp_cron" > "$temp_cron.new" grep -v "restart_unhealthy.sh" "$temp_cron" > "$temp_cron.new"
@ -885,9 +963,9 @@ main() {
check_system check_system
find_version_file find_version_file
create_install_dirs create_install_dirs
install_system_deps
parse_version_info parse_version_info
verify_checksums verify_checksums
install_system_deps
install_components install_components
copy_config_files copy_config_files
create_install_record create_install_record
@ -895,6 +973,20 @@ main() {
setup_dns_sync_cron setup_dns_sync_cron
setup_version_check_cron setup_version_check_cron
setup_restart_cron setup_restart_cron
# 注释掉立即执行健康检查避免与cron任务重复执行
# log_info "立即执行一次健康检查..."
# local check_health_script="$INSTALL_DIR/check_health.sh"
# if [[ -f "$check_health_script" ]]; then
# if "$check_health_script" >> "$INSTALL_DIR/.health_check.log" 2>&1; then
# log_success "健康检查执行完成"
# else
# log_warning "健康检查执行失败,请检查日志: $INSTALL_DIR/.health_check.log"
# fi
# else
# log_warning "健康检查脚本不存在: $check_health_script"
# fi
show_install_info show_install_info
} }

View File

@ -29,26 +29,68 @@ log_error() {
show_help() { show_help() {
echo "Argus-Metric Artifact 发布脚本" echo "Argus-Metric Artifact 发布脚本"
echo echo
echo "用法: $0 <版本号>" echo "用法: $0 <版本号> [选项]"
echo echo
echo "参数:" echo "参数:"
echo " <版本号> 要发布的版本号,对应 artifact 目录中的版本" echo " <版本号> 要发布的版本号,对应 artifact 目录中的版本"
echo echo
echo "选项:"
echo " --output-dir <路径> 指定输出目录 (默认: /private/argus/ftp/share/)"
echo " --owner <uid:gid> 指定文件所有者 (默认: 2133:2015)"
echo " -h, --help 显示此帮助信息"
echo
echo "示例:" echo "示例:"
echo " $0 1.20.0 # 发布 1.20.0 版本" echo " $0 1.20.0 # 使用默认配置发布"
echo " $0 1.20.0 --output-dir /tmp/publish # 指定输出目录"
echo " $0 1.20.0 --owner 1000:1000 # 指定文件所有者"
echo " $0 1.20.0 --output-dir /srv/ftp --owner root:root # 同时指定两者"
echo echo
} }
# 检查参数 # 默认配置
if [[ $# -ne 1 ]]; then DEFAULT_PUBLISH_DIR="/private/argus/ftp/share/"
DEFAULT_OWNER="2133:2015"
# 解析参数
VERSION=""
PUBLISH_DIR="$DEFAULT_PUBLISH_DIR"
OWNER="$DEFAULT_OWNER"
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help)
show_help
exit 0
;;
--output-dir)
PUBLISH_DIR="$2"
shift 2
;;
--owner)
OWNER="$2"
shift 2
;;
*)
if [[ -z "$VERSION" ]]; then
VERSION="$1"
shift
else
log_error "未知参数: $1"
show_help
exit 1
fi
;;
esac
done
# 检查版本号是否提供
if [[ -z "$VERSION" ]]; then
log_error "请提供版本号参数" log_error "请提供版本号参数"
show_help show_help
exit 1 exit 1
fi fi
VERSION="$1"
ARTIFACT_DIR="artifact/$VERSION" ARTIFACT_DIR="artifact/$VERSION"
PUBLISH_DIR="/Users/sundapeng/Project/nlp/aiops/client-plugins/all-in-one/publish/"
# 检查版本目录是否存在 # 检查版本目录是否存在
if [[ ! -d "$ARTIFACT_DIR" ]]; then if [[ ! -d "$ARTIFACT_DIR" ]]; then
@ -57,10 +99,12 @@ if [[ ! -d "$ARTIFACT_DIR" ]]; then
fi fi
log_info "开始发布版本: $VERSION" log_info "开始发布版本: $VERSION"
log_info "输出目录: $PUBLISH_DIR"
log_info "文件所有者: $OWNER"
# 确保发布目录存在 # 确保发布目录存在
log_info "确保发布目录存在: $PUBLISH_DIR" log_info "确保发布目录存在: $PUBLISH_DIR"
mkdir -p "$PUBLISH_DIR" sudo mkdir -p "$PUBLISH_DIR"
# 创建临时目录用于打包 # 创建临时目录用于打包
TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$" TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$"
@ -164,20 +208,26 @@ fi
TAR_NAME="argus-metric_$(echo $VERSION | tr '.' '_').tar.gz" TAR_NAME="argus-metric_$(echo $VERSION | tr '.' '_').tar.gz"
log_info "创建发布包: $TAR_NAME" log_info "创建发布包: $TAR_NAME"
cd "$TEMP_PACKAGE_DIR" cd "$TEMP_PACKAGE_DIR"
tar -czf "$PUBLISH_DIR/$TAR_NAME" * sudo tar -czf "$PUBLISH_DIR/$TAR_NAME" *
cd - > /dev/null cd - > /dev/null
# 设置文件所有者
log_info "设置文件所有者为: $OWNER"
sudo chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME"
# 清理临时目录 # 清理临时目录
rm -rf "$TEMP_PACKAGE_DIR" rm -rf "$TEMP_PACKAGE_DIR"
# 更新 LATEST_VERSION 文件 # 更新 LATEST_VERSION 文件
log_info "更新 LATEST_VERSION 文件..." log_info "更新 LATEST_VERSION 文件..."
echo "$VERSION" > "$PUBLISH_DIR/LATEST_VERSION" echo "$VERSION" | sudo tee "$PUBLISH_DIR/LATEST_VERSION" > /dev/null
sudo chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION"
# 复制 DNS 配置文件到发布目录根目录(直接从 config 目录复制) # 复制 DNS 配置文件到发布目录根目录(直接从 config 目录复制)
if [[ -f "config/dns.conf" ]]; then if [[ -f "config/dns.conf" ]]; then
log_info "复制 DNS 配置文件到发布目录根目录..." log_info "复制 DNS 配置文件到发布目录根目录..."
cp "config/dns.conf" "$PUBLISH_DIR/" sudo cp "config/dns.conf" "$PUBLISH_DIR/"
sudo chown "$OWNER" "$PUBLISH_DIR/dns.conf"
log_success "DNS 配置文件复制完成: $PUBLISH_DIR/dns.conf" log_success "DNS 配置文件复制完成: $PUBLISH_DIR/dns.conf"
else else
log_warning "未找到 config/dns.conf 文件,跳过 DNS 配置文件复制" log_warning "未找到 config/dns.conf 文件,跳过 DNS 配置文件复制"
@ -186,7 +236,8 @@ fi
# 复制 setup.sh 到发布目录 # 复制 setup.sh 到发布目录
if [[ -f "scripts/setup.sh" ]]; then if [[ -f "scripts/setup.sh" ]]; then
log_info "复制 setup.sh 到发布目录..." log_info "复制 setup.sh 到发布目录..."
cp "scripts/setup.sh" "$PUBLISH_DIR/" sudo cp "scripts/setup.sh" "$PUBLISH_DIR/"
sudo chown "$OWNER" "$PUBLISH_DIR/setup.sh"
fi fi
# 显示发布结果 # 显示发布结果

View File

@ -2,6 +2,15 @@
# 此脚本会检查各组件的健康状态,并重启不健康的组件 # 此脚本会检查各组件的健康状态,并重启不健康的组件
# PID 文件检测,防止重复执行
PIDFILE="/var/run/restart_unhealthy.pid"
if [ -f "$PIDFILE" ] && kill -0 $(cat "$PIDFILE") 2>/dev/null; then
echo "自动重启脚本已在运行中,跳过本次执行" >&2
exit 0
fi
echo $$ > "$PIDFILE"
trap "rm -f $PIDFILE" EXIT
# 获取脚本所在目录 # 获取脚本所在目录
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record" INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record"

View File

@ -1,244 +1,143 @@
#!/bin/bash #!/bin/bash
# DNS 同步脚本
# 比较 FTP 根目录的 dns.conf 和本地的 dns.conf如果有变化则同步到 /etc/resolv.conf
set -e set -e
# 颜色定义 # 颜色
RED='\033[0;31m' RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数 - 输出到 stderr 避免影响函数返回值 # 日志函数
log_info() { log_info() { echo -e "${BLUE}[INFO]${NC} $1" >&2; }
echo -e "${BLUE}[INFO]${NC} $1" >&2 log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" >&2; }
} log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1" >&2; }
log_error() { echo -e "${RED}[ERROR]${NC} $1" >&2; }
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1" >&2
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1" >&2
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1" >&2
}
# 获取脚本所在目录
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LOCAL_DNS_CONF="/opt/argus-metric/dns.conf" LOCAL_DNS_CONF="/opt/argus-metric/dns.conf"
REMOTE_DNS_CONF_URL=""
RESOLV_CONF="/etc/resolv.conf" RESOLV_CONF="/etc/resolv.conf"
ALT_RESOLV_CONF="/run/resolv.conf"
LOG_FILE="/opt/argus-metric/.dns_sync.log" LOG_FILE="/opt/argus-metric/.dns_sync.log"
REMOTE_DNS_CONF_URL=""
# 从环境变量或配置文件获取 FTP 服务器信息 # 获取 FTP 配置
get_ftp_config() { get_ftp_config() {
# 优先从环境变量获取配置
log_info "获取 FTP 配置信息..." log_info "获取 FTP 配置信息..."
# 如果环境变量中没有设置,则尝试从配置文件读取
if [[ -z "$FTP_SERVER" || -z "$FTP_USER" || -z "$FTP_PASSWORD" ]]; then if [[ -z "$FTP_SERVER" || -z "$FTP_USER" || -z "$FTP_PASSWORD" ]]; then
local config_file="$SCRIPT_DIR/config.env" [[ -f "$SCRIPT_DIR/config.env" ]] && source "$SCRIPT_DIR/config.env"
if [[ -f "$config_file" ]]; then
log_info "从配置文件读取 FTP 配置: $config_file"
source "$config_file"
fi fi
else
log_info "使用环境变量中的 FTP 配置"
fi
# 设置默认值(如果环境变量和配置文件都没有设置)
FTP_SERVER="${FTP_SERVER:-localhost}" FTP_SERVER="${FTP_SERVER:-localhost}"
FTP_USER="${FTP_USER:-ftpuser}" FTP_USER="${FTP_USER:-ftpuser}"
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
# 构建远程 DNS 配置文件 URL
REMOTE_DNS_CONF_URL="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/dns.conf" REMOTE_DNS_CONF_URL="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/dns.conf"
log_info "FTP 配置来源: ${FTP_CONFIG_SOURCE:-环境变量/配置文件}"
} }
# 下载远程 DNS 配置文件 # 下载远程 dns.conf
download_remote_dns_conf() { download_remote_dns_conf() {
local temp_file="/tmp/dns.conf.remote.$$" local tmp="/tmp/dns.remote.$$"
log_info "从 FTP 服务器下载 DNS 配置文件..."
log_info "远程地址: $REMOTE_DNS_CONF_URL"
log_info "FTP 服务器: $FTP_SERVER"
log_info "FTP 用户: $FTP_USER"
# 先测试 FTP 连接
log_info "测试 FTP 连接..." log_info "测试 FTP 连接..."
if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/" >/dev/null 2>&1; then if ! curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/" >/dev/null; then
log_success "FTP 服务器连接成功" log_error "无法连接到 FTP 服务器: $FTP_SERVER"; return 1
else
log_error "无法连接到 FTP 服务器: $FTP_SERVER"
log_error "请检查:"
log_error " 1. FTP 服务器是否运行"
log_error " 2. 网络连接是否正常"
log_error " 3. 服务器地址是否正确"
return 1
fi fi
if ! curl -u "${FTP_USER}:${FTP_PASSWORD}" -sf "ftp://${FTP_SERVER}/dns.conf" -o "$tmp" 2>/dev/null; then
# 测试 dns.conf 文件是否存在 log_error "下载 dns.conf 失败"; rm -f "$tmp"; return 1
log_info "检查远程 dns.conf 文件是否存在..."
if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/dns.conf" >/dev/null 2>&1; then
log_success "远程 dns.conf 文件存在"
else
log_error "远程 dns.conf 文件不存在或无法访问"
log_error "请检查 FTP 服务器根目录下是否有 dns.conf 文件"
return 1
fi
# 尝试下载文件
log_info "开始下载 dns.conf 文件..."
if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sf "ftp://${FTP_SERVER}/dns.conf" -o "$temp_file" 2>/dev/null; then
log_success "远程 DNS 配置文件下载成功"
echo "$temp_file"
else
log_error "下载 dns.conf 文件失败"
log_error "尝试手动测试命令:"
log_error " curl -u ${FTP_USER}:${FTP_PASSWORD} ftp://${FTP_SERVER}/dns.conf"
rm -f "$temp_file"
return 1
fi fi
echo "$tmp"
} }
# 比较两个文件是否相同 # 文件比较
compare_files() { compare_files() { diff -q "$1" "$2" >/dev/null 2>&1; }
local file1="$1"
local file2="$2"
if [[ ! -f "$file1" || ! -f "$file2" ]]; then # 从 dns.conf 提取有效 IP
return 1 get_dns_ips() {
fi grep -Eo '^[0-9]{1,3}(\.[0-9]{1,3}){3}$' "$1" | sort -u
# 使用 diff 比较文件内容
if diff -q "$file1" "$file2" >/dev/null 2>&1; then
return 0 # 文件相同
else
return 1 # 文件不同
fi
} }
# 将 DNS 配置追加到 /etc/resolv.conf # 安全更新 resolv.conf保留符号链接
update_resolv_conf() { update_resolv_conf() {
local dns_conf_file="$1" local dns_conf="$1"
local dns_ips
mapfile -t dns_ips < <(get_dns_ips "$dns_conf")
[[ ${#dns_ips[@]} -eq 0 ]] && { log_warning "未检测到有效 DNS"; return; }
log_info "更新 /etc/resolv.conf 文件..." local target_file="$RESOLV_CONF"
if [[ ! -w "$RESOLV_CONF" ]]; then
# 备份原始文件 log_warning "/etc/resolv.conf 不可写,使用兜底路径 $ALT_RESOLV_CONF"
if [[ -f "$RESOLV_CONF" ]]; then target_file="$ALT_RESOLV_CONF"
cp "$RESOLV_CONF" "${RESOLV_CONF}.backup.$(date +%Y%m%d_%H%M%S)"
log_info "已备份原始 resolv.conf 文件"
fi fi
# 读取 DNS 配置文件并追加到 resolv.conf local temp="/tmp/resolv.new.$$"
while IFS= read -r line; do cp "$target_file" "${target_file}.backup.$(date +%Y%m%d_%H%M%S)" 2>/dev/null || true
# 跳过空行和注释行 log_info "更新 DNS 配置文件: $target_file"
[[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue
# 验证是否为有效的 IP 地址 # 写入新的 nameserver 行
if [[ "$line" =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then for ip in "${dns_ips[@]}"; do
# 检查是否已存在相同的 nameserver 行 echo "nameserver $ip"
if ! grep -q "nameserver $line" "$RESOLV_CONF" 2>/dev/null; then done >"$temp"
echo "nameserver $line" >> "$RESOLV_CONF"
log_info "添加 DNS 服务器: $line" # 追加原内容(去掉重复 nameserver
grep -v '^nameserver' "$target_file" >>"$temp" 2>/dev/null || true
awk '!a[$0]++' "$temp" >"${temp}.uniq"
# ⚙️ 使用 cat 原地覆盖,避免 mv 引发 “设备忙”
if cat "${temp}.uniq" >"$target_file" 2>/dev/null; then
chmod 644 "$target_file"
log_success "DNS 更新完成: ${dns_ips[*]}"
else else
log_info "DNS 服务器已存在,跳过: $line" log_error "无法写入 $target_file,可能被系统锁定"
fi fi
else
log_warning "跳过无效的 DNS 地址: $line"
fi
done < "$dns_conf_file"
# 设置文件权限 rm -f "$temp" "${temp}.uniq"
chmod 644 "$RESOLV_CONF"
log_success "/etc/resolv.conf 文件更新完成"
} }
# 记录同步日志 # 检查 resolv.conf 是否包含 dns.conf 内容
log_sync() { ensure_dns_in_resolv() {
local message="$1" local dns_conf="$1"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S') local dns_ips
echo "[$timestamp] $message" >> "$LOG_FILE" mapfile -t dns_ips < <(get_dns_ips "$dns_conf")
[[ ${#dns_ips[@]} -eq 0 ]] && return
for ip in "${dns_ips[@]}"; do
if ! grep -q "nameserver $ip" "$RESOLV_CONF" 2>/dev/null; then
log_warning "检测到 /etc/resolv.conf 缺少 $ip,执行兜底修复"
update_resolv_conf "$dns_conf"
return
fi
done
log_info "/etc/resolv.conf 已包含所有 DNS"
} }
# 主函数 log_sync() { echo "[$(date '+%F %T')] $1" >>"$LOG_FILE"; }
main() { main() {
log_info "开始 DNS 同步检查..." log_info "开始 DNS 同步检查..."
log_sync "DNS 同步检查开始" mkdir -p /opt/argus-metric
# 确保系统目录存在
mkdir -p "/opt/argus-metric"
# 获取 FTP 配置
get_ftp_config get_ftp_config
local remote_file
# 检查本地 DNS 配置文件是否存在 if ! remote_file=$(download_remote_dns_conf); then
if [[ ! -f "$LOCAL_DNS_CONF" ]]; then log_error "下载失败"; log_sync "同步失败"; exit 1
log_warning "本地 DNS 配置文件不存在: $LOCAL_DNS_CONF"
log_warning "将下载远程配置文件并更新系统 DNS 设置"
# 下载远程配置文件
if remote_file=$(download_remote_dns_conf); then
# 复制到本地
cp "$remote_file" "$LOCAL_DNS_CONF"
log_success "远程 DNS 配置文件已保存到本地"
# 更新 resolv.conf
update_resolv_conf "$LOCAL_DNS_CONF"
log_sync "首次同步完成DNS 配置已更新"
# 清理临时文件
rm -f "$remote_file"
else
log_error "无法下载远程 DNS 配置文件,同步失败"
log_sync "同步失败:无法下载远程配置文件"
exit 1
fi fi
else
log_info "本地 DNS 配置文件存在: $LOCAL_DNS_CONF"
# 下载远程配置文件进行比较 if [[ ! -f "$LOCAL_DNS_CONF" ]]; then
if remote_file=$(download_remote_dns_conf); then log_info "本地 dns.conf 不存在,初始化..."
# 比较文件 cp "$remote_file" "$LOCAL_DNS_CONF"
if compare_files "$LOCAL_DNS_CONF" "$remote_file"; then update_resolv_conf "$LOCAL_DNS_CONF"
log_info "DNS 配置文件无变化,无需更新" log_sync "首次同步完成"
log_sync "DNS 配置文件无变化" else
else if compare_files "$LOCAL_DNS_CONF" "$remote_file"; then
log_info "检测到 DNS 配置文件有变化,开始同步..." log_info "dns.conf 无变化"
log_sync "检测到 DNS 配置文件变化,开始同步" ensure_dns_in_resolv "$LOCAL_DNS_CONF"
log_sync "dns.conf 无变化,执行兜底检查"
# 更新本地配置文件 else
log_info "检测到 DNS 配置更新"
cp "$remote_file" "$LOCAL_DNS_CONF" cp "$remote_file" "$LOCAL_DNS_CONF"
log_success "本地 DNS 配置文件已更新"
# 更新 resolv.conf
update_resolv_conf "$LOCAL_DNS_CONF" update_resolv_conf "$LOCAL_DNS_CONF"
log_sync "DNS 配置同步完成" log_sync "DNS 配置同步完成"
fi fi
fi
# 清理临时文件
rm -f "$remote_file" rm -f "$remote_file"
else log_success "DNS 同步流程完成"
log_error "无法下载远程 DNS 配置文件,跳过本次同步"
log_sync "同步失败:无法下载远程配置文件"
exit 1
fi
fi
log_success "DNS 同步检查完成"
log_sync "DNS 同步检查完成"
} }
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@" main "$@"
fi fi

View File

@ -31,25 +31,26 @@ RUN mkdir -p /var/log/supervisor
ENV FTP_BASE_PATH=/private/argus/ftp ENV FTP_BASE_PATH=/private/argus/ftp
# 设置域名环境变量 # 设置域名环境变量
ENV DOMAIN=prom.ftp.argus.com ENV DOMAIN=ftp.metric.argus.com
# 设置FTP用户密码环境变量 # 设置FTP用户密码环境变量
ENV FTP_PASSWORD=ZGClab1234! ENV FTP_PASSWORD=ZGClab1234!
# 设置用户和组ID环境变量 # 设置用户和组ID环境变量
ARG FTP_UID=2133 ARG ARGUS_BUILD_UID=2133
ARG FTP_GID=2015 ARG ARGUS_BUILD_GID=2015
ENV FTP_UID=${FTP_UID}
ENV FTP_GID=${FTP_GID} ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
# 创建FTP用户和目录结构 # 创建FTP用户和目录结构
RUN groupadd -g ${FTP_GID} ftpuser && \ RUN groupadd -g ${ARGUS_BUILD_GID} ftpuser && \
useradd -u ${FTP_UID} -g ${FTP_GID} -d ${FTP_BASE_PATH}/share -s /bin/bash ftpuser && \ useradd -u ${ARGUS_BUILD_UID} -g ${ARGUS_BUILD_GID} -d ${FTP_BASE_PATH}/share -s /bin/bash ftpuser && \
mkdir -p ${FTP_BASE_PATH}/share \ mkdir -p ${FTP_BASE_PATH}/share \
&& mkdir -p /private/argus/etc \ && mkdir -p /private/argus/etc \
&& mkdir -p /var/log/vsftpd \ && mkdir -p /var/log/vsftpd \
&& mkdir -p /var/run/vsftpd/empty \ && chown -R ftpuser:ftpuser ${FTP_BASE_PATH} \
&& chown -R ftpuser:ftpuser ${FTP_BASE_PATH} && mkdir -p /var/run/vsftpd/empty
# 创建vsftpd配置目录和用户列表文件 # 创建vsftpd配置目录和用户列表文件
RUN mkdir -p /etc/vsftpd && \ RUN mkdir -p /etc/vsftpd && \

View File

@ -32,6 +32,9 @@ IP=$(ifconfig eth0 | awk '/inet /{print $2}' || hostname -i)
echo "current IP: ${IP}" echo "current IP: ${IP}"
echo "${IP}" > /private/argus/etc/${DOMAIN} echo "${IP}" > /private/argus/etc/${DOMAIN}
chown ${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID} /private/argus/etc/${DOMAIN}
chmod +x /private/argus/etc/${DOMAIN}
# 启动vsftpd # 启动vsftpd
echo "[INFO] Starting vsftpd..." echo "[INFO] Starting vsftpd..."
exec /usr/sbin/vsftpd /tmp/vsftpd.conf exec /usr/sbin/vsftpd /tmp/vsftpd.conf

View File

@ -17,30 +17,31 @@ RUN mkdir -p /var/log/supervisor
ENV GRAFANA_BASE_PATH=/private/argus/metric/grafana ENV GRAFANA_BASE_PATH=/private/argus/metric/grafana
# 设置用户和组ID环境变量 # 设置用户和组ID环境变量
ARG GRAFANA_UID=2133 ARG ARGUS_BUILD_UID=2133
ARG GRAFANA_GID=2015 ARG ARGUS_BUILD_GID=2015
ENV GRAFANA_UID=${GRAFANA_UID}
ENV GRAFANA_GID=${GRAFANA_GID} ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
# 创建基本目录结构 # 创建基本目录结构
RUN mkdir -p /private/argus/etc \ RUN mkdir -p /private/argus/etc \
&& mkdir -p /private/argus/metric/grafana/data \ && mkdir -p ${GRAFANA_BASE_PATH}/data \
&& mkdir -p /private/argus/metric/grafana/logs \ && mkdir -p ${GRAFANA_BASE_PATH}/logs \
&& mkdir -p /private/argus/metric/grafana/plugins \ && mkdir -p ${GRAFANA_BASE_PATH}/plugins \
&& mkdir -p /private/argus/metric/grafana/provisioning/datasources \ && mkdir -p ${GRAFANA_BASE_PATH}/provisioning/datasources \
&& mkdir -p /private/argus/metric/grafana/provisioning/dashboards \ && mkdir -p ${GRAFANA_BASE_PATH}/provisioning/dashboards \
&& mkdir -p /private/argus/metric/grafana/data/sessions \ && mkdir -p ${GRAFANA_BASE_PATH}/data/sessions \
&& mkdir -p /private/argus/metric/grafana/data/dashboards \ && mkdir -p ${GRAFANA_BASE_PATH}/data/dashboards \
&& mkdir -p /private/argus/metric/grafana/config \ && mkdir -p ${GRAFANA_BASE_PATH}/config \
&& mkdir -p /etc/grafana \ && mkdir -p /etc/grafana \
&& mkdir -p /var/lib/grafana \ && mkdir -p /var/lib/grafana \
&& mkdir -p /var/log/grafana && mkdir -p /var/log/grafana
# 修改 Grafana 用户 UID/GID 并授权 # 修改 Grafana 用户 UID/GID 并授权
RUN deluser grafana && \ RUN deluser grafana && \
addgroup -g ${GRAFANA_GID} grafana && \ addgroup -g ${ARGUS_BUILD_GID} grafana && \
adduser -u ${GRAFANA_UID} -G grafana -s /bin/sh -D grafana && \ adduser -u ${ARGUS_BUILD_UID} -G grafana -s /bin/sh -D grafana && \
chown -R grafana:grafana /var/lib/grafana /etc/grafana /var/log/grafana /private/argus chown -R grafana:grafana /var/lib/grafana /etc/grafana /var/log/grafana ${GRAFANA_BASE_PATH}
# 复制配置文件到容器内临时位置 # 复制配置文件到容器内临时位置
COPY grafana.ini /tmp/grafana.ini COPY grafana.ini /tmp/grafana.ini

View File

@ -0,0 +1,570 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 3,
"links": [],
"panels": [
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 8,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.1.0",
"targets": [
{
"expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode='idle'}[5m])))",
"refId": "A"
}
],
"title": "CPU 平均利用率(%",
"type": "stat"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 8,
"x": 8,
"y": 0
},
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.1.0",
"targets": [
{
"expr": "avg(1 - ((node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes)) * 100",
"refId": "A"
}
],
"title": "内存平均利用率(%",
"type": "stat"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 8,
"x": 16,
"y": 0
},
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.1.0",
"targets": [
{
"expr": "count(count by(hostname) (up{job='node'} == 1))",
"refId": "A"
}
],
"title": "节点在线数",
"type": "stat"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 6,
"x": 0,
"y": 5
},
"id": 6,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto"
},
"pluginVersion": "11.1.0",
"targets": [
{
"expr": "avg by (hostname) (DCGM_FI_DEV_GPU_UTIL)",
"refId": "A"
}
],
"title": "GPU 平均利用率 (%)",
"type": "gauge"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 6,
"x": 6,
"y": 5
},
"id": 12,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto"
},
"pluginVersion": "11.1.0",
"targets": [
{
"expr": "round(avg(DCGM_FI_DEV_FB_USED{job='dcgm'}/(DCGM_FI_DEV_FB_USED{job='dcgm'} + DCGM_FI_DEV_FB_FREE{job='dcgm'})) * 100)",
"refId": "A"
}
],
"title": "显存平均利用率 (%)",
"type": "gauge"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 6,
"x": 12,
"y": 5
},
"id": 7,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto"
},
"pluginVersion": "11.1.0",
"targets": [
{
"expr": "avg by (hostname) (DCGM_FI_DEV_GPU_TEMP)",
"refId": "A"
}
],
"title": "GPU 温度 (℃)",
"type": "gauge"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"mappings": [],
"max": 300,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "orange",
"value": 200
},
{
"color": "red",
"value": 300
}
]
},
"unit": "watt"
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 6,
"x": 18,
"y": 5
},
"id": 8,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": true,
"showThresholdMarkers": true,
"sizing": "auto"
},
"pluginVersion": "11.1.0",
"targets": [
{
"expr": "avg by (hostname) (DCGM_FI_DEV_POWER_USAGE)",
"refId": "A"
}
],
"title": "GPU 平均实时功耗 (W)",
"type": "gauge"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"custom": {
"align": "center",
"cellOptions": {
"type": "auto"
},
"inspect": false
},
"decimals": 1,
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 12,
"w": 24,
"x": 0,
"y": 11
},
"id": 11,
"options": {
"cellHeight": "sm",
"cellLinks": [
{
"title": "跳转至节点详情",
"url": "http://127.0.0.1:3000/d/node_gpu_metrics/node-and-gpu-metrics?orgId=1&refresh=15s&var-hostname=${__data.fields.hostname}"
}
],
"footer": {
"countRows": false,
"fields": "",
"reducer": [
"sum"
],
"show": false
},
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "GPU 使用率"
}
]
},
"pluginVersion": "11.1.0",
"targets": [
{
"expr": "up{job=\"dcgm\"} + on(hostname) group_left(ip, node_id) up{job=\"dcgm\"}*0",
"format": "table",
"instant": true,
"refId": "node_info"
},
{
"expr": "round(100 - avg by(hostname)(rate(node_cpu_seconds_total{job=\"node\",mode=\"idle\"}[5m])) * 100, 0.1)",
"format": "table",
"instant": true,
"refId": "CPU"
},
{
"expr": "round((1 - avg by(hostname)(node_memory_MemAvailable_bytes{job=\"node\"} / node_memory_MemTotal_bytes{job=\"node\"})) * 100, 0.1)",
"format": "table",
"instant": true,
"refId": "MEM"
},
{
"expr": "round(avg by(hostname)(DCGM_FI_DEV_GPU_UTIL{job=\"dcgm\"}), 0.1)",
"format": "table",
"instant": true,
"refId": "GPU_UTIL"
},
{
"expr": "round(avg by(hostname)(DCGM_FI_DEV_FB_USED{job=\"dcgm\"} / (DCGM_FI_DEV_FB_USED{job=\"dcgm\"} + DCGM_FI_DEV_FB_FREE{job=\"dcgm\"}) * 100), 0.1)",
"format": "table",
"instant": true,
"refId": "GPU_MEM"
}
],
"title": "节点列表CPU / 内存 / GPU",
"transformations": [
{
"id": "seriesToColumns",
"options": {
"byField": "hostname"
}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"Value #node_info": true,
"hostname_1": true,
"hostname_2": true,
"hostname_3": true,
"instance": true,
"ip_1": true,
"job": true,
"node_id_1": true
},
"indexByName": {
"CPU 使用率": 3,
"GPU 使用率": 5,
"GPU 显存占用": 6,
"IP 地址": 1,
"主机名": 0,
"内存使用率": 4,
"节点 ID": 2
},
"renameByName": {
"Value #CPU": "CPU 使用率",
"Value #GPU_MEM": "GPU 显存占用",
"Value #GPU_UTIL": "GPU 使用率",
"Value #MEM": "内存使用率",
"hostname": "主机名",
"ip": "IP 地址",
"node_id": "节点 ID",
"user_id": "用户ID"
}
}
}
],
"type": "table"
}
],
"refresh": "5s",
"schemaVersion": 39,
"tags": [
"cluster",
"gpu",
"system"
],
"templating": {
"list": []
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"timezone": "browser",
"title": "Cluster Dashboard",
"uid": "cluster-dashboard",
"version": 34,
"weekStart": ""
}

View File

@ -9,6 +9,7 @@ DOMAIN=grafana.metric.argus.com
IP=$(ifconfig | awk '/inet / && $2 != "127.0.0.1" {print $2; exit}') IP=$(ifconfig | awk '/inet / && $2 != "127.0.0.1" {print $2; exit}')
echo "current IP: ${IP}" echo "current IP: ${IP}"
echo "${IP}" > /private/argus/etc/${DOMAIN} echo "${IP}" > /private/argus/etc/${DOMAIN}
chmod +x /private/argus/etc/${DOMAIN}
# 确保必要目录存在(权限已在 Dockerfile 中设置) # 确保必要目录存在(权限已在 Dockerfile 中设置)
mkdir -p /private/argus/metric/grafana/data mkdir -p /private/argus/metric/grafana/data
@ -27,7 +28,6 @@ mkdir -p /var/lib/grafana
if [ -f "/tmp/grafana.ini" ]; then if [ -f "/tmp/grafana.ini" ]; then
echo "[INFO] Copying grafana.ini to /private/argus/metric/grafana/config/" echo "[INFO] Copying grafana.ini to /private/argus/metric/grafana/config/"
cp /tmp/grafana.ini /private/argus/metric/grafana/config/grafana.ini cp /tmp/grafana.ini /private/argus/metric/grafana/config/grafana.ini
chown grafana:grafana /private/argus/metric/grafana/config/grafana.ini
echo "[INFO] Grafana configuration copied successfully" echo "[INFO] Grafana configuration copied successfully"
fi fi
@ -47,12 +47,9 @@ fi
if [ -f "/tmp/datasources.yml" ]; then if [ -f "/tmp/datasources.yml" ]; then
echo "[INFO] Copying datasource configuration to /private/argus/metric/grafana/provisioning/datasources/" echo "[INFO] Copying datasource configuration to /private/argus/metric/grafana/provisioning/datasources/"
cp /tmp/datasources.yml /private/argus/metric/grafana/provisioning/datasources/datasources.yml cp /tmp/datasources.yml /private/argus/metric/grafana/provisioning/datasources/datasources.yml
chown grafana:grafana /private/argus/metric/grafana/provisioning/datasources/datasources.yml
echo "[INFO] Datasource configuration copied successfully" echo "[INFO] Datasource configuration copied successfully"
elif [ -d "/private/argus/metric/grafana/provisioning/datasources" ] && [ "$(ls -A /private/argus/metric/grafana/provisioning/datasources)" ]; then elif [ -d "/private/argus/metric/grafana/provisioning/datasources" ] && [ "$(ls -A /private/argus/metric/grafana/provisioning/datasources)" ]; then
echo "[INFO] Found existing datasource provisioning files in /private/argus/metric/grafana/provisioning/datasources" echo "[INFO] Found existing datasource provisioning files in /private/argus/metric/grafana/provisioning/datasources"
# 确保数据源配置目录权限正确
chown -R grafana:grafana /private/argus/metric/grafana/provisioning/datasources
elif [ -d "/etc/grafana/provisioning/datasources" ] && [ "$(ls -A /etc/grafana/provisioning/datasources)" ]; then elif [ -d "/etc/grafana/provisioning/datasources" ] && [ "$(ls -A /etc/grafana/provisioning/datasources)" ]; then
echo "[INFO] Found datasource provisioning files in /etc/grafana/provisioning/datasources" echo "[INFO] Found datasource provisioning files in /etc/grafana/provisioning/datasources"
# 确保数据源配置目录权限正确 # 确保数据源配置目录权限正确
@ -65,7 +62,6 @@ fi
if [ -f "/tmp/dashboards.yml" ]; then if [ -f "/tmp/dashboards.yml" ]; then
echo "[INFO] Copying dashboard configuration to /private/argus/metric/grafana/provisioning/dashboards/" echo "[INFO] Copying dashboard configuration to /private/argus/metric/grafana/provisioning/dashboards/"
cp /tmp/dashboards.yml /private/argus/metric/grafana/provisioning/dashboards/dashboards.yml cp /tmp/dashboards.yml /private/argus/metric/grafana/provisioning/dashboards/dashboards.yml
chown grafana:grafana /private/argus/metric/grafana/provisioning/dashboards/dashboards.yml
echo "[INFO] Dashboard configuration copied successfully" echo "[INFO] Dashboard configuration copied successfully"
fi fi
@ -73,13 +69,9 @@ fi
if [ -f "/tmp/default_dashboard.json" ]; then if [ -f "/tmp/default_dashboard.json" ]; then
echo "[INFO] Copying default dashboard to /private/argus/metric/grafana/provisioning/dashboards/" echo "[INFO] Copying default dashboard to /private/argus/metric/grafana/provisioning/dashboards/"
cp /tmp/default_dashboard.json /private/argus/metric/grafana/provisioning/dashboards/default_dashboard.json cp /tmp/default_dashboard.json /private/argus/metric/grafana/provisioning/dashboards/default_dashboard.json
chown grafana:grafana /private/argus/metric/grafana/provisioning/dashboards/default_dashboard.json
echo "[INFO] Default dashboard copied successfully" echo "[INFO] Default dashboard copied successfully"
fi fi
# 确保所有配置目录权限正确
chown -R grafana:grafana /private/argus/metric/grafana/provisioning/
# 启动 Grafana # 启动 Grafana
if [ -n "$CONFIG_FILE" ]; then if [ -n "$CONFIG_FILE" ]; then
echo "[INFO] Starting Grafana with custom configuration..." echo "[INFO] Starting Grafana with custom configuration..."

View File

@ -48,11 +48,11 @@ RUN mkdir -p /var/log/supervisor
ENV PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus ENV PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus
# 设置用户和组ID环境变量 # 设置用户和组ID环境变量
ARG PROMETHEUS_UID=2133 ARG ARGUS_BUILD_UID=2133
ARG PROMETHEUS_GID=2015 ARG ARGUS_BUILD_GID=2015
ENV PROMETHEUS_UID=${PROMETHEUS_UID}
ENV PROMETHEUS_GID=${PROMETHEUS_GID}
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
# 创建目录结构 # 创建目录结构
RUN mkdir -p ${PROMETHEUS_BASE_PATH}/rules \ RUN mkdir -p ${PROMETHEUS_BASE_PATH}/rules \
&& mkdir -p ${PROMETHEUS_BASE_PATH}/targets \ && mkdir -p ${PROMETHEUS_BASE_PATH}/targets \
@ -61,11 +61,11 @@ RUN mkdir -p ${PROMETHEUS_BASE_PATH}/rules \
&& ln -s ${PROMETHEUS_BASE_PATH} /prometheus && ln -s ${PROMETHEUS_BASE_PATH} /prometheus
# 修改 Prometheus 用户 UID/GID 并授权 # 修改 Prometheus 用户 UID/GID 并授权
RUN usermod -u ${PROMETHEUS_UID} nobody && \ RUN usermod -u ${ARGUS_BUILD_UID} nobody && \
groupmod -g ${PROMETHEUS_GID} nogroup && \ groupmod -g ${ARGUS_BUILD_GID} nogroup && \
chown -h nobody:nogroup /prometheus && \ chown -h nobody:nogroup /prometheus && \
chown -R nobody:nogroup /private/argus/metric /etc/prometheus && \ chown -R nobody:nogroup ${PROMETHEUS_BASE_PATH} && \
chown -R nobody:nogroup ${PROMETHEUS_BASE_PATH} chown -R nobody:nogroup /etc/prometheus
# supervisor 配置 # supervisor 配置
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf

View File

@ -7,7 +7,7 @@ global:
alerting: alerting:
alertmanagers: alertmanagers:
- static_configs: - static_configs:
- targets: [] - targets: ["alertmanager.alert.argus.com:9093"]
# 规则目录 # 规则目录
rule_files: rule_files:

View File

@ -17,6 +17,7 @@ sed "s|\${PROMETHEUS_BASE_PATH}|${PROMETHEUS_BASE_PATH}|g" \
IP=$(ifconfig eth0 | awk '/inet /{print $2}') IP=$(ifconfig eth0 | awk '/inet /{print $2}')
echo "current IP: ${IP}" echo "current IP: ${IP}"
echo "${IP}" > /private/argus/etc/${DOMAIN} echo "${IP}" > /private/argus/etc/${DOMAIN}
chmod +x /private/argus/etc/${DOMAIN}
exec /bin/prometheus \ exec /bin/prometheus \
--config.file=${PROMETHEUS_BASE_PATH}/prometheus.yml \ --config.file=${PROMETHEUS_BASE_PATH}/prometheus.yml \

View File

@ -1,7 +1,7 @@
.env .env
data/ data/
images-cache/ images-cache/
private-test-node/
*.tar *.tar
*.log *.log
.DS_Store .DS_Store

View File

@ -1,171 +1,97 @@
# E2E Test - Argus Metric 部署测试 # E2E Test - Argus Metric 部署测试
## 概述 ## 1. 概述
本项目用于对 Argus Metric 模块进行端到端E2E部署测试。 本项目用于对 Argus Metric 模块进行端到端E2E部署测试。
通过一键脚本可快速搭建 Prometheus、FTP、Grafana 等服务,验证 Metric 模块的完整部署与运行流程。 通过一键脚本可快速搭建 Prometheus、FTP、Grafana 等服务,验证 Metric 模块的完整部署与运行流程。
## 拉取完整项目,进入 metric.tests 目录 功能包括:
- 自动启动所需服务和测试节点
- 发布安装包到 FTP
- CPU/GPU 节点客户端安装测试
- 验证安装结果与服务可用性
- 支持环境清理和分步调试
## 2. 前置条件
在开始部署和测试之前,请确保完成以下准备工作:
### 2.1 检查 all-in-one-full 客户端安装包
确认客户端安装包目录是否存在:
```bash ```bash
git clone https://git.nasp.fit/NASP/argus.git {$PROJECT_ROOT}/argus/src/metric/client-plugins/all-in-one-full
cd {$PROJECT_ROOT}/argus/src/metric/tests
``` ```
本项目依赖完整的 all-in-one-full 安装包,其中包含大量二进制文件、依赖包和测试制品,由于体积较大,无法直接上传到 Git 仓库。**请联系项目管理员获取最新版本的完整框架。**
## 一键构建与部署 Prometheus / FTP / Grafana ### 2.2 配置环境变量
### 1. 修改环境变量文件 查看配置文件是否存在,如不存在,则复制示例配置文件并根据实际环境修改:
将示例配置文件复制为 .env 并根据实际情况修改:
```bash ```bash
cd {$PROJECT_ROOT}/argus/src/metric/tests
cp env.example .env cp env.example .env
``` ```
.env 文件用于指定构建UID:GID、FTP 配置、版本号等信息,确保各脚本运行时可以正确访问资源。
### 2. 一键启动服务 ### 2.3 离线镜像准备
- 步骤1在**在线服务器**执行以下脚本,会拉取和构建所需的 Docker 镜像:
``` bash
cd {$PROJECT_ROOT}/argus/src/metric/tests
bash scripts/01_start_services.sh
bash scripts/save-images.sh
```
- 步骤2镜像将被保存到 metric.tests.images-cache 目录中,用于离线迁移和后续导入。
- 步骤3若目标服务器无法联网可将该目录拷贝到离线服务器并执行
``` bash
cd {$PROJECT_ROOT}/argus/src/metric/tests
bash scripts/load-images.sh
```
- 即可导入镜像并执行下面的QuickStart或分步操作。
执行以下命令完成环境初始化、镜像构建与服务启动: ## 3. QuickStart
执行完整的端到端测试流程:
```bash ```bash
sudo bash start-all.sh bash scripts/00_e2e_test.sh
``` ```
该脚本将自动完成: 该脚本将自动执行以下步骤:
- 初始化目录结构(如 /private/argus/metric 1. 启动所有服务Prometheus、FTP、Grafana、测试节点
- 构建各服务 Docker 镜像 2. 发布安装包到 FTP 服务
- 启动 Prometheus、FTP、Grafana 容器 3. 在 CPU 测试节点上安装客户端
4. 在 GPU 测试节点上安装客户端
5. 验证安装结果
6. 清理测试环境
### 3. 检查容器日志 ## 4. 分步执行
可手动验证容器运行状态: | 步骤 | 脚本 | 功能描述 |
|--------------|-------------------------------------------|--------------------------------------------------------|
| 启动基础服务 | bash scripts/01_start_services.sh | 构建 Docker 镜像、创建持久化目录、启动容器服务 |
| 发布安装包 | bash scripts/02_publish_artifact.sh | 自动递增版本号、打包安装制品、发布到 FTP |
| CPU 节点安装 | bash scripts/03_test_node_install.sh | 在 CPU 节点下载安装程序并执行安装 |
| GPU 节点安装 | bash scripts/04_test_gpu_node_install.sh | 在 GPU 节点下载安装程序并执行安装 |
| 验证安装 | bash scripts/05_verify_install.sh | 检查监控端口、端口连通性及服务可用性 |
| 清理环境 | bash scripts/06_cleanup.sh | 停止并清理所有测试容器及环境 |
``` bash ## 5. 查看监控采集数据及展示面板
docker logs argus-metric-ftp
docker logs argus-metric-grafana
docker logs argus-metric-prometheus
```
如日志输出中无 ERROR 或 supervisor 报错信息,则表示服务启动正常。
## 客户端安装包打包与分发
> **前置说明**:完整的 `all-in-one` 安装包打包分发框架因包含大量二进制文件和依赖包,无法上传至 Git 仓库。请先联系项目管理员获取最新的 `all-in-one` 完整框架,再执行后续操作。
打包后服务端会将安装包发布至 FTP 共享目录,默认路径为:
``` bash
$DATA_ROOT/ftp/share
```
发布后的文件权限与 FTP 目录账户保持一致。
### 1. 递增版本号
``` bash
bash scripts/version-manager.sh bump minor
```
该脚本会自动更新版本号(如 1.101.0 → 1.102.0)。
### 2. 打包安装制品
``` bash
bash scripts/package_artifact.sh
```
执行后会在输出目录中生成压缩包或安装脚本。
### 3. 发布制品至 FTP
``` bash
sudo bash scripts/publish_artifact.sh $VERSION --output-dir $OUTPUT_DIR --owner $UID:$GID
```
参数说明:
参数 说明
$VERSION 发布版本号(如 1.102.0
$OUTPUT_DIR 输出目录(默认 /private/argus/ftp/share
$UID:$GID 文件属主用户ID:组ID
示例:
``` bash
sudo bash scripts/publish_artifact.sh 1.102.0 --output-dir /private/argus/ftp/share --owner 2133:2015
```
更多详情可参考 client-plugins/all-in-one/README.md。
## 客户端安装(通过 FTP
客户端下载与安装步骤如下:
``` bash
curl -u ${USER}:${PASSWD} ftp://${FTP_SERVER}/setup.sh -o setup.sh
chmod +x setup.sh
sudo bash setup.sh --server ${FTP_SERVER} --user ${USER} --password ${PASSWD} --port ${PORT}
```
参数说明:
参数 说明
$FTP_SERVER 服务器 IP 地址
$USER 默认 ftpuser
$PASSWD 默认 ZGClab1234!
$PORT FTP 服务端口(需与 .env 保持一致)
示例:
``` bash
curl -u ftpuser:ZGClab1234! ftp://10.211.55.4/setup.sh -o setup.sh
chmod +x setup.sh
sudo bash setup.sh --server 10.211.55.4 --user ftpuser --password 'ZGClab1234!' --port 2122
```
更多细节可参考 client-plugins/all-in-one/README.md。
## 模拟 Argus-Master 配置下发
可通过手动写入 nodes.json 文件模拟 Argus-Master 对 Argus-Metric 的配置下发:
``` json
[
{
"node_id": "A1",
"user_id": "sundapeng",
"ip": "10.211.55.4",
"hostname": "dev-sundapeng-nsche-wohen-pod-0",
"labels": ["label-a", "label-b"]
}
]
```
路径:
``` bash
${DATA_ROOT}/prometheus/nodes.json
```
Argus-Metric 中的 prometheus 模块会自动解析该文件,并将其拆分生成目标配置:
``` bash
${DATA_ROOT}/prometheus/targets/
```
## Grafana 手动配置(如未自动接入 Prometheus
如 Grafana 未自动导入 Prometheus 数据源,可手动执行以下操作:
1. 添加数据源
- 进入 Grafana → Data sources
- 选择 Add data source → Prometheus
- URL 填写http://prometheus:9090(Docker 内部 DNS 地址)
2. 导入测试 Dashboard
- 打开 Grafana → Dashboards → Import
- 上传或粘贴 test_grafana_dashboard.json
## 查看监控数据
Prometheus 访问以下地址查看节点活性: Prometheus 访问以下地址查看节点活性:
``` bash ``` bash
http://127.0.0.1:9091/targets http://127.0.0.1:9090/targets
``` ```
Grafana 访问以下地址查看监控大屏: Grafana 访问以下地址查看监控大屏:
``` bash ``` bash
http://127.0.0.1:3000/d/node_gpu_metrics/node-and-gpu-metrics http://127.0.0.1:3000/d/node_gpu_metrics/node-and-gpu-metrics
``` ```
PS: 如果 Grafana 未自动导入 Prometheus 数据源,可手动执行以下操作:
1. 添加数据源
- 进入 Grafana → Data sources
- 选择 Add data source → Prometheus
- URL 填写http://prom.metric.argus.com:9090
2. 导入测试 Dashboard
- 打开 Grafana → Dashboards → Import
- 上传或粘贴 test_grafana_dashboard.json

View File

@ -0,0 +1,39 @@
# 使用NVIDIA官方CUDA基础镜像
FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive
# 设置时区
ENV TZ=Asia/Shanghai
RUN apt-get update -qq && \
apt-get install -y -qq \
tzdata \
curl \
wget \
gnupg2 \
software-properties-common \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
# 配置时区
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
WORKDIR /app
# 创建启动脚本在运行时验证GPU
COPY <<EOF /app/start.sh
#!/bin/bash
echo "检查GPU环境..."
if command -v nvidia-smi &> /dev/null; then
nvidia-smi
echo "GPU环境正常"
else
echo "警告: nvidia-smi 命令不可用请确保容器运行时启用了GPU支持"
fi
exec "\$@"
EOF
RUN chmod +x /app/start.sh
CMD ["/app/start.sh", "/bin/bash"]

View File

@ -0,0 +1,6 @@
FROM ubuntu:22.04
RUN apt-get update -qq && \
DEBIAN_FRONTEND=noninteractive apt-get install -y -qq tzdata && \
rm -rf /var/lib/apt/lists/*
ENV TZ=Asia/Shanghai

View File

@ -1,29 +1,39 @@
networks:
default:
name: argus-debug-net
external: true
services: services:
ftp: ftp:
build: build:
context: ../ftp/build context: ../ftp/build
dockerfile: Dockerfile dockerfile: Dockerfile
args: args:
FTP_UID: ${FTP_UID:-2133} ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
FTP_GID: ${FTP_GID:-2015} ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
USE_INTRANET: ${USE_INTRANET:-false}
image: argus-metric-ftp:latest image: argus-metric-ftp:latest
container_name: argus-ftp container_name: argus-ftp
restart: unless-stopped restart: unless-stopped
environment: environment:
- TZ=Asia/Shanghai
- FTP_BASE_PATH=/private/argus/ftp - FTP_BASE_PATH=/private/argus/ftp
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!} - FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
- DOMAIN=${FTP_DOMAIN:-prom.ftp.argus.com} - DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com}
- FTP_UID=${FTP_UID:-2133} - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- FTP_GID=${FTP_GID:-2015} - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
ports: ports:
- "${FTP_PORT:-21}:21" - "${FTP_PORT:-21}:21"
- "${FTP_DATA_PORT:-20}:20" - "${FTP_DATA_PORT:-20}:20"
- "21100-21110:21100-21110" - "21100-21110:21100-21110"
volumes: volumes:
- ${DATA_ROOT:-./data}/ftp:/private/argus/ftp - ${DATA_ROOT:-/private}/argus/metric/ftp:/private/argus/ftp
- ${DATA_ROOT:-./data}/etc:/private/argus/etc - ${DATA_ROOT:-/private}/argus/etc:/private/argus/etc
- /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro
networks: networks:
- argus-network default:
ipv4_address: 172.30.0.40
logging: logging:
driver: "json-file" driver: "json-file"
options: options:
@ -35,23 +45,27 @@ services:
context: ../prometheus/build context: ../prometheus/build
dockerfile: Dockerfile dockerfile: Dockerfile
args: args:
PROMETHEUS_UID: ${PROMETHEUS_UID:-2133} ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
PROMETHEUS_GID: ${PROMETHEUS_GID:-2015} ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
USE_INTRANET: ${USE_INTRANET:-false} USE_INTRANET: ${USE_INTRANET:-false}
image: argus-metric-prometheus:latest image: argus-metric-prometheus:latest
container_name: argus-prometheus container_name: argus-prometheus
restart: unless-stopped restart: unless-stopped
environment: environment:
- TZ=Asia/Shanghai
- PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus - PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus
- PROMETHEUS_UID=${PROMETHEUS_UID:-2133} - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- PROMETHEUS_GID=${PROMETHEUS_GID:-2015} - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
ports: ports:
- "${PROMETHEUS_PORT:-9090}:9090" - "${PROMETHEUS_PORT:-9090}:9090"
volumes: volumes:
- ${DATA_ROOT:-./data}/prometheus:/private/argus/metric/prometheus - ${DATA_ROOT:-/private}/argus/metric/prometheus:/private/argus/metric/prometheus
- ${DATA_ROOT:-./data}/etc:/private/argus/etc - ${DATA_ROOT:-/private}/argus/etc:/private/argus/etc
- /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro
networks: networks:
- argus-network default:
ipv4_address: 172.30.0.41
logging: logging:
driver: "json-file" driver: "json-file"
options: options:
@ -63,25 +77,29 @@ services:
context: ../grafana/build context: ../grafana/build
dockerfile: Dockerfile dockerfile: Dockerfile
args: args:
GRAFANA_UID: ${GRAFANA_UID:-2133} ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
GRAFANA_GID: ${GRAFANA_GID:-2015} ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
image: argus-metric-grafana:latest image: argus-metric-grafana:latest
container_name: argus-grafana container_name: argus-grafana
restart: unless-stopped restart: unless-stopped
environment: environment:
- TZ=Asia/Shanghai
- GRAFANA_BASE_PATH=/private/argus/metric/grafana - GRAFANA_BASE_PATH=/private/argus/metric/grafana
- GRAFANA_UID=${GRAFANA_UID:-2133} - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- GRAFANA_GID=${GRAFANA_GID:-2015} - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
- GF_SERVER_HTTP_PORT=3000 - GF_SERVER_HTTP_PORT=3000
- GF_LOG_LEVEL=warn - GF_LOG_LEVEL=warn
- GF_LOG_MODE=console - GF_LOG_MODE=console
ports: ports:
- "${GRAFANA_PORT:-3000}:3000" - "${GRAFANA_PORT:-3000}:3000"
volumes: volumes:
- ${DATA_ROOT:-./data}/grafana:/private/argus/metric/grafana - ${DATA_ROOT:-/private}/argus/metric/grafana:/private/argus/metric/grafana
- ${DATA_ROOT:-./data}/etc:/private/argus/etc - ${DATA_ROOT:-/private}/argus/etc:/private/argus/etc
- /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro
networks: networks:
- argus-network default:
ipv4_address: 172.30.0.42
depends_on: depends_on:
- prometheus - prometheus
logging: logging:
@ -90,16 +108,78 @@ services:
max-size: "10m" max-size: "10m"
max-file: "3" max-file: "3"
networks: test-node:
argus-network: build:
driver: bridge context: ./client-test-node/build
name: argus-network dockerfile: Dockerfile
image: argus-metric-test-node:latest
container_name: argus-metric-test-node
hostname: test-metric-node-001
restart: unless-stopped
privileged: true
depends_on:
- ftp
- prometheus
environment:
- TZ=Asia/Shanghai
- DEBIAN_FRONTEND=noninteractive
- FTP_DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com}
- FTP_SERVER=${FTP_SERVER:-172.30.0.40}
- FTP_USER=${FTP_USER:-ftpuser}
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
- FTP_PORT=${FTP_PORT:-21}
volumes: volumes:
ftp_data: - ${DATA_ROOT:-/private}/argus/agent:/private/argus/agent
driver: local - /etc/localtime:/etc/localtime:ro
prometheus_data: - /etc/timezone:/etc/timezone:ro
driver: local command: sleep infinity
grafana_data: networks:
driver: local default:
ipv4_address: 172.30.0.50
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
test-gpu-node:
build:
context: ./client-test-gpu-node/build
dockerfile: Dockerfile
image: argus-metric-test-gpu-node:latest
container_name: argus-metric-test-gpu-node
hostname: test-metric-gpu-node-001
restart: unless-stopped
privileged: true
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities:
- gpu
depends_on:
- ftp
- prometheus
environment:
- TZ=Asia/Shanghai
- DEBIAN_FRONTEND=noninteractive
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
- GPU_MODE=gpu
volumes:
- ${DATA_ROOT:-/private}/argus/agent:/private/argus/agent
- /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro
command: sleep infinity
networks:
default:
ipv4_address: 172.30.0.51
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"

View File

@ -1,19 +1,15 @@
# 用户和组配置 # 统一用户和组配置
FTP_UID=2133 ARGUS_BUILD_UID=1048
FTP_GID=2015 ARGUS_BUILD_GID=1048
PROMETHEUS_UID=2133
PROMETHEUS_GID=2015
GRAFANA_UID=2133
GRAFANA_GID=2015
# 数据根目录 # 数据根目录
DATA_ROOT=/private/argus DATA_ROOT=/private
# FTP 配置 # FTP 配置
FTP_PORT=2122 FTP_PORT=21
FTP_DATA_PORT=2022 FTP_DATA_PORT=20
FTP_PASSWORD=ZGClab1234! FTP_PASSWORD=ZGClab1234!
FTP_DOMAIN=prom.ftp.argus.com FTP_DOMAIN=ftp.metric.argus.com
# Prometheus 配置 # Prometheus 配置
PROMETHEUS_PORT=9090 PROMETHEUS_PORT=9090

View File

@ -1,90 +0,0 @@
#!/bin/bash
# 初始化目录脚本
# 用于创建所有必要的数据目录并设置正确的权限
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
# 加载 .env 文件(如果存在)
if [ -f .env ]; then
echo "加载 .env 配置文件..."
source .env
fi
# 默认配置
FTP_UID=${FTP_UID:-2133}
FTP_GID=${FTP_GID:-2015}
PROMETHEUS_UID=${PROMETHEUS_UID:-2133}
PROMETHEUS_GID=${PROMETHEUS_GID:-2015}
GRAFANA_UID=${GRAFANA_UID:-2133}
GRAFANA_GID=${GRAFANA_GID:-2015}
DATA_ROOT=${DATA_ROOT:-./data}
echo "开始初始化目录结构..."
echo "数据目录: ${DATA_ROOT}"
echo ""
# 创建 FTP 目录
echo "创建 FTP 目录..."
sudo mkdir -p ${DATA_ROOT}/ftp/share
sudo chown -R ${FTP_UID}:${FTP_GID} ${DATA_ROOT}/ftp
sudo chmod -R 755 ${DATA_ROOT}/ftp
# 创建 Prometheus 目录
echo "创建 Prometheus 目录..."
sudo mkdir -p ${DATA_ROOT}/prometheus/{data,rules,targets}
# 创建默认的 targets 文件(先创建文件再改权限)
if [ ! -f "${DATA_ROOT}/prometheus/targets/node_exporter.json" ]; then
echo "创建默认 node_exporter targets..."
echo '[
{
"targets": [],
"labels": {
"job": "node"
}
}
]' | sudo tee ${DATA_ROOT}/prometheus/targets/node_exporter.json > /dev/null
fi
if [ ! -f "${DATA_ROOT}/prometheus/targets/dcgm_exporter.json" ]; then
echo "创建默认 dcgm_exporter targets..."
echo '[
{
"targets": [],
"labels": {
"job": "dcgm"
}
}
]' | sudo tee ${DATA_ROOT}/prometheus/targets/dcgm_exporter.json > /dev/null
fi
# 统一设置 Prometheus 目录权限
sudo chown -R ${PROMETHEUS_UID}:${PROMETHEUS_GID} ${DATA_ROOT}/prometheus
sudo chmod -R 755 ${DATA_ROOT}/prometheus
# 创建 Grafana 目录
echo "创建 Grafana 目录..."
sudo mkdir -p ${DATA_ROOT}/grafana/{data,logs,plugins,provisioning/datasources,provisioning/dashboards,data/sessions,data/dashboards,config}
sudo chown -R ${GRAFANA_UID}:${GRAFANA_GID} ${DATA_ROOT}/grafana
sudo chmod -R 755 ${DATA_ROOT}/grafana
# 创建公共配置目录
sudo mkdir -p ${DATA_ROOT}/etc
sudo chown -R ${FTP_UID}:${FTP_GID} ${DATA_ROOT}/etc
sudo chmod -R 755 ${DATA_ROOT}/etc
echo "目录初始化完成!"
echo ""
echo "目录结构:"
echo " ${DATA_ROOT}/"
echo " ├── ftp/ (UID:${FTP_UID}, GID:${FTP_GID})"
echo " ├── prometheus/ (UID:${PROMETHEUS_UID}, GID:${PROMETHEUS_GID})"
echo " ├── grafana/ (UID:${GRAFANA_UID}, GID:${GRAFANA_GID})"
echo " └── etc/ (UID:${FTP_UID}, GID:${FTP_GID})"
echo ""
echo "您现在可以运行 'docker-compose up -d' 来启动所有服务"

View File

@ -0,0 +1,20 @@
#!/bin/bash
set -e
SCRIPT_DIR="$(dirname "$0")"
echo "=========================================="
echo "Argus Metric E2E Test"
echo "=========================================="
bash "$SCRIPT_DIR/01_start_services.sh"
bash "$SCRIPT_DIR/02_publish_artifact.sh"
bash "$SCRIPT_DIR/03_test_node_install.sh"
bash "$SCRIPT_DIR/04_test_gpu_node_install.sh"
bash "$SCRIPT_DIR/05_verify_install.sh"
bash "$SCRIPT_DIR/06_cleanup.sh"
echo "=========================================="
echo "E2E 测试完成"
echo "=========================================="

View File

@ -0,0 +1,27 @@
#!/bin/bash
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# 解析参数
REBUILD_FLAG=""
if [[ "$1" == "--rebuild" || "$1" == "-r" ]]; then
REBUILD_FLAG="--rebuild"
echo "[01] 启用强制重新构建模式"
fi
echo "[01] 启动所有服务..."
bash "$SCRIPT_DIR/common/start-all.sh" $REBUILD_FLAG
echo "[01] 等待服务就绪..."
sleep 5
echo "[01] 检查服务状态..."
docker ps | grep argus-ftp
docker ps | grep argus-prometheus
docker ps | grep argus-grafana
docker ps | grep argus-metric-test-node
docker ps | grep argus-metric-test-gpu-node
echo "[01] 基础服务已启动"

View File

@ -0,0 +1,60 @@
#!/bin/bash
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
TEST_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
PLUGIN_DIR="$(cd "$SCRIPT_DIR/../../client-plugins/all-in-one-full" && pwd)"
# 加载 .env
if [ -f "$TEST_DIR/.env" ]; then
source "$TEST_DIR/.env"
fi
# 检测容器挂载目录
if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then
FTP_MOUNT=$(docker inspect argus-ftp --format '{{range .Mounts}}{{if eq .Destination "/private/argus/ftp"}}{{.Source}}{{end}}{{end}}')
OUTPUT_DIR="${FTP_MOUNT}/share"
echo "[02] 容器挂载: $OUTPUT_DIR"
else
OUTPUT_DIR="${DATA_ROOT:-$TEST_DIR/data}/ftp/share"
echo "[02] 默认路径: $OUTPUT_DIR"
fi
OWNER="${ARGUS_BUILD_UID:-2133}:${ARGUS_BUILD_GID:-2015}"
cd "$PLUGIN_DIR"
echo "[02] 递增版本号..."
bash scripts/version-manager.sh bump minor
VERSION_FILE="config/VERSION"
if [ ! -f "$VERSION_FILE" ]; then
echo "[02] 错误: 未找到 $VERSION_FILE"
exit 1
fi
VERSION=$(cat "$VERSION_FILE" | tr -d '[:space:]')
echo "[02] 新版本: $VERSION"
echo "[02] 构建安装包..."
bash scripts/package_artifact.sh --force
echo "[02] 发布到 FTP: $OUTPUT_DIR"
sudo bash scripts/publish_artifact.sh "$VERSION" --output-dir "$OUTPUT_DIR" --owner "$OWNER"
echo "[02] 设置文件权限..."
# 设置所有者
sudo chown -R "$OWNER" "$OUTPUT_DIR"
# 设置目录权限为 755 (rwxr-xr-x)
sudo find "$OUTPUT_DIR" -type d -exec chmod 755 {} \;
# 设置文件权限为 644 (rw-r--r--)
sudo find "$OUTPUT_DIR" -type f -exec chmod 644 {} \;
# 特别处理 .sh 文件,给予执行权限 755
sudo find "$OUTPUT_DIR" -type f -name "*.sh" -exec chmod 755 {} \;
echo "[02] 权限设置完成 (UID:GID=$OWNER, dirs=755, files=644, scripts=755)"
echo "[02] 发布完成,验证文件..."
ls -lh "$OUTPUT_DIR"
echo "[02] 完成"

View File

@ -0,0 +1,33 @@
#!/bin/bash
set -e
FTP_SERVER="${FTP_SERVER:-172.30.0.40}"
FTP_USER="${FTP_USER:-ftpuser}"
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
FTP_PORT="${FTP_PORT:-21}"
FTP_HOST="${FTP_SERVER}"
echo "[03] 进入测试节点执行安装..."
echo "[03] 使用 FTP 地址: ${FTP_HOST}:${FTP_PORT}"
docker exec argus-metric-test-node bash -c "
set -e
if ! command -v curl &>/dev/null; then
echo '[03] curl 未安装,正在安装...'
apt-get update && apt-get install -y curl
fi
cd /tmp
echo '[03] 下载 setup.sh...'
curl -u ${FTP_USER}:${FTP_PASSWORD} ftp://${FTP_HOST}:${FTP_PORT}/setup.sh -o setup.sh
echo '[03] 执行安装...'
chmod +x setup.sh
bash setup.sh --server ${FTP_HOST} --user ${FTP_USER} --password '${FTP_PASSWORD}' --port ${FTP_PORT}
echo '[03] 安装完成'
"
echo "[03] 完成"

View File

@ -0,0 +1,33 @@
#!/bin/bash
set -e
FTP_SERVER="${FTP_SERVER:-172.30.0.40}"
FTP_USER="${FTP_USER:-ftpuser}"
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
FTP_PORT="${FTP_PORT:-21}"
FTP_HOST="${FTP_SERVER}"
echo "[03] 进入测试节点执行安装..."
echo "[03] 使用 FTP 地址: ${FTP_HOST}:${FTP_PORT}"
docker exec argus-metric-test-gpu-node bash -c "
set -e
if ! command -v curl &>/dev/null; then
echo '[03] curl 未安装,正在安装...'
apt-get update && apt-get install -y curl
fi
cd /tmp
echo '[03] 下载 setup.sh...'
curl -u ${FTP_USER}:${FTP_PASSWORD} ftp://${FTP_HOST}:${FTP_PORT}/setup.sh -o setup.sh
echo '[03] 执行安装...'
chmod +x setup.sh
bash setup.sh --server ${FTP_HOST} --user ${FTP_USER} --password '${FTP_PASSWORD}' --port ${FTP_PORT}
echo '[03] 安装完成'
"
echo "[03] 完成"

View File

@ -0,0 +1,96 @@
#!/bin/bash
set -e
echo "[04] 验证安装结果 - 检查监控端口..."
echo "=========================================="
# 检查容器是否运行
if ! docker ps --format '{{.Names}}' | grep -q '^argus-metric-test-node$'; then
echo "错误: 容器 argus-metric-test-node 未运行"
exit 1
fi
ERRORS=0
# ==================== 检查监听端口 ====================
echo ""
echo "[1] 检查监听端口..."
echo "----------------------------------------"
CHECK_RESULT=$(docker exec argus-metric-test-node bash -c '
if command -v netstat >/dev/null 2>&1; then
echo "使用 netstat 检查端口:"
if netstat -tlnp 2>/dev/null | grep -E ":(9100|9400|2020)"; then
echo "✓ 找到监控端口"
exit 0
else
echo "✗ 未找到监控端口 (9100/9400/2020)"
exit 1
fi
elif command -v ss >/dev/null 2>&1; then
echo "使用 ss 检查端口:"
if ss -tlnp 2>/dev/null | grep -E ":(9100|9400|2020)"; then
echo "✓ 找到监控端口"
exit 0
else
echo "✗ 未找到监控端口 (9100/9400/2020)"
exit 1
fi
elif command -v lsof >/dev/null 2>&1; then
echo "使用 lsof 检查端口:"
if lsof -i :9100 -i :9400 -i :2020 2>/dev/null | grep LISTEN; then
echo "✓ 找到监控端口"
exit 0
else
echo "✗ 未找到监控端口 (9100/9400/2020)"
exit 1
fi
else
echo "? 没有可用的端口检查工具 (netstat/ss/lsof),跳过此检查"
exit 0
fi
')
echo "$CHECK_RESULT"
# 只有在明确失败时才计入错误exit 1没有工具exit 0不算错误
if echo "$CHECK_RESULT" | grep -q "✗ 未找到监控端口"; then
ERRORS=$((ERRORS + 1))
fi
# ==================== 测试端口连通性 ====================
echo ""
echo "[2] 测试端口连通性..."
echo "----------------------------------------"
docker exec argus-metric-test-node bash -c '
if command -v curl >/dev/null 2>&1; then
FAILED=0
for port in 9100 9400 2020; do
echo -n "端口 $port: "
if curl -s --connect-timeout 2 "http://localhost:$port/metrics" > /dev/null 2>&1; then
echo "✓ 可访问 (/metrics)"
elif curl -s --connect-timeout 2 "http://localhost:$port/" > /dev/null 2>&1; then
echo "✓ 可访问 (根路径)"
else
echo "✗ 不可访问"
FAILED=$((FAILED + 1))
fi
done
exit $FAILED
else
echo "? curl 不可用,跳过连通性测试"
exit 0
fi
' || ERRORS=$((ERRORS + 1))
echo ""
echo "=========================================="
if [ $ERRORS -eq 0 ]; then
echo "✓ [04] 验证完成 - 所有端口检查通过"
else
echo "✗ [04] 验证失败 - 发现 $ERRORS 个问题"
echo ""
echo "调试建议:"
echo " 1. 进入容器检查: docker exec -it argus-metric-test-node bash"
echo " 2. 查看进程: docker exec argus-metric-test-node ps aux"
echo " 3. 查看日志: docker exec argus-metric-test-node cat /tmp/argus_install.log"
exit 1
fi
echo "=========================================="

View File

@ -0,0 +1,11 @@
#!/bin/bash
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
echo "[05] 清理环境..."
bash "$SCRIPT_DIR/common/stop-all.sh" || true
echo "[05] 清理完成"

View File

@ -6,7 +6,8 @@
set -e set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR" TEST_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
cd "$TEST_DIR"
echo "==========================================" echo "=========================================="
echo " 路径检查脚本" echo " 路径检查脚本"
@ -18,15 +19,15 @@ echo ""
# 检查配置文件 # 检查配置文件
echo "检查配置文件..." echo "检查配置文件..."
if [ -f "$SCRIPT_DIR/docker-compose.yml" ]; then if [ -f "$TEST_DIR/docker-compose.yml" ]; then
echo " ✓ docker-compose.yml 存在" echo " ✓ docker-compose.yml 存在"
else else
echo " ✗ docker-compose.yml 不存在" echo " ✗ docker-compose.yml 不存在"
fi fi
if [ -f "$SCRIPT_DIR/.env" ]; then if [ -f "$TEST_DIR/.env" ]; then
echo " ✓ .env 存在" echo " ✓ .env 存在"
elif [ -f "$SCRIPT_DIR/env.example" ]; then elif [ -f "$TEST_DIR/env.example" ]; then
echo " ⚠ .env 不存在,但 env.example 存在" echo " ⚠ .env 不存在,但 env.example 存在"
else else
echo " ✗ .env 和 env.example 都不存在" echo " ✗ .env 和 env.example 都不存在"

View File

@ -0,0 +1,61 @@
#!/bin/bash
# 初始化目录脚本
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
cd "$TEST_DIR"
# 加载 .env 文件(如果存在)
if [ -f .env ]; then
echo "加载 .env 配置文件..."
source .env
fi
# 默认配置
ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
DATA_ROOT=${DATA_ROOT:-/private}
echo "开始初始化目录结构..."
echo "数据根目录: ${DATA_ROOT}"
echo "统一 UID: ${ARGUS_BUILD_UID}"
echo "统一 GID: ${ARGUS_BUILD_GID}"
# 创建基础目录结构
echo "创建基础目录结构..."
sudo mkdir -p ${DATA_ROOT}/argus/metric
sudo mkdir -p ${DATA_ROOT}/argus/etc
sudo mkdir -p ${DATA_ROOT}/argus/agent
# 创建 FTP 目录
echo "创建 FTP 目录..."
sudo mkdir -p ${DATA_ROOT}/argus/metric/ftp/share
# 创建 Prometheus 目录
echo "创建 Prometheus 目录..."
sudo mkdir -p ${DATA_ROOT}/argus/metric/prometheus/{data,rules,targets}
# 创建 Grafana 目录
echo "创建 Grafana 目录..."
sudo mkdir -p ${DATA_ROOT}/argus/metric/grafana/{data,logs,plugins,provisioning/datasources,provisioning/dashboards,data/sessions,data/dashboards,config}
# 统一设置所有目录权限
echo "设置目录权限..."
sudo chown -R ${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID} ${DATA_ROOT}/argus/metric
sudo chmod -R 755 ${DATA_ROOT}/argus/metric
echo "目录初始化完成!"
echo ""
echo "目录结构:"
echo " ${DATA_ROOT}/"
echo " ├── argus/ (UID:${ARGUS_BUILD_UID}, GID:${ARGUS_BUILD_GID})"
echo " │ ├── metric/"
echo " │ │ ├── ftp/"
echo " │ │ ├── prometheus/"
echo " │ │ └── grafana/"
echo ""
echo "您现在可以运行 'docker-compose up -d' 来启动所有服务"

View File

@ -6,7 +6,8 @@
set -e set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR" TEST_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
cd "$TEST_DIR"
# 检测 docker-compose 命令 # 检测 docker-compose 命令
if command -v docker-compose &> /dev/null; then if command -v docker-compose &> /dev/null; then
@ -19,7 +20,7 @@ else
fi fi
# 镜像缓存目录 # 镜像缓存目录
IMAGE_CACHE_DIR="./images-cache" IMAGE_CACHE_DIR="$TEST_DIR/images-cache"
mkdir -p "$IMAGE_CACHE_DIR" mkdir -p "$IMAGE_CACHE_DIR"
# 定义镜像列表 # 定义镜像列表

View File

@ -6,13 +6,20 @@
set -e set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR" TEST_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
cd "$TEST_DIR"
# 解析参数
FORCE_REBUILD=false
if [[ "$1" == "--rebuild" ]]; then
FORCE_REBUILD=true
fi
echo "==========================================" echo "=========================================="
echo " Argus Metrics 一键启动脚本" echo " Argus Metrics 一键启动脚本"
echo "==========================================" echo "=========================================="
echo "" echo ""
echo "当前工作目录: $SCRIPT_DIR" echo "当前工作目录: $TEST_DIR"
echo "" echo ""
# 检查 Docker 和 Docker Compose # 检查 Docker 和 Docker Compose
@ -21,19 +28,13 @@ if ! command -v docker &> /dev/null; then
exit 1 exit 1
fi fi
# 检测 docker-compose 命令(兼容新旧版本) # 检查 docker compose 命令
COMPOSE_FILE="$SCRIPT_DIR/docker-compose.yml" if ! docker compose version &> /dev/null 2>&1; then
if command -v docker-compose &> /dev/null; then echo "错误: 未找到 docker compose 命令,请确保 Docker Compose V2 已安装"
DOCKER_COMPOSE="docker-compose -f $COMPOSE_FILE"
echo "使用: docker-compose"
elif docker compose version &> /dev/null 2>&1; then
DOCKER_COMPOSE="docker compose -f $COMPOSE_FILE"
echo "使用: docker compose"
else
echo "错误: 未找到 docker-compose 或 docker compose 命令"
exit 1 exit 1
fi fi
echo "Compose 文件: $COMPOSE_FILE" echo "使用: docker compose"
echo "Compose 文件: $TEST_DIR/docker-compose.yml"
echo "" echo ""
# 检查必要的构建目录 # 检查必要的构建目录
@ -42,6 +43,8 @@ BUILD_DIRS=(
"../ftp/build" "../ftp/build"
"../prometheus/build" "../prometheus/build"
"../grafana/build" "../grafana/build"
"client-test-node/build"
"client-test-gpu-node/build"
) )
for dir in "${BUILD_DIRS[@]}"; do for dir in "${BUILD_DIRS[@]}"; do
@ -65,6 +68,18 @@ fi
# 加载环境变量 # 加载环境变量
source .env source .env
# 检查并创建 Docker 网络
echo "检查 Docker 网络..."
NETWORK_NAME="argus-debug-net"
if docker network inspect "$NETWORK_NAME" >/dev/null 2>&1; then
echo "网络 $NETWORK_NAME 已存在"
else
echo "创建网络 $NETWORK_NAME..."
docker network create --driver bridge --subnet 172.30.0.0/16 "$NETWORK_NAME"
echo "网络创建成功"
fi
echo ""
echo "1. 初始化目录结构..." echo "1. 初始化目录结构..."
bash "$SCRIPT_DIR/init-directories.sh" bash "$SCRIPT_DIR/init-directories.sh"
@ -72,8 +87,8 @@ echo ""
echo "2. 准备 Docker 镜像..." echo "2. 准备 Docker 镜像..."
# 检查镜像是否存在 # 检查镜像是否存在
IMAGE_CACHE_DIR="./images-cache" IMAGE_CACHE_DIR="$TEST_DIR/images-cache"
IMAGES=("argus-metric-ftp:latest" "argus-metric-prometheus:latest" "argus-metric-grafana:latest") IMAGES=("argus-metric-ftp:latest" "argus-metric-prometheus:latest" "argus-metric-grafana:latest" "argus-metric-test-node:latest" "argus-metric-test-gpu-node:latest")
all_images_exist=true all_images_exist=true
for image in "${IMAGES[@]}"; do for image in "${IMAGES[@]}"; do
@ -83,7 +98,12 @@ for image in "${IMAGES[@]}"; do
fi fi
done done
if $all_images_exist; then if $FORCE_REBUILD; then
echo "强制重新构建镜像(--rebuild 模式)..."
cd "$TEST_DIR"
docker compose build --no-cache
echo "镜像重新构建完成"
elif $all_images_exist; then
echo "所有镜像已存在,跳过构建" echo "所有镜像已存在,跳过构建"
else else
echo "检测到缺失镜像,尝试从缓存加载..." echo "检测到缺失镜像,尝试从缓存加载..."
@ -104,6 +124,12 @@ else
"argus-metric-grafana:latest") "argus-metric-grafana:latest")
cache_file="${IMAGE_CACHE_DIR}/argus-grafana.tar" cache_file="${IMAGE_CACHE_DIR}/argus-grafana.tar"
;; ;;
"argus-metric-test-node:latest")
cache_file="${IMAGE_CACHE_DIR}/argus-test-node.tar"
;;
"argus-metric-test-gpu-node:latest")
cache_file="${IMAGE_CACHE_DIR}/argus-test-gpu-node.tar"
;;
esac esac
if [ -f "$cache_file" ]; then if [ -f "$cache_file" ]; then
@ -128,8 +154,8 @@ else
echo "" echo ""
echo "部分镜像缺失,开始构建..." echo "部分镜像缺失,开始构建..."
echo "工作目录: $(pwd)" echo "工作目录: $(pwd)"
cd "$SCRIPT_DIR" cd "$TEST_DIR"
$DOCKER_COMPOSE build docker compose build --no-cache
# 询问是否保存镜像 # 询问是否保存镜像
echo "" echo ""
@ -149,6 +175,12 @@ else
"argus-metric-grafana:latest") "argus-metric-grafana:latest")
docker save -o "${IMAGE_CACHE_DIR}/argus-grafana.tar" "$image" && echo " 已保存: argus-grafana.tar" docker save -o "${IMAGE_CACHE_DIR}/argus-grafana.tar" "$image" && echo " 已保存: argus-grafana.tar"
;; ;;
"argus-metric-test-node:latest")
docker save -o "${IMAGE_CACHE_DIR}/argus-test-node.tar" "$image" && echo " 已保存: argus-test-node.tar"
;;
"argus-metric-test-gpu-node:latest")
docker save -o "${IMAGE_CACHE_DIR}/argus-test-gpu-node.tar" "$image" && echo " 已保存: argus-test-gpu-node.tar"
;;
esac esac
done done
echo "镜像已保存到: $IMAGE_CACHE_DIR/" echo "镜像已保存到: $IMAGE_CACHE_DIR/"
@ -160,40 +192,12 @@ else
fi fi
echo "" echo ""
echo "3. 启动服务..." echo "3. 启动基础服务..."
cd "$SCRIPT_DIR" cd "$TEST_DIR"
$DOCKER_COMPOSE up -d # 启动除GPU节点外的所有服务
docker compose up -d ftp prometheus grafana test-node test-gpu-node
echo "" echo ""
echo "4. 等待服务启动..." echo "4. 等待服务启动..."
sleep 5 sleep 5
echo ""
echo "5. 检查服务状态..."
cd "$SCRIPT_DIR"
$DOCKER_COMPOSE ps
echo ""
echo "=========================================="
echo " 服务启动完成!"
echo "=========================================="
echo ""
echo "服务访问地址:"
echo " - FTP: ftp://localhost:${FTP_PORT:-21}"
echo " 用户名: ftpuser"
echo " 密码: ${FTP_PASSWORD:-ZGClab1234!}"
echo ""
echo " - Prometheus: http://localhost:${PROMETHEUS_PORT:-9090}"
echo ""
echo " - Grafana: http://localhost:${GRAFANA_PORT:-3000}"
echo " 用户名: admin"
echo " 密码: admin"
echo ""
echo "常用命令:"
echo " 查看日志: $DOCKER_COMPOSE logs -f [service]"
echo " 停止服务: $DOCKER_COMPOSE stop"
echo " 重启服务: $DOCKER_COMPOSE restart"
echo " 停止并删除: $DOCKER_COMPOSE down"
echo " 停止并删除卷: $DOCKER_COMPOSE down -v"
echo ""

View File

@ -0,0 +1,50 @@
#!/bin/bash
# 停止所有服务脚本
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
cd "$TEST_DIR"
# 检查 docker compose 命令
if ! docker compose version &> /dev/null 2>&1; then
echo "错误: 未找到 docker compose 命令,请确保 Docker Compose V2 已安装"
exit 1
fi
echo "=========================================="
echo " 停止 Argus Metrics 服务"
echo "=========================================="
echo ""
echo "使用: docker compose"
echo "Compose 文件: $TEST_DIR/docker-compose.yml"
echo ""
# 检查是否有运行的容器
if [ "$(docker compose ps -q)" ]; then
echo "停止所有服务..."
docker compose stop
echo ""
read -p "是否要删除容器? (y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
docker compose down
echo "容器已删除"
read -p "是否要删除数据卷? (y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
docker compose down -v
echo "数据卷已删除"
fi
fi
else
echo "没有运行的服务"
fi
echo ""
echo "完成!"

View File

@ -0,0 +1,85 @@
#!/bin/bash
# 镜像加载脚本
# 用于从 tar 文件加载 Docker 镜像
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
INPUT_DIR="${1:-$TEST_DIR/images-cache}"
echo "=========================================="
echo " Docker 镜像加载脚本"
echo "=========================================="
echo ""
echo "输入目录: $INPUT_DIR"
echo ""
# 检查输入目录是否存在
if [ ! -d "$INPUT_DIR" ]; then
echo "错误: 目录不存在: $INPUT_DIR"
exit 1
fi
# 查找所有tar文件并加载
total=0
success=0
failed=0
# 查找目录下所有.tar文件
tar_files=($(find "$INPUT_DIR" -name "*.tar" -type f 2>/dev/null | sort))
if [ ${#tar_files[@]} -eq 0 ]; then
echo "错误: 在目录 $INPUT_DIR 中未找到任何 .tar 文件"
exit 1
fi
echo "找到 ${#tar_files[@]} 个镜像文件:"
for tar_file in "${tar_files[@]}"; do
echo " - $(basename "$tar_file")"
done
echo ""
for tar_file in "${tar_files[@]}"; do
total=$((total + 1))
tar_filename=$(basename "$tar_file")
echo "[$total] 处理: $tar_filename"
# 强制加载,不检查镜像是否已存在
echo " 加载镜像..."
if docker load -i "$tar_file"; then
echo " 加载成功: $tar_filename"
success=$((success + 1))
else
echo " 加载失败: $tar_filename"
failed=$((failed + 1))
fi
echo ""
done
echo "=========================================="
echo " 加载完成"
echo "=========================================="
echo ""
echo "统计:"
echo " 总计: $total"
echo " 成功: $success"
echo " 失败: $failed"
echo ""
# 显示当前所有镜像
echo "当前所有镜像:"
docker images
echo ""
if [ $failed -gt 0 ]; then
echo "部分镜像加载失败,请检查!"
exit 1
fi
if [ $success -gt 0 ]; then
echo "镜像加载成功!"
fi

View File

@ -0,0 +1,94 @@
#!/bin/bash
# 镜像保存脚本
# 用于保存 Docker 镜像到 tar 文件,便于离线部署
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
OUTPUT_DIR="${1:-$TEST_DIR/images-cache}"
echo "=========================================="
echo " Docker 镜像保存脚本"
echo "=========================================="
echo ""
echo "输出目录: $OUTPUT_DIR"
echo ""
# 创建输出目录
mkdir -p "$OUTPUT_DIR"
# 定义镜像名称(与 docker-compose.yml 保持一致)
declare -A IMAGES=(
["argus-metric-ftp:latest"]="argus-ftp.tar"
["argus-metric-prometheus:latest"]="argus-prometheus.tar"
["argus-metric-grafana:latest"]="argus-grafana.tar"
["argus-metric-test-node:latest"]="argus-test-node.tar"
["argus-metric-test-gpu-node:latest"]="argus-test-gpu-node.tar"
)
# 检查镜像是否存在并保存
total=0
success=0
failed=0
for image in "${!IMAGES[@]}"; do
total=$((total + 1))
output_file="${OUTPUT_DIR}/${IMAGES[$image]}"
echo "[$total] 检查镜像: $image"
if docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then
echo " ✓ 镜像存在,开始保存..."
if docker save -o "$output_file" "$image"; then
file_size=$(ls -lh "$output_file" | awk '{print $5}')
echo " ✓ 保存成功: ${IMAGES[$image]} ($file_size)"
success=$((success + 1))
else
echo " ✗ 保存失败: $image"
failed=$((failed + 1))
fi
else
echo " ✗ 镜像不存在,请先构建镜像"
failed=$((failed + 1))
fi
echo ""
done
echo "=========================================="
echo " 保存完成"
echo "=========================================="
echo ""
echo "统计:"
echo " 总计: $total"
echo " 成功: $success"
echo " 失败: $failed"
echo ""
echo "输出目录: $OUTPUT_DIR"
echo ""
if [ $success -gt 0 ]; then
echo "已保存的文件:"
ls -lh "$OUTPUT_DIR"/*.tar 2>/dev/null || true
echo ""
echo "文件列表:"
for image in "${!IMAGES[@]}"; do
output_file="${OUTPUT_DIR}/${IMAGES[$image]}"
if [ -f "$output_file" ]; then
file_size=$(ls -lh "$output_file" | awk '{print $5}')
echo " - ${IMAGES[$image]} ($file_size)"
fi
done
fi
echo ""
echo "使用说明:"
echo "1. 将 images-cache 目录复制到目标服务器的 ~/argus/src/metric/tests/ 下"
echo "2. 在目标服务器运行: bash scripts/common/start-all.sh"
echo ""
if [ $failed -gt 0 ]; then
exit 1
fi

View File

@ -1,51 +0,0 @@
#!/bin/bash
# 停止所有服务脚本
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
# 检测 docker-compose 命令(兼容新旧版本)
COMPOSE_FILE="$SCRIPT_DIR/docker-compose.yml"
if command -v docker-compose &> /dev/null; then
DOCKER_COMPOSE="docker-compose -f $COMPOSE_FILE"
elif docker compose version &> /dev/null 2>&1; then
DOCKER_COMPOSE="docker compose -f $COMPOSE_FILE"
else
echo "错误: 未找到 docker-compose 或 docker compose 命令"
exit 1
fi
echo "=========================================="
echo " 停止 Argus Metrics 服务"
echo "=========================================="
echo ""
# 检查是否有运行的容器
if [ "$($DOCKER_COMPOSE ps -q)" ]; then
echo "停止所有服务..."
$DOCKER_COMPOSE stop
echo ""
read -p "是否要删除容器? (y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
$DOCKER_COMPOSE down
echo "容器已删除"
read -p "是否要删除数据卷? (y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
$DOCKER_COMPOSE down -v
echo "数据卷已删除"
fi
fi
else
echo "没有运行的服务"
fi
echo ""
echo "完成!"