diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/check_health.sh b/src/metric/client-plugins/all-in-one-demo/scripts/check_health.sh index 991cc9f..6b3c866 100755 --- a/src/metric/client-plugins/all-in-one-demo/scripts/check_health.sh +++ b/src/metric/client-plugins/all-in-one-demo/scripts/check_health.sh @@ -4,6 +4,15 @@ set -e +# PID 文件检测,防止重复执行 +PIDFILE="/var/run/check_health.pid" +if [ -f "$PIDFILE" ] && kill -0 $(cat "$PIDFILE") 2>/dev/null; then + echo "健康检查脚本已在运行中,跳过本次执行" >&2 + exit 0 +fi +echo $$ > "$PIDFILE" +trap "rm -f $PIDFILE" EXIT + # 获取脚本所在目录 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" HEALTH_LOG_FILE="$SCRIPT_DIR/.health_log" diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/install_artifact.sh b/src/metric/client-plugins/all-in-one-demo/scripts/install_artifact.sh index ba9ade2..13f091c 100755 --- a/src/metric/client-plugins/all-in-one-demo/scripts/install_artifact.sh +++ b/src/metric/client-plugins/all-in-one-demo/scripts/install_artifact.sh @@ -200,22 +200,22 @@ parse_version_info() { VERSION=$(grep '"version"' "$VERSION_FILE_PATH" | sed 's/.*"version": *"\([^"]*\)".*/\1/') BUILD_TIME=$(grep '"build_time"' "$VERSION_FILE_PATH" | sed 's/.*"build_time": *"\([^"]*\)".*/\1/') - # 解析 artifact_list - grep -A 100 '"artifact_list"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do + # 解析 artifact_list(跳过字段名本身) + grep -A 100 '"artifact_list"' "$VERSION_FILE_PATH" | grep -v '"artifact_list"' | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/') version=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/') echo "$component:$version" >> "$TEMP_DIR/components.txt" done - # 解析 checksums - grep -A 100 '"checksums"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do + # 解析 checksums(跳过字段名本身) + grep -A 100 '"checksums"' "$VERSION_FILE_PATH" | grep -v '"checksums"' | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/') checksum=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/') echo "$component:$checksum" >> "$TEMP_DIR/checksums.txt" done - # 解析 install_order - grep -A 100 '"install_order"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+"' | while read line; do + # 解析 install_order(跳过字段名本身,只取数组元素) + grep -A 100 '"install_order"' "$VERSION_FILE_PATH" | grep -v '"install_order"' | grep -E '^\s*"[^"]+"' | while read line; do component=$(echo "$line" | sed 's/.*"\([^"]*\)".*/\1/') echo "$component" >> "$TEMP_DIR/install_order.txt" done @@ -317,85 +317,152 @@ create_install_dirs() { log_success "安装目录创建完成: $INSTALL_DIR" } +# 获取系统版本 +get_system_version() { + if [[ ! -f /etc/os-release ]]; then + log_error "无法检测操作系统版本" + return 1 + fi + + source /etc/os-release + + # 提取主版本号 + case "$VERSION_ID" in + "20.04") + echo "ubuntu20" + ;; + "22.04") + echo "ubuntu22" + ;; + *) + log_warning "未识别的Ubuntu版本: $VERSION_ID,尝试使用ubuntu22" + echo "ubuntu22" + ;; + esac +} + # 安装系统依赖包 install_system_deps() { log_info "检查系统依赖包..." - + local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" local deps_dir="$script_dir/deps" - + # 检查deps目录是否存在 if [[ ! -d "$deps_dir" ]]; then log_info "deps 目录不存在,跳过系统依赖包安装" return 0 fi + + # 获取系统版本对应的依赖目录 + local system_version=$(get_system_version) + local version_deps_dir="$deps_dir/$system_version" - # 检查是否有tar.gz文件 - local deps_count=$(find "$deps_dir" -name "*.tar.gz" | wc -l) - if [[ $deps_count -eq 0 ]]; then - log_info "deps 目录中没有 tar.gz 文件,跳过系统依赖包安装" - return 0 + log_info "检测到系统版本: $system_version" + + # 检查版本特定的依赖目录是否存在 + if [[ ! -d "$version_deps_dir" ]]; then + log_warning "未找到 $system_version 版本的依赖目录: $version_deps_dir" + # 回退到旧的逻辑,检查根deps目录 + local deps_count=$(find "$deps_dir" -name "*.tar.gz" | wc -l) + if [[ $deps_count -eq 0 ]]; then + log_info "deps 目录中没有 tar.gz 文件,跳过系统依赖包安装" + return 0 + fi + version_deps_dir="$deps_dir" + else + # 检查版本目录中是否有tar.gz文件 + local deps_count=$(find "$version_deps_dir" -name "*.tar.gz" | wc -l) + if [[ $deps_count -eq 0 ]]; then + log_info "$system_version 版本目录中没有 tar.gz 文件,跳过系统依赖包安装" + return 0 + fi fi - - log_info "找到 $deps_count 个系统依赖包,开始安装..." - + + log_info "找到 $system_version 版本的依赖包,开始安装..." + # 创建临时目录用于解压依赖包 - local deps_temp_dir="$TEMP_DIR/deps" + local deps_temp_dir="${TEMP_DIR:-/tmp}/deps" mkdir -p "$deps_temp_dir" - + + # 定义要检查的核心依赖 + local CORE_DEPS=(jq cron curl) + local FAILED_DEPS=() + # 处理每个tar.gz文件 - find "$deps_dir" -name "*.tar.gz" | while read tar_file; do + find "$version_deps_dir" -name "*.tar.gz" | while read tar_file; do local tar_basename=$(basename "$tar_file") local extract_name="${tar_basename%.tar.gz}" - + log_info "处理依赖包: $tar_basename" - + # 解压到临时目录 local extract_dir="$deps_temp_dir/$extract_name" mkdir -p "$extract_dir" - + if tar -xzf "$tar_file" -C "$extract_dir" 2>/dev/null; then log_success " $tar_basename 解压完成" else log_error " $tar_basename 解压失败" continue fi - + # 进入解压目录,查找deb包 - cd "$extract_dir" - local deb_count=$(find . -name "*.deb" | wc -l) - - if [[ $deb_count -gt 0 ]]; then - log_info " 找到 $deb_count 个 deb 包,开始安装..." - - # 1. 先尝试安装所有deb包 - log_info " 第1步:批量安装deb包..." - if dpkg -i *.deb 2>/dev/null; then - log_success " 所有deb包安装成功" - else - log_warning " 部分deb包安装失败,可能存在依赖问题" - - # 2. 使用apt-get修复依赖 - log_info " 第2步:修复依赖关系..." - if apt-get install -f -y; then - log_success " 依赖关系修复完成" - else - log_error " 依赖关系修复失败" - # 继续处理其他包,不退出 + cd "$extract_dir" || continue + local deb_files=(*.deb) + if [[ ${#deb_files[@]} -gt 0 ]]; then + log_info " 找到 ${#deb_files[@]} 个 deb 包,开始安装..." + + for deb in "${deb_files[@]}"; do + local pkg_name + pkg_name=$(dpkg-deb -f "$deb" Package 2>/dev/null) + + # 如果已安装,则跳过 + if dpkg -s "$pkg_name" &>/dev/null; then + log_success " $pkg_name 已安装,跳过" + continue fi - fi + + # 尝试安装 + log_info " 安装 $pkg_name..." + if DEBIAN_FRONTEND=noninteractive dpkg -i "$deb" &>/dev/null; then + log_success " $pkg_name 安装成功" + else + log_warning " $pkg_name 安装失败,尝试修复依赖..." + if DEBIAN_FRONTEND=noninteractive apt-get install -f -y &>/dev/null; then + if dpkg -s "$pkg_name" &>/dev/null; then + log_success " $pkg_name 修复安装成功" + else + log_error " $pkg_name 仍未安装成功" + FAILED_DEPS+=("$pkg_name") + fi + else + log_error " $pkg_name 自动修复失败" + FAILED_DEPS+=("$pkg_name") + fi + fi + done else log_info " $tar_basename 中没有找到deb包,跳过" fi - + # 返回到依赖临时目录 - cd "$deps_temp_dir" + cd "$deps_temp_dir" || continue done - + # 检查并启动 cron 服务 start_cron_service - - log_success "系统依赖包安装完成" + + # 总结安装结果 + if [[ ${#FAILED_DEPS[@]} -gt 0 ]]; then + log_error "以下系统依赖未能成功安装,安装终止,请手动安装后重试:" + for f in "${FAILED_DEPS[@]}"; do + echo " - $f" + done + exit 1 + else + log_success "系统依赖包安装完成,全部就绪" + fi } # 启动 cron 服务 @@ -637,6 +704,18 @@ EOF log_success "安装记录已创建: $install_record_file" } +# 检查cron任务是否已存在 +check_cron_task_exists() { + local task_pattern="$1" + local temp_cron="$2" + + if grep -q "$task_pattern" "$temp_cron"; then + return 0 # 任务已存在 + else + return 1 # 任务不存在 + fi +} + # 设置健康检查定时任务 setup_health_check_cron() { log_info "设置健康检查定时任务..." @@ -661,7 +740,7 @@ setup_health_check_cron() { crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron" # 检查并删除旧的健康检查任务 - if grep -q "check_health.sh" "$temp_cron"; then + if check_cron_task_exists "check_health.sh" "$temp_cron"; then log_info "发现旧的健康检查定时任务,正在更新..." # 删除所有包含check_health.sh的行 grep -v "check_health.sh" "$temp_cron" > "$temp_cron.new" @@ -716,7 +795,7 @@ setup_dns_sync_cron() { crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron" # 检查并删除旧的 DNS 同步任务 - if grep -q "sync_dns.sh" "$temp_cron"; then + if check_cron_task_exists "sync_dns.sh" "$temp_cron"; then log_info "发现旧的 DNS 同步定时任务,正在更新..." # 删除所有包含sync_dns.sh的行 grep -v "sync_dns.sh" "$temp_cron" > "$temp_cron.new" @@ -724,16 +803,15 @@ setup_dns_sync_cron() { log_info "旧的 DNS 同步定时任务已删除" fi - # 添加新的定时任务(每30秒执行一次) + # 添加新的定时任务(每1分钟执行一次) # 直接使用版本目录中的 DNS 同步脚本 echo "# Argus-Metrics DNS 同步定时任务" >> "$temp_cron" echo "* * * * * $sync_dns_script >> $INSTALL_DIR/.dns_sync.log 2>&1" >> "$temp_cron" - echo "* * * * * sleep 30; $sync_dns_script >> $INSTALL_DIR/.dns_sync.log 2>&1" >> "$temp_cron" # 安装新的crontab if crontab "$temp_cron"; then log_success "DNS 同步定时任务设置成功" - log_info " 执行频率: 每30秒" + log_info " 执行频率: 每1分钟" log_info " 日志文件: $INSTALL_DIR/.dns_sync.log" log_info " 查看定时任务: crontab -l" log_info " 删除定时任务: crontab -e" @@ -771,7 +849,7 @@ setup_version_check_cron() { crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron" # 检查是否已存在版本校验定时任务 - if grep -q "check_version.sh" "$temp_cron"; then + if check_cron_task_exists "check_version.sh" "$temp_cron"; then log_info "发现旧的版本校验定时任务,正在更新..." # 删除所有包含check_version.sh的行 grep -v "check_version.sh" "$temp_cron" > "$temp_cron.new" @@ -824,7 +902,7 @@ setup_restart_cron() { crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron" # 检查是否已存在自动重启定时任务 - if grep -q "restart_unhealthy.sh" "$temp_cron"; then + if check_cron_task_exists "restart_unhealthy.sh" "$temp_cron"; then log_info "发现旧的自动重启定时任务,正在更新..." # 删除所有包含restart_unhealthy.sh的行 grep -v "restart_unhealthy.sh" "$temp_cron" > "$temp_cron.new" @@ -885,9 +963,9 @@ main() { check_system find_version_file create_install_dirs - parse_version_info + install_system_deps + parse_version_info verify_checksums - install_system_deps install_components copy_config_files create_install_record @@ -895,6 +973,20 @@ main() { setup_dns_sync_cron setup_version_check_cron setup_restart_cron + + # 注释掉立即执行健康检查,避免与cron任务重复执行 + # log_info "立即执行一次健康检查..." + # local check_health_script="$INSTALL_DIR/check_health.sh" + # if [[ -f "$check_health_script" ]]; then + # if "$check_health_script" >> "$INSTALL_DIR/.health_check.log" 2>&1; then + # log_success "健康检查执行完成" + # else + # log_warning "健康检查执行失败,请检查日志: $INSTALL_DIR/.health_check.log" + # fi + # else + # log_warning "健康检查脚本不存在: $check_health_script" + # fi + show_install_info } diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/publish_artifact.sh b/src/metric/client-plugins/all-in-one-demo/scripts/publish_artifact.sh index 7b93099..2f16b19 100755 --- a/src/metric/client-plugins/all-in-one-demo/scripts/publish_artifact.sh +++ b/src/metric/client-plugins/all-in-one-demo/scripts/publish_artifact.sh @@ -29,26 +29,68 @@ log_error() { show_help() { echo "Argus-Metric Artifact 发布脚本" echo - echo "用法: $0 <版本号>" + echo "用法: $0 <版本号> [选项]" echo echo "参数:" - echo " <版本号> 要发布的版本号,对应 artifact 目录中的版本" + echo " <版本号> 要发布的版本号,对应 artifact 目录中的版本" + echo + echo "选项:" + echo " --output-dir <路径> 指定输出目录 (默认: /private/argus/ftp/share/)" + echo " --owner 指定文件所有者 (默认: 2133:2015)" + echo " -h, --help 显示此帮助信息" echo echo "示例:" - echo " $0 1.20.0 # 发布 1.20.0 版本" + echo " $0 1.20.0 # 使用默认配置发布" + echo " $0 1.20.0 --output-dir /tmp/publish # 指定输出目录" + echo " $0 1.20.0 --owner 1000:1000 # 指定文件所有者" + echo " $0 1.20.0 --output-dir /srv/ftp --owner root:root # 同时指定两者" echo } -# 检查参数 -if [[ $# -ne 1 ]]; then +# 默认配置 +DEFAULT_PUBLISH_DIR="/private/argus/ftp/share/" +DEFAULT_OWNER="2133:2015" + +# 解析参数 +VERSION="" +PUBLISH_DIR="$DEFAULT_PUBLISH_DIR" +OWNER="$DEFAULT_OWNER" + +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + show_help + exit 0 + ;; + --output-dir) + PUBLISH_DIR="$2" + shift 2 + ;; + --owner) + OWNER="$2" + shift 2 + ;; + *) + if [[ -z "$VERSION" ]]; then + VERSION="$1" + shift + else + log_error "未知参数: $1" + show_help + exit 1 + fi + ;; + esac +done + +# 检查版本号是否提供 +if [[ -z "$VERSION" ]]; then log_error "请提供版本号参数" show_help exit 1 fi -VERSION="$1" ARTIFACT_DIR="artifact/$VERSION" -PUBLISH_DIR="/Users/sundapeng/Project/nlp/aiops/client-plugins/all-in-one/publish/" # 检查版本目录是否存在 if [[ ! -d "$ARTIFACT_DIR" ]]; then @@ -57,10 +99,12 @@ if [[ ! -d "$ARTIFACT_DIR" ]]; then fi log_info "开始发布版本: $VERSION" +log_info "输出目录: $PUBLISH_DIR" +log_info "文件所有者: $OWNER" # 确保发布目录存在 log_info "确保发布目录存在: $PUBLISH_DIR" -mkdir -p "$PUBLISH_DIR" +sudo mkdir -p "$PUBLISH_DIR" # 创建临时目录用于打包 TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$" @@ -164,20 +208,26 @@ fi TAR_NAME="argus-metric_$(echo $VERSION | tr '.' '_').tar.gz" log_info "创建发布包: $TAR_NAME" cd "$TEMP_PACKAGE_DIR" -tar -czf "$PUBLISH_DIR/$TAR_NAME" * +sudo tar -czf "$PUBLISH_DIR/$TAR_NAME" * cd - > /dev/null +# 设置文件所有者 +log_info "设置文件所有者为: $OWNER" +sudo chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME" + # 清理临时目录 rm -rf "$TEMP_PACKAGE_DIR" # 更新 LATEST_VERSION 文件 log_info "更新 LATEST_VERSION 文件..." -echo "$VERSION" > "$PUBLISH_DIR/LATEST_VERSION" +echo "$VERSION" | sudo tee "$PUBLISH_DIR/LATEST_VERSION" > /dev/null +sudo chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION" # 复制 DNS 配置文件到发布目录根目录(直接从 config 目录复制) if [[ -f "config/dns.conf" ]]; then log_info "复制 DNS 配置文件到发布目录根目录..." - cp "config/dns.conf" "$PUBLISH_DIR/" + sudo cp "config/dns.conf" "$PUBLISH_DIR/" + sudo chown "$OWNER" "$PUBLISH_DIR/dns.conf" log_success "DNS 配置文件复制完成: $PUBLISH_DIR/dns.conf" else log_warning "未找到 config/dns.conf 文件,跳过 DNS 配置文件复制" @@ -186,7 +236,8 @@ fi # 复制 setup.sh 到发布目录 if [[ -f "scripts/setup.sh" ]]; then log_info "复制 setup.sh 到发布目录..." - cp "scripts/setup.sh" "$PUBLISH_DIR/" + sudo cp "scripts/setup.sh" "$PUBLISH_DIR/" + sudo chown "$OWNER" "$PUBLISH_DIR/setup.sh" fi # 显示发布结果 diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/restart_unhealthy.sh b/src/metric/client-plugins/all-in-one-demo/scripts/restart_unhealthy.sh index 7e54693..cd2065b 100755 --- a/src/metric/client-plugins/all-in-one-demo/scripts/restart_unhealthy.sh +++ b/src/metric/client-plugins/all-in-one-demo/scripts/restart_unhealthy.sh @@ -2,6 +2,15 @@ # 此脚本会检查各组件的健康状态,并重启不健康的组件 +# PID 文件检测,防止重复执行 +PIDFILE="/var/run/restart_unhealthy.pid" +if [ -f "$PIDFILE" ] && kill -0 $(cat "$PIDFILE") 2>/dev/null; then + echo "自动重启脚本已在运行中,跳过本次执行" >&2 + exit 0 +fi +echo $$ > "$PIDFILE" +trap "rm -f $PIDFILE" EXIT + # 获取脚本所在目录 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record" diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/sync_dns.sh b/src/metric/client-plugins/all-in-one-demo/scripts/sync_dns.sh index b11ce37..ba8a84c 100755 --- a/src/metric/client-plugins/all-in-one-demo/scripts/sync_dns.sh +++ b/src/metric/client-plugins/all-in-one-demo/scripts/sync_dns.sh @@ -1,257 +1,143 @@ #!/bin/bash - -# DNS 同步脚本 -# 比较 FTP 根目录的 dns.conf 和本地的 dns.conf,如果有变化则同步到 /etc/resolv.conf - set -e -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color +# 颜色 +RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m' -# 日志函数 - 输出到 stderr 避免影响函数返回值 -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" >&2 -} +# 日志函数 +log_info() { echo -e "${BLUE}[INFO]${NC} $1" >&2; } +log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" >&2; } +log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1" >&2; } +log_error() { echo -e "${RED}[ERROR]${NC} $1" >&2; } -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" >&2 -} - -log_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" >&2 -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" >&2 -} - -# 获取脚本所在目录 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" LOCAL_DNS_CONF="/opt/argus-metric/dns.conf" -REMOTE_DNS_CONF_URL="" RESOLV_CONF="/etc/resolv.conf" +ALT_RESOLV_CONF="/run/resolv.conf" LOG_FILE="/opt/argus-metric/.dns_sync.log" +REMOTE_DNS_CONF_URL="" -# 从环境变量或配置文件获取 FTP 服务器信息 +# 获取 FTP 配置 get_ftp_config() { - # 优先从环境变量获取配置 log_info "获取 FTP 配置信息..." - - # 如果环境变量中没有设置,则尝试从配置文件读取 if [[ -z "$FTP_SERVER" || -z "$FTP_USER" || -z "$FTP_PASSWORD" ]]; then - local config_file="$SCRIPT_DIR/config.env" - if [[ -f "$config_file" ]]; then - log_info "从配置文件读取 FTP 配置: $config_file" - source "$config_file" - fi - else - log_info "使用环境变量中的 FTP 配置" + [[ -f "$SCRIPT_DIR/config.env" ]] && source "$SCRIPT_DIR/config.env" fi - - # 设置默认值(如果环境变量和配置文件都没有设置) FTP_SERVER="${FTP_SERVER:-localhost}" FTP_USER="${FTP_USER:-ftpuser}" FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" - - # 构建远程 DNS 配置文件 URL REMOTE_DNS_CONF_URL="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/dns.conf" - - log_info "FTP 配置来源: ${FTP_CONFIG_SOURCE:-环境变量/配置文件}" } -# 下载远程 DNS 配置文件 +# 下载远程 dns.conf download_remote_dns_conf() { - local temp_file="/tmp/dns.conf.remote.$$" - - log_info "从 FTP 服务器下载 DNS 配置文件..." - log_info "远程地址: $REMOTE_DNS_CONF_URL" - log_info "FTP 服务器: $FTP_SERVER" - log_info "FTP 用户: $FTP_USER" - - # 先测试 FTP 连接 + local tmp="/tmp/dns.remote.$$" log_info "测试 FTP 连接..." - if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/" >/dev/null 2>&1; then - log_success "FTP 服务器连接成功" - else - log_error "无法连接到 FTP 服务器: $FTP_SERVER" - log_error "请检查:" - log_error " 1. FTP 服务器是否运行" - log_error " 2. 网络连接是否正常" - log_error " 3. 服务器地址是否正确" - return 1 + if ! curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/" >/dev/null; then + log_error "无法连接到 FTP 服务器: $FTP_SERVER"; return 1 fi - - # 测试 dns.conf 文件是否存在 - log_info "检查远程 dns.conf 文件是否存在..." - if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/dns.conf" >/dev/null 2>&1; then - log_success "远程 dns.conf 文件存在" - else - log_error "远程 dns.conf 文件不存在或无法访问" - log_error "请检查 FTP 服务器根目录下是否有 dns.conf 文件" - return 1 - fi - - # 尝试下载文件 - log_info "开始下载 dns.conf 文件..." - if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sf "ftp://${FTP_SERVER}/dns.conf" -o "$temp_file" 2>/dev/null; then - log_success "远程 DNS 配置文件下载成功" - echo "$temp_file" - else - log_error "下载 dns.conf 文件失败" - log_error "尝试手动测试命令:" - log_error " curl -u ${FTP_USER}:${FTP_PASSWORD} ftp://${FTP_SERVER}/dns.conf" - rm -f "$temp_file" - return 1 + if ! curl -u "${FTP_USER}:${FTP_PASSWORD}" -sf "ftp://${FTP_SERVER}/dns.conf" -o "$tmp" 2>/dev/null; then + log_error "下载 dns.conf 失败"; rm -f "$tmp"; return 1 fi + echo "$tmp" } -# 比较两个文件是否相同 -compare_files() { - local file1="$1" - local file2="$2" - - if [[ ! -f "$file1" || ! -f "$file2" ]]; then - return 1 - fi - - # 使用 diff 比较文件内容 - if diff -q "$file1" "$file2" >/dev/null 2>&1; then - return 0 # 文件相同 - else - return 1 # 文件不同 - fi +# 文件比较 +compare_files() { diff -q "$1" "$2" >/dev/null 2>&1; } + +# 从 dns.conf 提取有效 IP +get_dns_ips() { + grep -Eo '^[0-9]{1,3}(\.[0-9]{1,3}){3}$' "$1" | sort -u } -# 将 DNS 配置添加到 /etc/resolv.conf 开头 +# 安全更新 resolv.conf(保留符号链接) update_resolv_conf() { - local dns_conf_file="$1" - - log_info "更新 /etc/resolv.conf 文件..." + local dns_conf="$1" + local dns_ips + mapfile -t dns_ips < <(get_dns_ips "$dns_conf") + [[ ${#dns_ips[@]} -eq 0 ]] && { log_warning "未检测到有效 DNS"; return; } - # 创建临时文件 - local temp_resolv="/tmp/resolv.conf.$$" - - # 将 dns.conf 内容转换为 nameserver 添加到临时文件开头 - while IFS= read -r line; do - # 跳过空行和注释 - [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue - # 验证 IP 格式 - if [[ "$line" =~ ^[0-9]{1,3}(\.[0-9]{1,3}){3}$ ]]; then - echo "nameserver $line" >> "$temp_resolv" - log_info "添加 DNS 到临时文件: $line" - else - log_warning "跳过无效 DNS: $line" - fi - done < "$dns_conf_file" - - # 将原 resolv.conf 内容追加到临时文件后面 - if [[ -f "$RESOLV_CONF" ]]; then - cat "$RESOLV_CONF" >> "$temp_resolv" + local target_file="$RESOLV_CONF" + if [[ ! -w "$RESOLV_CONF" ]]; then + log_warning "/etc/resolv.conf 不可写,使用兜底路径 $ALT_RESOLV_CONF" + target_file="$ALT_RESOLV_CONF" fi - # 判断是否是 root - if [[ $(id -u) -eq 0 ]]; then - # root 直接写入 - tee "$RESOLV_CONF" < "$temp_resolv" >/dev/null - chmod 644 "$RESOLV_CONF" + local temp="/tmp/resolv.new.$$" + cp "$target_file" "${target_file}.backup.$(date +%Y%m%d_%H%M%S)" 2>/dev/null || true + log_info "更新 DNS 配置文件: $target_file" + + # 写入新的 nameserver 行 + for ip in "${dns_ips[@]}"; do + echo "nameserver $ip" + done >"$temp" + + # 追加原内容(去掉重复 nameserver) + grep -v '^nameserver' "$target_file" >>"$temp" 2>/dev/null || true + awk '!a[$0]++' "$temp" >"${temp}.uniq" + + # ⚙️ 使用 cat 原地覆盖,避免 mv 引发 “设备忙” + if cat "${temp}.uniq" >"$target_file" 2>/dev/null; then + chmod 644 "$target_file" + log_success "DNS 更新完成: ${dns_ips[*]}" else - # 非 root 尝试使用 sudo - if command -v sudo >/dev/null 2>&1; then - sudo tee "$RESOLV_CONF" < "$temp_resolv" >/dev/null - sudo chmod 644 "$RESOLV_CONF" - else - log_error "非 root 用户且系统未安装 sudo,无法更新 /etc/resolv.conf" - rm -f "$temp_resolv" - exit 1 - fi + log_error "无法写入 $target_file,可能被系统锁定" fi - # 清理临时文件 - rm -f "$temp_resolv" - - log_success "/etc/resolv.conf 已更新" + rm -f "$temp" "${temp}.uniq" } -# 记录同步日志 -log_sync() { - local message="$1" - local timestamp=$(date '+%Y-%m-%d %H:%M:%S') - echo "[$timestamp] $message" >> "$LOG_FILE" +# 检查 resolv.conf 是否包含 dns.conf 内容 +ensure_dns_in_resolv() { + local dns_conf="$1" + local dns_ips + mapfile -t dns_ips < <(get_dns_ips "$dns_conf") + [[ ${#dns_ips[@]} -eq 0 ]] && return + + for ip in "${dns_ips[@]}"; do + if ! grep -q "nameserver $ip" "$RESOLV_CONF" 2>/dev/null; then + log_warning "检测到 /etc/resolv.conf 缺少 $ip,执行兜底修复" + update_resolv_conf "$dns_conf" + return + fi + done + log_info "/etc/resolv.conf 已包含所有 DNS" } -# 主函数 +log_sync() { echo "[$(date '+%F %T')] $1" >>"$LOG_FILE"; } + main() { log_info "开始 DNS 同步检查..." - log_sync "DNS 同步检查开始" - - # 确保系统目录存在 - mkdir -p "/opt/argus-metric" - - # 获取 FTP 配置 + mkdir -p /opt/argus-metric + get_ftp_config - - # 检查本地 DNS 配置文件是否存在 + local remote_file + if ! remote_file=$(download_remote_dns_conf); then + log_error "下载失败"; log_sync "同步失败"; exit 1 + fi + if [[ ! -f "$LOCAL_DNS_CONF" ]]; then - log_warning "本地 DNS 配置文件不存在: $LOCAL_DNS_CONF" - log_warning "将下载远程配置文件并更新系统 DNS 设置" - - # 下载远程配置文件 - if remote_file=$(download_remote_dns_conf); then - # 复制到本地 - cp "$remote_file" "$LOCAL_DNS_CONF" - log_success "远程 DNS 配置文件已保存到本地" - - # 更新 resolv.conf - update_resolv_conf "$LOCAL_DNS_CONF" - log_sync "首次同步完成,DNS 配置已更新" - - # 清理临时文件 - rm -f "$remote_file" - else - log_error "无法下载远程 DNS 配置文件,同步失败" - log_sync "同步失败:无法下载远程配置文件" - exit 1 - fi + log_info "本地 dns.conf 不存在,初始化..." + cp "$remote_file" "$LOCAL_DNS_CONF" + update_resolv_conf "$LOCAL_DNS_CONF" + log_sync "首次同步完成" else - log_info "本地 DNS 配置文件存在: $LOCAL_DNS_CONF" - - # 下载远程配置文件进行比较 - if remote_file=$(download_remote_dns_conf); then - # 比较文件 - if compare_files "$LOCAL_DNS_CONF" "$remote_file"; then - log_info "DNS 配置文件无变化,无需更新" - log_sync "DNS 配置文件无变化" - else - log_info "检测到 DNS 配置文件有变化,开始同步..." - log_sync "检测到 DNS 配置文件变化,开始同步" - - # 更新本地配置文件 - cp "$remote_file" "$LOCAL_DNS_CONF" - log_success "本地 DNS 配置文件已更新" - - # 更新 resolv.conf - update_resolv_conf "$LOCAL_DNS_CONF" - log_sync "DNS 配置同步完成" - fi - - # 清理临时文件 - rm -f "$remote_file" + if compare_files "$LOCAL_DNS_CONF" "$remote_file"; then + log_info "dns.conf 无变化" + ensure_dns_in_resolv "$LOCAL_DNS_CONF" + log_sync "dns.conf 无变化,执行兜底检查" else - log_error "无法下载远程 DNS 配置文件,跳过本次同步" - log_sync "同步失败:无法下载远程配置文件" - exit 1 + log_info "检测到 DNS 配置更新" + cp "$remote_file" "$LOCAL_DNS_CONF" + update_resolv_conf "$LOCAL_DNS_CONF" + log_sync "DNS 配置同步完成" fi fi - - log_success "DNS 同步检查完成" - log_sync "DNS 同步检查完成" + + rm -f "$remote_file" + log_success "DNS 同步流程完成" } -# 脚本入口 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then main "$@" fi diff --git a/src/metric/tests/client-test-gpu-node/build/Dockerfile b/src/metric/tests/client-test-gpu-node/build/Dockerfile new file mode 100644 index 0000000..8a64a87 --- /dev/null +++ b/src/metric/tests/client-test-gpu-node/build/Dockerfile @@ -0,0 +1,39 @@ +# 使用NVIDIA官方CUDA基础镜像 +FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04 + +ENV DEBIAN_FRONTEND=noninteractive + +# 设置时区 +ENV TZ=Asia/Shanghai + +RUN apt-get update -qq && \ + apt-get install -y -qq \ + tzdata \ + curl \ + wget \ + gnupg2 \ + software-properties-common \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# 配置时区 +RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone + +WORKDIR /app + +# 创建启动脚本,在运行时验证GPU +COPY < /dev/null; then + nvidia-smi + echo "GPU环境正常" +else + echo "警告: nvidia-smi 命令不可用,请确保容器运行时启用了GPU支持" +fi +exec "\$@" +EOF + +RUN chmod +x /app/start.sh + +CMD ["/app/start.sh", "/bin/bash"] diff --git a/src/metric/tests/docker-compose.yml b/src/metric/tests/docker-compose.yml index 0a88552..d05853b 100644 --- a/src/metric/tests/docker-compose.yml +++ b/src/metric/tests/docker-compose.yml @@ -142,3 +142,44 @@ services: max-size: "10m" max-file: "3" + test-gpu-node: + build: + context: ./client-test-gpu-node/build + dockerfile: Dockerfile + image: argus-metric-test-gpu-node:latest + container_name: argus-metric-test-gpu-node + hostname: test-metric-gpu-node-001 + restart: unless-stopped + privileged: true + runtime: nvidia + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: + - gpu + depends_on: + - ftp + - prometheus + environment: + - TZ=Asia/Shanghai + - DEBIAN_FRONTEND=noninteractive + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - GPU_MODE=gpu + volumes: + - ${DATA_ROOT:-/private}/argus/agent:/private/argus/agent + - /etc/localtime:/etc/localtime:ro + - /etc/timezone:/etc/timezone:ro + command: sleep infinity + networks: + default: + ipv4_address: 172.30.0.51 + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + diff --git a/src/metric/tests/scripts/01_start_services.sh b/src/metric/tests/scripts/01_start_services.sh index 7012224..01e587f 100755 --- a/src/metric/tests/scripts/01_start_services.sh +++ b/src/metric/tests/scripts/01_start_services.sh @@ -21,6 +21,7 @@ docker ps | grep argus-ftp docker ps | grep argus-prometheus docker ps | grep argus-grafana docker ps | grep argus-metric-test-node +docker ps | grep argus-metric-test-gpu-node -echo "[01] 所有服务已启动" +echo "[01] 基础服务已启动" diff --git a/src/metric/tests/scripts/02_publish_artifact.sh b/src/metric/tests/scripts/02_publish_artifact.sh index 15dcf72..658d9dd 100755 --- a/src/metric/tests/scripts/02_publish_artifact.sh +++ b/src/metric/tests/scripts/02_publish_artifact.sh @@ -20,7 +20,7 @@ else echo "[02] 默认路径: $OUTPUT_DIR" fi -OWNER="${FTP_UID:-2133}:${FTP_GID:-2015}" +OWNER="${ARGUS_BUILD_UID:-2133}:${ARGUS_BUILD_GID:-2015}" cd "$PLUGIN_DIR" diff --git a/src/metric/tests/scripts/04_test_gpu_node_install.sh b/src/metric/tests/scripts/04_test_gpu_node_install.sh new file mode 100755 index 0000000..ce1d19a --- /dev/null +++ b/src/metric/tests/scripts/04_test_gpu_node_install.sh @@ -0,0 +1,33 @@ +#!/bin/bash +set -e + +FTP_SERVER="${FTP_SERVER:-172.30.0.40}" +FTP_USER="${FTP_USER:-ftpuser}" +FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" +FTP_PORT="${FTP_PORT:-21}" + +FTP_HOST="${FTP_SERVER}" + +echo "[03] 进入测试节点执行安装..." +echo "[03] 使用 FTP 地址: ${FTP_HOST}:${FTP_PORT}" + +docker exec argus-metric-test-gpu-node bash -c " +set -e + +if ! command -v curl &>/dev/null; then + echo '[03] curl 未安装,正在安装...' + apt-get update && apt-get install -y curl +fi + +cd /tmp +echo '[03] 下载 setup.sh...' +curl -u ${FTP_USER}:${FTP_PASSWORD} ftp://${FTP_HOST}:${FTP_PORT}/setup.sh -o setup.sh + +echo '[03] 执行安装...' +chmod +x setup.sh +bash setup.sh --server ${FTP_HOST} --user ${FTP_USER} --password '${FTP_PASSWORD}' --port ${FTP_PORT} + +echo '[03] 安装完成' +" + +echo "[03] 完成" diff --git a/src/metric/tests/scripts/04_verify_install.sh b/src/metric/tests/scripts/05_verify_install.sh similarity index 100% rename from src/metric/tests/scripts/04_verify_install.sh rename to src/metric/tests/scripts/05_verify_install.sh diff --git a/src/metric/tests/scripts/05_cleanup.sh b/src/metric/tests/scripts/06_cleanup.sh similarity index 100% rename from src/metric/tests/scripts/05_cleanup.sh rename to src/metric/tests/scripts/06_cleanup.sh diff --git a/src/metric/tests/scripts/common/start-all.sh b/src/metric/tests/scripts/common/start-all.sh index c68a4d5..5521367 100755 --- a/src/metric/tests/scripts/common/start-all.sh +++ b/src/metric/tests/scripts/common/start-all.sh @@ -44,6 +44,7 @@ BUILD_DIRS=( "../prometheus/build" "../grafana/build" "client-test-node/build" + "client-test-gpu-node/build" ) for dir in "${BUILD_DIRS[@]}"; do @@ -87,7 +88,7 @@ echo "2. 准备 Docker 镜像..." # 检查镜像是否存在 IMAGE_CACHE_DIR="$TEST_DIR/images-cache" -IMAGES=("argus-metric-ftp:latest" "argus-metric-prometheus:latest" "argus-metric-grafana:latest" "argus-metric-test-node:latest") +IMAGES=("argus-metric-ftp:latest" "argus-metric-prometheus:latest" "argus-metric-grafana:latest" "argus-metric-test-node:latest" "argus-metric-test-gpu-node:latest") all_images_exist=true for image in "${IMAGES[@]}"; do @@ -126,6 +127,9 @@ else "argus-metric-test-node:latest") cache_file="${IMAGE_CACHE_DIR}/argus-test-node.tar" ;; + "argus-metric-test-gpu-node:latest") + cache_file="${IMAGE_CACHE_DIR}/argus-test-gpu-node.tar" + ;; esac if [ -f "$cache_file" ]; then @@ -174,6 +178,9 @@ else "argus-metric-test-node:latest") docker save -o "${IMAGE_CACHE_DIR}/argus-test-node.tar" "$image" && echo " 已保存: argus-test-node.tar" ;; + "argus-metric-test-gpu-node:latest") + docker save -o "${IMAGE_CACHE_DIR}/argus-test-gpu-node.tar" "$image" && echo " 已保存: argus-test-gpu-node.tar" + ;; esac done echo "镜像已保存到: $IMAGE_CACHE_DIR/" @@ -185,40 +192,12 @@ else fi echo "" -echo "3. 启动服务..." +echo "3. 启动基础服务..." cd "$TEST_DIR" -docker compose up -d +# 启动除GPU节点外的所有服务 +docker compose up -d ftp prometheus grafana test-node test-gpu-node echo "" echo "4. 等待服务启动..." sleep 5 -echo "" -echo "5. 检查服务状态..." -cd "$TEST_DIR" -docker compose ps - -echo "" -echo "==========================================" -echo " 服务启动完成!" -echo "==========================================" -echo "" -echo "服务访问地址:" -echo " - FTP: ftp://localhost:${FTP_PORT:-21}" -echo " 用户名: ftpuser" -echo " 密码: ${FTP_PASSWORD:-ZGClab1234!}" -echo "" -echo " - Prometheus: http://localhost:${PROMETHEUS_PORT:-9090}" -echo "" -echo " - Grafana: http://localhost:${GRAFANA_PORT:-3000}" -echo " 用户名: admin" -echo " 密码: admin" -echo "" -echo "常用命令:" -echo " 查看日志: docker compose logs -f [service]" -echo " 停止服务: docker compose stop" -echo " 重启服务: docker compose restart" -echo " 停止并删除: docker compose down" -echo " 停止并删除卷: docker compose down -v" -echo "" - diff --git a/src/metric/tests/scripts/save-images.sh b/src/metric/tests/scripts/save-images.sh index 4b62510..9851718 100755 --- a/src/metric/tests/scripts/save-images.sh +++ b/src/metric/tests/scripts/save-images.sh @@ -24,7 +24,8 @@ declare -A IMAGES=( ["argus-metric-ftp:latest"]="argus-ftp.tar" ["argus-metric-prometheus:latest"]="argus-prometheus.tar" ["argus-metric-grafana:latest"]="argus-grafana.tar" - ["ubuntu:22.04"]="test-node.tar" + ["argus-metric-test-node:latest"]="argus-test-node.tar" + ["argus-metric-test-gpu-node:latest"]="argus-test-gpu-node.tar" ) # 检查镜像是否存在并保存