diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/check_health.sh b/src/metric/client-plugins/all-in-one-demo/scripts/check_health.sh index 991cc9f..6b3c866 100755 --- a/src/metric/client-plugins/all-in-one-demo/scripts/check_health.sh +++ b/src/metric/client-plugins/all-in-one-demo/scripts/check_health.sh @@ -4,6 +4,15 @@ set -e +# PID 文件检测,防止重复执行 +PIDFILE="/var/run/check_health.pid" +if [ -f "$PIDFILE" ] && kill -0 $(cat "$PIDFILE") 2>/dev/null; then + echo "健康检查脚本已在运行中,跳过本次执行" >&2 + exit 0 +fi +echo $$ > "$PIDFILE" +trap "rm -f $PIDFILE" EXIT + # 获取脚本所在目录 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" HEALTH_LOG_FILE="$SCRIPT_DIR/.health_log" diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/install_artifact.sh b/src/metric/client-plugins/all-in-one-demo/scripts/install_artifact.sh index ba9ade2..13f091c 100755 --- a/src/metric/client-plugins/all-in-one-demo/scripts/install_artifact.sh +++ b/src/metric/client-plugins/all-in-one-demo/scripts/install_artifact.sh @@ -200,22 +200,22 @@ parse_version_info() { VERSION=$(grep '"version"' "$VERSION_FILE_PATH" | sed 's/.*"version": *"\([^"]*\)".*/\1/') BUILD_TIME=$(grep '"build_time"' "$VERSION_FILE_PATH" | sed 's/.*"build_time": *"\([^"]*\)".*/\1/') - # 解析 artifact_list - grep -A 100 '"artifact_list"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do + # 解析 artifact_list(跳过字段名本身) + grep -A 100 '"artifact_list"' "$VERSION_FILE_PATH" | grep -v '"artifact_list"' | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/') version=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/') echo "$component:$version" >> "$TEMP_DIR/components.txt" done - # 解析 checksums - grep -A 100 '"checksums"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do + # 解析 checksums(跳过字段名本身) + grep -A 100 '"checksums"' "$VERSION_FILE_PATH" | grep -v '"checksums"' | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/') checksum=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/') echo "$component:$checksum" >> "$TEMP_DIR/checksums.txt" done - # 解析 install_order - grep -A 100 '"install_order"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+"' | while read line; do + # 解析 install_order(跳过字段名本身,只取数组元素) + grep -A 100 '"install_order"' "$VERSION_FILE_PATH" | grep -v '"install_order"' | grep -E '^\s*"[^"]+"' | while read line; do component=$(echo "$line" | sed 's/.*"\([^"]*\)".*/\1/') echo "$component" >> "$TEMP_DIR/install_order.txt" done @@ -317,85 +317,152 @@ create_install_dirs() { log_success "安装目录创建完成: $INSTALL_DIR" } +# 获取系统版本 +get_system_version() { + if [[ ! -f /etc/os-release ]]; then + log_error "无法检测操作系统版本" + return 1 + fi + + source /etc/os-release + + # 提取主版本号 + case "$VERSION_ID" in + "20.04") + echo "ubuntu20" + ;; + "22.04") + echo "ubuntu22" + ;; + *) + log_warning "未识别的Ubuntu版本: $VERSION_ID,尝试使用ubuntu22" + echo "ubuntu22" + ;; + esac +} + # 安装系统依赖包 install_system_deps() { log_info "检查系统依赖包..." - + local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" local deps_dir="$script_dir/deps" - + # 检查deps目录是否存在 if [[ ! -d "$deps_dir" ]]; then log_info "deps 目录不存在,跳过系统依赖包安装" return 0 fi + + # 获取系统版本对应的依赖目录 + local system_version=$(get_system_version) + local version_deps_dir="$deps_dir/$system_version" - # 检查是否有tar.gz文件 - local deps_count=$(find "$deps_dir" -name "*.tar.gz" | wc -l) - if [[ $deps_count -eq 0 ]]; then - log_info "deps 目录中没有 tar.gz 文件,跳过系统依赖包安装" - return 0 + log_info "检测到系统版本: $system_version" + + # 检查版本特定的依赖目录是否存在 + if [[ ! -d "$version_deps_dir" ]]; then + log_warning "未找到 $system_version 版本的依赖目录: $version_deps_dir" + # 回退到旧的逻辑,检查根deps目录 + local deps_count=$(find "$deps_dir" -name "*.tar.gz" | wc -l) + if [[ $deps_count -eq 0 ]]; then + log_info "deps 目录中没有 tar.gz 文件,跳过系统依赖包安装" + return 0 + fi + version_deps_dir="$deps_dir" + else + # 检查版本目录中是否有tar.gz文件 + local deps_count=$(find "$version_deps_dir" -name "*.tar.gz" | wc -l) + if [[ $deps_count -eq 0 ]]; then + log_info "$system_version 版本目录中没有 tar.gz 文件,跳过系统依赖包安装" + return 0 + fi fi - - log_info "找到 $deps_count 个系统依赖包,开始安装..." - + + log_info "找到 $system_version 版本的依赖包,开始安装..." + # 创建临时目录用于解压依赖包 - local deps_temp_dir="$TEMP_DIR/deps" + local deps_temp_dir="${TEMP_DIR:-/tmp}/deps" mkdir -p "$deps_temp_dir" - + + # 定义要检查的核心依赖 + local CORE_DEPS=(jq cron curl) + local FAILED_DEPS=() + # 处理每个tar.gz文件 - find "$deps_dir" -name "*.tar.gz" | while read tar_file; do + find "$version_deps_dir" -name "*.tar.gz" | while read tar_file; do local tar_basename=$(basename "$tar_file") local extract_name="${tar_basename%.tar.gz}" - + log_info "处理依赖包: $tar_basename" - + # 解压到临时目录 local extract_dir="$deps_temp_dir/$extract_name" mkdir -p "$extract_dir" - + if tar -xzf "$tar_file" -C "$extract_dir" 2>/dev/null; then log_success " $tar_basename 解压完成" else log_error " $tar_basename 解压失败" continue fi - + # 进入解压目录,查找deb包 - cd "$extract_dir" - local deb_count=$(find . -name "*.deb" | wc -l) - - if [[ $deb_count -gt 0 ]]; then - log_info " 找到 $deb_count 个 deb 包,开始安装..." - - # 1. 先尝试安装所有deb包 - log_info " 第1步:批量安装deb包..." - if dpkg -i *.deb 2>/dev/null; then - log_success " 所有deb包安装成功" - else - log_warning " 部分deb包安装失败,可能存在依赖问题" - - # 2. 使用apt-get修复依赖 - log_info " 第2步:修复依赖关系..." - if apt-get install -f -y; then - log_success " 依赖关系修复完成" - else - log_error " 依赖关系修复失败" - # 继续处理其他包,不退出 + cd "$extract_dir" || continue + local deb_files=(*.deb) + if [[ ${#deb_files[@]} -gt 0 ]]; then + log_info " 找到 ${#deb_files[@]} 个 deb 包,开始安装..." + + for deb in "${deb_files[@]}"; do + local pkg_name + pkg_name=$(dpkg-deb -f "$deb" Package 2>/dev/null) + + # 如果已安装,则跳过 + if dpkg -s "$pkg_name" &>/dev/null; then + log_success " $pkg_name 已安装,跳过" + continue fi - fi + + # 尝试安装 + log_info " 安装 $pkg_name..." + if DEBIAN_FRONTEND=noninteractive dpkg -i "$deb" &>/dev/null; then + log_success " $pkg_name 安装成功" + else + log_warning " $pkg_name 安装失败,尝试修复依赖..." + if DEBIAN_FRONTEND=noninteractive apt-get install -f -y &>/dev/null; then + if dpkg -s "$pkg_name" &>/dev/null; then + log_success " $pkg_name 修复安装成功" + else + log_error " $pkg_name 仍未安装成功" + FAILED_DEPS+=("$pkg_name") + fi + else + log_error " $pkg_name 自动修复失败" + FAILED_DEPS+=("$pkg_name") + fi + fi + done else log_info " $tar_basename 中没有找到deb包,跳过" fi - + # 返回到依赖临时目录 - cd "$deps_temp_dir" + cd "$deps_temp_dir" || continue done - + # 检查并启动 cron 服务 start_cron_service - - log_success "系统依赖包安装完成" + + # 总结安装结果 + if [[ ${#FAILED_DEPS[@]} -gt 0 ]]; then + log_error "以下系统依赖未能成功安装,安装终止,请手动安装后重试:" + for f in "${FAILED_DEPS[@]}"; do + echo " - $f" + done + exit 1 + else + log_success "系统依赖包安装完成,全部就绪" + fi } # 启动 cron 服务 @@ -637,6 +704,18 @@ EOF log_success "安装记录已创建: $install_record_file" } +# 检查cron任务是否已存在 +check_cron_task_exists() { + local task_pattern="$1" + local temp_cron="$2" + + if grep -q "$task_pattern" "$temp_cron"; then + return 0 # 任务已存在 + else + return 1 # 任务不存在 + fi +} + # 设置健康检查定时任务 setup_health_check_cron() { log_info "设置健康检查定时任务..." @@ -661,7 +740,7 @@ setup_health_check_cron() { crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron" # 检查并删除旧的健康检查任务 - if grep -q "check_health.sh" "$temp_cron"; then + if check_cron_task_exists "check_health.sh" "$temp_cron"; then log_info "发现旧的健康检查定时任务,正在更新..." # 删除所有包含check_health.sh的行 grep -v "check_health.sh" "$temp_cron" > "$temp_cron.new" @@ -716,7 +795,7 @@ setup_dns_sync_cron() { crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron" # 检查并删除旧的 DNS 同步任务 - if grep -q "sync_dns.sh" "$temp_cron"; then + if check_cron_task_exists "sync_dns.sh" "$temp_cron"; then log_info "发现旧的 DNS 同步定时任务,正在更新..." # 删除所有包含sync_dns.sh的行 grep -v "sync_dns.sh" "$temp_cron" > "$temp_cron.new" @@ -724,16 +803,15 @@ setup_dns_sync_cron() { log_info "旧的 DNS 同步定时任务已删除" fi - # 添加新的定时任务(每30秒执行一次) + # 添加新的定时任务(每1分钟执行一次) # 直接使用版本目录中的 DNS 同步脚本 echo "# Argus-Metrics DNS 同步定时任务" >> "$temp_cron" echo "* * * * * $sync_dns_script >> $INSTALL_DIR/.dns_sync.log 2>&1" >> "$temp_cron" - echo "* * * * * sleep 30; $sync_dns_script >> $INSTALL_DIR/.dns_sync.log 2>&1" >> "$temp_cron" # 安装新的crontab if crontab "$temp_cron"; then log_success "DNS 同步定时任务设置成功" - log_info " 执行频率: 每30秒" + log_info " 执行频率: 每1分钟" log_info " 日志文件: $INSTALL_DIR/.dns_sync.log" log_info " 查看定时任务: crontab -l" log_info " 删除定时任务: crontab -e" @@ -771,7 +849,7 @@ setup_version_check_cron() { crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron" # 检查是否已存在版本校验定时任务 - if grep -q "check_version.sh" "$temp_cron"; then + if check_cron_task_exists "check_version.sh" "$temp_cron"; then log_info "发现旧的版本校验定时任务,正在更新..." # 删除所有包含check_version.sh的行 grep -v "check_version.sh" "$temp_cron" > "$temp_cron.new" @@ -824,7 +902,7 @@ setup_restart_cron() { crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron" # 检查是否已存在自动重启定时任务 - if grep -q "restart_unhealthy.sh" "$temp_cron"; then + if check_cron_task_exists "restart_unhealthy.sh" "$temp_cron"; then log_info "发现旧的自动重启定时任务,正在更新..." # 删除所有包含restart_unhealthy.sh的行 grep -v "restart_unhealthy.sh" "$temp_cron" > "$temp_cron.new" @@ -885,9 +963,9 @@ main() { check_system find_version_file create_install_dirs - parse_version_info + install_system_deps + parse_version_info verify_checksums - install_system_deps install_components copy_config_files create_install_record @@ -895,6 +973,20 @@ main() { setup_dns_sync_cron setup_version_check_cron setup_restart_cron + + # 注释掉立即执行健康检查,避免与cron任务重复执行 + # log_info "立即执行一次健康检查..." + # local check_health_script="$INSTALL_DIR/check_health.sh" + # if [[ -f "$check_health_script" ]]; then + # if "$check_health_script" >> "$INSTALL_DIR/.health_check.log" 2>&1; then + # log_success "健康检查执行完成" + # else + # log_warning "健康检查执行失败,请检查日志: $INSTALL_DIR/.health_check.log" + # fi + # else + # log_warning "健康检查脚本不存在: $check_health_script" + # fi + show_install_info } diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/publish_artifact.sh b/src/metric/client-plugins/all-in-one-demo/scripts/publish_artifact.sh index 7b93099..2f16b19 100755 --- a/src/metric/client-plugins/all-in-one-demo/scripts/publish_artifact.sh +++ b/src/metric/client-plugins/all-in-one-demo/scripts/publish_artifact.sh @@ -29,26 +29,68 @@ log_error() { show_help() { echo "Argus-Metric Artifact 发布脚本" echo - echo "用法: $0 <版本号>" + echo "用法: $0 <版本号> [选项]" echo echo "参数:" - echo " <版本号> 要发布的版本号,对应 artifact 目录中的版本" + echo " <版本号> 要发布的版本号,对应 artifact 目录中的版本" + echo + echo "选项:" + echo " --output-dir <路径> 指定输出目录 (默认: /private/argus/ftp/share/)" + echo " --owner 指定文件所有者 (默认: 2133:2015)" + echo " -h, --help 显示此帮助信息" echo echo "示例:" - echo " $0 1.20.0 # 发布 1.20.0 版本" + echo " $0 1.20.0 # 使用默认配置发布" + echo " $0 1.20.0 --output-dir /tmp/publish # 指定输出目录" + echo " $0 1.20.0 --owner 1000:1000 # 指定文件所有者" + echo " $0 1.20.0 --output-dir /srv/ftp --owner root:root # 同时指定两者" echo } -# 检查参数 -if [[ $# -ne 1 ]]; then +# 默认配置 +DEFAULT_PUBLISH_DIR="/private/argus/ftp/share/" +DEFAULT_OWNER="2133:2015" + +# 解析参数 +VERSION="" +PUBLISH_DIR="$DEFAULT_PUBLISH_DIR" +OWNER="$DEFAULT_OWNER" + +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + show_help + exit 0 + ;; + --output-dir) + PUBLISH_DIR="$2" + shift 2 + ;; + --owner) + OWNER="$2" + shift 2 + ;; + *) + if [[ -z "$VERSION" ]]; then + VERSION="$1" + shift + else + log_error "未知参数: $1" + show_help + exit 1 + fi + ;; + esac +done + +# 检查版本号是否提供 +if [[ -z "$VERSION" ]]; then log_error "请提供版本号参数" show_help exit 1 fi -VERSION="$1" ARTIFACT_DIR="artifact/$VERSION" -PUBLISH_DIR="/Users/sundapeng/Project/nlp/aiops/client-plugins/all-in-one/publish/" # 检查版本目录是否存在 if [[ ! -d "$ARTIFACT_DIR" ]]; then @@ -57,10 +99,12 @@ if [[ ! -d "$ARTIFACT_DIR" ]]; then fi log_info "开始发布版本: $VERSION" +log_info "输出目录: $PUBLISH_DIR" +log_info "文件所有者: $OWNER" # 确保发布目录存在 log_info "确保发布目录存在: $PUBLISH_DIR" -mkdir -p "$PUBLISH_DIR" +sudo mkdir -p "$PUBLISH_DIR" # 创建临时目录用于打包 TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$" @@ -164,20 +208,26 @@ fi TAR_NAME="argus-metric_$(echo $VERSION | tr '.' '_').tar.gz" log_info "创建发布包: $TAR_NAME" cd "$TEMP_PACKAGE_DIR" -tar -czf "$PUBLISH_DIR/$TAR_NAME" * +sudo tar -czf "$PUBLISH_DIR/$TAR_NAME" * cd - > /dev/null +# 设置文件所有者 +log_info "设置文件所有者为: $OWNER" +sudo chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME" + # 清理临时目录 rm -rf "$TEMP_PACKAGE_DIR" # 更新 LATEST_VERSION 文件 log_info "更新 LATEST_VERSION 文件..." -echo "$VERSION" > "$PUBLISH_DIR/LATEST_VERSION" +echo "$VERSION" | sudo tee "$PUBLISH_DIR/LATEST_VERSION" > /dev/null +sudo chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION" # 复制 DNS 配置文件到发布目录根目录(直接从 config 目录复制) if [[ -f "config/dns.conf" ]]; then log_info "复制 DNS 配置文件到发布目录根目录..." - cp "config/dns.conf" "$PUBLISH_DIR/" + sudo cp "config/dns.conf" "$PUBLISH_DIR/" + sudo chown "$OWNER" "$PUBLISH_DIR/dns.conf" log_success "DNS 配置文件复制完成: $PUBLISH_DIR/dns.conf" else log_warning "未找到 config/dns.conf 文件,跳过 DNS 配置文件复制" @@ -186,7 +236,8 @@ fi # 复制 setup.sh 到发布目录 if [[ -f "scripts/setup.sh" ]]; then log_info "复制 setup.sh 到发布目录..." - cp "scripts/setup.sh" "$PUBLISH_DIR/" + sudo cp "scripts/setup.sh" "$PUBLISH_DIR/" + sudo chown "$OWNER" "$PUBLISH_DIR/setup.sh" fi # 显示发布结果 diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/restart_unhealthy.sh b/src/metric/client-plugins/all-in-one-demo/scripts/restart_unhealthy.sh index 7e54693..cd2065b 100755 --- a/src/metric/client-plugins/all-in-one-demo/scripts/restart_unhealthy.sh +++ b/src/metric/client-plugins/all-in-one-demo/scripts/restart_unhealthy.sh @@ -2,6 +2,15 @@ # 此脚本会检查各组件的健康状态,并重启不健康的组件 +# PID 文件检测,防止重复执行 +PIDFILE="/var/run/restart_unhealthy.pid" +if [ -f "$PIDFILE" ] && kill -0 $(cat "$PIDFILE") 2>/dev/null; then + echo "自动重启脚本已在运行中,跳过本次执行" >&2 + exit 0 +fi +echo $$ > "$PIDFILE" +trap "rm -f $PIDFILE" EXIT + # 获取脚本所在目录 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record" diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/sync_dns.sh b/src/metric/client-plugins/all-in-one-demo/scripts/sync_dns.sh index 9e05f24..ba8a84c 100755 --- a/src/metric/client-plugins/all-in-one-demo/scripts/sync_dns.sh +++ b/src/metric/client-plugins/all-in-one-demo/scripts/sync_dns.sh @@ -1,244 +1,143 @@ #!/bin/bash - -# DNS 同步脚本 -# 比较 FTP 根目录的 dns.conf 和本地的 dns.conf,如果有变化则同步到 /etc/resolv.conf - set -e -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color +# 颜色 +RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m' -# 日志函数 - 输出到 stderr 避免影响函数返回值 -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" >&2 -} +# 日志函数 +log_info() { echo -e "${BLUE}[INFO]${NC} $1" >&2; } +log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" >&2; } +log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1" >&2; } +log_error() { echo -e "${RED}[ERROR]${NC} $1" >&2; } -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" >&2 -} - -log_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" >&2 -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" >&2 -} - -# 获取脚本所在目录 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" LOCAL_DNS_CONF="/opt/argus-metric/dns.conf" -REMOTE_DNS_CONF_URL="" RESOLV_CONF="/etc/resolv.conf" +ALT_RESOLV_CONF="/run/resolv.conf" LOG_FILE="/opt/argus-metric/.dns_sync.log" +REMOTE_DNS_CONF_URL="" -# 从环境变量或配置文件获取 FTP 服务器信息 +# 获取 FTP 配置 get_ftp_config() { - # 优先从环境变量获取配置 log_info "获取 FTP 配置信息..." - - # 如果环境变量中没有设置,则尝试从配置文件读取 if [[ -z "$FTP_SERVER" || -z "$FTP_USER" || -z "$FTP_PASSWORD" ]]; then - local config_file="$SCRIPT_DIR/config.env" - if [[ -f "$config_file" ]]; then - log_info "从配置文件读取 FTP 配置: $config_file" - source "$config_file" - fi - else - log_info "使用环境变量中的 FTP 配置" + [[ -f "$SCRIPT_DIR/config.env" ]] && source "$SCRIPT_DIR/config.env" fi - - # 设置默认值(如果环境变量和配置文件都没有设置) FTP_SERVER="${FTP_SERVER:-localhost}" FTP_USER="${FTP_USER:-ftpuser}" FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" - - # 构建远程 DNS 配置文件 URL REMOTE_DNS_CONF_URL="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/dns.conf" - - log_info "FTP 配置来源: ${FTP_CONFIG_SOURCE:-环境变量/配置文件}" } -# 下载远程 DNS 配置文件 +# 下载远程 dns.conf download_remote_dns_conf() { - local temp_file="/tmp/dns.conf.remote.$$" - - log_info "从 FTP 服务器下载 DNS 配置文件..." - log_info "远程地址: $REMOTE_DNS_CONF_URL" - log_info "FTP 服务器: $FTP_SERVER" - log_info "FTP 用户: $FTP_USER" - - # 先测试 FTP 连接 + local tmp="/tmp/dns.remote.$$" log_info "测试 FTP 连接..." - if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/" >/dev/null 2>&1; then - log_success "FTP 服务器连接成功" - else - log_error "无法连接到 FTP 服务器: $FTP_SERVER" - log_error "请检查:" - log_error " 1. FTP 服务器是否运行" - log_error " 2. 网络连接是否正常" - log_error " 3. 服务器地址是否正确" - return 1 + if ! curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/" >/dev/null; then + log_error "无法连接到 FTP 服务器: $FTP_SERVER"; return 1 fi - - # 测试 dns.conf 文件是否存在 - log_info "检查远程 dns.conf 文件是否存在..." - if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/dns.conf" >/dev/null 2>&1; then - log_success "远程 dns.conf 文件存在" - else - log_error "远程 dns.conf 文件不存在或无法访问" - log_error "请检查 FTP 服务器根目录下是否有 dns.conf 文件" - return 1 - fi - - # 尝试下载文件 - log_info "开始下载 dns.conf 文件..." - if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sf "ftp://${FTP_SERVER}/dns.conf" -o "$temp_file" 2>/dev/null; then - log_success "远程 DNS 配置文件下载成功" - echo "$temp_file" - else - log_error "下载 dns.conf 文件失败" - log_error "尝试手动测试命令:" - log_error " curl -u ${FTP_USER}:${FTP_PASSWORD} ftp://${FTP_SERVER}/dns.conf" - rm -f "$temp_file" - return 1 + if ! curl -u "${FTP_USER}:${FTP_PASSWORD}" -sf "ftp://${FTP_SERVER}/dns.conf" -o "$tmp" 2>/dev/null; then + log_error "下载 dns.conf 失败"; rm -f "$tmp"; return 1 fi + echo "$tmp" } -# 比较两个文件是否相同 -compare_files() { - local file1="$1" - local file2="$2" - - if [[ ! -f "$file1" || ! -f "$file2" ]]; then - return 1 - fi - - # 使用 diff 比较文件内容 - if diff -q "$file1" "$file2" >/dev/null 2>&1; then - return 0 # 文件相同 - else - return 1 # 文件不同 - fi +# 文件比较 +compare_files() { diff -q "$1" "$2" >/dev/null 2>&1; } + +# 从 dns.conf 提取有效 IP +get_dns_ips() { + grep -Eo '^[0-9]{1,3}(\.[0-9]{1,3}){3}$' "$1" | sort -u } -# 将 DNS 配置追加到 /etc/resolv.conf +# 安全更新 resolv.conf(保留符号链接) update_resolv_conf() { - local dns_conf_file="$1" - - log_info "更新 /etc/resolv.conf 文件..." - - # 备份原始文件 - if [[ -f "$RESOLV_CONF" ]]; then - cp "$RESOLV_CONF" "${RESOLV_CONF}.backup.$(date +%Y%m%d_%H%M%S)" - log_info "已备份原始 resolv.conf 文件" + local dns_conf="$1" + local dns_ips + mapfile -t dns_ips < <(get_dns_ips "$dns_conf") + [[ ${#dns_ips[@]} -eq 0 ]] && { log_warning "未检测到有效 DNS"; return; } + + local target_file="$RESOLV_CONF" + if [[ ! -w "$RESOLV_CONF" ]]; then + log_warning "/etc/resolv.conf 不可写,使用兜底路径 $ALT_RESOLV_CONF" + target_file="$ALT_RESOLV_CONF" fi - - # 读取 DNS 配置文件并追加到 resolv.conf - while IFS= read -r line; do - # 跳过空行和注释行 - [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue - - # 验证是否为有效的 IP 地址 - if [[ "$line" =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then - # 检查是否已存在相同的 nameserver 行 - if ! grep -q "nameserver $line" "$RESOLV_CONF" 2>/dev/null; then - echo "nameserver $line" >> "$RESOLV_CONF" - log_info "添加 DNS 服务器: $line" - else - log_info "DNS 服务器已存在,跳过: $line" - fi - else - log_warning "跳过无效的 DNS 地址: $line" + + local temp="/tmp/resolv.new.$$" + cp "$target_file" "${target_file}.backup.$(date +%Y%m%d_%H%M%S)" 2>/dev/null || true + log_info "更新 DNS 配置文件: $target_file" + + # 写入新的 nameserver 行 + for ip in "${dns_ips[@]}"; do + echo "nameserver $ip" + done >"$temp" + + # 追加原内容(去掉重复 nameserver) + grep -v '^nameserver' "$target_file" >>"$temp" 2>/dev/null || true + awk '!a[$0]++' "$temp" >"${temp}.uniq" + + # ⚙️ 使用 cat 原地覆盖,避免 mv 引发 “设备忙” + if cat "${temp}.uniq" >"$target_file" 2>/dev/null; then + chmod 644 "$target_file" + log_success "DNS 更新完成: ${dns_ips[*]}" + else + log_error "无法写入 $target_file,可能被系统锁定" + fi + + rm -f "$temp" "${temp}.uniq" +} + +# 检查 resolv.conf 是否包含 dns.conf 内容 +ensure_dns_in_resolv() { + local dns_conf="$1" + local dns_ips + mapfile -t dns_ips < <(get_dns_ips "$dns_conf") + [[ ${#dns_ips[@]} -eq 0 ]] && return + + for ip in "${dns_ips[@]}"; do + if ! grep -q "nameserver $ip" "$RESOLV_CONF" 2>/dev/null; then + log_warning "检测到 /etc/resolv.conf 缺少 $ip,执行兜底修复" + update_resolv_conf "$dns_conf" + return fi - done < "$dns_conf_file" - - # 设置文件权限 - chmod 644 "$RESOLV_CONF" - - log_success "/etc/resolv.conf 文件更新完成" + done + log_info "/etc/resolv.conf 已包含所有 DNS" } -# 记录同步日志 -log_sync() { - local message="$1" - local timestamp=$(date '+%Y-%m-%d %H:%M:%S') - echo "[$timestamp] $message" >> "$LOG_FILE" -} +log_sync() { echo "[$(date '+%F %T')] $1" >>"$LOG_FILE"; } -# 主函数 main() { log_info "开始 DNS 同步检查..." - log_sync "DNS 同步检查开始" - - # 确保系统目录存在 - mkdir -p "/opt/argus-metric" - - # 获取 FTP 配置 + mkdir -p /opt/argus-metric + get_ftp_config - - # 检查本地 DNS 配置文件是否存在 + local remote_file + if ! remote_file=$(download_remote_dns_conf); then + log_error "下载失败"; log_sync "同步失败"; exit 1 + fi + if [[ ! -f "$LOCAL_DNS_CONF" ]]; then - log_warning "本地 DNS 配置文件不存在: $LOCAL_DNS_CONF" - log_warning "将下载远程配置文件并更新系统 DNS 设置" - - # 下载远程配置文件 - if remote_file=$(download_remote_dns_conf); then - # 复制到本地 - cp "$remote_file" "$LOCAL_DNS_CONF" - log_success "远程 DNS 配置文件已保存到本地" - - # 更新 resolv.conf - update_resolv_conf "$LOCAL_DNS_CONF" - log_sync "首次同步完成,DNS 配置已更新" - - # 清理临时文件 - rm -f "$remote_file" - else - log_error "无法下载远程 DNS 配置文件,同步失败" - log_sync "同步失败:无法下载远程配置文件" - exit 1 - fi + log_info "本地 dns.conf 不存在,初始化..." + cp "$remote_file" "$LOCAL_DNS_CONF" + update_resolv_conf "$LOCAL_DNS_CONF" + log_sync "首次同步完成" else - log_info "本地 DNS 配置文件存在: $LOCAL_DNS_CONF" - - # 下载远程配置文件进行比较 - if remote_file=$(download_remote_dns_conf); then - # 比较文件 - if compare_files "$LOCAL_DNS_CONF" "$remote_file"; then - log_info "DNS 配置文件无变化,无需更新" - log_sync "DNS 配置文件无变化" - else - log_info "检测到 DNS 配置文件有变化,开始同步..." - log_sync "检测到 DNS 配置文件变化,开始同步" - - # 更新本地配置文件 - cp "$remote_file" "$LOCAL_DNS_CONF" - log_success "本地 DNS 配置文件已更新" - - # 更新 resolv.conf - update_resolv_conf "$LOCAL_DNS_CONF" - log_sync "DNS 配置同步完成" - fi - - # 清理临时文件 - rm -f "$remote_file" + if compare_files "$LOCAL_DNS_CONF" "$remote_file"; then + log_info "dns.conf 无变化" + ensure_dns_in_resolv "$LOCAL_DNS_CONF" + log_sync "dns.conf 无变化,执行兜底检查" else - log_error "无法下载远程 DNS 配置文件,跳过本次同步" - log_sync "同步失败:无法下载远程配置文件" - exit 1 + log_info "检测到 DNS 配置更新" + cp "$remote_file" "$LOCAL_DNS_CONF" + update_resolv_conf "$LOCAL_DNS_CONF" + log_sync "DNS 配置同步完成" fi fi - - log_success "DNS 同步检查完成" - log_sync "DNS 同步检查完成" + + rm -f "$remote_file" + log_success "DNS 同步流程完成" } -# 脚本入口 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then main "$@" fi diff --git a/src/metric/ftp/build/Dockerfile b/src/metric/ftp/build/Dockerfile index b691843..5d11e10 100644 --- a/src/metric/ftp/build/Dockerfile +++ b/src/metric/ftp/build/Dockerfile @@ -31,25 +31,26 @@ RUN mkdir -p /var/log/supervisor ENV FTP_BASE_PATH=/private/argus/ftp # 设置域名环境变量 -ENV DOMAIN=prom.ftp.argus.com +ENV DOMAIN=ftp.metric.argus.com # 设置FTP用户密码环境变量 ENV FTP_PASSWORD=ZGClab1234! # 设置用户和组ID环境变量 -ARG FTP_UID=2133 -ARG FTP_GID=2015 -ENV FTP_UID=${FTP_UID} -ENV FTP_GID=${FTP_GID} +ARG ARGUS_BUILD_UID=2133 +ARG ARGUS_BUILD_GID=2015 + +ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \ + ARGUS_BUILD_GID=${ARGUS_BUILD_GID} # 创建FTP用户和目录结构 -RUN groupadd -g ${FTP_GID} ftpuser && \ - useradd -u ${FTP_UID} -g ${FTP_GID} -d ${FTP_BASE_PATH}/share -s /bin/bash ftpuser && \ +RUN groupadd -g ${ARGUS_BUILD_GID} ftpuser && \ + useradd -u ${ARGUS_BUILD_UID} -g ${ARGUS_BUILD_GID} -d ${FTP_BASE_PATH}/share -s /bin/bash ftpuser && \ mkdir -p ${FTP_BASE_PATH}/share \ && mkdir -p /private/argus/etc \ && mkdir -p /var/log/vsftpd \ - && mkdir -p /var/run/vsftpd/empty \ - && chown -R ftpuser:ftpuser ${FTP_BASE_PATH} + && chown -R ftpuser:ftpuser ${FTP_BASE_PATH} \ + && mkdir -p /var/run/vsftpd/empty # 创建vsftpd配置目录和用户列表文件 RUN mkdir -p /etc/vsftpd && \ diff --git a/src/metric/ftp/build/start-ftp-supervised.sh b/src/metric/ftp/build/start-ftp-supervised.sh index fb0a213..57d0e6d 100644 --- a/src/metric/ftp/build/start-ftp-supervised.sh +++ b/src/metric/ftp/build/start-ftp-supervised.sh @@ -32,6 +32,9 @@ IP=$(ifconfig eth0 | awk '/inet /{print $2}' || hostname -i) echo "current IP: ${IP}" echo "${IP}" > /private/argus/etc/${DOMAIN} +chown ${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID} /private/argus/etc/${DOMAIN} +chmod +x /private/argus/etc/${DOMAIN} + # 启动vsftpd echo "[INFO] Starting vsftpd..." exec /usr/sbin/vsftpd /tmp/vsftpd.conf diff --git a/src/metric/grafana/build/Dockerfile b/src/metric/grafana/build/Dockerfile index 82ba4fa..c9212dc 100644 --- a/src/metric/grafana/build/Dockerfile +++ b/src/metric/grafana/build/Dockerfile @@ -17,30 +17,31 @@ RUN mkdir -p /var/log/supervisor ENV GRAFANA_BASE_PATH=/private/argus/metric/grafana # 设置用户和组ID环境变量 -ARG GRAFANA_UID=2133 -ARG GRAFANA_GID=2015 -ENV GRAFANA_UID=${GRAFANA_UID} -ENV GRAFANA_GID=${GRAFANA_GID} +ARG ARGUS_BUILD_UID=2133 +ARG ARGUS_BUILD_GID=2015 + +ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \ + ARGUS_BUILD_GID=${ARGUS_BUILD_GID} # 创建基本目录结构 RUN mkdir -p /private/argus/etc \ - && mkdir -p /private/argus/metric/grafana/data \ - && mkdir -p /private/argus/metric/grafana/logs \ - && mkdir -p /private/argus/metric/grafana/plugins \ - && mkdir -p /private/argus/metric/grafana/provisioning/datasources \ - && mkdir -p /private/argus/metric/grafana/provisioning/dashboards \ - && mkdir -p /private/argus/metric/grafana/data/sessions \ - && mkdir -p /private/argus/metric/grafana/data/dashboards \ - && mkdir -p /private/argus/metric/grafana/config \ + && mkdir -p ${GRAFANA_BASE_PATH}/data \ + && mkdir -p ${GRAFANA_BASE_PATH}/logs \ + && mkdir -p ${GRAFANA_BASE_PATH}/plugins \ + && mkdir -p ${GRAFANA_BASE_PATH}/provisioning/datasources \ + && mkdir -p ${GRAFANA_BASE_PATH}/provisioning/dashboards \ + && mkdir -p ${GRAFANA_BASE_PATH}/data/sessions \ + && mkdir -p ${GRAFANA_BASE_PATH}/data/dashboards \ + && mkdir -p ${GRAFANA_BASE_PATH}/config \ && mkdir -p /etc/grafana \ && mkdir -p /var/lib/grafana \ && mkdir -p /var/log/grafana # 修改 Grafana 用户 UID/GID 并授权 RUN deluser grafana && \ - addgroup -g ${GRAFANA_GID} grafana && \ - adduser -u ${GRAFANA_UID} -G grafana -s /bin/sh -D grafana && \ - chown -R grafana:grafana /var/lib/grafana /etc/grafana /var/log/grafana /private/argus + addgroup -g ${ARGUS_BUILD_GID} grafana && \ + adduser -u ${ARGUS_BUILD_UID} -G grafana -s /bin/sh -D grafana && \ + chown -R grafana:grafana /var/lib/grafana /etc/grafana /var/log/grafana ${GRAFANA_BASE_PATH} # 复制配置文件到容器内临时位置 COPY grafana.ini /tmp/grafana.ini diff --git a/src/metric/grafana/build/dashboards/default_cluster_dashboard.json b/src/metric/grafana/build/dashboards/default_cluster_dashboard.json new file mode 100644 index 0000000..06ef418 --- /dev/null +++ b/src/metric/grafana/build/dashboards/default_cluster_dashboard.json @@ -0,0 +1,570 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 3, + "links": [], + "panels": [ + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode='idle'}[5m])))", + "refId": "A" + } + ], + "title": "CPU 平均利用率(%)", + "type": "stat" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "expr": "avg(1 - ((node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes)) * 100", + "refId": "A" + } + ], + "title": "内存平均利用率(%)", + "type": "stat" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "expr": "count(count by(hostname) (up{job='node'} == 1))", + "refId": "A" + } + ], + "title": "节点在线数", + "type": "stat" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 5 + }, + "id": 6, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "expr": "avg by (hostname) (DCGM_FI_DEV_GPU_UTIL)", + "refId": "A" + } + ], + "title": "GPU 平均利用率 (%)", + "type": "gauge" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 5 + }, + "id": 12, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "expr": "round(avg(DCGM_FI_DEV_FB_USED{job='dcgm'}/(DCGM_FI_DEV_FB_USED{job='dcgm'} + DCGM_FI_DEV_FB_FREE{job='dcgm'})) * 100)", + "refId": "A" + } + ], + "title": "显存平均利用率 (%)", + "type": "gauge" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 5 + }, + "id": 7, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "expr": "avg by (hostname) (DCGM_FI_DEV_GPU_TEMP)", + "refId": "A" + } + ], + "title": "GPU 温度 (℃)", + "type": "gauge" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 300, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "orange", + "value": 200 + }, + { + "color": "red", + "value": 300 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 5 + }, + "id": 8, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "expr": "avg by (hostname) (DCGM_FI_DEV_POWER_USAGE)", + "refId": "A" + } + ], + "title": "GPU 平均实时功耗 (W)", + "type": "gauge" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": { + "align": "center", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "decimals": 1, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 11, + "options": { + "cellHeight": "sm", + "cellLinks": [ + { + "title": "跳转至节点详情", + "url": "http://127.0.0.1:3000/d/node_gpu_metrics/node-and-gpu-metrics?orgId=1&refresh=15s&var-hostname=${__data.fields.hostname}" + } + ], + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "GPU 使用率" + } + ] + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "expr": "up{job=\"dcgm\"} + on(hostname) group_left(ip, node_id) up{job=\"dcgm\"}*0", + "format": "table", + "instant": true, + "refId": "node_info" + }, + { + "expr": "round(100 - avg by(hostname)(rate(node_cpu_seconds_total{job=\"node\",mode=\"idle\"}[5m])) * 100, 0.1)", + "format": "table", + "instant": true, + "refId": "CPU" + }, + { + "expr": "round((1 - avg by(hostname)(node_memory_MemAvailable_bytes{job=\"node\"} / node_memory_MemTotal_bytes{job=\"node\"})) * 100, 0.1)", + "format": "table", + "instant": true, + "refId": "MEM" + }, + { + "expr": "round(avg by(hostname)(DCGM_FI_DEV_GPU_UTIL{job=\"dcgm\"}), 0.1)", + "format": "table", + "instant": true, + "refId": "GPU_UTIL" + }, + { + "expr": "round(avg by(hostname)(DCGM_FI_DEV_FB_USED{job=\"dcgm\"} / (DCGM_FI_DEV_FB_USED{job=\"dcgm\"} + DCGM_FI_DEV_FB_FREE{job=\"dcgm\"}) * 100), 0.1)", + "format": "table", + "instant": true, + "refId": "GPU_MEM" + } + ], + "title": "节点列表(CPU / 内存 / GPU)", + "transformations": [ + { + "id": "seriesToColumns", + "options": { + "byField": "hostname" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value #node_info": true, + "hostname_1": true, + "hostname_2": true, + "hostname_3": true, + "instance": true, + "ip_1": true, + "job": true, + "node_id_1": true + }, + "indexByName": { + "CPU 使用率": 3, + "GPU 使用率": 5, + "GPU 显存占用": 6, + "IP 地址": 1, + "主机名": 0, + "内存使用率": 4, + "节点 ID": 2 + }, + "renameByName": { + "Value #CPU": "CPU 使用率", + "Value #GPU_MEM": "GPU 显存占用", + "Value #GPU_UTIL": "GPU 使用率", + "Value #MEM": "内存使用率", + "hostname": "主机名", + "ip": "IP 地址", + "node_id": "节点 ID", + "user_id": "用户ID" + } + } + } + ], + "type": "table" + } + ], + "refresh": "5s", + "schemaVersion": 39, + "tags": [ + "cluster", + "gpu", + "system" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Cluster Dashboard", + "uid": "cluster-dashboard", + "version": 34, + "weekStart": "" +} \ No newline at end of file diff --git a/src/metric/grafana/build/start-grafana-supervised.sh b/src/metric/grafana/build/start-grafana-supervised.sh index 5d4b8a1..95bb267 100644 --- a/src/metric/grafana/build/start-grafana-supervised.sh +++ b/src/metric/grafana/build/start-grafana-supervised.sh @@ -9,6 +9,7 @@ DOMAIN=grafana.metric.argus.com IP=$(ifconfig | awk '/inet / && $2 != "127.0.0.1" {print $2; exit}') echo "current IP: ${IP}" echo "${IP}" > /private/argus/etc/${DOMAIN} +chmod +x /private/argus/etc/${DOMAIN} # 确保必要目录存在(权限已在 Dockerfile 中设置) mkdir -p /private/argus/metric/grafana/data @@ -27,7 +28,6 @@ mkdir -p /var/lib/grafana if [ -f "/tmp/grafana.ini" ]; then echo "[INFO] Copying grafana.ini to /private/argus/metric/grafana/config/" cp /tmp/grafana.ini /private/argus/metric/grafana/config/grafana.ini - chown grafana:grafana /private/argus/metric/grafana/config/grafana.ini echo "[INFO] Grafana configuration copied successfully" fi @@ -47,12 +47,9 @@ fi if [ -f "/tmp/datasources.yml" ]; then echo "[INFO] Copying datasource configuration to /private/argus/metric/grafana/provisioning/datasources/" cp /tmp/datasources.yml /private/argus/metric/grafana/provisioning/datasources/datasources.yml - chown grafana:grafana /private/argus/metric/grafana/provisioning/datasources/datasources.yml echo "[INFO] Datasource configuration copied successfully" elif [ -d "/private/argus/metric/grafana/provisioning/datasources" ] && [ "$(ls -A /private/argus/metric/grafana/provisioning/datasources)" ]; then echo "[INFO] Found existing datasource provisioning files in /private/argus/metric/grafana/provisioning/datasources" - # 确保数据源配置目录权限正确 - chown -R grafana:grafana /private/argus/metric/grafana/provisioning/datasources elif [ -d "/etc/grafana/provisioning/datasources" ] && [ "$(ls -A /etc/grafana/provisioning/datasources)" ]; then echo "[INFO] Found datasource provisioning files in /etc/grafana/provisioning/datasources" # 确保数据源配置目录权限正确 @@ -65,7 +62,6 @@ fi if [ -f "/tmp/dashboards.yml" ]; then echo "[INFO] Copying dashboard configuration to /private/argus/metric/grafana/provisioning/dashboards/" cp /tmp/dashboards.yml /private/argus/metric/grafana/provisioning/dashboards/dashboards.yml - chown grafana:grafana /private/argus/metric/grafana/provisioning/dashboards/dashboards.yml echo "[INFO] Dashboard configuration copied successfully" fi @@ -73,13 +69,9 @@ fi if [ -f "/tmp/default_dashboard.json" ]; then echo "[INFO] Copying default dashboard to /private/argus/metric/grafana/provisioning/dashboards/" cp /tmp/default_dashboard.json /private/argus/metric/grafana/provisioning/dashboards/default_dashboard.json - chown grafana:grafana /private/argus/metric/grafana/provisioning/dashboards/default_dashboard.json echo "[INFO] Default dashboard copied successfully" fi -# 确保所有配置目录权限正确 -chown -R grafana:grafana /private/argus/metric/grafana/provisioning/ - # 启动 Grafana if [ -n "$CONFIG_FILE" ]; then echo "[INFO] Starting Grafana with custom configuration..." diff --git a/src/metric/prometheus/build/Dockerfile b/src/metric/prometheus/build/Dockerfile index e2195a8..9e609f3 100755 --- a/src/metric/prometheus/build/Dockerfile +++ b/src/metric/prometheus/build/Dockerfile @@ -48,11 +48,11 @@ RUN mkdir -p /var/log/supervisor ENV PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus # 设置用户和组ID环境变量 -ARG PROMETHEUS_UID=2133 -ARG PROMETHEUS_GID=2015 -ENV PROMETHEUS_UID=${PROMETHEUS_UID} -ENV PROMETHEUS_GID=${PROMETHEUS_GID} +ARG ARGUS_BUILD_UID=2133 +ARG ARGUS_BUILD_GID=2015 +ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \ + ARGUS_BUILD_GID=${ARGUS_BUILD_GID} # 创建目录结构 RUN mkdir -p ${PROMETHEUS_BASE_PATH}/rules \ && mkdir -p ${PROMETHEUS_BASE_PATH}/targets \ @@ -61,11 +61,11 @@ RUN mkdir -p ${PROMETHEUS_BASE_PATH}/rules \ && ln -s ${PROMETHEUS_BASE_PATH} /prometheus # 修改 Prometheus 用户 UID/GID 并授权 -RUN usermod -u ${PROMETHEUS_UID} nobody && \ - groupmod -g ${PROMETHEUS_GID} nogroup && \ +RUN usermod -u ${ARGUS_BUILD_UID} nobody && \ + groupmod -g ${ARGUS_BUILD_GID} nogroup && \ chown -h nobody:nogroup /prometheus && \ - chown -R nobody:nogroup /private/argus/metric /etc/prometheus && \ - chown -R nobody:nogroup ${PROMETHEUS_BASE_PATH} + chown -R nobody:nogroup ${PROMETHEUS_BASE_PATH} && \ + chown -R nobody:nogroup /etc/prometheus # supervisor 配置 COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf diff --git a/src/metric/prometheus/build/prometheus.yml b/src/metric/prometheus/build/prometheus.yml index e3e4403..f813127 100755 --- a/src/metric/prometheus/build/prometheus.yml +++ b/src/metric/prometheus/build/prometheus.yml @@ -7,7 +7,7 @@ global: alerting: alertmanagers: - static_configs: - - targets: [] + - targets: ["alertmanager.alert.argus.com:9093"] # 规则目录 rule_files: diff --git a/src/metric/prometheus/build/start-prometheus-supervised.sh b/src/metric/prometheus/build/start-prometheus-supervised.sh index 75d9a39..2233a9a 100755 --- a/src/metric/prometheus/build/start-prometheus-supervised.sh +++ b/src/metric/prometheus/build/start-prometheus-supervised.sh @@ -17,6 +17,7 @@ sed "s|\${PROMETHEUS_BASE_PATH}|${PROMETHEUS_BASE_PATH}|g" \ IP=$(ifconfig eth0 | awk '/inet /{print $2}') echo "current IP: ${IP}" echo "${IP}" > /private/argus/etc/${DOMAIN} +chmod +x /private/argus/etc/${DOMAIN} exec /bin/prometheus \ --config.file=${PROMETHEUS_BASE_PATH}/prometheus.yml \ diff --git a/src/metric/tests/.gitignore b/src/metric/tests/.gitignore index b73f619..62f84ef 100644 --- a/src/metric/tests/.gitignore +++ b/src/metric/tests/.gitignore @@ -1,7 +1,7 @@ .env data/ images-cache/ +private-test-node/ *.tar *.log .DS_Store - diff --git a/src/metric/tests/README.md b/src/metric/tests/README.md index 2898e45..a0bccbd 100644 --- a/src/metric/tests/README.md +++ b/src/metric/tests/README.md @@ -1,171 +1,97 @@ # E2E Test - Argus Metric 部署测试 -## 概述 +## 1. 概述 本项目用于对 Argus Metric 模块进行端到端(E2E)部署测试。 通过一键脚本可快速搭建 Prometheus、FTP、Grafana 等服务,验证 Metric 模块的完整部署与运行流程。 -## 拉取完整项目,进入 metric.tests 目录 +功能包括: -``` bash -git clone https://git.nasp.fit/NASP/argus.git +- 自动启动所需服务和测试节点 +- 发布安装包到 FTP +- CPU/GPU 节点客户端安装测试 +- 验证安装结果与服务可用性 +- 支持环境清理和分步调试 -cd {$PROJECT_ROOT}/argus/src/metric/tests +## 2. 前置条件 + +在开始部署和测试之前,请确保完成以下准备工作: + +### 2.1 检查 all-in-one-full 客户端安装包 +确认客户端安装包目录是否存在: +```bash +{$PROJECT_ROOT}/argus/src/metric/client-plugins/all-in-one-full ``` +本项目依赖完整的 all-in-one-full 安装包,其中包含大量二进制文件、依赖包和测试制品,由于体积较大,无法直接上传到 Git 仓库。**请联系项目管理员获取最新版本的完整框架。** -## 一键构建与部署 Prometheus / FTP / Grafana -### 1. 修改环境变量文件 - -将示例配置文件复制为 .env 并根据实际情况修改: - -``` bash +### 2.2 配置环境变量 +查看配置文件是否存在,如不存在,则复制示例配置文件并根据实际环境修改: +```bash +cd {$PROJECT_ROOT}/argus/src/metric/tests cp env.example .env ``` +.env 文件用于指定构建UID:GID、FTP 配置、版本号等信息,确保各脚本运行时可以正确访问资源。 -### 2. 一键启动服务 +### 2.3 离线镜像准备 + - 步骤1:在**在线服务器**执行以下脚本,会拉取和构建所需的 Docker 镜像: + ``` bash + cd {$PROJECT_ROOT}/argus/src/metric/tests + bash scripts/01_start_services.sh + bash scripts/save-images.sh + ``` + - 步骤2:镜像将被保存到 metric.tests.images-cache 目录中,用于离线迁移和后续导入。 + - 步骤3:若目标服务器无法联网,可将该目录拷贝到离线服务器,并执行: + ``` bash + cd {$PROJECT_ROOT}/argus/src/metric/tests + bash scripts/load-images.sh + ``` + - 即可导入镜像并执行下面的QuickStart或分步操作。 -执行以下命令完成环境初始化、镜像构建与服务启动: +## 3. QuickStart -``` bash -sudo bash start-all.sh +执行完整的端到端测试流程: + +```bash +bash scripts/00_e2e_test.sh ``` -该脚本将自动完成: -- 初始化目录结构(如 /private/argus/metric) -- 构建各服务 Docker 镜像 -- 启动 Prometheus、FTP、Grafana 容器 +该脚本将自动执行以下步骤: +1. 启动所有服务(Prometheus、FTP、Grafana、测试节点) +2. 发布安装包到 FTP 服务 +3. 在 CPU 测试节点上安装客户端 +4. 在 GPU 测试节点上安装客户端 +5. 验证安装结果 +6. 清理测试环境 -### 3. 检查容器日志 +## 4. 分步执行 -可手动验证容器运行状态: +| 步骤 | 脚本 | 功能描述 | +|--------------|-------------------------------------------|--------------------------------------------------------| +| 启动基础服务 | bash scripts/01_start_services.sh | 构建 Docker 镜像、创建持久化目录、启动容器服务 | +| 发布安装包 | bash scripts/02_publish_artifact.sh | 自动递增版本号、打包安装制品、发布到 FTP | +| CPU 节点安装 | bash scripts/03_test_node_install.sh | 在 CPU 节点下载安装程序并执行安装 | +| GPU 节点安装 | bash scripts/04_test_gpu_node_install.sh | 在 GPU 节点下载安装程序并执行安装 | +| 验证安装 | bash scripts/05_verify_install.sh | 检查监控端口、端口连通性及服务可用性 | +| 清理环境 | bash scripts/06_cleanup.sh | 停止并清理所有测试容器及环境 | -``` bash -docker logs argus-metric-ftp -docker logs argus-metric-grafana -docker logs argus-metric-prometheus -``` +## 5. 查看监控采集数据及展示面板 -如日志输出中无 ERROR 或 supervisor 报错信息,则表示服务启动正常。 - -## 客户端安装包打包与分发 - -> **前置说明**:完整的 `all-in-one` 安装包打包分发框架因包含大量二进制文件和依赖包,无法上传至 Git 仓库。请先联系项目管理员获取最新的 `all-in-one` 完整框架,再执行后续操作。 - -打包后服务端会将安装包发布至 FTP 共享目录,默认路径为: - -``` bash -$DATA_ROOT/ftp/share -``` - -发布后的文件权限与 FTP 目录账户保持一致。 - -### 1. 递增版本号 -``` bash -bash scripts/version-manager.sh bump minor -``` -该脚本会自动更新版本号(如 1.101.0 → 1.102.0)。 - -### 2. 打包安装制品 -``` bash -bash scripts/package_artifact.sh -``` -执行后会在输出目录中生成压缩包或安装脚本。 - -### 3. 发布制品至 FTP -``` bash -sudo bash scripts/publish_artifact.sh $VERSION --output-dir $OUTPUT_DIR --owner $UID:$GID -``` - -参数说明: - -参数 说明 -$VERSION 发布版本号(如 1.102.0) -$OUTPUT_DIR 输出目录(默认 /private/argus/ftp/share) -$UID:$GID 文件属主(用户ID:组ID) - -示例: - -``` bash -sudo bash scripts/publish_artifact.sh 1.102.0 --output-dir /private/argus/ftp/share --owner 2133:2015 -``` - -更多详情可参考 client-plugins/all-in-one/README.md。 - -## 客户端安装(通过 FTP) - -客户端下载与安装步骤如下: - -``` bash -curl -u ${USER}:${PASSWD} ftp://${FTP_SERVER}/setup.sh -o setup.sh -chmod +x setup.sh -sudo bash setup.sh --server ${FTP_SERVER} --user ${USER} --password ${PASSWD} --port ${PORT} -``` - -参数说明: - -参数 说明 -$FTP_SERVER 服务器 IP 地址 -$USER 默认 ftpuser -$PASSWD 默认 ZGClab1234! -$PORT FTP 服务端口(需与 .env 保持一致) - -示例: - -``` bash -curl -u ftpuser:ZGClab1234! ftp://10.211.55.4/setup.sh -o setup.sh -chmod +x setup.sh -sudo bash setup.sh --server 10.211.55.4 --user ftpuser --password 'ZGClab1234!' --port 2122 -``` - -更多细节可参考 client-plugins/all-in-one/README.md。 - -## 模拟 Argus-Master 配置下发 - -可通过手动写入 nodes.json 文件模拟 Argus-Master 对 Argus-Metric 的配置下发: - -``` json -[ - { - "node_id": "A1", - "user_id": "sundapeng", - "ip": "10.211.55.4", - "hostname": "dev-sundapeng-nsche-wohen-pod-0", - "labels": ["label-a", "label-b"] - } -] -``` - -路径: - -``` bash -${DATA_ROOT}/prometheus/nodes.json -``` - -Argus-Metric 中的 prometheus 模块会自动解析该文件,并将其拆分生成目标配置: - -``` bash -${DATA_ROOT}/prometheus/targets/ -``` - -## Grafana 手动配置(如未自动接入 Prometheus) - -如 Grafana 未自动导入 Prometheus 数据源,可手动执行以下操作: - -1. 添加数据源 -- 进入 Grafana → Data sources -- 选择 Add data source → Prometheus -- URL 填写:http://prometheus:9090(Docker 内部 DNS 地址) - -2. 导入测试 Dashboard -- 打开 Grafana → Dashboards → Import -- 上传或粘贴 test_grafana_dashboard.json - -## 查看监控数据 Prometheus 访问以下地址查看节点活性: ``` bash -http://127.0.0.1:9091/targets +http://127.0.0.1:9090/targets ``` Grafana 访问以下地址查看监控大屏: ``` bash http://127.0.0.1:3000/d/node_gpu_metrics/node-and-gpu-metrics -``` \ No newline at end of file +``` + +PS: 如果 Grafana 未自动导入 Prometheus 数据源,可手动执行以下操作: + +1. 添加数据源 +- 进入 Grafana → Data sources +- 选择 Add data source → Prometheus +- URL 填写:http://prom.metric.argus.com:9090 + +2. 导入测试 Dashboard +- 打开 Grafana → Dashboards → Import +- 上传或粘贴 test_grafana_dashboard.json \ No newline at end of file diff --git a/src/metric/tests/client-test-gpu-node/build/Dockerfile b/src/metric/tests/client-test-gpu-node/build/Dockerfile new file mode 100644 index 0000000..8a64a87 --- /dev/null +++ b/src/metric/tests/client-test-gpu-node/build/Dockerfile @@ -0,0 +1,39 @@ +# 使用NVIDIA官方CUDA基础镜像 +FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04 + +ENV DEBIAN_FRONTEND=noninteractive + +# 设置时区 +ENV TZ=Asia/Shanghai + +RUN apt-get update -qq && \ + apt-get install -y -qq \ + tzdata \ + curl \ + wget \ + gnupg2 \ + software-properties-common \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# 配置时区 +RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone + +WORKDIR /app + +# 创建启动脚本,在运行时验证GPU +COPY < /dev/null; then + nvidia-smi + echo "GPU环境正常" +else + echo "警告: nvidia-smi 命令不可用,请确保容器运行时启用了GPU支持" +fi +exec "\$@" +EOF + +RUN chmod +x /app/start.sh + +CMD ["/app/start.sh", "/bin/bash"] diff --git a/src/metric/tests/client-test-node/build/Dockerfile b/src/metric/tests/client-test-node/build/Dockerfile new file mode 100644 index 0000000..e72dc1c --- /dev/null +++ b/src/metric/tests/client-test-node/build/Dockerfile @@ -0,0 +1,6 @@ +FROM ubuntu:22.04 +RUN apt-get update -qq && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y -qq tzdata && \ + rm -rf /var/lib/apt/lists/* +ENV TZ=Asia/Shanghai + diff --git a/src/metric/tests/docker-compose.yml b/src/metric/tests/docker-compose.yml index 45ea0ac..d05853b 100644 --- a/src/metric/tests/docker-compose.yml +++ b/src/metric/tests/docker-compose.yml @@ -1,29 +1,39 @@ +networks: + default: + name: argus-debug-net + external: true + services: ftp: build: context: ../ftp/build dockerfile: Dockerfile args: - FTP_UID: ${FTP_UID:-2133} - FTP_GID: ${FTP_GID:-2015} + ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133} + ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015} + USE_INTRANET: ${USE_INTRANET:-false} image: argus-metric-ftp:latest container_name: argus-ftp restart: unless-stopped environment: + - TZ=Asia/Shanghai - FTP_BASE_PATH=/private/argus/ftp - FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!} - - DOMAIN=${FTP_DOMAIN:-prom.ftp.argus.com} - - FTP_UID=${FTP_UID:-2133} - - FTP_GID=${FTP_GID:-2015} + - DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com} + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} ports: - "${FTP_PORT:-21}:21" - "${FTP_DATA_PORT:-20}:20" - "21100-21110:21100-21110" volumes: - - ${DATA_ROOT:-./data}/ftp:/private/argus/ftp - - ${DATA_ROOT:-./data}/etc:/private/argus/etc + - ${DATA_ROOT:-/private}/argus/metric/ftp:/private/argus/ftp + - ${DATA_ROOT:-/private}/argus/etc:/private/argus/etc + - /etc/localtime:/etc/localtime:ro + - /etc/timezone:/etc/timezone:ro networks: - - argus-network + default: + ipv4_address: 172.30.0.40 logging: driver: "json-file" options: @@ -35,23 +45,27 @@ services: context: ../prometheus/build dockerfile: Dockerfile args: - PROMETHEUS_UID: ${PROMETHEUS_UID:-2133} - PROMETHEUS_GID: ${PROMETHEUS_GID:-2015} + ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133} + ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015} USE_INTRANET: ${USE_INTRANET:-false} image: argus-metric-prometheus:latest container_name: argus-prometheus restart: unless-stopped environment: + - TZ=Asia/Shanghai - PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus - - PROMETHEUS_UID=${PROMETHEUS_UID:-2133} - - PROMETHEUS_GID=${PROMETHEUS_GID:-2015} + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} ports: - "${PROMETHEUS_PORT:-9090}:9090" volumes: - - ${DATA_ROOT:-./data}/prometheus:/private/argus/metric/prometheus - - ${DATA_ROOT:-./data}/etc:/private/argus/etc + - ${DATA_ROOT:-/private}/argus/metric/prometheus:/private/argus/metric/prometheus + - ${DATA_ROOT:-/private}/argus/etc:/private/argus/etc + - /etc/localtime:/etc/localtime:ro + - /etc/timezone:/etc/timezone:ro networks: - - argus-network + default: + ipv4_address: 172.30.0.41 logging: driver: "json-file" options: @@ -63,25 +77,29 @@ services: context: ../grafana/build dockerfile: Dockerfile args: - GRAFANA_UID: ${GRAFANA_UID:-2133} - GRAFANA_GID: ${GRAFANA_GID:-2015} + ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133} + ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015} image: argus-metric-grafana:latest container_name: argus-grafana restart: unless-stopped environment: + - TZ=Asia/Shanghai - GRAFANA_BASE_PATH=/private/argus/metric/grafana - - GRAFANA_UID=${GRAFANA_UID:-2133} - - GRAFANA_GID=${GRAFANA_GID:-2015} + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} - GF_SERVER_HTTP_PORT=3000 - GF_LOG_LEVEL=warn - GF_LOG_MODE=console ports: - "${GRAFANA_PORT:-3000}:3000" volumes: - - ${DATA_ROOT:-./data}/grafana:/private/argus/metric/grafana - - ${DATA_ROOT:-./data}/etc:/private/argus/etc + - ${DATA_ROOT:-/private}/argus/metric/grafana:/private/argus/metric/grafana + - ${DATA_ROOT:-/private}/argus/etc:/private/argus/etc + - /etc/localtime:/etc/localtime:ro + - /etc/timezone:/etc/timezone:ro networks: - - argus-network + default: + ipv4_address: 172.30.0.42 depends_on: - prometheus logging: @@ -90,16 +108,78 @@ services: max-size: "10m" max-file: "3" -networks: - argus-network: - driver: bridge - name: argus-network + test-node: + build: + context: ./client-test-node/build + dockerfile: Dockerfile + image: argus-metric-test-node:latest + container_name: argus-metric-test-node + hostname: test-metric-node-001 + restart: unless-stopped + privileged: true + depends_on: + - ftp + - prometheus + environment: + - TZ=Asia/Shanghai + - DEBIAN_FRONTEND=noninteractive + - FTP_DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com} + - FTP_SERVER=${FTP_SERVER:-172.30.0.40} + - FTP_USER=${FTP_USER:-ftpuser} + - FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!} + - FTP_PORT=${FTP_PORT:-21} + volumes: + - ${DATA_ROOT:-/private}/argus/agent:/private/argus/agent + - /etc/localtime:/etc/localtime:ro + - /etc/timezone:/etc/timezone:ro + command: sleep infinity + networks: + default: + ipv4_address: 172.30.0.50 + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" -volumes: - ftp_data: - driver: local - prometheus_data: - driver: local - grafana_data: - driver: local + test-gpu-node: + build: + context: ./client-test-gpu-node/build + dockerfile: Dockerfile + image: argus-metric-test-gpu-node:latest + container_name: argus-metric-test-gpu-node + hostname: test-metric-gpu-node-001 + restart: unless-stopped + privileged: true + runtime: nvidia + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: + - gpu + depends_on: + - ftp + - prometheus + environment: + - TZ=Asia/Shanghai + - DEBIAN_FRONTEND=noninteractive + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - GPU_MODE=gpu + volumes: + - ${DATA_ROOT:-/private}/argus/agent:/private/argus/agent + - /etc/localtime:/etc/localtime:ro + - /etc/timezone:/etc/timezone:ro + command: sleep infinity + networks: + default: + ipv4_address: 172.30.0.51 + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" diff --git a/src/metric/tests/env.example b/src/metric/tests/env.example index 9d72de2..afd491b 100644 --- a/src/metric/tests/env.example +++ b/src/metric/tests/env.example @@ -1,19 +1,15 @@ -# 用户和组配置 -FTP_UID=2133 -FTP_GID=2015 -PROMETHEUS_UID=2133 -PROMETHEUS_GID=2015 -GRAFANA_UID=2133 -GRAFANA_GID=2015 +# 统一用户和组配置 +ARGUS_BUILD_UID=1048 +ARGUS_BUILD_GID=1048 # 数据根目录 -DATA_ROOT=/private/argus +DATA_ROOT=/private # FTP 配置 -FTP_PORT=2122 -FTP_DATA_PORT=2022 +FTP_PORT=21 +FTP_DATA_PORT=20 FTP_PASSWORD=ZGClab1234! -FTP_DOMAIN=prom.ftp.argus.com +FTP_DOMAIN=ftp.metric.argus.com # Prometheus 配置 PROMETHEUS_PORT=9090 diff --git a/src/metric/tests/init-directories.sh b/src/metric/tests/init-directories.sh deleted file mode 100755 index df5d719..0000000 --- a/src/metric/tests/init-directories.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash - -# 初始化目录脚本 -# 用于创建所有必要的数据目录并设置正确的权限 - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "$SCRIPT_DIR" - -# 加载 .env 文件(如果存在) -if [ -f .env ]; then - echo "加载 .env 配置文件..." - source .env -fi - -# 默认配置 -FTP_UID=${FTP_UID:-2133} -FTP_GID=${FTP_GID:-2015} -PROMETHEUS_UID=${PROMETHEUS_UID:-2133} -PROMETHEUS_GID=${PROMETHEUS_GID:-2015} -GRAFANA_UID=${GRAFANA_UID:-2133} -GRAFANA_GID=${GRAFANA_GID:-2015} -DATA_ROOT=${DATA_ROOT:-./data} - -echo "开始初始化目录结构..." -echo "数据目录: ${DATA_ROOT}" -echo "" - -# 创建 FTP 目录 -echo "创建 FTP 目录..." -sudo mkdir -p ${DATA_ROOT}/ftp/share -sudo chown -R ${FTP_UID}:${FTP_GID} ${DATA_ROOT}/ftp -sudo chmod -R 755 ${DATA_ROOT}/ftp - -# 创建 Prometheus 目录 -echo "创建 Prometheus 目录..." -sudo mkdir -p ${DATA_ROOT}/prometheus/{data,rules,targets} - -# 创建默认的 targets 文件(先创建文件再改权限) -if [ ! -f "${DATA_ROOT}/prometheus/targets/node_exporter.json" ]; then - echo "创建默认 node_exporter targets..." - echo '[ - { - "targets": [], - "labels": { - "job": "node" - } - } -]' | sudo tee ${DATA_ROOT}/prometheus/targets/node_exporter.json > /dev/null -fi - -if [ ! -f "${DATA_ROOT}/prometheus/targets/dcgm_exporter.json" ]; then - echo "创建默认 dcgm_exporter targets..." - echo '[ - { - "targets": [], - "labels": { - "job": "dcgm" - } - } -]' | sudo tee ${DATA_ROOT}/prometheus/targets/dcgm_exporter.json > /dev/null -fi - -# 统一设置 Prometheus 目录权限 -sudo chown -R ${PROMETHEUS_UID}:${PROMETHEUS_GID} ${DATA_ROOT}/prometheus -sudo chmod -R 755 ${DATA_ROOT}/prometheus - -# 创建 Grafana 目录 -echo "创建 Grafana 目录..." -sudo mkdir -p ${DATA_ROOT}/grafana/{data,logs,plugins,provisioning/datasources,provisioning/dashboards,data/sessions,data/dashboards,config} -sudo chown -R ${GRAFANA_UID}:${GRAFANA_GID} ${DATA_ROOT}/grafana -sudo chmod -R 755 ${DATA_ROOT}/grafana - -# 创建公共配置目录 -sudo mkdir -p ${DATA_ROOT}/etc -sudo chown -R ${FTP_UID}:${FTP_GID} ${DATA_ROOT}/etc -sudo chmod -R 755 ${DATA_ROOT}/etc - -echo "目录初始化完成!" -echo "" -echo "目录结构:" -echo " ${DATA_ROOT}/" -echo " ├── ftp/ (UID:${FTP_UID}, GID:${FTP_GID})" -echo " ├── prometheus/ (UID:${PROMETHEUS_UID}, GID:${PROMETHEUS_GID})" -echo " ├── grafana/ (UID:${GRAFANA_UID}, GID:${GRAFANA_GID})" -echo " └── etc/ (UID:${FTP_UID}, GID:${FTP_GID})" -echo "" -echo "您现在可以运行 'docker-compose up -d' 来启动所有服务" - diff --git a/src/metric/tests/scripts/00_e2e_test.sh b/src/metric/tests/scripts/00_e2e_test.sh new file mode 100755 index 0000000..0c5a323 --- /dev/null +++ b/src/metric/tests/scripts/00_e2e_test.sh @@ -0,0 +1,20 @@ +#!/bin/bash +set -e + +SCRIPT_DIR="$(dirname "$0")" + +echo "==========================================" +echo "Argus Metric E2E Test" +echo "==========================================" + +bash "$SCRIPT_DIR/01_start_services.sh" +bash "$SCRIPT_DIR/02_publish_artifact.sh" +bash "$SCRIPT_DIR/03_test_node_install.sh" +bash "$SCRIPT_DIR/04_test_gpu_node_install.sh" +bash "$SCRIPT_DIR/05_verify_install.sh" +bash "$SCRIPT_DIR/06_cleanup.sh" + +echo "==========================================" +echo "E2E 测试完成" +echo "==========================================" + diff --git a/src/metric/tests/scripts/01_start_services.sh b/src/metric/tests/scripts/01_start_services.sh new file mode 100755 index 0000000..01e587f --- /dev/null +++ b/src/metric/tests/scripts/01_start_services.sh @@ -0,0 +1,27 @@ +#!/bin/bash +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + +# 解析参数 +REBUILD_FLAG="" +if [[ "$1" == "--rebuild" || "$1" == "-r" ]]; then + REBUILD_FLAG="--rebuild" + echo "[01] 启用强制重新构建模式" +fi + +echo "[01] 启动所有服务..." +bash "$SCRIPT_DIR/common/start-all.sh" $REBUILD_FLAG + +echo "[01] 等待服务就绪..." +sleep 5 + +echo "[01] 检查服务状态..." +docker ps | grep argus-ftp +docker ps | grep argus-prometheus +docker ps | grep argus-grafana +docker ps | grep argus-metric-test-node +docker ps | grep argus-metric-test-gpu-node + +echo "[01] 基础服务已启动" + diff --git a/src/metric/tests/scripts/02_publish_artifact.sh b/src/metric/tests/scripts/02_publish_artifact.sh new file mode 100755 index 0000000..658d9dd --- /dev/null +++ b/src/metric/tests/scripts/02_publish_artifact.sh @@ -0,0 +1,60 @@ +#!/bin/bash +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +TEST_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +PLUGIN_DIR="$(cd "$SCRIPT_DIR/../../client-plugins/all-in-one-full" && pwd)" + +# 加载 .env +if [ -f "$TEST_DIR/.env" ]; then + source "$TEST_DIR/.env" +fi + +# 检测容器挂载目录 +if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then + FTP_MOUNT=$(docker inspect argus-ftp --format '{{range .Mounts}}{{if eq .Destination "/private/argus/ftp"}}{{.Source}}{{end}}{{end}}') + OUTPUT_DIR="${FTP_MOUNT}/share" + echo "[02] 容器挂载: $OUTPUT_DIR" +else + OUTPUT_DIR="${DATA_ROOT:-$TEST_DIR/data}/ftp/share" + echo "[02] 默认路径: $OUTPUT_DIR" +fi + +OWNER="${ARGUS_BUILD_UID:-2133}:${ARGUS_BUILD_GID:-2015}" + +cd "$PLUGIN_DIR" + +echo "[02] 递增版本号..." +bash scripts/version-manager.sh bump minor + +VERSION_FILE="config/VERSION" +if [ ! -f "$VERSION_FILE" ]; then + echo "[02] 错误: 未找到 $VERSION_FILE" + exit 1 +fi + +VERSION=$(cat "$VERSION_FILE" | tr -d '[:space:]') +echo "[02] 新版本: $VERSION" + +echo "[02] 构建安装包..." +bash scripts/package_artifact.sh --force + +echo "[02] 发布到 FTP: $OUTPUT_DIR" +sudo bash scripts/publish_artifact.sh "$VERSION" --output-dir "$OUTPUT_DIR" --owner "$OWNER" + +echo "[02] 设置文件权限..." +# 设置所有者 +sudo chown -R "$OWNER" "$OUTPUT_DIR" +# 设置目录权限为 755 (rwxr-xr-x) +sudo find "$OUTPUT_DIR" -type d -exec chmod 755 {} \; +# 设置文件权限为 644 (rw-r--r--) +sudo find "$OUTPUT_DIR" -type f -exec chmod 644 {} \; +# 特别处理 .sh 文件,给予执行权限 755 +sudo find "$OUTPUT_DIR" -type f -name "*.sh" -exec chmod 755 {} \; +echo "[02] 权限设置完成 (UID:GID=$OWNER, dirs=755, files=644, scripts=755)" + +echo "[02] 发布完成,验证文件..." +ls -lh "$OUTPUT_DIR" + +echo "[02] 完成" + diff --git a/src/metric/tests/scripts/03_test_node_install.sh b/src/metric/tests/scripts/03_test_node_install.sh new file mode 100755 index 0000000..af8200f --- /dev/null +++ b/src/metric/tests/scripts/03_test_node_install.sh @@ -0,0 +1,33 @@ +#!/bin/bash +set -e + +FTP_SERVER="${FTP_SERVER:-172.30.0.40}" +FTP_USER="${FTP_USER:-ftpuser}" +FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" +FTP_PORT="${FTP_PORT:-21}" + +FTP_HOST="${FTP_SERVER}" + +echo "[03] 进入测试节点执行安装..." +echo "[03] 使用 FTP 地址: ${FTP_HOST}:${FTP_PORT}" + +docker exec argus-metric-test-node bash -c " +set -e + +if ! command -v curl &>/dev/null; then + echo '[03] curl 未安装,正在安装...' + apt-get update && apt-get install -y curl +fi + +cd /tmp +echo '[03] 下载 setup.sh...' +curl -u ${FTP_USER}:${FTP_PASSWORD} ftp://${FTP_HOST}:${FTP_PORT}/setup.sh -o setup.sh + +echo '[03] 执行安装...' +chmod +x setup.sh +bash setup.sh --server ${FTP_HOST} --user ${FTP_USER} --password '${FTP_PASSWORD}' --port ${FTP_PORT} + +echo '[03] 安装完成' +" + +echo "[03] 完成" diff --git a/src/metric/tests/scripts/04_test_gpu_node_install.sh b/src/metric/tests/scripts/04_test_gpu_node_install.sh new file mode 100755 index 0000000..ce1d19a --- /dev/null +++ b/src/metric/tests/scripts/04_test_gpu_node_install.sh @@ -0,0 +1,33 @@ +#!/bin/bash +set -e + +FTP_SERVER="${FTP_SERVER:-172.30.0.40}" +FTP_USER="${FTP_USER:-ftpuser}" +FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" +FTP_PORT="${FTP_PORT:-21}" + +FTP_HOST="${FTP_SERVER}" + +echo "[03] 进入测试节点执行安装..." +echo "[03] 使用 FTP 地址: ${FTP_HOST}:${FTP_PORT}" + +docker exec argus-metric-test-gpu-node bash -c " +set -e + +if ! command -v curl &>/dev/null; then + echo '[03] curl 未安装,正在安装...' + apt-get update && apt-get install -y curl +fi + +cd /tmp +echo '[03] 下载 setup.sh...' +curl -u ${FTP_USER}:${FTP_PASSWORD} ftp://${FTP_HOST}:${FTP_PORT}/setup.sh -o setup.sh + +echo '[03] 执行安装...' +chmod +x setup.sh +bash setup.sh --server ${FTP_HOST} --user ${FTP_USER} --password '${FTP_PASSWORD}' --port ${FTP_PORT} + +echo '[03] 安装完成' +" + +echo "[03] 完成" diff --git a/src/metric/tests/scripts/05_verify_install.sh b/src/metric/tests/scripts/05_verify_install.sh new file mode 100755 index 0000000..5a33a05 --- /dev/null +++ b/src/metric/tests/scripts/05_verify_install.sh @@ -0,0 +1,96 @@ +#!/bin/bash +set -e + +echo "[04] 验证安装结果 - 检查监控端口..." +echo "==========================================" + +# 检查容器是否运行 +if ! docker ps --format '{{.Names}}' | grep -q '^argus-metric-test-node$'; then + echo "错误: 容器 argus-metric-test-node 未运行" + exit 1 +fi + +ERRORS=0 + +# ==================== 检查监听端口 ==================== +echo "" +echo "[1] 检查监听端口..." +echo "----------------------------------------" +CHECK_RESULT=$(docker exec argus-metric-test-node bash -c ' +if command -v netstat >/dev/null 2>&1; then + echo "使用 netstat 检查端口:" + if netstat -tlnp 2>/dev/null | grep -E ":(9100|9400|2020)"; then + echo "✓ 找到监控端口" + exit 0 + else + echo "✗ 未找到监控端口 (9100/9400/2020)" + exit 1 + fi +elif command -v ss >/dev/null 2>&1; then + echo "使用 ss 检查端口:" + if ss -tlnp 2>/dev/null | grep -E ":(9100|9400|2020)"; then + echo "✓ 找到监控端口" + exit 0 + else + echo "✗ 未找到监控端口 (9100/9400/2020)" + exit 1 + fi +elif command -v lsof >/dev/null 2>&1; then + echo "使用 lsof 检查端口:" + if lsof -i :9100 -i :9400 -i :2020 2>/dev/null | grep LISTEN; then + echo "✓ 找到监控端口" + exit 0 + else + echo "✗ 未找到监控端口 (9100/9400/2020)" + exit 1 + fi +else + echo "? 没有可用的端口检查工具 (netstat/ss/lsof),跳过此检查" + exit 0 +fi +') +echo "$CHECK_RESULT" +# 只有在明确失败时才计入错误(exit 1),没有工具(exit 0)不算错误 +if echo "$CHECK_RESULT" | grep -q "✗ 未找到监控端口"; then + ERRORS=$((ERRORS + 1)) +fi + +# ==================== 测试端口连通性 ==================== +echo "" +echo "[2] 测试端口连通性..." +echo "----------------------------------------" +docker exec argus-metric-test-node bash -c ' +if command -v curl >/dev/null 2>&1; then + FAILED=0 + for port in 9100 9400 2020; do + echo -n "端口 $port: " + if curl -s --connect-timeout 2 "http://localhost:$port/metrics" > /dev/null 2>&1; then + echo "✓ 可访问 (/metrics)" + elif curl -s --connect-timeout 2 "http://localhost:$port/" > /dev/null 2>&1; then + echo "✓ 可访问 (根路径)" + else + echo "✗ 不可访问" + FAILED=$((FAILED + 1)) + fi + done + exit $FAILED +else + echo "? curl 不可用,跳过连通性测试" + exit 0 +fi +' || ERRORS=$((ERRORS + 1)) + +echo "" +echo "==========================================" +if [ $ERRORS -eq 0 ]; then + echo "✓ [04] 验证完成 - 所有端口检查通过" +else + echo "✗ [04] 验证失败 - 发现 $ERRORS 个问题" + echo "" + echo "调试建议:" + echo " 1. 进入容器检查: docker exec -it argus-metric-test-node bash" + echo " 2. 查看进程: docker exec argus-metric-test-node ps aux" + echo " 3. 查看日志: docker exec argus-metric-test-node cat /tmp/argus_install.log" + exit 1 +fi +echo "==========================================" diff --git a/src/metric/tests/scripts/06_cleanup.sh b/src/metric/tests/scripts/06_cleanup.sh new file mode 100755 index 0000000..c7c93d3 --- /dev/null +++ b/src/metric/tests/scripts/06_cleanup.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + +echo "[05] 清理环境..." + +bash "$SCRIPT_DIR/common/stop-all.sh" || true + +echo "[05] 清理完成" + diff --git a/src/metric/tests/check-paths.sh b/src/metric/tests/scripts/common/check-paths.sh similarity index 92% rename from src/metric/tests/check-paths.sh rename to src/metric/tests/scripts/common/check-paths.sh index bc93897..71ec5c1 100755 --- a/src/metric/tests/check-paths.sh +++ b/src/metric/tests/scripts/common/check-paths.sh @@ -6,7 +6,8 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "$SCRIPT_DIR" +TEST_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" +cd "$TEST_DIR" echo "==========================================" echo " 路径检查脚本" @@ -18,15 +19,15 @@ echo "" # 检查配置文件 echo "检查配置文件..." -if [ -f "$SCRIPT_DIR/docker-compose.yml" ]; then +if [ -f "$TEST_DIR/docker-compose.yml" ]; then echo " ✓ docker-compose.yml 存在" else echo " ✗ docker-compose.yml 不存在" fi -if [ -f "$SCRIPT_DIR/.env" ]; then +if [ -f "$TEST_DIR/.env" ]; then echo " ✓ .env 存在" -elif [ -f "$SCRIPT_DIR/env.example" ]; then +elif [ -f "$TEST_DIR/env.example" ]; then echo " ⚠ .env 不存在,但 env.example 存在" else echo " ✗ .env 和 env.example 都不存在" diff --git a/src/metric/tests/scripts/common/init-directories.sh b/src/metric/tests/scripts/common/init-directories.sh new file mode 100755 index 0000000..a8bab51 --- /dev/null +++ b/src/metric/tests/scripts/common/init-directories.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# 初始化目录脚本 + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" +cd "$TEST_DIR" + +# 加载 .env 文件(如果存在) +if [ -f .env ]; then + echo "加载 .env 配置文件..." + source .env +fi + +# 默认配置 +ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} +ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} +DATA_ROOT=${DATA_ROOT:-/private} + +echo "开始初始化目录结构..." +echo "数据根目录: ${DATA_ROOT}" +echo "统一 UID: ${ARGUS_BUILD_UID}" +echo "统一 GID: ${ARGUS_BUILD_GID}" + +# 创建基础目录结构 +echo "创建基础目录结构..." +sudo mkdir -p ${DATA_ROOT}/argus/metric +sudo mkdir -p ${DATA_ROOT}/argus/etc +sudo mkdir -p ${DATA_ROOT}/argus/agent + +# 创建 FTP 目录 +echo "创建 FTP 目录..." +sudo mkdir -p ${DATA_ROOT}/argus/metric/ftp/share + +# 创建 Prometheus 目录 +echo "创建 Prometheus 目录..." +sudo mkdir -p ${DATA_ROOT}/argus/metric/prometheus/{data,rules,targets} + +# 创建 Grafana 目录 +echo "创建 Grafana 目录..." +sudo mkdir -p ${DATA_ROOT}/argus/metric/grafana/{data,logs,plugins,provisioning/datasources,provisioning/dashboards,data/sessions,data/dashboards,config} + +# 统一设置所有目录权限 +echo "设置目录权限..." +sudo chown -R ${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID} ${DATA_ROOT}/argus/metric +sudo chmod -R 755 ${DATA_ROOT}/argus/metric + +echo "目录初始化完成!" +echo "" +echo "目录结构:" +echo " ${DATA_ROOT}/" +echo " ├── argus/ (UID:${ARGUS_BUILD_UID}, GID:${ARGUS_BUILD_GID})" +echo " │ ├── metric/" +echo " │ │ ├── ftp/" +echo " │ │ ├── prometheus/" +echo " │ │ └── grafana/" +echo "" +echo "您现在可以运行 'docker-compose up -d' 来启动所有服务" + diff --git a/src/metric/tests/init-environment.sh b/src/metric/tests/scripts/common/init-environment.sh similarity index 100% rename from src/metric/tests/init-environment.sh rename to src/metric/tests/scripts/common/init-environment.sh diff --git a/src/metric/tests/manage-images.sh b/src/metric/tests/scripts/common/manage-images.sh similarity index 98% rename from src/metric/tests/manage-images.sh rename to src/metric/tests/scripts/common/manage-images.sh index b47150b..8524a5d 100755 --- a/src/metric/tests/manage-images.sh +++ b/src/metric/tests/scripts/common/manage-images.sh @@ -6,7 +6,8 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "$SCRIPT_DIR" +TEST_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" +cd "$TEST_DIR" # 检测 docker-compose 命令 if command -v docker-compose &> /dev/null; then @@ -19,7 +20,7 @@ else fi # 镜像缓存目录 -IMAGE_CACHE_DIR="./images-cache" +IMAGE_CACHE_DIR="$TEST_DIR/images-cache" mkdir -p "$IMAGE_CACHE_DIR" # 定义镜像列表 diff --git a/src/metric/tests/start-all.sh b/src/metric/tests/scripts/common/start-all.sh similarity index 66% rename from src/metric/tests/start-all.sh rename to src/metric/tests/scripts/common/start-all.sh index b0ceb72..5521367 100755 --- a/src/metric/tests/start-all.sh +++ b/src/metric/tests/scripts/common/start-all.sh @@ -6,13 +6,20 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "$SCRIPT_DIR" +TEST_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" +cd "$TEST_DIR" + +# 解析参数 +FORCE_REBUILD=false +if [[ "$1" == "--rebuild" ]]; then + FORCE_REBUILD=true +fi echo "==========================================" echo " Argus Metrics 一键启动脚本" echo "==========================================" echo "" -echo "当前工作目录: $SCRIPT_DIR" +echo "当前工作目录: $TEST_DIR" echo "" # 检查 Docker 和 Docker Compose @@ -21,19 +28,13 @@ if ! command -v docker &> /dev/null; then exit 1 fi -# 检测 docker-compose 命令(兼容新旧版本) -COMPOSE_FILE="$SCRIPT_DIR/docker-compose.yml" -if command -v docker-compose &> /dev/null; then - DOCKER_COMPOSE="docker-compose -f $COMPOSE_FILE" - echo "使用: docker-compose" -elif docker compose version &> /dev/null 2>&1; then - DOCKER_COMPOSE="docker compose -f $COMPOSE_FILE" - echo "使用: docker compose" -else - echo "错误: 未找到 docker-compose 或 docker compose 命令" +# 检查 docker compose 命令 +if ! docker compose version &> /dev/null 2>&1; then + echo "错误: 未找到 docker compose 命令,请确保 Docker Compose V2 已安装" exit 1 fi -echo "Compose 文件: $COMPOSE_FILE" +echo "使用: docker compose" +echo "Compose 文件: $TEST_DIR/docker-compose.yml" echo "" # 检查必要的构建目录 @@ -42,6 +43,8 @@ BUILD_DIRS=( "../ftp/build" "../prometheus/build" "../grafana/build" + "client-test-node/build" + "client-test-gpu-node/build" ) for dir in "${BUILD_DIRS[@]}"; do @@ -65,6 +68,18 @@ fi # 加载环境变量 source .env +# 检查并创建 Docker 网络 +echo "检查 Docker 网络..." +NETWORK_NAME="argus-debug-net" +if docker network inspect "$NETWORK_NAME" >/dev/null 2>&1; then + echo "网络 $NETWORK_NAME 已存在" +else + echo "创建网络 $NETWORK_NAME..." + docker network create --driver bridge --subnet 172.30.0.0/16 "$NETWORK_NAME" + echo "网络创建成功" +fi +echo "" + echo "1. 初始化目录结构..." bash "$SCRIPT_DIR/init-directories.sh" @@ -72,8 +87,8 @@ echo "" echo "2. 准备 Docker 镜像..." # 检查镜像是否存在 -IMAGE_CACHE_DIR="./images-cache" -IMAGES=("argus-metric-ftp:latest" "argus-metric-prometheus:latest" "argus-metric-grafana:latest") +IMAGE_CACHE_DIR="$TEST_DIR/images-cache" +IMAGES=("argus-metric-ftp:latest" "argus-metric-prometheus:latest" "argus-metric-grafana:latest" "argus-metric-test-node:latest" "argus-metric-test-gpu-node:latest") all_images_exist=true for image in "${IMAGES[@]}"; do @@ -83,7 +98,12 @@ for image in "${IMAGES[@]}"; do fi done -if $all_images_exist; then +if $FORCE_REBUILD; then + echo "强制重新构建镜像(--rebuild 模式)..." + cd "$TEST_DIR" + docker compose build --no-cache + echo "镜像重新构建完成" +elif $all_images_exist; then echo "所有镜像已存在,跳过构建" else echo "检测到缺失镜像,尝试从缓存加载..." @@ -104,6 +124,12 @@ else "argus-metric-grafana:latest") cache_file="${IMAGE_CACHE_DIR}/argus-grafana.tar" ;; + "argus-metric-test-node:latest") + cache_file="${IMAGE_CACHE_DIR}/argus-test-node.tar" + ;; + "argus-metric-test-gpu-node:latest") + cache_file="${IMAGE_CACHE_DIR}/argus-test-gpu-node.tar" + ;; esac if [ -f "$cache_file" ]; then @@ -128,8 +154,8 @@ else echo "" echo "部分镜像缺失,开始构建..." echo "工作目录: $(pwd)" - cd "$SCRIPT_DIR" - $DOCKER_COMPOSE build + cd "$TEST_DIR" + docker compose build --no-cache # 询问是否保存镜像 echo "" @@ -149,6 +175,12 @@ else "argus-metric-grafana:latest") docker save -o "${IMAGE_CACHE_DIR}/argus-grafana.tar" "$image" && echo " 已保存: argus-grafana.tar" ;; + "argus-metric-test-node:latest") + docker save -o "${IMAGE_CACHE_DIR}/argus-test-node.tar" "$image" && echo " 已保存: argus-test-node.tar" + ;; + "argus-metric-test-gpu-node:latest") + docker save -o "${IMAGE_CACHE_DIR}/argus-test-gpu-node.tar" "$image" && echo " 已保存: argus-test-gpu-node.tar" + ;; esac done echo "镜像已保存到: $IMAGE_CACHE_DIR/" @@ -160,40 +192,12 @@ else fi echo "" -echo "3. 启动服务..." -cd "$SCRIPT_DIR" -$DOCKER_COMPOSE up -d +echo "3. 启动基础服务..." +cd "$TEST_DIR" +# 启动除GPU节点外的所有服务 +docker compose up -d ftp prometheus grafana test-node test-gpu-node echo "" echo "4. 等待服务启动..." sleep 5 -echo "" -echo "5. 检查服务状态..." -cd "$SCRIPT_DIR" -$DOCKER_COMPOSE ps - -echo "" -echo "==========================================" -echo " 服务启动完成!" -echo "==========================================" -echo "" -echo "服务访问地址:" -echo " - FTP: ftp://localhost:${FTP_PORT:-21}" -echo " 用户名: ftpuser" -echo " 密码: ${FTP_PASSWORD:-ZGClab1234!}" -echo "" -echo " - Prometheus: http://localhost:${PROMETHEUS_PORT:-9090}" -echo "" -echo " - Grafana: http://localhost:${GRAFANA_PORT:-3000}" -echo " 用户名: admin" -echo " 密码: admin" -echo "" -echo "常用命令:" -echo " 查看日志: $DOCKER_COMPOSE logs -f [service]" -echo " 停止服务: $DOCKER_COMPOSE stop" -echo " 重启服务: $DOCKER_COMPOSE restart" -echo " 停止并删除: $DOCKER_COMPOSE down" -echo " 停止并删除卷: $DOCKER_COMPOSE down -v" -echo "" - diff --git a/src/metric/tests/scripts/common/stop-all.sh b/src/metric/tests/scripts/common/stop-all.sh new file mode 100755 index 0000000..233eb83 --- /dev/null +++ b/src/metric/tests/scripts/common/stop-all.sh @@ -0,0 +1,50 @@ + #!/bin/bash + + # 停止所有服务脚本 + + set -e + + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + TEST_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" + cd "$TEST_DIR" + + # 检查 docker compose 命令 + if ! docker compose version &> /dev/null 2>&1; then + echo "错误: 未找到 docker compose 命令,请确保 Docker Compose V2 已安装" + exit 1 + fi + + echo "==========================================" + echo " 停止 Argus Metrics 服务" + echo "==========================================" + echo "" + echo "使用: docker compose" + echo "Compose 文件: $TEST_DIR/docker-compose.yml" + echo "" + + # 检查是否有运行的容器 + if [ "$(docker compose ps -q)" ]; then + echo "停止所有服务..." + docker compose stop + + echo "" + read -p "是否要删除容器? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + docker compose down + echo "容器已删除" + + read -p "是否要删除数据卷? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + docker compose down -v + echo "数据卷已删除" + fi + fi + else + echo "没有运行的服务" + fi + + echo "" + echo "完成!" + diff --git a/src/metric/tests/scripts/load-images.sh b/src/metric/tests/scripts/load-images.sh new file mode 100755 index 0000000..27d6ddc --- /dev/null +++ b/src/metric/tests/scripts/load-images.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +# 镜像加载脚本 +# 用于从 tar 文件加载 Docker 镜像 + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +INPUT_DIR="${1:-$TEST_DIR/images-cache}" + +echo "==========================================" +echo " Docker 镜像加载脚本" +echo "==========================================" +echo "" +echo "输入目录: $INPUT_DIR" +echo "" + +# 检查输入目录是否存在 +if [ ! -d "$INPUT_DIR" ]; then + echo "错误: 目录不存在: $INPUT_DIR" + exit 1 +fi + +# 查找所有tar文件并加载 +total=0 +success=0 +failed=0 + +# 查找目录下所有.tar文件 +tar_files=($(find "$INPUT_DIR" -name "*.tar" -type f 2>/dev/null | sort)) + +if [ ${#tar_files[@]} -eq 0 ]; then + echo "错误: 在目录 $INPUT_DIR 中未找到任何 .tar 文件" + exit 1 +fi + +echo "找到 ${#tar_files[@]} 个镜像文件:" +for tar_file in "${tar_files[@]}"; do + echo " - $(basename "$tar_file")" +done +echo "" + +for tar_file in "${tar_files[@]}"; do + total=$((total + 1)) + tar_filename=$(basename "$tar_file") + + echo "[$total] 处理: $tar_filename" + + # 强制加载,不检查镜像是否已存在 + echo " 加载镜像..." + if docker load -i "$tar_file"; then + echo " 加载成功: $tar_filename" + success=$((success + 1)) + else + echo " 加载失败: $tar_filename" + failed=$((failed + 1)) + fi + echo "" +done + +echo "==========================================" +echo " 加载完成" +echo "==========================================" +echo "" +echo "统计:" +echo " 总计: $total" +echo " 成功: $success" +echo " 失败: $failed" +echo "" + +# 显示当前所有镜像 +echo "当前所有镜像:" +docker images +echo "" + +if [ $failed -gt 0 ]; then + echo "部分镜像加载失败,请检查!" + exit 1 +fi + +if [ $success -gt 0 ]; then + echo "镜像加载成功!" +fi + diff --git a/src/metric/tests/scripts/save-images.sh b/src/metric/tests/scripts/save-images.sh new file mode 100755 index 0000000..9851718 --- /dev/null +++ b/src/metric/tests/scripts/save-images.sh @@ -0,0 +1,94 @@ +#!/bin/bash + +# 镜像保存脚本 +# 用于保存 Docker 镜像到 tar 文件,便于离线部署 + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +OUTPUT_DIR="${1:-$TEST_DIR/images-cache}" + +echo "==========================================" +echo " Docker 镜像保存脚本" +echo "==========================================" +echo "" +echo "输出目录: $OUTPUT_DIR" +echo "" + +# 创建输出目录 +mkdir -p "$OUTPUT_DIR" + +# 定义镜像名称(与 docker-compose.yml 保持一致) +declare -A IMAGES=( + ["argus-metric-ftp:latest"]="argus-ftp.tar" + ["argus-metric-prometheus:latest"]="argus-prometheus.tar" + ["argus-metric-grafana:latest"]="argus-grafana.tar" + ["argus-metric-test-node:latest"]="argus-test-node.tar" + ["argus-metric-test-gpu-node:latest"]="argus-test-gpu-node.tar" +) + +# 检查镜像是否存在并保存 +total=0 +success=0 +failed=0 + +for image in "${!IMAGES[@]}"; do + total=$((total + 1)) + output_file="${OUTPUT_DIR}/${IMAGES[$image]}" + + echo "[$total] 检查镜像: $image" + + if docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then + echo " ✓ 镜像存在,开始保存..." + if docker save -o "$output_file" "$image"; then + file_size=$(ls -lh "$output_file" | awk '{print $5}') + echo " ✓ 保存成功: ${IMAGES[$image]} ($file_size)" + success=$((success + 1)) + else + echo " ✗ 保存失败: $image" + failed=$((failed + 1)) + fi + else + echo " ✗ 镜像不存在,请先构建镜像" + failed=$((failed + 1)) + fi + echo "" +done + +echo "==========================================" +echo " 保存完成" +echo "==========================================" +echo "" +echo "统计:" +echo " 总计: $total" +echo " 成功: $success" +echo " 失败: $failed" +echo "" +echo "输出目录: $OUTPUT_DIR" +echo "" + +if [ $success -gt 0 ]; then + echo "已保存的文件:" + ls -lh "$OUTPUT_DIR"/*.tar 2>/dev/null || true + echo "" + echo "文件列表:" + for image in "${!IMAGES[@]}"; do + output_file="${OUTPUT_DIR}/${IMAGES[$image]}" + if [ -f "$output_file" ]; then + file_size=$(ls -lh "$output_file" | awk '{print $5}') + echo " - ${IMAGES[$image]} ($file_size)" + fi + done +fi + +echo "" +echo "使用说明:" +echo "1. 将 images-cache 目录复制到目标服务器的 ~/argus/src/metric/tests/ 下" +echo "2. 在目标服务器运行: bash scripts/common/start-all.sh" +echo "" + +if [ $failed -gt 0 ]; then + exit 1 +fi + diff --git a/src/metric/tests/stop-all.sh b/src/metric/tests/stop-all.sh deleted file mode 100755 index 6886160..0000000 --- a/src/metric/tests/stop-all.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash - -# 停止所有服务脚本 - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "$SCRIPT_DIR" - -# 检测 docker-compose 命令(兼容新旧版本) -COMPOSE_FILE="$SCRIPT_DIR/docker-compose.yml" -if command -v docker-compose &> /dev/null; then - DOCKER_COMPOSE="docker-compose -f $COMPOSE_FILE" -elif docker compose version &> /dev/null 2>&1; then - DOCKER_COMPOSE="docker compose -f $COMPOSE_FILE" -else - echo "错误: 未找到 docker-compose 或 docker compose 命令" - exit 1 -fi - -echo "==========================================" -echo " 停止 Argus Metrics 服务" -echo "==========================================" -echo "" - -# 检查是否有运行的容器 -if [ "$($DOCKER_COMPOSE ps -q)" ]; then - echo "停止所有服务..." - $DOCKER_COMPOSE stop - - echo "" - read -p "是否要删除容器? (y/N): " -n 1 -r - echo - if [[ $REPLY =~ ^[Yy]$ ]]; then - $DOCKER_COMPOSE down - echo "容器已删除" - - read -p "是否要删除数据卷? (y/N): " -n 1 -r - echo - if [[ $REPLY =~ ^[Yy]$ ]]; then - $DOCKER_COMPOSE down -v - echo "数据卷已删除" - fi - fi -else - echo "没有运行的服务" -fi - -echo "" -echo "完成!" -