diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..15e6b91 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +src/metric/client-plugins/all-in-one-full/plugins/*/bin/* filter=lfs diff=lfs merge=lfs -text diff --git a/src/metric/.gitignore b/src/metric/.gitignore index 43f5e6d..50cf728 100644 --- a/src/metric/.gitignore +++ b/src/metric/.gitignore @@ -4,4 +4,4 @@ /client-plugins/demo-all-in-one/publish/ /client-plugins/demo-all-in-one/checklist /client-plugins/demo-all-in-one/VERSION -/client-plugins/all-in-one-full/ +/client-plugins/all-in-one-full/artifact/ diff --git a/src/metric/client-plugins/all-in-one-full/README.md b/src/metric/client-plugins/all-in-one-full/README.md new file mode 100644 index 0000000..da8f84e --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/README.md @@ -0,0 +1,59 @@ +# 客户侧组件安装包构建、发布流程 + +## 第一步:配置版本和组件 + +首先搞定配置文件: + +1. 把 `.checklist.example` 重命名成 `checklist` +2. 把 `.VERSION.example` 重命名成 `VERSION` + +### checklist 文件格式 +``` +# 组件名称 目录路径 版本号 [依赖组件] [安装顺序] +dcgm-exporter-installer /path/to/dcgm-exporter-installer 1.1.0 +node-exporter-installer /path/to/node-exporter-installer 1.1.0 +``` + +### VERSION 文件 +设置需要发布的版本号,比如 `1.29.0` + +> 建议用 `version-manager.sh` 来管理版本 + +## 第二步:构建安装包 + +直接跑脚本: +```bash +./package_artifact.sh +``` + +构建完的东西会放在 `artifact/` 目录下,按版本分文件夹。 + +如果版本已经存在了,想要覆盖重新构建: +```bash +./package_artifact.sh --force +``` + +构建完可以手工测试安装包。 + +## 第三步:发布安装包 + +用这个脚本发布: +```bash +./publish_artifact.sh +``` + +发布后的内容在 `publish/` 目录里,包含: +- 压缩版本的安装包 +- 一键安装的bash脚本 + +## 第四步:部署到FTP服务器 + +把发布的内容上传到FTP服务器,客户端就可以通过一键命令安装: + +```bash +curl -fsSL http://your-ftp-server/install.sh | sh - + +curl -fsSL "ftp://ftpuser:{PASSWD}!@10.211.55.4/share/setup.sh" | sudo bash -s -- --server 10.211.55.4 --user ftpuser --password {PASSWD} +``` + +这样客户就能直接从FTP服务器下载并安装组件了。 \ No newline at end of file diff --git a/src/metric/client-plugins/all-in-one-full/config/.VERSION.example b/src/metric/client-plugins/all-in-one-full/config/.VERSION.example new file mode 100644 index 0000000..5e57fb8 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/config/.VERSION.example @@ -0,0 +1 @@ +1.29.0 diff --git a/src/metric/client-plugins/all-in-one-full/config/.checklist.example b/src/metric/client-plugins/all-in-one-full/config/.checklist.example new file mode 100644 index 0000000..89cf322 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/config/.checklist.example @@ -0,0 +1,3 @@ +# 组件名称 目录路径 版本号 [依赖组件] [安装顺序] +dcgm-exporter-installer /Users/sundapeng/Project/nlp/aiops/client-plugins/dcgm-exporter-installer 1.1.0 +node-exporter-installer /Users/sundapeng/Project/nlp/aiops/client-plugins/node-exporter-installer 1.1.0 diff --git a/src/metric/client-plugins/all-in-one-full/config/VERSION b/src/metric/client-plugins/all-in-one-full/config/VERSION new file mode 100644 index 0000000..034552a --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/config/VERSION @@ -0,0 +1 @@ +1.30.0 diff --git a/src/metric/client-plugins/all-in-one-full/config/checklist b/src/metric/client-plugins/all-in-one-full/config/checklist new file mode 100644 index 0000000..e97d45e --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/config/checklist @@ -0,0 +1,5 @@ +# 组件名称 目录路径 版本号 [依赖组件] [安装顺序] +argus-agent plugins/argus-agent 1.0.0 +node-exporter plugins/node-exporter 1.0.0 +dcgm-exporter plugins/dcgm-exporter 1.0.0 +fluent-bit plugins/fluent-bit 1.0.0 diff --git a/src/metric/client-plugins/all-in-one-full/config/config.env b/src/metric/client-plugins/all-in-one-full/config/config.env new file mode 100644 index 0000000..b5bea3c --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/config/config.env @@ -0,0 +1,14 @@ +# Elasticsearch +ES_HOST=es.log.argus.com +ES_PORT=9200 + +# Argus-Agent +# 连接master服务 +MASTER_ENDPOINT=master.argus.com:3000 +# 上报状态间隔描述 +REPORT_INTERVAL_SECONDS=5 + +# FTP +FTP_SERVER=172.31.0.40 +FTP_USER=ftpuser +FTP_PASSWORD=ZGClab1234! diff --git a/src/metric/client-plugins/all-in-one-full/config/config.env.example b/src/metric/client-plugins/all-in-one-full/config/config.env.example new file mode 100644 index 0000000..8871dfe --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/config/config.env.example @@ -0,0 +1,8 @@ +# Argus Metric 配置文件示例 +# 复制此文件为 config.env 并根据需要修改配置 + +# 连接master服务 +MASTER_ENDPOINT=master.argus.com:3000 + +# 上报状态间隔描述(秒) +REPORT_INTERVAL_SECONDS=60 diff --git a/src/metric/client-plugins/all-in-one-full/config/dns.conf b/src/metric/client-plugins/all-in-one-full/config/dns.conf new file mode 100644 index 0000000..5a9c316 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/config/dns.conf @@ -0,0 +1 @@ +172.31.0.2 diff --git a/src/metric/client-plugins/all-in-one-full/deps/cron-offline.tar.gz b/src/metric/client-plugins/all-in-one-full/deps/cron-offline.tar.gz new file mode 100644 index 0000000..77104f7 Binary files /dev/null and b/src/metric/client-plugins/all-in-one-full/deps/cron-offline.tar.gz differ diff --git a/src/metric/client-plugins/all-in-one-full/deps/jq-curl.tar.gz b/src/metric/client-plugins/all-in-one-full/deps/jq-curl.tar.gz new file mode 100644 index 0000000..27f4ccc Binary files /dev/null and b/src/metric/client-plugins/all-in-one-full/deps/jq-curl.tar.gz differ diff --git a/src/metric/client-plugins/all-in-one-full/deps/ubuntu20/cron.tar.gz b/src/metric/client-plugins/all-in-one-full/deps/ubuntu20/cron.tar.gz new file mode 100755 index 0000000..376a089 Binary files /dev/null and b/src/metric/client-plugins/all-in-one-full/deps/ubuntu20/cron.tar.gz differ diff --git a/src/metric/client-plugins/all-in-one-full/deps/ubuntu20/curl.tar.gz b/src/metric/client-plugins/all-in-one-full/deps/ubuntu20/curl.tar.gz new file mode 100755 index 0000000..5c4fcc8 Binary files /dev/null and b/src/metric/client-plugins/all-in-one-full/deps/ubuntu20/curl.tar.gz differ diff --git a/src/metric/client-plugins/all-in-one-full/deps/ubuntu20/jq.tar.gz b/src/metric/client-plugins/all-in-one-full/deps/ubuntu20/jq.tar.gz new file mode 100755 index 0000000..a322155 Binary files /dev/null and b/src/metric/client-plugins/all-in-one-full/deps/ubuntu20/jq.tar.gz differ diff --git a/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/cron.tar.gz b/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/cron.tar.gz new file mode 100755 index 0000000..702f63f Binary files /dev/null and b/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/cron.tar.gz differ diff --git a/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/curl.tar.gz b/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/curl.tar.gz new file mode 100755 index 0000000..3237287 Binary files /dev/null and b/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/curl.tar.gz differ diff --git a/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/jq.tar.gz b/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/jq.tar.gz new file mode 100755 index 0000000..b50273f Binary files /dev/null and b/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/jq.tar.gz differ diff --git a/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/README.md b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/README.md new file mode 100644 index 0000000..4e9e690 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/README.md @@ -0,0 +1,94 @@ +# Argus Agent 插件 + +这是 Argus Agent 的安装和管理插件,提供了完整的安装、卸载、健康检查功能。 + +## 文件结构 + +``` +argus-agent/ +├── bin/ +│ └── argus-agent # Argus Agent 二进制文件 +├── config/ # 配置文件目录 +├── install.sh # 安装脚本 +├── uninstall.sh # 卸载脚本 +├── check_health.sh # 健康检查脚本 +├── package.sh # 打包脚本 +└── README.md # 说明文档 +``` + +## 使用方法 + +### 安装 + +```bash +sudo ./install.sh +``` + +安装脚本会: +- 检查系统要求 +- 停止可能运行的服务 +- 安装二进制文件到 `/usr/local/bin/argus-agent` +- 创建 `argus-agent` 用户 +- 创建配置和数据目录 +- 启动服务并记录 PID + +### 卸载 + +```bash +sudo ./uninstall.sh +``` + +卸载脚本会: +- 停止所有 argus-agent 进程 +- 删除二进制文件 +- 删除配置和数据目录 +- 清理日志文件 +- 更新安装记录 + +### 健康检查 + +```bash +./check_health.sh +``` + +健康检查脚本会: +- 检查安装记录中的 PID +- 验证进程是否正在运行 +- 输出 JSON 格式的健康状态 + +### 打包 + +```bash +./package.sh +``` + +打包脚本会: +- 检查所有必要文件 +- 创建时间戳命名的压缩包 +- 输出安装包信息 + +## 安装后的文件位置 + +- 二进制文件: `/usr/local/bin/argus-agent` +- 配置目录: `/etc/argus-agent/` +- 数据目录: `/var/lib/argus-agent/` +- 日志文件: `/var/log/argus-agent.log` +- PID 文件: `/var/run/argus-agent.pid` +- 安装记录: `/opt/argus-metric/current/.install_record` + +## 健康检查输出格式 + +```json +{ + "name": "argus-agent", + "status": "health|unhealth", + "reason": "状态说明" +} +``` + +## 注意事项 + +1. 安装和卸载脚本需要 root 权限 +2. 健康检查脚本使用安装记录中的 PID 来验证进程状态 +3. 如果 jq 命令不可用,健康检查会使用简单的文本解析 +4. 卸载时会保留 `argus-agent` 用户,避免影响其他服务 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent new file mode 100755 index 0000000..bb3f86b --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d2cf989d0089223b34a27a32d14aad83459afe25a58b1d9f4f3be9f3c5b82e1 +size 7580232 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/check_health.sh b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/check_health.sh new file mode 100755 index 0000000..3bd9a99 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/check_health.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +# Argus Agent 健康检查脚本 +# 输出 JSON 格式结果 + +set -e + +# 检查 Argus Agent 健康状态 +check_health() { + local name="argus-agent" + local status="unhealth" + local reason="" + local install_record="/opt/argus-metric/current/.install_record" + + # 首先尝试通过安装记录文件检查进程 + if [[ -f "$install_record" ]]; then + # 尝试使用jq解析JSON格式的安装记录文件 + local pid="" + if command -v jq &> /dev/null; then + pid=$(jq -r '.components."argus-agent".pid // empty' "$install_record" 2>/dev/null || echo "") + else + # 如果没有jq,使用简单的文本解析方法 + pid=$(grep -A 10 '"argus-agent"' "$install_record" | grep '"pid"' | cut -d'"' -f4 | head -1) + fi + + if [[ -n "$pid" && "$pid" =~ ^[0-9]+$ ]]; then + if kill -0 "$pid" 2>/dev/null; then + # 进程存在且运行正常 + status="health" + reason="进程运行正常 (PID: $pid)" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 0 + else + reason="安装记录中的 PID $pid 进程不存在" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + else + reason="安装记录文件中未找到有效的 argus-agent PID" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + else + # 如果安装记录文件不存在,尝试查找 argus-agent 进程 + local pids=$(pgrep -f "argus-agent" 2>/dev/null || true) + if [[ -n "$pids" ]]; then + # 取第一个找到的 PID + local pid=$(echo "$pids" | head -1) + status="health" + reason="发现 argus-agent 进程运行 (PID: $pid),但未找到安装记录" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 0 + else + reason="未找到 argus-agent 进程,且安装记录文件不存在" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + fi +} + +# 主函数 +main() { + check_health +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/install.sh b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/install.sh new file mode 100755 index 0000000..7c085ec --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/install.sh @@ -0,0 +1,289 @@ +#!/bin/bash + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 显示帮助信息 +show_help() { + echo "Argus Agent 安装脚本" + echo + echo "用法: $0 [选项]" + echo + echo "选项:" + echo " --help 显示此帮助信息" + echo + echo "示例:" + echo " $0 # 安装 Argus Agent" + echo +} + +# 解析命令行参数 +INSTALL_DIR="" +for arg in "$@"; do + case $arg in + --help|-h) + show_help + exit 0 + ;; + *) + # 如果参数不是以--开头,则认为是安装目录 + if [[ ! "$arg" =~ ^-- ]]; then + INSTALL_DIR="$arg" + else + log_error "未知参数: $arg" + show_help + exit 1 + fi + ;; + esac +done + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0" + exit 1 + fi +} + +# 检查系统要求 +check_system() { + log_info "检查系统要求..." + + # 检查操作系统 + if [[ ! -f /etc/os-release ]]; then + log_error "无法检测操作系统版本" + exit 1 + fi + + source /etc/os-release + log_info "检测到操作系统: $NAME $VERSION" + + # 检查是否为 Linux 系统 + if [[ "$ID" != "ubuntu" && "$ID" != "debian" && "$ID" != "centos" && "$ID" != "rhel" && "$ID" != "fedora" ]]; then + log_warning "此脚本主要针对常见 Linux 发行版,其他系统可能需要调整" + fi + + # 检查系统架构 + local arch=$(uname -m) + log_info "系统架构: $arch" + + if [[ "$arch" != "x86_64" && "$arch" != "amd64" ]]; then + log_warning "当前架构为 $arch,argus-agent 主要支持 x86_64/amd64" + fi +} + +# 停止可能运行的服务 +stop_existing_service() { + log_info "检查并停止可能运行的服务..." + local pid_file="/var/run/argus-agent.pid" + + if [[ -f "$pid_file" ]]; then + local pid=$(cat "$pid_file") + if ps -p "$pid" -o comm= | grep -q "^argus-agent$"; then + kill "$pid" 2>/dev/null || true + sleep 2 + kill -9 "$pid" 2>/dev/null || true + log_success "服务已停止" + fi + rm -f "$pid_file" + fi + + local pids=$(pgrep -x argus-agent 2>/dev/null || true) + if [[ -n "$pids" ]]; then + for pid in $pids; do kill -9 "$pid" 2>/dev/null || true; done + fi + + # 检查僵尸进程 + local zombies=$(ps -eo pid,stat,comm | grep '[a]rgus-agent' | awk '$2 ~ /Z/ {print $1}') + if [[ -n "$zombies" ]]; then + for pid in $zombies; do + local ppid=$(ps -o ppid= -p $pid) + log_warning "检测到僵尸 argus-agent (PID=$pid, PPID=$ppid),尝试清理" + [[ "$ppid" -ne 1 ]] && kill -9 "$ppid" 2>/dev/null || true + done + fi +} + + +# 安装 Argus Agent 二进制文件 +install_argus_agent() { + log_info "安装 Argus Agent..." + local binary_file="bin/argus-agent" + local install_dir="/usr/local/bin" + local target_file="$install_dir/argus-agent" + + [[ ! -f "$binary_file" ]] && log_error "找不到 Argus Agent 二进制文件: $binary_file" && exit 1 + + stop_existing_service + + local timeout=10 + while [[ $timeout -gt 0 ]]; do + remaining_pids=$(pgrep -x argus-agent | grep -vw $$ || true) + [[ -z "$remaining_pids" ]] && break + if ps -eo pid,stat,comm | grep -E 'argus-agent' | grep -q 'Z'; then + log_warning "检测到僵尸 argus-agent,跳过等待" + break + fi + log_warning "等待 argus-agent 完全退出... ($timeout)" + sleep 1 + ((timeout--)) + done + + cp "$binary_file" "${target_file}.new" + chmod +x "${target_file}.new" + mv -f "${target_file}.new" "$target_file" + log_success "Argus Agent 二进制文件安装完成" +} + + +# 创建用户和组 +create_user() { + log_info "创建 argus-agent 用户..." + + # 检查用户是否已存在 + if id "argus-agent" &>/dev/null; then + log_info "用户 argus-agent 已存在" + else + useradd --no-create-home --shell /bin/false argus-agent + log_success "用户 argus-agent 创建完成" + fi +} + +# 安装配置文件 +install_config() { + log_info "安装配置文件..." + + local config_dir="/etc/argus-agent" + + # 创建配置目录 + mkdir -p "$config_dir" + + # 创建健康检查目录 + mkdir -p "/var/lib/argus-agent/health" + chown argus-agent:argus-agent "/var/lib/argus-agent/health" +} + +# 启动 Argus Agent 服务 +start_argus_agent() { + log_info "启动 Argus Agent 服务..." + local binary_path="/usr/local/bin/argus-agent" + local log_file="/var/log/argus-agent.log" + local pid_file="/var/run/argus-agent.pid" + + [[ -f "$pid_file" ]] && rm -f "$pid_file" + + log_info "正在启动 Argus Agent..." + setsid "$binary_path" > "$log_file" 2>&1 < /dev/null & + local pid=$! + echo "$pid" > "$pid_file" + sleep 2 + + if kill -0 "$pid" 2>/dev/null; then + log_success "Argus Agent 服务启动成功 (PID: $pid)" + else + log_error "Argus Agent 启动失败" + [[ -f "$log_file" ]] && tail -n 10 "$log_file" + rm -f "$pid_file" + fi +} + + +# 更新安装记录 +update_install_record() { + local pid="$1" + # 使用传入的安装目录参数,如果没有则使用默认值 + local install_base_dir="${2:-/opt/argus-metric/current}" + local install_record="$install_base_dir/.install_record" + + # 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建 + if [[ ! -f "$install_record" ]]; then + log_info "安装记录文件不存在,将由主安装脚本创建" + return 0 + fi + + # 如果文件存在,说明是重启场景,只更新 PID 字段 + if command -v jq &> /dev/null; then + # 读取当前 PID + local current_pid=$(jq -r '.components."argus-agent".pid // ""' "$install_record" 2>/dev/null) + + if [[ -z "$current_pid" ]]; then + log_warning "无法读取当前 PID,跳过更新" + return 1 + fi + + # 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段 + jq --arg new_pid "$pid" '.components."argus-agent".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record" + log_info "PID 已更新: $current_pid -> $pid" + else + log_warning "jq 命令不可用,无法更新安装记录文件" + fi +} + +# 显示安装信息 +show_install_info() { + log_success "Argus Agent 安装完成!" + echo + echo "安装信息:" + echo " 二进制文件: /usr/local/bin/argus-agent" + echo " 运行用户: argus-agent" + echo " 配置目录: /etc/argus-agent/" + echo " 健康检查目录: /var/lib/argus-agent/health" + echo + echo "使用方法:" + echo " 手动启动: /usr/local/bin/argus-agent" + echo " 后台启动: nohup /usr/local/bin/argus-agent &" + echo + echo "健康检查:" + echo " ./check_health.sh" + echo +} + +# 主函数 +main() { + echo "==========================================" + echo " Argus Agent 安装脚本 v1.0" + echo "==========================================" + echo + + check_root + check_system + + log_info "开始安装 Argus Agent..." + + install_argus_agent + create_user + install_config + start_argus_agent + + show_install_info +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/package.sh b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/package.sh new file mode 100755 index 0000000..a1d6394 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/package.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +set -e + +# 颜色定义 +GREEN='\033[0;32m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +# 获取当前目录 +CURRENT_DIR=$(pwd) +PACKAGE_NAME="argus-agent-$(date +%Y%m%d-%H%M%S)" +PACKAGE_FILE="${PACKAGE_NAME}.tar.gz" + +log_info "开始打包 Argus Agent 安装包..." + +# 检查必要文件 +log_info "检查必要文件..." + +required_files=( + "install.sh" + "uninstall.sh" + "bin/argus-agent" + "check_health.sh" +) + +missing_files=() +for file in "${required_files[@]}"; do + if [[ ! -f "$file" ]]; then + missing_files+=("$file") + fi +done + +if [[ ${#missing_files[@]} -gt 0 ]]; then + echo "缺少以下文件:" + for file in "${missing_files[@]}"; do + echo " - $file" + done + exit 1 +fi + +log_success "所有必要文件检查完成" + +# 创建临时目录 +TEMP_DIR=$(mktemp -d) +log_info "创建临时目录: $TEMP_DIR" + +# 复制文件到临时目录 +cp -r . "$TEMP_DIR/$PACKAGE_NAME" + +# 进入临时目录 +cd "$TEMP_DIR" + +# 创建压缩包 +log_info "创建压缩包: $PACKAGE_FILE" +tar -czf "$PACKAGE_FILE" "$PACKAGE_NAME" + +# 移动压缩包到原目录 +mv "$PACKAGE_FILE" "$CURRENT_DIR/" + +# 清理临时目录 +rm -rf "$TEMP_DIR" + +# 返回原目录 +cd "$CURRENT_DIR" + +# 显示结果 +log_success "打包完成!" +echo +echo "安装包文件: $PACKAGE_FILE" +echo "文件大小: $(du -h "$PACKAGE_FILE" | cut -f1)" +echo +echo "使用方法:" +echo "1. 将 $PACKAGE_FILE 传输到目标服务器" +echo "2. 解压: tar -xzf $PACKAGE_FILE" +echo "3. 进入目录: cd $PACKAGE_NAME" +echo "4. 运行安装: sudo ./install.sh" +echo +echo "注意: 请确保所有必要文件都存在" diff --git a/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/uninstall.sh b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/uninstall.sh new file mode 100755 index 0000000..d64a370 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/uninstall.sh @@ -0,0 +1,255 @@ +#!/bin/bash + +# Argus Agent 卸载脚本 +# 版本: 1.0 +# 作者: AIOps Team +# 日期: $(date +%Y-%m-%d) + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0" + exit 1 + fi +} + +# 停止运行中的进程 +stop_processes() { + log_info "停止 Argus Agent 进程..." + + local pid_file="/var/run/argus-agent.pid" + local stopped=false + + # 首先尝试通过 PID 文件停止服务 + if [[ -f "$pid_file" ]]; then + local pid=$(cat "$pid_file") + if kill -0 "$pid" 2>/dev/null; then + log_info "通过 PID 文件停止服务 (PID: $pid)..." + kill "$pid" + sleep 3 + + # 检查进程是否已停止 + if kill -0 "$pid" 2>/dev/null; then + log_warning "进程未响应,强制终止..." + kill -9 "$pid" 2>/dev/null || true + fi + log_success "Argus Agent 进程已停止" + stopped=true + else + log_warning "PID 文件存在但进程已不存在,清理 PID 文件" + rm -f "$pid_file" + fi + fi + + # 查找并杀死所有 argus-agent 进程 + local pids=$(pgrep -f "argus-agent" 2>/dev/null || true) + if [[ -n "$pids" ]]; then + log_info "发现 argus-agent 进程,正在停止..." + for pid in $pids; do + log_info "停止进程 PID: $pid" + kill "$pid" 2>/dev/null || true + done + sleep 2 + + # 检查是否还有进程在运行,如果有则强制终止 + local remaining_pids=$(pgrep -f "argus-agent" 2>/dev/null || true) + if [[ -n "$remaining_pids" ]]; then + log_warning "进程未响应,强制终止..." + for pid in $remaining_pids; do + log_info "强制终止进程 PID: $pid" + kill -9 "$pid" 2>/dev/null || true + done + sleep 1 + fi + + # 最终检查 + if pgrep -f "argus-agent" > /dev/null; then + log_error "无法停止所有 argus-agent 进程" + else + log_success "所有 Argus Agent 进程已停止" + stopped=true + fi + else + log_info "Argus Agent 进程未运行" + fi + + # 清理 PID 文件 + rm -f "$pid_file" + + if [[ "$stopped" == "false" ]]; then + log_warning "未发现需要停止的 Argus Agent 进程" + fi +} + +# 删除二进制文件 +remove_binary() { + log_info "删除 Argus Agent 二进制文件..." + + local binary_files=( + "/usr/local/bin/argus-agent" + ) + + local deleted=false + for binary_file in "${binary_files[@]}"; do + if [[ -f "$binary_file" ]]; then + rm -f "$binary_file" + log_success "二进制文件已删除: $binary_file" + deleted=true + fi + done + + if [[ "$deleted" == "false" ]]; then + log_info "二进制文件不存在" + fi +} + +# 删除配置文件 +remove_config() { + log_info "删除配置文件..." + + local config_dir="/etc/argus-agent" + + if [[ -d "$config_dir" ]]; then + rm -rf "$config_dir" + log_success "配置目录已删除" + else + log_info "配置目录不存在" + fi +} + +# 删除数据目录 +remove_data_dir() { + log_info "删除数据目录..." + + local data_dir="/var/lib/argus-agent" + + if [[ -d "$data_dir" ]]; then + rm -rf "$data_dir" + log_success "数据目录已删除" + else + log_info "数据目录不存在" + fi +} + +# 检查用户状态(可选) +check_user_status() { + log_info "检查 argus-agent 用户状态..." + + if id "argus-agent" &>/dev/null; then + log_info "检测到 argus-agent 用户存在" + log_warning "argus-agent 是系统用户,可能被其他服务使用" + log_info "为了系统稳定性,将保留 argus-agent 用户" + log_info "如需手动删除,请运行: sudo userdel argus-agent" + else + log_info "argus-agent 用户不存在" + fi +} + +# 清理日志文件 +cleanup_logs() { + log_info "清理日志文件..." + + # 删除安装脚本创建的日志文件 + rm -f /var/log/argus-agent.log + + log_success "日志文件已清理" +} + +# 清理安装记录 +cleanup_install_record() { + log_info "清理安装记录..." + + local install_record="/opt/argus-metric/current/.install_record" + + if [[ -f "$install_record" ]]; then + if command -v jq &> /dev/null; then + # 使用 jq 删除 argus-agent 记录 + jq 'del(.components."argus-agent")' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record" + log_success "安装记录已更新" + else + log_warning "jq 命令不可用,无法清理安装记录" + fi + else + log_info "安装记录文件不存在" + fi +} + +# 显示卸载信息 +show_uninstall_info() { + log_success "Argus Agent 卸载完成!" + echo + echo "已删除的内容:" + echo " - 二进制文件: /usr/local/bin/argus-agent" + echo " - 配置目录: /etc/argus-agent" + echo " - 数据目录: /var/lib/argus-agent" + echo " - 相关日志文件" + echo + echo "注意:" + echo " - argus-agent 用户已保留(系统用户,可能被其他服务使用)" + echo " - 如需完全清理,请手动检查并删除相关文件" + echo +} + +# 主函数 +main() { + echo "==========================================" + echo " Argus Agent 卸载脚本 v1.0" + echo "==========================================" + echo + + check_root + + log_warning "此操作将完全卸载 Argus Agent" + read -p "确认继续?(y/N): " confirm + + if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "取消卸载操作" + exit 0 + fi + + log_info "开始卸载 Argus Agent..." + + stop_processes + remove_binary + remove_config + remove_data_dir + cleanup_logs + cleanup_install_record + + # 检查用户状态 + check_user_status + + show_uninstall_info +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/bin/datacenter-gpu-manager_3.3.9_amd64.deb b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/bin/datacenter-gpu-manager_3.3.9_amd64.deb new file mode 100644 index 0000000..683d8cf --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/bin/datacenter-gpu-manager_3.3.9_amd64.deb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bf3a081e24603bc995a8aa041ff7819df60563da3e1f7887dae366baed6d45c +size 911205922 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/bin/dcgm-exporter b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/bin/dcgm-exporter new file mode 100755 index 0000000..5b374f1 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/bin/dcgm-exporter @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8159d5eb6617ff7a06dd0166d14cf17186dd2a578b7b5413026395a0b123c4c7 +size 58360760 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/check_health.sh b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/check_health.sh new file mode 100755 index 0000000..b7ec881 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/check_health.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# DCGM Exporter 健康检查脚本 +# 输出 JSON 格式结果 + +set -e + +# 检查 DCGM Exporter 健康状态 +check_health() { + local url="http://localhost:9400" + local metrics_url="$url/metrics" + local name="dcgm-exporter" + local status="unhealth" + local reason="" + + # 检查 curl 是否可用 + if ! command -v curl &> /dev/null; then + reason="curl 命令不可用,无法进行健康检查" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + + # 测试根路径连接 + local http_code=$(curl -s -o /dev/null -w "%{http_code}" "$url" 2>/dev/null || echo "000") + + if [[ "$http_code" == "200" ]]; then + # 测试 metrics 端点 + local metrics_code=$(curl -s -o /dev/null -w "%{http_code}" "$metrics_url" 2>/dev/null || echo "000") + + if [[ "$metrics_code" == "200" ]]; then + status="health" + reason="success" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 0 + else + reason="Metrics 端点异常 (HTTP $metrics_code)" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + else + reason="HTTP 服务异常 (HTTP $http_code),请检查 DCGM Exporter 是否正在运行在端口 9400" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi +} + +# 主函数 +main() { + check_health +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/config/default-counters.csv b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/config/default-counters.csv new file mode 100644 index 0000000..ad949dd --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/config/default-counters.csv @@ -0,0 +1,77 @@ +# Format +# If line starts with a '#' it is considered a comment +# DCGM FIELD, Prometheus metric type, help message + +# Clocks +DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). +DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). +# DCGM_EXP_CLOCK_EVENTS_COUNT, gauge, Count of clock events within the user-specified time window (see clock-events-count-window-size param). + +# Temperature +DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). +DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). + +# Power +DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). + +# PCIE +DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML. +DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML. +DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. + +# Utilization (the sample period varies depending on the product) +DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). +DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). +DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). +DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %). + +# Errors and violations +DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. +# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). +# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). +# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). +# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). +# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). +# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). +# DCGM_EXP_XID_ERRORS_COUNT, gauge, Count of XID Errors within user-specified time window (see xid-count-window-size param). +# Memory usage +DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB). +DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB). + +# ECC +# DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. +# DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. +# DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. +# DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. + +# Retired pages +# DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. +# DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. +# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. + +# NVLink +# DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. +# DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. +# DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. +# DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes + +# VGPU License status +DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status + +# Remapped rows +DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors +DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors +DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed + +# Static configuration information. These appear as labels on the other metrics +DCGM_FI_DRIVER_VERSION, label, Driver Version +# DCGM_FI_NVML_VERSION, label, NVML Version +# DCGM_FI_DEV_BRAND, label, Device Brand +# DCGM_FI_DEV_SERIAL, label, Device Serial Number +# DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version +# DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version +# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version +# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version +# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device diff --git a/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/install.sh b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/install.sh new file mode 100755 index 0000000..7c97d6b --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/install.sh @@ -0,0 +1,365 @@ +#!/bin/bash + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 更新安装记录 +update_install_record() { + local pid="$1" + # 使用传入的安装目录参数,如果没有则使用默认值 + local install_base_dir="${2:-/opt/argus-metric/current}" + local install_record="$install_base_dir/.install_record" + + # 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建 + if [[ ! -f "$install_record" ]]; then + log_info "安装记录文件不存在,将由主安装脚本创建" + return 0 + fi + + # 如果文件存在,说明是重启场景,只更新 PID 字段 + if command -v jq &> /dev/null; then + # 读取当前 PID + local current_pid=$(jq -r '.components."dcgm-exporter".pid // ""' "$install_record" 2>/dev/null) + + if [[ -z "$current_pid" ]]; then + log_warning "无法读取当前 PID,跳过更新" + return 1 + fi + + # 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段 + jq --arg new_pid "$pid" '.components."dcgm-exporter".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record" + log_info "PID 已更新: $current_pid -> $pid" + else + log_warning "jq 命令不可用,无法更新安装记录文件" + fi +} + +# 显示帮助信息 +show_help() { + echo "DCGM Exporter 安装脚本" + echo + echo "用法: $0 [选项]" + echo + echo "选项:" + echo " --help 显示此帮助信息" + echo + echo "示例:" + echo " $0 # 安装 DCGM Exporter" + echo +} + +# 解析命令行参数 +INSTALL_DIR="" +for arg in "$@"; do + case $arg in + --help|-h) + show_help + exit 0 + ;; + *) + # 如果参数不是以--开头,则认为是安装目录 + if [[ ! "$arg" =~ ^-- ]]; then + INSTALL_DIR="$arg" + else + log_error "未知参数: $arg" + show_help + exit 1 + fi + ;; + esac +done + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0" + exit 1 + fi +} + +# 检查系统要求 +check_system() { + log_info "检查系统要求..." + + # 检查操作系统 + if [[ ! -f /etc/os-release ]]; then + log_error "无法检测操作系统版本" + exit 1 + fi + + source /etc/os-release + log_info "检测到操作系统: $NAME $VERSION" + + # 检查是否为 Ubuntu/Debian + if [[ "$ID" != "ubuntu" && "$ID" != "debian" ]]; then + log_warning "此脚本主要针对 Ubuntu/Debian 系统,其他系统可能需要调整" + fi + + # 检查 NVIDIA GPU + if ! command -v nvidia-smi &> /dev/null; then + log_warning "未检测到 nvidia-smi,请确保已安装 NVIDIA 驱动" + else + log_success "检测到 NVIDIA GPU" + nvidia-smi --query-gpu=name --format=csv,noheader,nounits | head -1 + fi +} + +# 安装 DCGM 依赖 +install_dcgm_dependency() { + log_info "安装 DCGM 依赖..." + + local deb_file="bin/datacenter-gpu-manager_3.3.9_amd64.deb" + + if [[ ! -f "$deb_file" ]]; then + log_error "找不到 DCGM 依赖文件: $deb_file" + exit 1 + fi + + # 安装 deb 包 + dpkg -i "$deb_file" || { + log_warning "dpkg 安装失败,尝试使用 apt 修复依赖..." + apt-get update + apt-get install -f -y + dpkg -i "$deb_file" + } + + log_success "DCGM 依赖安装完成" +} + +# 检查 DCGM 服务状态 +check_dcgm_service() { + log_info "检查 DCGM 服务状态..." + + # 检查 DCGM 服务是否在运行 + if systemctl is-active --quiet dcgm 2>/dev/null; then + log_success "DCGM 服务已在运行" + elif pgrep -f nv-hostengine > /dev/null; then + log_success "nv-hostengine 进程已在运行" + else + log_warning "DCGM 服务未运行,需要手动启动" + log_info "启动 DCGM 服务的方法:" + log_info " 1. 使用 systemd: sudo systemctl start dcgm" + log_info " 2. 手动启动: nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 &" + fi + + # 测试 DCGM 连接 + if systemctl is-active --quiet dcgm 2>/dev/null || pgrep -f nv-hostengine > /dev/null; then + log_info "测试 DCGM 连接..." + if dcgmi discovery -l > /dev/null 2>&1; then + log_success "DCGM 连接测试成功" + else + log_warning "DCGM 连接测试失败,请检查服务状态" + fi + fi +} + +# 停止可能运行的服务 +stop_existing_service() { + log_info "检查并停止可能运行的服务..." + + local pid_file="/var/run/dcgm-exporter.pid" + + # 检查并停止通过 PID 文件管理的服务 + if [[ -f "$pid_file" ]]; then + local pid=$(cat "$pid_file") + if kill -0 "$pid" 2>/dev/null; then + log_info "发现正在运行的 DCGM Exporter 服务 (PID: $pid),正在停止..." + kill "$pid" > /dev/null 2>&1 || true + sleep 2 + if kill -0 "$pid" 2>/dev/null; then + log_warning "进程未响应,强制终止..." + kill -9 "$pid" > /dev/null 2>&1 || true + fi + rm -f "$pid_file" + log_success "服务已停止" + else + log_warning "发现过期的 PID 文件,正在清理..." + rm -f "$pid_file" + fi + fi + + # 查找并停止所有 dcgm-exporter 进程(排除脚本自身) + local exporter_bin="/usr/local/bin/dcgm-exporter" + local pids=$(pgrep -f "$exporter_bin") + + if [[ -n "$pids" ]]; then + log_info "发现其他 dcgm-exporter 进程,正在停止..." + for pid in $pids; do + if [[ "$pid" != "$$" ]]; then + kill "$pid" > /dev/null 2>&1 || true + sleep 1 + if kill -0 "$pid" 2>/dev/null; then + log_warning "进程 $pid 未响应,强制终止..." + kill -9 "$pid" > /dev/null 2>&1 || true + fi + fi + done + log_success "所有 dcgm-exporter 进程已停止" + fi +} + +# 安装 DCGM Exporter 二进制文件 +install_dcgm_exporter() { + log_info "安装 DCGM Exporter..." + + local binary_file="bin/dcgm-exporter" + local install_dir="/usr/local/bin" + + if [[ ! -f "$binary_file" ]]; then + log_error "找不到 DCGM Exporter 二进制文件: $binary_file" + exit 1 + fi + + # 停止可能运行的服务 + stop_existing_service + + # 复制二进制文件 + cp "$binary_file" "$install_dir/" + chmod +x "$install_dir/dcgm-exporter" + + log_success "DCGM Exporter 二进制文件安装完成" +} + +# 安装配置文件 +install_config() { + log_info "安装配置文件..." + + local config_dir="/etc/dcgm-exporter" + local config_file="config/default-counters.csv" + + # 创建配置目录 + mkdir -p "$config_dir" + + if [[ -f "$config_file" ]]; then + cp "$config_file" "$config_dir/" + log_success "配置文件安装完成" + else + log_warning "未找到配置文件,使用默认配置" + fi +} + +# 启动 DCGM Exporter 服务 +start_dcgm_exporter() { + log_info "启动 DCGM Exporter 服务..." + + local binary_path="/usr/local/bin/dcgm-exporter" + local log_file="/var/log/dcgm-exporter.log" + local pid_file="/var/run/dcgm-exporter.pid" + + # 检查服务是否已经在运行 + if [[ -f "$pid_file" ]]; then + local pid=$(cat "$pid_file") + if kill -0 "$pid" 2>/dev/null; then + log_info "DCGM Exporter 服务已在运行 (PID: $pid)" + return 0 + else + log_warning "发现过期的 PID 文件,正在清理..." + rm -f "$pid_file" + fi + fi + + # 检查端口是否被占用 + if netstat -tuln 2>/dev/null | grep -q ":9400 "; then + log_warning "端口 9400 已被占用,请检查是否有其他服务在运行" + return 1 + fi + + # 启动服务 + log_info "正在启动 DCGM Exporter..." + nohup "$binary_path" --address=:9400 > "$log_file" 2>&1 & + local pid=$! + + # 保存 PID + echo "$pid" > "$pid_file" + + # 等待服务启动 + sleep 2 + + # 检查服务是否成功启动 + if kill -0 "$pid" 2>/dev/null; then + log_success "DCGM Exporter 服务启动成功 (PID: $pid)" + log_info "日志文件: $log_file" + log_info "PID 文件: $pid_file" + + # 更新安装记录 + update_install_record "$pid" "$INSTALL_DIR" + else + log_error "DCGM Exporter 服务启动失败" + rm -f "$pid_file" + return 1 + fi +} + + + +# 显示安装信息 +show_install_info() { + log_success "DCGM Exporter 安装完成!" + echo + echo "安装信息:" + echo " 二进制文件: /usr/local/bin/dcgm-exporter" + echo " 配置文件: /etc/dcgm-exporter/default-counters.csv" + echo " 默认端口: 9400" + echo + echo "使用方法:" + echo " 1. 启动 DCGM 服务:" + echo " sudo systemctl start dcgm" + echo " 或: nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 &" + echo " 2. 启动 DCGM Exporter:" + echo " /usr/local/bin/dcgm-exporter --address=:9400" + echo " 或: nohup /usr/local/bin/dcgm-exporter --address=:9400 &" + echo + echo "测试连接:" + echo " curl http://localhost:9400/metrics" + echo +} + +# 主函数 +main() { + echo "==========================================" + echo " DCGM Exporter 安装脚本 v1.0" + echo "==========================================" + echo + + check_root + check_system + + log_info "开始安装 DCGM Exporter..." + + install_dcgm_dependency + check_dcgm_service + install_dcgm_exporter + install_config + start_dcgm_exporter + + show_install_info +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/package.sh b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/package.sh new file mode 100755 index 0000000..103913f --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/package.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +set -e + +# 颜色定义 +GREEN='\033[0;32m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +# 获取当前目录 +CURRENT_DIR=$(pwd) +PACKAGE_NAME="dcgm-exporter-$(date +%Y%m%d-%H%M%S)" +PACKAGE_FILE="${PACKAGE_NAME}.tar.gz" + +log_info "开始打包 DCGM Exporter 安装包..." + +# 检查必要文件 +log_info "检查必要文件..." + +required_files=( + "install.sh" + "uninstall.sh" + "bin/dcgm-exporter" + "bin/datacenter-gpu-manager_3.3.9_amd64.deb" + "check_health.sh" +) + +missing_files=() +for file in "${required_files[@]}"; do + if [[ ! -f "$file" ]]; then + missing_files+=("$file") + fi +done + +if [[ ${#missing_files[@]} -gt 0 ]]; then + echo "缺少以下文件:" + for file in "${missing_files[@]}"; do + echo " - $file" + done + exit 1 +fi + +log_success "所有必要文件检查完成" + +# 创建临时目录 +TEMP_DIR=$(mktemp -d) +log_info "创建临时目录: $TEMP_DIR" + +# 复制文件到临时目录 +cp -r . "$TEMP_DIR/$PACKAGE_NAME" + +# 进入临时目录 +cd "$TEMP_DIR" + +# 创建压缩包 +log_info "创建压缩包: $PACKAGE_FILE" +tar -czf "$PACKAGE_FILE" "$PACKAGE_NAME" + +# 移动压缩包到原目录 +mv "$PACKAGE_FILE" "$CURRENT_DIR/" + +# 清理临时目录 +rm -rf "$TEMP_DIR" + +# 返回原目录 +cd "$CURRENT_DIR" + +# 显示结果 +log_success "打包完成!" +echo +echo "安装包文件: $PACKAGE_FILE" +echo "文件大小: $(du -h "$PACKAGE_FILE" | cut -f1)" +echo +echo "使用方法:" +echo "1. 将 $PACKAGE_FILE 传输到目标服务器" +echo "2. 解压: tar -xzf $PACKAGE_FILE" +echo "3. 进入目录: cd $PACKAGE_NAME" +echo "4. 运行安装: sudo ./install.sh" +echo +echo "注意: 请确保 config/default-counters.csv 文件存在" diff --git a/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/uninstall.sh b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/uninstall.sh new file mode 100755 index 0000000..816a8ae --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/uninstall.sh @@ -0,0 +1,216 @@ +#!/bin/bash + +# DCGM Exporter 卸载脚本 +# 版本: 1.0 +# 作者: AIOps Team +# 日期: $(date +%Y-%m-%d) + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0" + exit 1 + fi +} + +# 停止运行中的进程 +stop_processes() { + log_info "停止 DCGM Exporter 进程..." + + local pid_file="/var/run/dcgm-exporter.pid" + local stopped=false + + # 首先尝试通过 PID 文件停止服务 + if [[ -f "$pid_file" ]]; then + local pid=$(cat "$pid_file") + if kill -0 "$pid" 2>/dev/null; then + log_info "通过 PID 文件停止服务 (PID: $pid)..." + kill "$pid" + sleep 3 + + # 检查进程是否已停止 + if kill -0 "$pid" 2>/dev/null; then + log_warning "进程未响应,强制终止..." + kill -9 "$pid" 2>/dev/null || true + fi + log_success "DCGM Exporter 进程已停止" + stopped=true + else + log_warning "PID 文件存在但进程已不存在,清理 PID 文件" + rm -f "$pid_file" + fi + fi + + # 查找并杀死所有 dcgm-exporter 进程 + local pids=$(pgrep -f "dcgm-exporter" 2>/dev/null || true) + if [[ -n "$pids" ]]; then + log_info "发现 dcgm-exporter 进程,正在停止..." + for pid in $pids; do + log_info "停止进程 PID: $pid" + kill "$pid" 2>/dev/null || true + done + sleep 2 + + # 检查是否还有进程在运行,如果有则强制终止 + local remaining_pids=$(pgrep -f "dcgm-exporter" 2>/dev/null || true) + if [[ -n "$remaining_pids" ]]; then + log_warning "进程未响应,强制终止..." + for pid in $remaining_pids; do + log_info "强制终止进程 PID: $pid" + kill -9 "$pid" 2>/dev/null || true + done + sleep 1 + fi + + # 最终检查 + if pgrep -f "dcgm-exporter" > /dev/null; then + log_error "无法停止所有 dcgm-exporter 进程" + else + log_success "所有 DCGM Exporter 进程已停止" + stopped=true + fi + else + log_info "DCGM Exporter 进程未运行" + fi + + # 清理 PID 文件 + rm -f "$pid_file" + + if [[ "$stopped" == "false" ]]; then + log_warning "未发现需要停止的 DCGM Exporter 进程" + fi +} + +# 删除二进制文件 +remove_binary() { + log_info "删除 DCGM Exporter 二进制文件..." + + local binary_file="/usr/local/bin/dcgm-exporter" + + if [[ -f "$binary_file" ]]; then + rm -f "$binary_file" + log_success "二进制文件已删除" + else + log_info "二进制文件不存在" + fi +} + +# 删除配置文件 +remove_config() { + log_info "删除配置文件..." + + local config_dir="/etc/dcgm-exporter" + + if [[ -d "$config_dir" ]]; then + rm -rf "$config_dir" + log_success "配置目录已删除" + else + log_info "配置目录不存在" + fi +} + +# 卸载 DCGM 依赖(可选) +remove_dcgm_dependency() { + log_info "检查 DCGM 依赖状态..." + + # 检查是否安装了 DCGM 包 + if dpkg -l | grep -q datacenter-gpu-manager; then + log_info "检测到 DCGM 依赖包已安装" + log_warning "DCGM 是系统级依赖,可能被其他应用程序使用" + log_info "为了系统稳定性,将保留 DCGM 依赖包" + log_info "如需手动卸载,请运行: sudo apt-get remove --purge datacenter-gpu-manager" + else + log_info "DCGM 依赖包未安装" + fi +} + +# 清理日志文件 +cleanup_logs() { + log_info "清理日志文件..." + + # 清理 journal 日志 + journalctl --vacuum-time=1s --quiet || true + + # 删除可能的日志文件 + rm -f /var/log/nv-hostengine.log + rm -f /var/log/dcgm-exporter.log + + log_success "日志文件已清理" +} + +# 显示卸载信息 +show_uninstall_info() { + log_success "DCGM Exporter 卸载完成!" + echo + echo "已删除的内容:" + echo " - 二进制文件: /usr/local/bin/dcgm-exporter" + echo " - 配置目录: /etc/dcgm-exporter" + echo " - 相关日志文件" + echo + echo "注意:" + echo " - DCGM 依赖包可能仍然存在" + echo " - 如需完全清理,请手动检查并删除相关文件" + echo +} + +# 主函数 +main() { + echo "==========================================" + echo " DCGM Exporter 卸载脚本 v1.0" + echo "==========================================" + echo + + check_root + + log_warning "此操作将完全卸载 DCGM Exporter" + read -p "确认继续?(y/N): " confirm + + if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "取消卸载操作" + exit 0 + fi + + log_info "开始卸载 DCGM Exporter..." + + stop_processes + remove_binary + remove_config + cleanup_logs + + # 询问是否卸载 DCGM 依赖 + remove_dcgm_dependency + + show_uninstall_info +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/README.md b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/README.md new file mode 100644 index 0000000..ca8ce92 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/README.md @@ -0,0 +1,181 @@ +# Fluent Bit 安装包 + +这是一个 Fluent Bit 的自动化安装包,提供了完整的安装、卸载和健康检查功能。 + +## 目录结构 + +``` +fluent-bit-installer/ +├── install.sh # 安装脚本 +├── uninstall.sh # 卸载脚本 +├── package.sh # 打包脚本 +├── check_health.sh # 健康检查脚本 +├── bin/ +│ └── fluent-bit_3.1.9_amd64.deb # Fluent Bit 安装包 +└── config/ + ├── fluent-bit.conf # 主配置文件 + ├── inject_labels.lua # Lua 脚本 + ├── parsers.conf # 解析器配置 + ├── inputs.d/ # 输入配置目录 + │ ├── 10-train.conf + │ └── 20-infer.conf + └── outputs.d/ # 输出配置目录 + └── 10-es.conf +``` + +## 功能特性 + +- **自动化安装**: 一键安装 Fluent Bit 及其依赖 +- **配置管理**: 自动部署预配置的配置文件 +- **服务管理**: 自动启动和停止 Fluent Bit 服务 +- **健康检查**: 提供 JSON 格式的健康状态检查 +- **完整卸载**: 彻底清理所有相关文件和配置 +- **用户管理**: 自动创建专用的 fluent-bit 用户 + +## 使用方法 + +### 1. 打包安装包 + +```bash +./package.sh +``` + +这将创建一个带时间戳的压缩包,例如:`fluent-bit-installer-20250924-160954.tar.gz` + +### 2. 安装 Fluent Bit + +```bash +# 解压安装包 +tar -xzf fluent-bit-installer-*.tar.gz +cd fluent-bit-installer-* + +# 运行安装脚本(需要 root 权限) +sudo ./install.sh +``` + +### 3. 健康检查 + +```bash +./check_health.sh +``` + +输出示例: +```json +{"name": "fluent-bit", "status": "health", "reason": "success"} +``` + +### 4. 卸载 Fluent Bit + +```bash +sudo ./uninstall.sh +``` + +## 安装后的文件位置 + +- **二进制文件**: `/opt/fluent-bit/bin/fluent-bit` +- **配置文件**: `/etc/fluent-bit/` +- **日志文件**: `/var/log/fluent-bit/` +- **缓冲区目录**: `/var/lib/fluent-bit/buffers/` +- **运行用户**: `fluent-bit` +- **HTTP 端口**: `2020` + +## 配置说明 + +### 主配置文件 + +主配置文件位于 `/etc/fluent-bit/fluent-bit.conf`,包含以下主要部分: + +- **SERVICE**: 服务配置,包括 HTTP 服务器设置 +- **INPUT**: 输入配置,通过 `inputs.d/` 目录管理 +- **FILTER**: 过滤器配置,包括解析器和标签注入 +- **OUTPUT**: 输出配置,通过 `outputs.d/` 目录管理 + +### 输入配置 + +- `10-train.conf`: 训练日志输入配置 +- `20-infer.conf`: 推理日志输入配置 + +### 输出配置 + +- `10-es.conf`: Elasticsearch 输出配置 + +## 服务管理 + +### 手动启动 + +```bash +/opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf +``` + +### 后台启动 + +```bash +nohup /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf & +``` + +### 检查服务状态 + +```bash +# 检查进程 +ps aux | grep fluent-bit + +# 检查端口 +netstat -tuln | grep 2020 + +# 检查日志 +tail -f /var/log/fluent-bit/fluent-bit.log +``` + +## API 接口 + +Fluent Bit 提供 HTTP API 用于监控和管理: + +- **根路径**: `http://localhost:2020` +- **状态接口**: `http://localhost:2020/api/v1/status` +- **指标接口**: `http://localhost:2020/api/v1/metrics` + +## 故障排除 + +### 常见问题 + +1. **端口被占用** + - 检查端口 2020 是否被其他服务占用 + - 修改配置文件中的端口设置 + +2. **权限问题** + - 确保 fluent-bit 用户有足够的权限访问日志文件 + - 检查目录权限设置 + +3. **配置文件错误** + - 检查配置文件语法 + - 查看日志文件中的错误信息 + +### 日志查看 + +```bash +# 查看服务日志 +tail -f /var/log/fluent-bit/fluent-bit.log + +# 查看系统日志 +journalctl -u fluent-bit -f +``` + +## 系统要求 + +- **操作系统**: Ubuntu/Debian/CentOS/RHEL/Fedora +- **架构**: x86_64/amd64 +- **权限**: root 权限(用于安装和卸载) +- **依赖**: curl(用于健康检查) + +## 版本信息 + +- **Fluent Bit 版本**: 3.1.9 +- **安装包版本**: 1.0 +- **支持架构**: amd64 + +## 注意事项 + +1. 安装前请确保系统已更新 +2. 卸载时会保留 fluent-bit 用户(系统用户,可能被其他服务使用) +3. 配置文件包含环境变量,请根据实际环境调整 +4. 建议在生产环境使用前进行充分测试 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/fluent-bit_3.1.9_amd64.deb b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/fluent-bit_3.1.9_amd64.deb new file mode 100644 index 0000000..f52cb53 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/fluent-bit_3.1.9_amd64.deb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bdc163534a062c3addd705a65326800b4e362a0f54a891ed0bb8776556e2361 +size 42047204 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libpq5_14.19-0ubuntu0.22.04.1_amd64.deb b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libpq5_14.19-0ubuntu0.22.04.1_amd64.deb new file mode 100644 index 0000000..e731f32 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libpq5_14.19-0ubuntu0.22.04.1_amd64.deb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4610f6aae2b19dcc326458aaa596d06f965d0a00abb36ea3317c7157a60fd1ce +size 152282 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libyaml-0-2_0.2.2-1build2_amd64.deb b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libyaml-0-2_0.2.2-1build2_amd64.deb new file mode 100644 index 0000000..474abdc --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libyaml-0-2_0.2.2-1build2_amd64.deb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b137d89a463b671383b6eaec404a494c8bd630a4adb79fc059c3aa48af170dcb +size 51622 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/check_health.sh b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/check_health.sh new file mode 100755 index 0000000..37f4090 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/check_health.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +# Fluent Bit 健康检查脚本 +# 输出 JSON 格式结果 + +set -e + +# 检查 Fluent Bit 健康状态 +check_health() { + local name="fluent-bit" + local status="unhealth" + local reason="" + local install_record="/opt/argus-metric/current/.install_record" + + # 首先尝试通过安装记录文件检查进程 + if [[ -f "$install_record" ]]; then + # 尝试使用jq解析JSON格式的安装记录文件 + local pid="" + if command -v jq &> /dev/null; then + pid=$(jq -r '.components."fluent-bit".pid // empty' "$install_record" 2>/dev/null || echo "") + else + # 如果没有jq,使用简单的文本解析方法 + pid=$(grep -A 10 '"fluent-bit"' "$install_record" | grep '"pid"' | cut -d'"' -f4 | head -1) + fi + + if [[ -n "$pid" && "$pid" =~ ^[0-9]+$ ]]; then + if kill -0 "$pid" 2>/dev/null; then + # 进程存在且运行正常 + status="health" + reason="进程运行正常 (PID: $pid)" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 0 + else + reason="安装记录中的 PID $pid 进程不存在" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + else + reason="安装记录文件中未找到有效的 fluent-bit PID" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + else + # 如果安装记录文件不存在,尝试查找 fluent-bit 进程 + local pids=$(pgrep -f "fluent-bit" 2>/dev/null || true) + if [[ -n "$pids" ]]; then + # 取第一个找到的 PID + local pid=$(echo "$pids" | head -1) + status="health" + reason="发现 fluent-bit 进程运行 (PID: $pid),但未找到安装记录" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 0 + else + reason="未找到 fluent-bit 进程,且安装记录文件不存在" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + fi +} + +# 主函数 +main() { + check_health +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/fluent-bit.conf b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/fluent-bit.conf new file mode 100644 index 0000000..95ed374 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/fluent-bit.conf @@ -0,0 +1,37 @@ +[SERVICE] + Daemon Off + Parsers_File parsers.conf + HTTP_Server On + HTTP_Listen 0.0.0.0 + HTTP_Port 2020 + storage.path /buffers + storage.sync normal + storage.checksum on + storage.backlog.mem_limit 128M + # 备注:该镜像默认未开启 Hot Reload,修改配置后请重启容器。 + +@INCLUDE inputs.d/*.conf + +[FILTER] + Name parser + Match app.* + Key_Name log + Parser timestamp_parser + Reserve_Data On + Preserve_Key On + Unescape_Key On + +[FILTER] + Name record_modifier + Match * + Record cluster ${CLUSTER} + Record rack ${RACK} + Record host ${HOSTNAME} + +[FILTER] + Name lua + Match app.* + script inject_labels.lua + call add_labels + +@INCLUDE outputs.d/*.conf diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/inject_labels.lua b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/inject_labels.lua new file mode 100644 index 0000000..0d87f7a --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/inject_labels.lua @@ -0,0 +1,15 @@ +function add_labels(tag, ts, record) + record["job_id"] = os.getenv("FB_JOB_ID") or record["job_id"] or "unknown" + record["user"] = os.getenv("FB_USER") or record["user"] or "unknown" + record["model"] = os.getenv("FB_MODEL") or record["model"] or "unknown" + record["gpu_id"] = os.getenv("FB_GPU_ID") or record["gpu_id"] or "na" + local p = record["log_path"] or "" + if string.find(p, "/logs/infer/") then + record["role"] = "infer" + elseif string.find(p, "/logs/train/") then + record["role"] = "train" + else + record["role"] = record["role"] or "app" + end + return 1, ts, record +end diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/inputs.d/10-train.conf b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/inputs.d/10-train.conf new file mode 100644 index 0000000..3ea9e25 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/inputs.d/10-train.conf @@ -0,0 +1,10 @@ +[INPUT] + Name tail + Path /logs/train/*.log + Tag app.train + Path_Key log_path + Refresh_Interval 5 + DB /buffers/train.db + Skip_Long_Lines On + storage.type filesystem + multiline.parser python,go,java diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/inputs.d/20-infer.conf b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/inputs.d/20-infer.conf new file mode 100644 index 0000000..793e203 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/inputs.d/20-infer.conf @@ -0,0 +1,10 @@ +[INPUT] + Name tail + Path /logs/infer/*.log + Tag app.infer + Path_Key log_path + Refresh_Interval 5 + DB /buffers/infer.db + Skip_Long_Lines On + storage.type filesystem + multiline.parser python,go,java diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/outputs.d/10-es.conf b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/outputs.d/10-es.conf new file mode 100644 index 0000000..f273270 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/outputs.d/10-es.conf @@ -0,0 +1,24 @@ +# 重要:使用 Logstash_Format + Logstash_Prefix,生成 train-*/infer-* 索引 +[OUTPUT] + Name es + Match app.train + Host ${ES_HOST:-localhost} + Port ${ES_PORT:-9200} + Logstash_Format On + Logstash_Prefix train + Replace_Dots On + Generate_ID On + Retry_Limit False + Suppress_Type_Name On + +[OUTPUT] + Name es + Match app.infer + Host ${ES_HOST:-localhost} + Port ${ES_PORT:-9200} + Logstash_Format On + Logstash_Prefix infer + Replace_Dots On + Generate_ID On + Retry_Limit False + Suppress_Type_Name On diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/parsers.conf b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/parsers.conf new file mode 100644 index 0000000..d86fa06 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/parsers.conf @@ -0,0 +1,27 @@ +[MULTILINE_PARSER] + Name python + Type regex + Flush 2 + Rule "start_state" "/^\d{4}-\d{2}-\d{2}[\sT]/" "cont" + Rule "cont" "/^\s+|^Traceback|^\tat\s+/" "cont" + +[MULTILINE_PARSER] + Name go + Type regex + Flush 2 + Rule "start_state" "/^[0-9]{4}\/[0-9]{2}\/[0-9]{2}/" "cont" + Rule "cont" "/^\s+|^\t/" "cont" + +[MULTILINE_PARSER] + Name java + Type regex + Flush 2 + Rule "start_state" "/^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}/" "cont" + Rule "cont" "/^\s+at\s+|^\t.../" "cont" + +[PARSER] + Name timestamp_parser + Format regex + Regex ^(?\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(?\w+)\s+(?.*)$ + Time_Key timestamp + Time_Format %Y-%m-%d %H:%M:%S diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/install.sh b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/install.sh new file mode 100755 index 0000000..aef6e34 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/install.sh @@ -0,0 +1,291 @@ +#!/bin/bash + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +log_info "Starting Fluent Bit installation..." + +# 解析命令行参数 +INSTALL_DIR="${1:-/opt/argus-metric/current}" + +# 更新安装记录 +update_install_record() { + local pid="$1" + # 使用传入的安装目录参数,如果没有则使用默认值 + local install_base_dir="${2:-/opt/argus-metric/current}" + local install_record="$install_base_dir/.install_record" + + # 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建 + if [[ ! -f "$install_record" ]]; then + log_info "安装记录文件不存在,将由主安装脚本创建" + return 0 + fi + + # 如果文件存在,说明是重启场景,只更新 PID 字段 + if command -v jq &> /dev/null; then + # 读取当前 PID + local current_pid=$(jq -r '.components."fluent-bit".pid // ""' "$install_record" 2>/dev/null) + + if [[ -z "$current_pid" ]]; then + log_warning "无法读取当前 PID,跳过更新" + return 1 + fi + + # 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段 + jq --arg new_pid "$pid" '.components."fluent-bit".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record" + log_info "PID updated: $current_pid -> $pid" + else + log_warning "jq 命令不可用,无法更新安装记录文件" + fi +} + +# 检查是否为 root 用户 +if [[ $EUID -ne 0 ]]; then + log_error "This script requires root privileges" + log_info "Please use: sudo $0" + exit 1 +fi + +# 停止可能运行的服务 +log_info "Stopping existing fluent-bit processes..." + +# 只匹配进程名为 fluent-bit 的进程 +pids=$(pgrep -x fluent-bit 2>/dev/null || true) + +if [[ -n "$pids" ]]; then + for pid in $pids; do + log_info "Stopping process PID: $pid" + kill "$pid" 2>/dev/null || true + done + sleep 2 + + # 检查是否还有残留进程 + remaining_pids=$(pgrep -x fluent-bit 2>/dev/null || true) + if [[ -n "$remaining_pids" ]]; then + log_warning "Force killing unresponsive processes..." + for pid in $remaining_pids; do + kill -9 "$pid" 2>/dev/null || true + done + fi +fi + +# 安装 Fluent Bit 依赖库 libpq5(离线模式) +log_info "Checking Fluent Bit dependency: libpq5 ..." +if ! ldconfig -p | grep -q libpq.so.5; then + if ls bin/libpq5_*.deb >/dev/null 2>&1; then + log_info "Installing local dependency package: libpq5" + DEBIAN_FRONTEND=noninteractive dpkg -i bin/libpq5_*.deb >/dev/null 2>&1 || { + log_error "Failed to install libpq5 from bin/, please check package validity" + exit 1 + } + else + log_error "Missing dependency: libpq5 (libpq.so.5). Please put bin/libpq5_*.deb in the bin/ directory." + exit 1 + fi +else + log_info "libpq.so.5 already present on system" +fi + +# 安装 Fluent Bit 依赖库 libyaml-0-2(离线模式) +log_info "Checking Fluent Bit dependency: libyaml-0.so.2 ..." +if ! ldconfig -p | grep -q libyaml-0.so.2; then + if ls bin/libyaml-0-2_*.deb >/dev/null 2>&1; then + log_info "Installing local dependency package: libyaml-0-2" + DEBIAN_FRONTEND=noninteractive dpkg -i bin/libyaml-0-2_*.deb >/dev/null 2>&1 || { + log_error "Failed to install libyaml-0-2 from bin/, please check package validity" + exit 1 + } + else + log_error "Missing dependency: libyaml-0-2 (libyaml-0.so.2). Please put bin/libyaml-0-2_*.deb in the bin/ directory." + exit 1 + fi +else + log_info "libyaml-0.so.2 already present on system" +fi + +# 清理可能存在的旧 fluent-bit 安装(避免配置文件冲突) +log_info "Cleaning up old fluent-bit installation if exists..." +if dpkg -l | grep -q "^ii.*fluent-bit"; then + log_info "Found existing fluent-bit package, removing..." + dpkg --purge fluent-bit 2>/dev/null || true + apt-get remove --purge -y fluent-bit 2>/dev/null || true +fi + +# 确保清理残留的配置文件 +if [[ -d "/etc/fluent-bit" ]]; then + log_info "Removing old fluent-bit configuration directory..." + rm -rf /etc/fluent-bit +fi + +# 安装 Fluent Bit 主包 +log_info "Installing Fluent Bit from deb package..." +deb_file="bin/fluent-bit_3.1.9_amd64.deb" +if [[ ! -f "$deb_file" ]]; then + log_error "Fluent Bit package not found: $deb_file" + exit 1 +fi + +DEBIAN_FRONTEND=noninteractive dpkg -i "$deb_file" >/dev/null 2>&1 || true + +# 验证 Fluent Bit 可以运行 +fb_version=$(/opt/fluent-bit/bin/fluent-bit --version 2>&1 | head -1) +log_info "Fluent Bit version: $fb_version" + +# 创建 fluent-bit 用户 +log_info "Creating fluent-bit user..." +if ! id "fluent-bit" &>/dev/null; then + useradd --no-create-home --shell /bin/false fluent-bit +fi + +# 创建配置目录 +log_info "Installing configuration files..." +mkdir -p /etc/fluent-bit +if [[ -d "config" ]]; then + cp -r config/* /etc/fluent-bit/ + chown -R fluent-bit:fluent-bit /etc/fluent-bit +fi + +# 创建日志和缓冲区目录 +log_info "Creating log and buffer directories..." +mkdir -p /logs/train /logs/infer /buffers +chmod 755 /logs/train /logs/infer +chmod 770 /buffers +chown -R fluent-bit:fluent-bit /logs /buffers + +# 启动 Fluent Bit +log_info "Starting Fluent Bit with configuration from /etc/fluent-bit/" +config_path="/etc/fluent-bit/fluent-bit.conf" + +if [[ ! -f "$config_path" ]]; then + log_error "Configuration file not found: $config_path" + exit 1 +fi + +# 设置环境变量 +log_info "Setting environment variables..." + +# 获取非 127.0.0.1 的 IP 地址作为 HOSTNAME +if [[ -z "${HOSTNAME:-}" ]]; then + # 获取 177.x.x.x 段的 IP 地址 + HOSTNAME=$(ip route get 8.8.8.8 2>/dev/null | grep -oP 'src \K\S+' | grep '^177\.' | head -1) + + # 如果没有找到 177.x.x.x 段的 IP,则获取第一个非 127.0.0.1 的 IP + if [[ -z "$HOSTNAME" ]]; then + HOSTNAME=$(ip route get 8.8.8.8 2>/dev/null | grep -oP 'src \K\S+' | grep -v '^127\.' | head -1) + fi + + # 如果还是没有找到,使用 hostname 命令 + if [[ -z "$HOSTNAME" ]]; then + HOSTNAME=$(hostname) + fi +fi +export HOSTNAME + +export CLUSTER="${CLUSTER:-local}" +export RACK="${RACK:-dev}" +export ES_HOST="${ES_HOST:-localhost}" +export ES_PORT="${ES_PORT:-9200}" + +log_info "Environment variables:" +log_info " CLUSTER=$CLUSTER" +log_info " RACK=$RACK" +log_info " HOSTNAME=$HOSTNAME" +log_info " ES_HOST=$ES_HOST" +log_info " ES_PORT=$ES_PORT" + +# 检查 fluent-bit 二进制文件 +log_info "[DEBUG] Checking fluent-bit binary..." +if [[ ! -f "/opt/fluent-bit/bin/fluent-bit" ]]; then + log_error "fluent-bit binary not found at /opt/fluent-bit/bin/fluent-bit" + exit 1 +fi +log_info "[DEBUG] fluent-bit binary exists and is executable: $(ls -lh /opt/fluent-bit/bin/fluent-bit)" + +# 检查配置文件 +log_info "[DEBUG] Checking configuration file: $config_path" +if [[ ! -f "$config_path" ]]; then + log_error "Configuration file not found: $config_path" + exit 1 +fi +log_info "[DEBUG] Configuration file exists: $(ls -lh $config_path)" + +# 显示完整的启动命令 +log_info "[DEBUG] Full command to execute:" +log_info "[DEBUG] su -s /bin/bash fluent-bit -c 'env CLUSTER=\"$CLUSTER\" RACK=\"$RACK\" HOSTNAME=\"$HOSTNAME\" ES_HOST=\"$ES_HOST\" ES_PORT=\"$ES_PORT\" /opt/fluent-bit/bin/fluent-bit --config=\"$config_path\"'" + +# 清空或创建日志文件 +log_info "[DEBUG] Preparing log file: /var/log/fluent-bit.log" +: > /var/log/fluent-bit.log +chmod 666 /var/log/fluent-bit.log + +log_info "Command: /opt/fluent-bit/bin/fluent-bit --config=$config_path" +log_info "[DEBUG] Starting fluent-bit process as fluent-bit user (using su)..." +nohup su -s /bin/bash fluent-bit -c "env CLUSTER='$CLUSTER' RACK='$RACK' HOSTNAME='$HOSTNAME' ES_HOST='$ES_HOST' ES_PORT='$ES_PORT' /opt/fluent-bit/bin/fluent-bit --config='$config_path' >> /var/log/fluent-bit.log 2>&1" & + +bg_pid=$! +log_info "[DEBUG] Background process started with PID: $bg_pid" + +# 等待服务启动 +log_info "[DEBUG] Waiting 3 seconds for service to start..." +sleep 3 + +# 查找实际的 fluent-bit 进程 PID +log_info "[DEBUG] Searching for fluent-bit process..." +log_info "[DEBUG] Running: pgrep -u fluent-bit -x fluent-bit" +actual_pid=$(pgrep -u fluent-bit -x fluent-bit | head -1) + +# 显示所有 fluent-bit 相关进程 +log_info "[DEBUG] All fluent-bit related processes:" +ps aux | grep fluent-bit | grep -v grep || log_warning "No fluent-bit processes found in ps output" + +if [[ -n "$actual_pid" ]]; then + log_success "Fluent Bit started successfully (PID: $actual_pid)" + log_info "[DEBUG] Process details: $(ps -p $actual_pid -o pid,user,cmd --no-headers)" + + # 更新安装记录 + update_install_record "$actual_pid" "$INSTALL_DIR" +else + log_error "Fluent Bit failed to start - no fluent-bit process found" + log_info "[DEBUG] Checking if background process $bg_pid still exists..." + if ps -p $bg_pid > /dev/null 2>&1; then + log_warning "Background shell process $bg_pid still exists" + else + log_warning "Background shell process $bg_pid has exited" + fi + + log_info "[DEBUG] Last 20 lines of /var/log/fluent-bit.log:" + if [[ -f "/var/log/fluent-bit.log" ]]; then + tail -20 /var/log/fluent-bit.log | while IFS= read -r line; do + log_info "[LOG] $line" + done + else + log_error "Log file /var/log/fluent-bit.log does not exist" + fi + + exit 1 +fi + +log_success "Fluent Bit installation completed!" diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/package.sh b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/package.sh new file mode 100755 index 0000000..faf702b --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/package.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +set -e + +# 颜色定义 +GREEN='\033[0;32m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +# 获取当前目录 +CURRENT_DIR=$(pwd) +PACKAGE_NAME="fluent-bit-$(date +%Y%m%d-%H%M%S)" +PACKAGE_FILE="${PACKAGE_NAME}.tar.gz" + +log_info "开始打包 Fluent Bit 安装包..." + +# 检查必要文件 +log_info "检查必要文件..." + +required_files=( + "install.sh" + "uninstall.sh" + "bin/fluent-bit_3.1.9_amd64.deb" + "check_health.sh" +) + +missing_files=() +for file in "${required_files[@]}"; do + if [[ ! -f "$file" ]]; then + missing_files+=("$file") + fi +done + +if [[ ${#missing_files[@]} -gt 0 ]]; then + echo "缺少以下文件:" + for file in "${missing_files[@]}"; do + echo " - $file" + done + exit 1 +fi + +log_success "所有必要文件检查完成" + +# 创建临时目录 +TEMP_DIR=$(mktemp -d) +log_info "创建临时目录: $TEMP_DIR" + +# 复制文件到临时目录 +cp -r . "$TEMP_DIR/$PACKAGE_NAME" + +# 进入临时目录 +cd "$TEMP_DIR" + +# 创建压缩包 +log_info "创建压缩包: $PACKAGE_FILE" +tar -czf "$PACKAGE_FILE" "$PACKAGE_NAME" + +# 移动压缩包到原目录 +mv "$PACKAGE_FILE" "$CURRENT_DIR/" + +# 清理临时目录 +rm -rf "$TEMP_DIR" + +# 返回原目录 +cd "$CURRENT_DIR" + +# 显示结果 +log_success "打包完成!" +echo +echo "安装包文件: $PACKAGE_FILE" +echo "文件大小: $(du -h "$PACKAGE_FILE" | cut -f1)" +echo +echo "使用方法:" +echo "1. 将 $PACKAGE_FILE 传输到目标服务器" +echo "2. 解压: tar -xzf $PACKAGE_FILE" +echo "3. 进入目录: cd $PACKAGE_NAME" +echo "4. 运行安装: sudo ./install.sh" +echo +echo "注意: 请确保所有必要文件都存在" diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/uninstall.sh b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/uninstall.sh new file mode 100755 index 0000000..ceba076 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/uninstall.sh @@ -0,0 +1,169 @@ +#!/bin/bash +set -euo pipefail + +echo "[INFO] Starting Fluent Bit uninstallation..." + +# 检查是否为 root 用户 +if [[ $EUID -ne 0 ]]; then + echo "[ERROR] This script requires root privileges" + echo "[INFO] Please use: sudo $0" + exit 1 +fi + +echo "[WARNING] This operation will completely uninstall Fluent Bit" +read -p "Confirm to continue? (y/N): " confirm + +if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + echo "[INFO] Uninstallation cancelled" + exit 0 +fi + +# 停止运行中的进程 +echo "[INFO] Stopping Fluent Bit processes..." +install_record="/opt/argus-metric/current/.install_record" +stopped=false + +# 首先尝试通过安装记录文件停止服务 +if [[ -f "$install_record" ]]; then + # 尝试使用jq解析JSON格式的安装记录文件 + pid="" + if command -v jq &> /dev/null; then + pid=$(jq -r '.components."fluent-bit".pid // empty' "$install_record" 2>/dev/null || echo "") + else + # 如果没有jq,使用简单的文本解析方法 + pid=$(grep -A 10 '"fluent-bit"' "$install_record" | grep '"pid"' | cut -d'"' -f4 | head -1) + fi + + if [[ -n "$pid" && "$pid" =~ ^[0-9]+$ ]]; then + if kill -0 "$pid" 2>/dev/null; then + echo "[INFO] Stopping service via installation record (PID: $pid)..." + kill "$pid" + sleep 3 + + # 检查进程是否已停止 + if kill -0 "$pid" 2>/dev/null; then + echo "[WARNING] Process unresponsive, force killing..." + kill -9 "$pid" 2>/dev/null || true + fi + echo "[SUCCESS] Fluent Bit process stopped" + stopped=true + else + echo "[WARNING] PID in installation record no longer exists" + fi + fi +fi + +# 查找并杀死所有 fluent-bit 进程 +pids=$(pgrep -f "fluent-bit" 2>/dev/null || true) +if [[ -n "$pids" ]]; then + echo "[INFO] Found fluent-bit processes, stopping..." + for pid in $pids; do + echo "[INFO] Stopping process PID: $pid" + kill "$pid" 2>/dev/null || true + done + sleep 2 + + # 检查是否还有进程在运行,如果有则强制终止 + remaining_pids=$(pgrep -f "fluent-bit" 2>/dev/null || true) + if [[ -n "$remaining_pids" ]]; then + echo "[WARNING] Processes unresponsive, force killing..." + for pid in $remaining_pids; do + echo "[INFO] Force killing process PID: $pid" + kill -9 "$pid" 2>/dev/null || true + done + sleep 1 + fi + + # 最终检查 + if pgrep -f "fluent-bit" > /dev/null; then + echo "[ERROR] Unable to stop all fluent-bit processes" + else + echo "[SUCCESS] All Fluent Bit processes stopped" + stopped=true + fi +else + echo "[INFO] No Fluent Bit processes running" +fi + +if [[ "$stopped" == "false" ]]; then + echo "[WARNING] No Fluent Bit processes found to stop" +fi + +# 卸载 Fluent Bit 包 +echo "[INFO] Uninstalling Fluent Bit package..." +if dpkg -l | grep -q "fluent-bit"; then + echo "[INFO] Found fluent-bit package installed via dpkg, uninstalling..." + dpkg --remove --force-remove-reinstreq fluent-bit || true + echo "[SUCCESS] Fluent Bit package uninstalled" +else + echo "[INFO] No fluent-bit package found via package manager" +fi + +# 删除二进制文件 +echo "[INFO] Removing Fluent Bit binary files..." +binary_dir="/opt/fluent-bit" +if [[ -d "$binary_dir" ]]; then + rm -rf "$binary_dir" + echo "[SUCCESS] Binary directory removed: $binary_dir" +else + echo "[INFO] Binary directory does not exist" +fi + +# 删除配置文件 +echo "[INFO] Removing configuration files..." +config_dir="/etc/fluent-bit" +if [[ -d "$config_dir" ]]; then + rm -rf "$config_dir" + echo "[SUCCESS] Configuration directory removed" +else + echo "[INFO] Configuration directory does not exist" +fi + +# 删除数据目录 +echo "[INFO] Removing data directories..." +data_dirs=("/logs" "/buffers") +deleted=false +for data_dir in "${data_dirs[@]}"; do + if [[ -d "$data_dir" ]]; then + rm -rf "$data_dir" + echo "[SUCCESS] Data directory removed: $data_dir" + deleted=true + fi +done + +if [[ "$deleted" == "false" ]]; then + echo "[INFO] No data directories found" +fi + +# 清理安装记录 +echo "[INFO] Cleaning up installation record..." +if [[ -f "$install_record" ]]; then + # 从安装记录中移除 fluent-bit 条目 + sed -i '/^fluent-bit:/d' "$install_record" + echo "[SUCCESS] Installation record cleaned" +else + echo "[INFO] Installation record file does not exist" +fi + +# 检查用户状态 +echo "[INFO] Checking fluent-bit user status..." +if id "fluent-bit" &>/dev/null; then + echo "[INFO] fluent-bit user exists" + echo "[WARNING] fluent-bit is a system user, may be used by other services" + echo "[INFO] fluent-bit user will be preserved for system stability" + echo "[INFO] To manually remove, run: sudo userdel fluent-bit" +else + echo "[INFO] fluent-bit user does not exist" +fi + +echo "[SUCCESS] Fluent Bit uninstallation completed!" +echo +echo "Removed content:" +echo " - Binary directory: /opt/fluent-bit" +echo " - Configuration directory: /etc/fluent-bit" +echo " - Application log directory: /logs" +echo " - Buffer directory: /buffers" +echo +echo "Note:" +echo " - fluent-bit user preserved (system user, may be used by other services)" +echo " - For complete cleanup, manually check and remove related files" diff --git a/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/bin/node_exporter b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/bin/node_exporter new file mode 100755 index 0000000..bccf467 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/bin/node_exporter @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d548f65fe29db403603c0f0c6a5d15e3ac74b6ed69ec445258e8fff4bc88601 +size 19925095 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/check_health.sh b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/check_health.sh new file mode 100755 index 0000000..ed168e3 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/check_health.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# Node Exporter 健康检查脚本 +# 输出 JSON 格式结果 + +set -e + +# 检查 Node Exporter 健康状态 +check_health() { + local url="http://localhost:9100" + local metrics_url="$url/metrics" + local name="node-exporter" + local status="unhealth" + local reason="" + + # 检查 curl 是否可用 + if ! command -v curl &> /dev/null; then + reason="curl 命令不可用,无法进行健康检查" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + + # 测试根路径连接 + local http_code=$(curl -s -o /dev/null -w "%{http_code}" "$url" 2>/dev/null || echo "000") + + if [[ "$http_code" == "200" ]]; then + # 测试 metrics 端点 + local metrics_code=$(curl -s -o /dev/null -w "%{http_code}" "$metrics_url" 2>/dev/null || echo "000") + + if [[ "$metrics_code" == "200" ]]; then + status="health" + reason="success" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 0 + else + reason="Metrics 端点异常 (HTTP $metrics_code)" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + else + reason="HTTP 服务异常 (HTTP $http_code),请检查 Node Exporter 是否正在运行在端口 9100" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi +} + +# 主函数 +main() { + check_health +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/install.sh b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/install.sh new file mode 100755 index 0000000..28ba2d1 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/install.sh @@ -0,0 +1,343 @@ +#!/bin/bash + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 更新安装记录 +update_install_record() { + local pid="$1" + # 使用传入的安装目录参数,如果没有则使用默认值 + local install_base_dir="${2:-/opt/argus-metric/current}" + local install_record="$install_base_dir/.install_record" + + # 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建 + if [[ ! -f "$install_record" ]]; then + log_info "安装记录文件不存在,将由主安装脚本创建" + return 0 + fi + + # 如果文件存在,说明是重启场景,只更新 PID 字段 + if command -v jq &> /dev/null; then + # 读取当前 PID + local current_pid=$(jq -r '.components."node-exporter".pid // ""' "$install_record" 2>/dev/null) + + if [[ -z "$current_pid" ]]; then + log_warning "无法读取当前 PID,跳过更新" + return 1 + fi + + # 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段 + jq --arg new_pid "$pid" '.components."node-exporter".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record" + log_info "PID 已更新: $current_pid -> $pid" + else + log_warning "jq 命令不可用,无法更新安装记录文件" + fi +} + +# 显示帮助信息 +show_help() { + echo "Node Exporter 安装脚本" + echo + echo "用法: $0 [选项]" + echo + echo "选项:" + echo " --help 显示此帮助信息" + echo + echo "示例:" + echo " $0 # 安装 Node Exporter" + echo +} + +# 解析命令行参数 +INSTALL_DIR="" +for arg in "$@"; do + case $arg in + --help|-h) + show_help + exit 0 + ;; + *) + # 如果参数不是以--开头,则认为是安装目录 + if [[ ! "$arg" =~ ^-- ]]; then + INSTALL_DIR="$arg" + else + log_error "未知参数: $arg" + show_help + exit 1 + fi + ;; + esac +done + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0" + exit 1 + fi +} + +# 检查系统要求 +check_system() { + log_info "检查系统要求..." + + # 检查操作系统 + if [[ ! -f /etc/os-release ]]; then + log_error "无法检测操作系统版本" + exit 1 + fi + + source /etc/os-release + log_info "检测到操作系统: $NAME $VERSION" + + # 检查是否为 Linux 系统 + if [[ "$ID" != "ubuntu" && "$ID" != "debian" && "$ID" != "centos" && "$ID" != "rhel" && "$ID" != "fedora" ]]; then + log_warning "此脚本主要针对常见 Linux 发行版,其他系统可能需要调整" + fi + + # 检查系统架构 + local arch=$(uname -m) + log_info "系统架构: $arch" + + if [[ "$arch" != "x86_64" && "$arch" != "amd64" ]]; then + log_warning "当前架构为 $arch,node_exporter 主要支持 x86_64/amd64" + fi +} + +stop_existing_service() { + log_info "检查并停止可能运行的 Node Exporter 服务..." + + # 当前脚本 PID,防止误杀 + SELF_PID=$$ + + # 1. 停止 systemd 服务(如果存在) + if systemctl list-units --full -all | grep -q "node_exporter.service"; then + log_info "检测到 systemd 服务 node_exporter,正在停止..." + systemctl stop node_exporter || true + systemctl disable node_exporter || true + fi + + # 2. 清理可能存在的 PID 文件 + for pid_file in /var/run/node-exporter.pid /var/run/node_exporter.pid /tmp/node_exporter.pid; do + if [[ -f "$pid_file" ]]; then + local pid=$(cat "$pid_file") + if kill -0 "$pid" 2>/dev/null; then + log_info "发现 Node Exporter (PID: $pid),正在停止..." + kill "$pid" + sleep 2 + kill -0 "$pid" 2>/dev/null && kill -9 "$pid" + fi + rm -f "$pid_file" + fi + done + + # 3. 用 pgrep 查找进程,排除当前脚本 + local pids=$(pgrep -f "node_exporter|node-exporter|/usr/local/bin/node-exporter" | grep -vw "$SELF_PID" || true) + if [[ -n "$pids" ]]; then + log_info "发现 Node Exporter 进程 (PID: $pids),正在停止..." + for pid in $pids; do + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" 2>/dev/null || true + sleep 1 + kill -0 "$pid" 2>/dev/null && kill -9 "$pid" 2>/dev/null || true + fi + done + fi + + # 4. 兜底:检查是否有进程占用 9100 端口 + local listen_pids=$(lsof -ti:9100 2>/dev/null || true) + if [[ -n "$listen_pids" ]]; then + log_warning "发现占用 9100 端口的进程 (PID: $listen_pids),强制终止..." + for pid in $listen_pids; do + kill -9 "$pid" 2>/dev/null || true + done + sleep 1 + fi + + # 5. 最终验证 + if netstat -tuln 2>/dev/null | grep -q ":9100 "; then + log_error "端口 9100 仍被占用,请手动检查" + return 1 + else + log_success "旧的 Node Exporter 已完全停止" + fi +} + + +# 安装 Node Exporter 二进制文件 +install_node_exporter() { + log_info "安装 Node Exporter..." + + local binary_file="bin/node_exporter" + local install_dir="/usr/local/bin" + + if [[ ! -f "$binary_file" ]]; then + log_error "找不到 Node Exporter 二进制文件: $binary_file" + exit 1 + fi + + # 停止可能运行的服务 + stop_existing_service + + # 复制二进制文件并重命名为统一格式 + cp "$binary_file" "$install_dir/node-exporter" + chmod +x "$install_dir/node-exporter" + + log_success "Node Exporter 二进制文件安装完成" +} + +# 创建用户和组 +create_user() { + log_info "创建 node_exporter 用户..." + + # 检查用户是否已存在 + if id "node_exporter" &>/dev/null; then + log_info "用户 node_exporter 已存在" + else + useradd --no-create-home --shell /bin/false node_exporter + log_success "用户 node_exporter 创建完成" + fi +} + +# 安装配置文件 +install_config() { + log_info "安装配置文件..." + + local config_dir="/etc/node_exporter" + + # 创建配置目录 + mkdir -p "$config_dir" + + # 创建文本文件收集器目录 + mkdir -p "/var/lib/node_exporter/textfile_collector" + chown node_exporter:node_exporter "/var/lib/node_exporter/textfile_collector" +} + +# 启动 Node Exporter 服务 +start_node_exporter() { + log_info "启动 Node Exporter 服务..." + + local binary_path="/usr/local/bin/node-exporter" + local log_file="/var/log/node-exporter.log" + local pid_file="/var/run/node-exporter.pid" + + # 检查服务是否已经在运行 + if [[ -f "$pid_file" ]]; then + local pid=$(cat "$pid_file") + if kill -0 "$pid" 2>/dev/null; then + log_info "Node Exporter 服务已在运行 (PID: $pid)" + return 0 + else + log_warning "发现过期的 PID 文件,正在清理..." + rm -f "$pid_file" + fi + fi + + # 检查端口是否被占用 + if netstat -tuln 2>/dev/null | grep -q ":9100 "; then + log_warning "端口 9100 已被占用,请检查是否有其他服务在运行" + return 1 + fi + + # 启动服务 + log_info "正在启动 Node Exporter..." + nohup "$binary_path" --web.listen-address=:9100 > "$log_file" 2>&1 & + local pid=$! + + # 保存 PID + echo "$pid" > "$pid_file" + + # 等待服务启动 + sleep 2 + + # 检查服务是否成功启动 + if kill -0 "$pid" 2>/dev/null; then + log_success "Node Exporter 服务启动成功 (PID: $pid)" + log_info "日志文件: $log_file" + log_info "PID 文件: $pid_file" + + # 更新安装记录 + update_install_record "$pid" "$INSTALL_DIR" + else + log_error "Node Exporter 服务启动失败" + rm -f "$pid_file" + return 1 + fi +} + + + +# 显示安装信息 +show_install_info() { + log_success "Node Exporter 安装完成!" + echo + echo "安装信息:" + echo " 二进制文件: /usr/local/bin/node-exporter" + echo " 运行用户: node_exporter" + echo " 配置目录: /etc/node_exporter/" + echo " 默认端口: 9100" + echo + echo "使用方法:" + echo " 手动启动: /usr/local/bin/node-exporter --web.listen-address=:9100" + echo " 后台启动: nohup /usr/local/bin/node-exporter --web.listen-address=:9100 &" + echo + echo "测试连接:" + echo " curl http://localhost:9100/metrics" + echo " curl http://localhost:9100" + echo + echo "Prometheus 配置示例:" + echo " - job_name: 'node_exporter'" + echo " static_configs:" + echo " - targets: ['localhost:9100']" + echo +} + +# 主函数 +main() { + echo "==========================================" + echo " Node Exporter 安装脚本 v1.0" + echo "==========================================" + echo + + check_root + check_system + + log_info "开始安装 Node Exporter..." + + install_node_exporter + create_user + install_config + start_node_exporter + + show_install_info +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi + diff --git a/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/package.sh b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/package.sh new file mode 100755 index 0000000..b38c733 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/package.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +set -e + +# 颜色定义 +GREEN='\033[0;32m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +# 获取当前目录 +CURRENT_DIR=$(pwd) +PACKAGE_NAME="node-exporter-$(date +%Y%m%d-%H%M%S)" +PACKAGE_FILE="${PACKAGE_NAME}.tar.gz" + +log_info "开始打包 Node Exporter 安装包..." + +# 检查必要文件 +log_info "检查必要文件..." + +required_files=( + "install.sh" + "uninstall.sh" + "bin/node_exporter" + "check_health.sh" +) + +missing_files=() +for file in "${required_files[@]}"; do + if [[ ! -f "$file" ]]; then + missing_files+=("$file") + fi +done + +if [[ ${#missing_files[@]} -gt 0 ]]; then + echo "缺少以下文件:" + for file in "${missing_files[@]}"; do + echo " - $file" + done + exit 1 +fi + +log_success "所有必要文件检查完成" + +# 创建临时目录 +TEMP_DIR=$(mktemp -d) +log_info "创建临时目录: $TEMP_DIR" + +# 复制文件到临时目录 +cp -r . "$TEMP_DIR/$PACKAGE_NAME" + +# 进入临时目录 +cd "$TEMP_DIR" + +# 创建压缩包 +log_info "创建压缩包: $PACKAGE_FILE" +tar -czf "$PACKAGE_FILE" "$PACKAGE_NAME" + +# 移动压缩包到原目录 +mv "$PACKAGE_FILE" "$CURRENT_DIR/" + +# 清理临时目录 +rm -rf "$TEMP_DIR" + +# 返回原目录 +cd "$CURRENT_DIR" + +# 显示结果 +log_success "打包完成!" +echo +echo "安装包文件: $PACKAGE_FILE" +echo "文件大小: $(du -h "$PACKAGE_FILE" | cut -f1)" +echo +echo "使用方法:" +echo "1. 将 $PACKAGE_FILE 传输到目标服务器" +echo "2. 解压: tar -xzf $PACKAGE_FILE" +echo "3. 进入目录: cd $PACKAGE_NAME" +echo "4. 运行安装: sudo ./install.sh" +echo +echo "注意: 请确保所有必要文件都存在" diff --git a/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/uninstall.sh b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/uninstall.sh new file mode 100755 index 0000000..14801c1 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/uninstall.sh @@ -0,0 +1,239 @@ +#!/bin/bash + +# Node Exporter 卸载脚本 +# 版本: 1.0 +# 作者: AIOps Team +# 日期: $(date +%Y-%m-%d) + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0" + exit 1 + fi +} + +# 停止运行中的进程 +stop_processes() { + log_info "停止 Node Exporter 进程..." + + local pid_file="/var/run/node-exporter.pid" + local stopped=false + + # 首先尝试通过 PID 文件停止服务 + if [[ -f "$pid_file" ]]; then + local pid=$(cat "$pid_file") + if kill -0 "$pid" 2>/dev/null; then + log_info "通过 PID 文件停止服务 (PID: $pid)..." + kill "$pid" + sleep 3 + + # 检查进程是否已停止 + if kill -0 "$pid" 2>/dev/null; then + log_warning "进程未响应,强制终止..." + kill -9 "$pid" 2>/dev/null || true + fi + log_success "Node Exporter 进程已停止" + stopped=true + else + log_warning "PID 文件存在但进程已不存在,清理 PID 文件" + rm -f "$pid_file" + fi + fi + + # 查找并杀死所有 node_exporter 和 node-exporter 进程 + local pids=$(pgrep -f "node_exporter\|node-exporter" 2>/dev/null || true) + if [[ -n "$pids" ]]; then + log_info "发现 node_exporter 或 node-exporter 进程,正在停止..." + for pid in $pids; do + log_info "停止进程 PID: $pid" + kill "$pid" 2>/dev/null || true + done + sleep 2 + + # 检查是否还有进程在运行,如果有则强制终止 + local remaining_pids=$(pgrep -f "node_exporter\|node-exporter" 2>/dev/null || true) + if [[ -n "$remaining_pids" ]]; then + log_warning "进程未响应,强制终止..." + for pid in $remaining_pids; do + log_info "强制终止进程 PID: $pid" + kill -9 "$pid" 2>/dev/null || true + done + sleep 1 + fi + + # 最终检查 + if pgrep -f "node_exporter\|node-exporter" > /dev/null; then + log_error "无法停止所有 node_exporter 进程" + else + log_success "所有 Node Exporter 进程已停止" + stopped=true + fi + else + log_info "Node Exporter 进程未运行" + fi + + # 清理 PID 文件 + rm -f "$pid_file" + + if [[ "$stopped" == "false" ]]; then + log_warning "未发现需要停止的 Node Exporter 进程" + fi +} + +# 删除二进制文件 +remove_binary() { + log_info "删除 Node Exporter 二进制文件..." + + local binary_files=( + "/usr/local/bin/node-exporter" + "/usr/local/bin/node_exporter" + ) + + local deleted=false + for binary_file in "${binary_files[@]}"; do + if [[ -f "$binary_file" ]]; then + rm -f "$binary_file" + log_success "二进制文件已删除: $binary_file" + deleted=true + fi + done + + if [[ "$deleted" == "false" ]]; then + log_info "二进制文件不存在" + fi +} + +# 删除配置文件 +remove_config() { + log_info "删除配置文件..." + + local config_dir="/etc/node_exporter" + + if [[ -d "$config_dir" ]]; then + rm -rf "$config_dir" + log_success "配置目录已删除" + else + log_info "配置目录不存在" + fi +} + +# 删除数据目录 +remove_data_dir() { + log_info "删除数据目录..." + + local data_dir="/var/lib/node_exporter" + + if [[ -d "$data_dir" ]]; then + rm -rf "$data_dir" + log_success "数据目录已删除" + else + log_info "数据目录不存在" + fi +} + +# 检查用户状态(可选) +check_user_status() { + log_info "检查 node_exporter 用户状态..." + + if id "node_exporter" &>/dev/null; then + log_info "检测到 node_exporter 用户存在" + log_warning "node_exporter 是系统用户,可能被其他服务使用" + log_info "为了系统稳定性,将保留 node_exporter 用户" + log_info "如需手动删除,请运行: sudo userdel node_exporter" + else + log_info "node_exporter 用户不存在" + fi +} + +# 清理日志文件 +cleanup_logs() { + log_info "清理日志文件..." + + # 清理 journal 日志 + journalctl --vacuum-time=1s --quiet || true + + # 删除安装脚本创建的日志文件 + rm -f /var/log/node-exporter.log + + log_success "日志文件已清理" +} + +# 显示卸载信息 +show_uninstall_info() { + log_success "Node Exporter 卸载完成!" + echo + echo "已删除的内容:" + echo " - 二进制文件: /usr/local/bin/node-exporter" + echo " - 配置目录: /etc/node_exporter" + echo " - 数据目录: /var/lib/node_exporter" + echo " - 相关日志文件" + echo + echo "注意:" + echo " - node_exporter 用户已保留(系统用户,可能被其他服务使用)" + echo " - 如需完全清理,请手动检查并删除相关文件" + echo +} + +# 主函数 +main() { + echo "==========================================" + echo " Node Exporter 卸载脚本 v1.0" + echo "==========================================" + echo + + check_root + + log_warning "此操作将完全卸载 Node Exporter" + read -p "确认继续?(y/N): " confirm + + if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "取消卸载操作" + exit 0 + fi + + log_info "开始卸载 Node Exporter..." + + stop_processes + remove_binary + remove_config + remove_data_dir + cleanup_logs + + # 检查用户状态 + check_user_status + + show_uninstall_info +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/scripts/check_health.sh b/src/metric/client-plugins/all-in-one-full/scripts/check_health.sh new file mode 100755 index 0000000..6b3c866 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/check_health.sh @@ -0,0 +1,286 @@ +#!/bin/bash + +# 整体健康检查脚本,调用各个组件的健康检查并将结果写入 .health_log 文件 + +set -e + +# PID 文件检测,防止重复执行 +PIDFILE="/var/run/check_health.pid" +if [ -f "$PIDFILE" ] && kill -0 $(cat "$PIDFILE") 2>/dev/null; then + echo "健康检查脚本已在运行中,跳过本次执行" >&2 + exit 0 +fi +echo $$ > "$PIDFILE" +trap "rm -f $PIDFILE" EXIT + +# 获取脚本所在目录 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +HEALTH_LOG_FILE="$SCRIPT_DIR/.health_log" +INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record" + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 - 输出到 stderr 避免影响 JSON 结果 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" >&2 +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" >&2 +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" >&2 +} + +# 检查单个组件健康状态 +check_component() { + local component_name="$1" + local check_script_path="$2" + + log_info "检查 $component_name 健康状态..." + + if [[ ! -f "$check_script_path" ]]; then + log_error "健康检查脚本不存在: $check_script_path" + echo "{\"name\": \"$component_name\", \"status\": \"unhealth\", \"reason\": \"健康检查脚本不存在: $check_script_path\"}" + return 1 + fi + + if [[ ! -x "$check_script_path" ]]; then + log_error "健康检查脚本无执行权限: $check_script_path" + echo "{\"name\": \"$component_name\", \"status\": \"unhealth\", \"reason\": \"健康检查脚本无执行权限: $check_script_path\"}" + return 1 + fi + + # 执行健康检查脚本,只捕获 stdout,stderr 输出到终端 + local result + if result=$("$check_script_path" 2>/dev/null); then + log_success "$component_name 健康检查通过" + echo "$result" + return 0 + else + log_warning "$component_name 健康检查失败" + echo "$result" + return 1 + fi +} + +# 生成时间戳 +get_timestamp() { + date '+%Y-%m-%d %H:%M:%S' +} + +# 生成UTC时间戳 +get_utc_timestamp() { + date -u '+%Y-%m-%dT%H:%M:%SZ' +} + +# 获取主机名 +get_hostname() { + echo "${HOSTNAME:-$(hostname)}" +} + +# 创建健康状态目录 +create_health_dir() { + local hostname=$(get_hostname) + local health_dir="/private/argus/agent/$hostname/health" + + if [[ ! -d "$health_dir" ]]; then + log_info "创建健康状态目录: $health_dir" + mkdir -p "$health_dir" + fi + + echo "$health_dir" +} + +# 写入单个模块的健康状态JSON文件 +write_component_health_json() { + local component_name="$1" + local status="$2" + local error_msg="$3" + local health_dir="$4" + + # 生成模块名前缀-xxx.json格式的文件名 + local module_prefix="metric" + local filename="${module_prefix}-${component_name}.json" + local filepath="$health_dir/$filename" + + # 生成UTC时间戳 + local timestamp=$(get_utc_timestamp) + + # 构建JSON内容 + local json_content=$(cat << EOF +{ + "status": "$status", + "error": "$error_msg", + "timestamp": "$timestamp" +} +EOF +) + + # 写入文件 + echo "$json_content" > "$filepath" + log_info "已写入模块健康状态文件: $filepath" +} + +# 从安装记录文件中读取组件安装目录 +read_install_record() { + local install_record_file="$1" + + if [[ ! -f "$install_record_file" ]]; then + log_error "安装记录文件不存在: $install_record_file" + return 1 + fi + + # 检查是否有 jq 命令来解析 JSON + if command -v jq &> /dev/null; then + # 使用 jq 解析 JSON + local components_json + if components_json=$(jq -r '.components | to_entries[] | "\(.key):\(.value.install_dir)"' "$install_record_file" 2>/dev/null); then + echo "$components_json" + return 0 + else + log_error "无法解析安装记录文件 JSON 格式: $install_record_file" + return 1 + fi + else + # 如果没有 jq,尝试简单的文本解析 + log_warning "jq 命令不可用,尝试简单文本解析" + + # 查找所有 install_dir 行 + local components=() + while IFS= read -r line; do + if [[ "$line" =~ \"install_dir\":[[:space:]]*\"([^\"]+)\" ]]; then + local install_dir="${BASH_REMATCH[1]}" + # 从路径中提取组件名称 + local component_name=$(basename "$install_dir") + components+=("$component_name:$install_dir") + fi + done < "$install_record_file" + + if [[ ${#components[@]} -gt 0 ]]; then + printf '%s\n' "${components[@]}" + return 0 + else + log_error "无法从安装记录文件中提取组件信息" + return 1 + fi + fi +} + +# 主函数 +main() { + echo "==========================================" >&2 + echo " 整体健康检查脚本" >&2 + echo "==========================================" >&2 + echo >&2 + + # 记录健康检查开始时间 + local start_time=$(get_timestamp) + log_info "健康检查开始时间: $start_time" + + # 创建健康状态目录 + local health_dir + health_dir=$(create_health_dir) + + # 从安装记录文件中读取组件信息 + log_info "从安装记录文件读取组件信息: $INSTALL_RECORD_FILE" + local components_info + if ! components_info=$(read_install_record "$INSTALL_RECORD_FILE"); then + log_error "无法读取安装记录文件,健康检查终止" + exit 1 + fi + + # 存储所有检查结果 + local all_results=() + local overall_status="health" + + # 逐个检查组件 + while IFS= read -r component_info; do + if [[ -n "$component_info" ]]; then + IFS=':' read -r component_name install_dir <<< "$component_info" + local check_script_path="$install_dir/check_health.sh" + + local result + local component_status="healthy" + local error_msg="" + + if result=$(check_component "$component_name" "$check_script_path"); then + all_results+=("$result") + else + all_results+=("$result") + overall_status="unhealth" + component_status="unhealthy" + # 从结果中提取错误信息 + if command -v jq &> /dev/null; then + error_msg=$(echo "$result" | jq -r '.reason // ""' 2>/dev/null || echo "") + else + # 简单的文本解析提取错误信息 + if [[ "$result" =~ \"reason\":[[:space:]]*\"([^\"]+)\" ]]; then + error_msg="${BASH_REMATCH[1]}" + fi + fi + fi + + # 写入单个模块的健康状态JSON文件 + write_component_health_json "$component_name" "$component_status" "$error_msg" "$health_dir" + fi + done <<< "$components_info" + + # 记录健康检查结束时间 + local end_time=$(get_timestamp) + log_info "健康检查结束时间: $end_time" + + # 构建完整的健康检查结果 JSON + local health_check_result=$(cat << EOF +{ + "start_time": "$start_time", + "end_time": "$end_time", + "overall_status": "$overall_status", + "components": [ +$(printf '%s,\n' "${all_results[@]}" | sed '$s/,$//') + ] +} +EOF +) + + # 写入健康日志文件 + log_info "将健康检查结果写入日志文件: $HEALTH_LOG_FILE" + echo "$health_check_result" >> "$HEALTH_LOG_FILE" + + # 输出 JSON 结果到 stdout + echo "$health_check_result" + + # 显示总结到 stderr + echo >&2 + echo "==========================================" >&2 + echo " 健康检查总结" >&2 + echo "==========================================" >&2 + echo "开始时间: $start_time" >&2 + echo "结束时间: $end_time" >&2 + echo "整体状态: $overall_status" >&2 + echo "日志文件: $HEALTH_LOG_FILE" >&2 + echo >&2 + + if [[ "$overall_status" == "health" ]]; then + log_success "所有组件健康检查通过!" + exit 0 + else + log_error "部分组件健康检查失败,请查看上述详细信息" + exit 1 + fi +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi \ No newline at end of file diff --git a/src/metric/client-plugins/all-in-one-full/scripts/check_version.sh b/src/metric/client-plugins/all-in-one-full/scripts/check_version.sh new file mode 100755 index 0000000..fce49f3 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/check_version.sh @@ -0,0 +1,240 @@ +#!/bin/bash + +# 版本校验脚本 +# 比较本地 LATEST_VERSION 与 FTP 的 VERSION 版本,如果不一致则更新对应版本 + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 - 输出到 stderr 避免影响函数返回值 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" >&2 +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" >&2 +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" >&2 +} + +# 获取脚本所在目录 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# 动态获取当前版本目录 +get_current_version_dir() { + # 查找 /opt/argus-metric/versions/ 下的最新版本目录 + local versions_dir="/opt/argus-metric/versions" + if [[ -d "$versions_dir" ]]; then + # 按版本号排序,获取最新的版本目录 + local latest_version_dir=$(ls -1 "$versions_dir" 2>/dev/null | sort -V | tail -1) + if [[ -n "$latest_version_dir" ]]; then + echo "$versions_dir/$latest_version_dir" + else + echo "/opt/argus-metric" + fi + else + echo "/opt/argus-metric" + fi +} + +# 获取当前版本目录 +CURRENT_VERSION_DIR=$(get_current_version_dir) +# LATEST_VERSION 文件在根目录 +LOCAL_VERSION_FILE="/opt/argus-metric/LATEST_VERSION" +REMOTE_VERSION_URL="" +LOG_FILE="$CURRENT_VERSION_DIR/.version_check.log" + +# 从环境变量或配置文件获取 FTP 服务器信息 +get_ftp_config() { + # 优先从环境变量获取配置 + log_info "获取 FTP 配置信息..." + + # 如果环境变量中没有设置,则尝试从配置文件读取 + if [[ -z "$FTP_SERVER" || -z "$FTP_USER" || -z "$FTP_PASSWORD" ]]; then + local config_file="$SCRIPT_DIR/../config/config.env" + if [[ -f "$config_file" ]]; then + log_info "从配置文件读取 FTP 配置: $config_file" + source "$config_file" + fi + else + log_info "使用环境变量中的 FTP 配置" + fi + + # 设置默认值(如果环境变量和配置文件都没有设置) + FTP_SERVER="${FTP_SERVER:-localhost}" + FTP_USER="${FTP_USER:-ftpuser}" + FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" + + # 构建远程版本文件 URL + REMOTE_VERSION_URL="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/LATEST_VERSION" + + log_info "FTP 配置来源: ${FTP_CONFIG_SOURCE:-环境变量/配置文件}" +} + +# 获取远程版本号 +get_remote_version() { + log_info "从 FTP 服务器获取远程版本号..." + log_info "远程地址: $REMOTE_VERSION_URL" + + # 先测试 FTP 连接 + log_info "测试 FTP 连接..." + if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/" >/dev/null 2>&1; then + log_success "FTP 服务器连接成功" + else + log_error "无法连接到 FTP 服务器: $FTP_SERVER" + return 1 + fi + + # 测试 LATEST_VERSION 文件是否存在 + log_info "检查远程 LATEST_VERSION 文件是否存在..." + if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/LATEST_VERSION" >/dev/null 2>&1; then + log_success "远程 LATEST_VERSION 文件存在" + else + log_error "远程 LATEST_VERSION 文件不存在或无法访问" + return 1 + fi + + # 获取远程版本号 + local remote_version + if remote_version=$(curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfL "ftp://${FTP_SERVER}/LATEST_VERSION" 2>/dev/null | tr -d '[:space:]'); then + if [[ -n "$remote_version" ]]; then + log_success "获取到远程版本号: $remote_version" + echo "$remote_version" + else + log_error "远程版本号为空" + return 1 + fi + else + log_error "获取远程版本号失败" + return 1 + fi +} + +# 获取本地版本号 +get_local_version() { + if [[ -f "$LOCAL_VERSION_FILE" ]]; then + local local_version=$(cat "$LOCAL_VERSION_FILE" 2>/dev/null | tr -d '[:space:]') + if [[ -n "$local_version" ]]; then + log_info "本地版本号: $local_version" + echo "$local_version" + else + log_warning "本地版本文件为空" + echo "" + fi + else + log_warning "本地版本文件不存在: $LOCAL_VERSION_FILE" + echo "" + fi +} + +# 更新到新版本 +update_to_version() { + local new_version="$1" + local temp_dir="/tmp/argus-update-$$" + local setup_script="$temp_dir/setup.sh" + + log_info "开始更新到版本: $new_version" + + # 创建临时目录 + mkdir -p "$temp_dir" + + # 下载最新的 setup.sh + log_info "从 FTP 服务器下载最新的安装脚本..." + local setup_url="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/setup.sh" + + if curl -fsS "$setup_url" -o "$setup_script"; then + log_success "安装脚本下载完成" + else + log_error "下载安装脚本失败: $setup_url" + rm -rf "$temp_dir" + return 1 + fi + + # 添加执行权限 + chmod +x "$setup_script" + + # 执行安装脚本 + log_info "执行安装脚本进行版本更新..." + if "$setup_script" --server "$FTP_SERVER" --user "$FTP_USER" --password "$FTP_PASSWORD" --version "$new_version"; then + log_success "版本更新完成: $new_version" + rm -rf "$temp_dir" + return 0 + else + log_error "版本更新失败: $new_version" + rm -rf "$temp_dir" + return 1 + fi +} + +# 记录检查日志 +log_check() { + local message="$1" + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + echo "[$timestamp] $message" >> "$LOG_FILE" +} + +# 主函数 +main() { + log_info "开始版本校验检查..." + log_check "版本校验检查开始" + + # 确保系统目录存在 + mkdir -p "/opt/argus-metric" + mkdir -p "$CURRENT_VERSION_DIR" + + log_info "当前版本目录: $CURRENT_VERSION_DIR" + + # 获取 FTP 配置 + get_ftp_config + + # 获取本地版本号 + local local_version + local_version=$(get_local_version) + + # 获取远程版本号 + local remote_version + if ! remote_version=$(get_remote_version); then + log_error "无法获取远程版本号,跳过本次检查" + log_check "版本校验失败:无法获取远程版本号" + exit 1 + fi + + # 比较版本号 + if [[ "$local_version" == "$remote_version" ]]; then + log_info "版本一致,无需更新 (本地: $local_version, 远程: $remote_version)" + log_check "版本校验完成:版本一致 ($local_version)" + else + log_info "检测到版本不一致 (本地: $local_version, 远程: $remote_version)" + log_check "检测到版本不一致:本地($local_version) -> 远程($remote_version)" + + # 更新到新版本 + if update_to_version "$remote_version"; then + log_success "版本更新成功: $local_version -> $remote_version" + log_check "版本更新成功:$local_version -> $remote_version" + else + log_error "版本更新失败" + log_check "版本更新失败:$local_version -> $remote_version" + exit 1 + fi + fi + + log_success "版本校验检查完成" + log_check "版本校验检查完成" +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/scripts/install_artifact.sh b/src/metric/client-plugins/all-in-one-full/scripts/install_artifact.sh new file mode 100755 index 0000000..722f2e8 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/install_artifact.sh @@ -0,0 +1,991 @@ +#!/bin/bash + +set -e + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +log_info() { + local message="[INFO] $1" + echo -e "${BLUE}${message}${NC}" + echo "$(date '+%Y-%m-%d %H:%M:%S') $message" >> "$LOG_FILE" +} + +log_success() { + local message="[SUCCESS] $1" + echo -e "${GREEN}${message}${NC}" + echo "$(date '+%Y-%m-%d %H:%M:%S') $message" >> "$LOG_FILE" +} + +log_warning() { + local message="[WARNING] $1" + echo -e "${YELLOW}${message}${NC}" + echo "$(date '+%Y-%m-%d %H:%M:%S') $message" >> "$LOG_FILE" +} + +log_error() { + local message="[ERROR] $1" + echo -e "${RED}${message}${NC}" + echo "$(date '+%Y-%m-%d %H:%M:%S') $message" >> "$LOG_FILE" +} + +# 配置变量 +INSTALL_DIR="${1:-$(pwd)}" # 使用第一个参数作为安装目录,如果没有参数则使用当前目录 +TEMP_DIR="/tmp/metrics-install-$$" +VERSION_FILE="version.json" +LOG_FILE="${INSTALL_DIR}/.install.log" # 安装日志文件 + + +# 加载配置文件 +load_config() { + local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + local config_file="$script_dir/config.env" + + if [[ -f "$config_file" ]]; then + log_info "加载配置文件: $config_file" + # 导出配置文件中的环境变量 + set -a # 自动导出所有变量 + source "$config_file" + set +a # 关闭自动导出 + log_success "配置文件加载完成" + else + log_warning "配置文件不存在: $config_file,使用默认配置" + fi +} + +# 复制配置文件到安装目录 +copy_config_files() { + log_info "复制配置文件到安装目录..." + + local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + local source_config="$script_dir/../config/config.env" + local target_config="$INSTALL_DIR/config.env" + + if [[ -f "$source_config" ]]; then + # 检查源文件和目标文件是否是同一个文件 + if [[ "$source_config" == "$target_config" ]]; then + log_info "配置文件已在目标位置,跳过复制" + log_success "配置文件已存在: $target_config" + else + if cp "$source_config" "$target_config"; then + log_success "配置文件复制完成: $target_config" + else + log_error "配置文件复制失败" + return 1 + fi + fi + else + log_warning "源配置文件不存在: $source_config" + fi + + # 复制版本校验脚本 + log_info "复制版本校验脚本到安装目录..." + local target_check_version="$INSTALL_DIR/check_version.sh" + + # 检查目标文件是否已存在(从 artifact 包中解压出来的) + if [[ -f "$target_check_version" ]]; then + log_info "版本校验脚本已存在,设置执行权限..." + chmod +x "$target_check_version" + log_success "版本校验脚本权限设置完成: $target_check_version" + else + log_warning "版本校验脚本不存在: $target_check_version" + log_info "请确保 check_version.sh 已包含在 artifact 包中" + fi +} + +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0 [安装目录]" + log_info "如果不指定安装目录,将使用当前目录: $(pwd)" + exit 1 + fi +} + +# 检查系统要求 +check_system() { + log_info "检查系统要求..." + + # 检查操作系统 + if [[ ! -f /etc/os-release ]]; then + log_error "无法检测操作系统版本" + exit 1 + fi + + source /etc/os-release + log_info "检测到操作系统: $NAME $VERSION" + + # 检查系统架构 + arch=$(uname -m) + log_info "系统架构: $arch" + + # 检查磁盘空间 + available_space=$(df / | awk 'NR==2 {print $4}') + if [[ $available_space -lt 10485760 ]]; then # 10GB in KB + log_warning "可用磁盘空间不足 10GB,当前可用: $(($available_space / 1024 / 1024))GB" + fi + + # 检查内存 + total_mem=$(free -m | awk 'NR==2{print $2}') + if [[ $total_mem -lt 4096 ]]; then # 4GB + log_warning "系统内存不足 4GB,当前: ${total_mem}MB" + fi +} + +# 查找版本文件 +find_version_file() { + log_info "查找版本信息文件..." + + # 在当前目录查找 + if [[ -f "$VERSION_FILE" ]]; then + VERSION_FILE_PATH="$(pwd)/$VERSION_FILE" + log_success "找到版本文件: $VERSION_FILE" + return 0 + fi + + # 在 artifact 目录查找 + for version_dir in artifact/*/; do + if [[ -f "${version_dir}${VERSION_FILE}" ]]; then + VERSION_FILE_PATH="$(cd "$(dirname "${version_dir}${VERSION_FILE}")" && pwd)/$(basename "${version_dir}${VERSION_FILE}")" + log_success "找到版本文件: $VERSION_FILE_PATH" + return 0 + fi + done + + log_error "未找到版本信息文件 $VERSION_FILE" + exit 1 +} + +# 解析版本信息 +parse_version_info() { + log_info "解析版本信息..." + + if [[ ! -f "$VERSION_FILE_PATH" ]]; then + log_error "版本文件不存在: $VERSION_FILE_PATH" + exit 1 + fi + + # 使用 jq 解析 JSON(如果可用) + if command -v jq &> /dev/null; then + # 验证JSON文件格式 + if ! jq empty "$VERSION_FILE_PATH" 2>/dev/null; then + log_error "JSON文件格式错误,请检查 $VERSION_FILE_PATH" + exit 1 + fi + + VERSION=$(jq -r '.version' "$VERSION_FILE_PATH") + BUILD_TIME=$(jq -r '.build_time' "$VERSION_FILE_PATH") + + # 解析 artifact_list + if jq -e '.artifact_list' "$VERSION_FILE_PATH" > /dev/null 2>&1; then + jq -r '.artifact_list | to_entries[] | "\(.key):\(.value)"' "$VERSION_FILE_PATH" > "$TEMP_DIR/components.txt" + else + log_error "version.json 中缺少 artifact_list 字段" + exit 1 + fi + + # 解析 checksums + if jq -e '.checksums' "$VERSION_FILE_PATH" > /dev/null 2>&1; then + jq -r '.checksums | to_entries[] | "\(.key):\(.value)"' "$VERSION_FILE_PATH" > "$TEMP_DIR/checksums.txt" + else + log_error "version.json 中缺少 checksums 字段" + exit 1 + fi + + # 解析 install_order(现在包含完整的文件名) + if jq -e '.install_order' "$VERSION_FILE_PATH" > /dev/null 2>&1; then + jq -r '.install_order[]' "$VERSION_FILE_PATH" > "$TEMP_DIR/install_order.txt" + else + log_error "version.json 中缺少 install_order 字段" + exit 1 + fi + + else + log_warning "jq 未安装,使用简单的 JSON 解析" + # 简单的 JSON 解析 + VERSION=$(grep '"version"' "$VERSION_FILE_PATH" | sed 's/.*"version": *"\([^"]*\)".*/\1/') + BUILD_TIME=$(grep '"build_time"' "$VERSION_FILE_PATH" | sed 's/.*"build_time": *"\([^"]*\)".*/\1/') + + # 解析 artifact_list(跳过字段名本身) + grep -A 100 '"artifact_list"' "$VERSION_FILE_PATH" | grep -v '"artifact_list"' | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do + component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/') + version=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/') + echo "$component:$version" >> "$TEMP_DIR/components.txt" + done + + # 解析 checksums(跳过字段名本身) + grep -A 100 '"checksums"' "$VERSION_FILE_PATH" | grep -v '"checksums"' | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do + component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/') + checksum=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/') + echo "$component:$checksum" >> "$TEMP_DIR/checksums.txt" + done + + # 解析 install_order(跳过字段名本身,只取数组元素) + grep -A 100 '"install_order"' "$VERSION_FILE_PATH" | grep -v '"install_order"' | grep -E '^\s*"[^"]+"' | while read line; do + component=$(echo "$line" | sed 's/.*"\([^"]*\)".*/\1/') + echo "$component" >> "$TEMP_DIR/install_order.txt" + done + + # 验证解析结果 + if [[ ! -f "$TEMP_DIR/components.txt" || ! -s "$TEMP_DIR/components.txt" ]]; then + log_error "无法解析 artifact_list,请检查 version.json 格式" + exit 1 + fi + + if [[ ! -f "$TEMP_DIR/checksums.txt" || ! -s "$TEMP_DIR/checksums.txt" ]]; then + log_error "无法解析 checksums,请检查 version.json 格式" + exit 1 + fi + + if [[ ! -f "$TEMP_DIR/install_order.txt" || ! -s "$TEMP_DIR/install_order.txt" ]]; then + log_error "无法解析 install_order,请检查 version.json 格式" + exit 1 + fi + fi + + log_success "版本信息解析完成" + log_info " 版本: $VERSION" + log_info " 构建时间: $BUILD_TIME" + + component_count=0 + if [[ -f "$TEMP_DIR/components.txt" ]]; then + component_count=$(wc -l < "$TEMP_DIR/components.txt") + log_info " 组件数量: $component_count" + log_info " 组件列表:" + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + version=$(echo "$line" | cut -d':' -f2) + log_info " - $component v$version" + done < "$TEMP_DIR/components.txt" + else + log_error "components.txt 文件不存在" + exit 1 + fi +} + +# 验证文件完整性 +verify_checksums() { + log_info "验证文件完整性..." + + artifact_dir=$(dirname "$VERSION_FILE_PATH") + log_info "Artifact 目录: $artifact_dir" + failed_verification=0 + + if [[ -f "$TEMP_DIR/checksums.txt" ]]; then + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + expected_checksum=$(echo "$line" | cut -d':' -f2-) + + # 查找匹配的 tar 文件 + actual_file="" + for file in "$artifact_dir/${component}-"*.tar.gz; do + if [[ -f "$file" ]]; then + actual_file="$file" + break + fi + done + + if [[ -z "$actual_file" ]]; then + log_error "找不到组件文件: $component" + failed_verification=1 + continue + fi + + # 计算实际校验和 + actual_checksum="sha256:$(sha256sum "$actual_file" | cut -d' ' -f1)" + + if [[ "$actual_checksum" == "$expected_checksum" ]]; then + log_success " $component: 校验通过" + else + log_error " $component: 校验失败" + log_error " 期望: $expected_checksum" + log_error " 实际: $actual_checksum" + failed_verification=1 + fi + done < "$TEMP_DIR/checksums.txt" + fi + + if [[ $failed_verification -eq 1 ]]; then + log_error "文件完整性验证失败" + exit 1 + fi + + log_success "所有文件校验通过" +} + +# 创建安装目录 +create_install_dirs() { + log_info "创建安装目录..." + + mkdir -p "$INSTALL_DIR" + mkdir -p "$TEMP_DIR" + + log_success "安装目录创建完成: $INSTALL_DIR" +} + +# 获取系统版本 +get_system_version() { + if [[ ! -f /etc/os-release ]]; then + log_error "无法检测操作系统版本" + return 1 + fi + + source /etc/os-release + + # 提取主版本号 + case "$VERSION_ID" in + "20.04") + echo "ubuntu20" + ;; + "22.04") + echo "ubuntu22" + ;; + *) + log_warning "未识别的Ubuntu版本: $VERSION_ID,尝试使用ubuntu22" + echo "ubuntu22" + ;; + esac +} + +# 安装系统依赖包 +install_system_deps() { + log_info "开始安装系统依赖包(离线模式)..." + + local artifact_dir + artifact_dir=$(dirname "$VERSION_FILE_PATH") + local deps_dir="$artifact_dir/deps" + local system_version + system_version=$(get_system_version) + local version_deps_dir="$deps_dir/$system_version" + + if [[ ! -d "$version_deps_dir" ]]; then + log_warning "未找到 $system_version 版本的依赖目录: $version_deps_dir,跳过安装" + return 0 + fi + + log_info "找到系统版本依赖目录: $version_deps_dir" + + local deps_temp_dir="/tmp/argus_deps" + mkdir -p "$deps_temp_dir" + rm -rf "$deps_temp_dir"/* + + local FAILED_DEPS=() + local CORE_DEPS=(jq cron curl) # 核心依赖列表 + + # 遍历每个 tar.gz + for tar_file in "$version_deps_dir"/*.tar.gz; do + [[ -f "$tar_file" ]] || continue + + local tar_basename + tar_basename=$(basename "$tar_file") + log_info "处理依赖包: $tar_basename" + + local extract_dir="$deps_temp_dir/${tar_basename%.tar.gz}" + mkdir -p "$extract_dir" + + if tar -xzf "$tar_file" -C "$extract_dir"; then + log_success " $tar_basename 解压完成" + else + log_error " $tar_basename 解压失败" + FAILED_DEPS+=("$tar_basename") + continue + fi + + # 递归查找所有 deb 文件,一次性安装 + mapfile -t deb_files < <(find "$extract_dir" -type f -name "*.deb") + if [[ ${#deb_files[@]} -eq 0 ]]; then + log_warning " 没有找到 deb 包,跳过" + continue + fi + + log_info " 安装 ${#deb_files[@]} 个 deb 包..." + if dpkg -i "${deb_files[@]}" &>/tmp/dpkg_install.log; then + log_success " 所有 deb 包安装成功" + else + dpkg --configure -a || true + if dpkg -l | grep -q '^ii'; then + log_success " dpkg --configure 修复后安装成功" + else + log_error " 部分 deb 包安装失败,请手动安装" + for deb in "${deb_files[@]}"; do + pkg_name=$(dpkg-deb -f "$deb" Package 2>/dev/null || true) + FAILED_DEPS+=("${pkg_name:-$deb}") + done + fi + fi + done + + # 启动 cron 服务或其它必要服务 + start_cron_service + + # 检查核心依赖是否都已安装 + local missing_core=() + for dep in "${CORE_DEPS[@]}"; do + if ! dpkg -s "$dep" &>/dev/null; then + missing_core+=("$dep") + fi + done + + if [[ ${#missing_core[@]} -gt 0 ]]; then + log_error "核心依赖安装失败,请手动安装以下组件:" + for d in "${missing_core[@]}"; do + echo " - $d" + done + exit 1 + fi + + # 最终处理其他安装失败的包 + if [[ ${#FAILED_DEPS[@]} -gt 0 ]]; then + log_error "以下系统依赖安装失败,请手动安装后重试:" + for f in "${FAILED_DEPS[@]}"; do + echo " - $f" + done + exit 1 + fi + + log_success "系统依赖安装完成,全部就绪" +} + +# 启动 cron 服务 +start_cron_service() { + log_info "检查并启动 cron 服务..." + + # 检查 cron 是否已经在运行 + if pgrep -x "cron" > /dev/null; then + log_success "cron 服务已在运行" + return 0 + fi + + # 检查 /usr/sbin/cron 是否存在 + if [[ ! -f "/usr/sbin/cron" ]]; then + log_warning "cron 可执行文件不存在,跳过启动" + return 1 + fi + + # 启动 cron 服务 + log_info "启动 cron 服务..." + if /usr/sbin/cron start 2>/dev/null || /usr/sbin/cron 2>/dev/null; then + log_success "cron 服务启动成功" + + sleep 2 + + if pgrep -x "cron" > /dev/null; then + log_success "cron 服务运行正常" + else + log_warning "cron 服务可能未正常启动" + fi + else + log_error "cron 服务启动失败" + return 1 + fi +} + +# 安装组件 +install_components() { + log_info "开始安装组件..." + + artifact_dir=$(dirname "$VERSION_FILE_PATH") + log_info "Artifact 目录: $artifact_dir" + install_count=0 + total_count=0 + + if [[ -f "$TEMP_DIR/install_order.txt" ]]; then + total_count=$(wc -l < "$TEMP_DIR/install_order.txt") + fi + + if [[ -f "$TEMP_DIR/install_order.txt" ]]; then + while IFS= read -r filename; do + install_count=$((install_count + 1)) + + # 从文件名中提取组件名(去掉时间戳后缀) + component=$(echo "$filename" | sed 's/-[0-9]\{8\}-[0-9]\{6\}\.tar\.gz$//') + + log_info "[$install_count/$total_count] 安装 $component..." + log_info " 文件名: $filename" + + # 直接使用完整的文件名 + tar_file="$artifact_dir/$filename" + + if [[ ! -f "$tar_file" ]]; then + log_error "找不到组件文件: $filename" + log_info " 期望路径: $tar_file" + log_info " 当前目录: $(pwd)" + log_info " 目录内容:" + ls -la "$artifact_dir" | while read line; do + log_info " $line" + done + exit 1 + fi + + log_info " 找到文件: $tar_file" + + # 解压到临时目录 + component_temp_dir="$TEMP_DIR/$component" + mkdir -p "$component_temp_dir" + + if tar -xzf "$tar_file" -C "$component_temp_dir" 2>/dev/null; then + log_success " $component 解压完成" + else + log_error " $component 解压失败" + exit 1 + fi + + # 查找解压后的目录 + extracted_dir="" + for dir in "$component_temp_dir"/*; do + if [[ -d "$dir" ]]; then + extracted_dir="$dir" + break + fi + done + + if [[ -z "$extracted_dir" ]]; then + log_error " $component 解压后未找到目录" + exit 1 + fi + + # 执行安装脚本 + if [[ -f "$extracted_dir/install.sh" ]]; then + log_info " 执行 $component 安装脚本..." + if (cd "$extracted_dir" && ./install.sh "$INSTALL_DIR"); then + log_success " $component 安装完成" + else + log_error " $component 安装失败" + exit 1 + fi + else + log_error " $component 缺少 install.sh 文件" + exit 1 + fi + + # 将解压后的目录移动到安装目录,保留组件目录 + component_install_dir="$INSTALL_DIR/$component" + # 简化安装逻辑:直接删除旧目录,不进行备份 + if [[ -d "$component_install_dir" ]]; then + log_info " 组件目录已存在,删除旧版本: $component_install_dir" + rm -rf "$component_install_dir" + # log_info " 组件目录已存在,备份后更新: $component_install_dir" + # mv "$component_install_dir" "${component_install_dir}.backup.$(date +%Y%m%d_%H%M%S)" + fi + mv "$extracted_dir" "$component_install_dir" + log_success " 组件目录已保存: $component_install_dir" + + # 清理临时文件 + rm -rf "$component_temp_dir" + done < "$TEMP_DIR/install_order.txt" + fi + + log_success "所有组件安装完成" +} + +# 创建安装记录 +create_install_record() { + log_info "创建安装记录..." + + # 等待一段时间确保所有进程都已启动 + log_info "等待进程启动..." + sleep 3 + + local install_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + local install_record_file="$INSTALL_DIR/.install_record" + + # 创建 JSON 格式的安装记录 + cat > "$install_record_file" << EOF +{ + "version": "$VERSION", + "build_time": "$BUILD_TIME", + "install_time": "$install_time", + "install_dir": "$INSTALL_DIR", + "install_pid": $$, + "components": { +EOF + + # 添加组件信息 + local first_component=true + if [[ -f "$TEMP_DIR/components.txt" ]]; then + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + version=$(echo "$line" | cut -d':' -f2) + + # 获取组件的进程信息 + local component_pid="" + + # 根据组件名查找进程,使用多种方法确保能找到PID + case "$component" in + "node-exporter") + # 尝试多种方式查找node_exporter进程 + component_pid=$(pgrep -f "node_exporter" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(pgrep -f "node-exporter" | head -1) + fi + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "node_exporter" | awk '{print $2}' | head -1) + fi + ;; + "dcgm-exporter") + # 查找dcgm-exporter进程 + component_pid=$(pgrep -f "dcgm-exporter" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(pgrep -f "dcgm_exporter" | head -1) + fi + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "dcgm-exporter" | awk '{print $2}' | head -1) + fi + ;; + "fluent-bit") + # 查找fluent-bit进程 + component_pid=$(pgrep -f "fluent-bit" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(pgrep -f "fluent_bit" | head -1) + fi + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "fluent-bit" | awk '{print $2}' | head -1) + fi + ;; + "argus-agent") + # 查找argus-agent进程 + component_pid=$(pgrep -f "argus-agent" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "argus-agent" | awk '{print $2}' | head -1) + fi + ;; + esac + + # 记录找到的PID信息 + if [[ -n "$component_pid" ]]; then + log_info " 找到 $component 进程 PID: $component_pid" + else + log_warning " 未找到 $component 进程" + fi + + # 添加逗号分隔符 + if [[ "$first_component" == "true" ]]; then + first_component=false + else + echo "," >> "$install_record_file" + fi + + # 添加组件信息 + cat >> "$install_record_file" << EOF + "$component": { + "version": "$version", + "pid": "$component_pid", + "install_dir": "$INSTALL_DIR/$component" + } +EOF + done < "$TEMP_DIR/components.txt" + fi + + # 结束 JSON + cat >> "$install_record_file" << EOF + } +} +EOF + + log_success "安装记录已创建: $install_record_file" +} + +# 检查cron任务是否已存在 +check_cron_task_exists() { + local task_pattern="$1" + local temp_cron="$2" + + if grep -q "$task_pattern" "$temp_cron"; then + return 0 # 任务已存在 + else + return 1 # 任务不存在 + fi +} + +# 设置健康检查定时任务 +setup_health_check_cron() { + log_info "设置健康检查定时任务..." + + # 直接使用当前安装目录,不依赖current软链接 + # INSTALL_DIR 是 /opt/argus-metric/versions/1.34.0 + local check_health_script="$INSTALL_DIR/check_health.sh" + + # 检查健康检查脚本是否存在 + if [[ ! -f "$check_health_script" ]]; then + log_error "健康检查脚本不存在: $check_health_script" + return 1 + fi + + # 确保脚本有执行权限 + chmod +x "$check_health_script" + + # 创建临时crontab文件 + local temp_cron="/tmp/crontab_$$" + + # 获取当前用户的crontab(如果存在) + crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron" + + # 检查并删除旧的健康检查任务 + if check_cron_task_exists "check_health.sh" "$temp_cron"; then + log_info "发现旧的健康检查定时任务,正在更新..." + # 删除所有包含check_health.sh的行 + grep -v "check_health.sh" "$temp_cron" > "$temp_cron.new" + mv "$temp_cron.new" "$temp_cron" + log_info "旧的健康检查定时任务已删除" + fi + + # 添加新的定时任务(每5分钟执行一次) + echo "# Argus-Metrics 健康检查定时任务" >> "$temp_cron" + echo "*/5 * * * * $check_health_script >> $INSTALL_DIR/.health_cron.log 2>&1" >> "$temp_cron" + + # 安装新的crontab + if crontab "$temp_cron"; then + log_success "健康检查定时任务设置成功" + log_info " 执行频率: 每5分钟" + log_info " 日志文件: $INSTALL_DIR/.health_cron.log" + log_info " 查看定时任务: crontab -l" + log_info " 删除定时任务: crontab -e" + else + log_error "健康检查定时任务设置失败" + rm -f "$temp_cron" + return 1 + fi + + # 清理临时文件 + rm -f "$temp_cron" + + log_info "健康检查通过crontab自动执行" +} + +# 设置 DNS 同步定时任务 +setup_dns_sync_cron() { + log_info "设置 DNS 同步定时任务..." + + # 使用当前版本目录中的 DNS 同步脚本 + local sync_dns_script="$INSTALL_DIR/sync_dns.sh" + + # 检查 DNS 同步脚本是否存在 + if [[ ! -f "$sync_dns_script" ]]; then + log_warning "DNS 同步脚本不存在: $sync_dns_script" + log_warning "跳过 DNS 同步定时任务设置" + return 0 + fi + + # 确保脚本有执行权限 + chmod +x "$sync_dns_script" + + # 创建临时crontab文件 + local temp_cron="/tmp/crontab_$$" + + # 获取当前用户的crontab(如果存在) + crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron" + + # 检查并删除旧的 DNS 同步任务 + if check_cron_task_exists "sync_dns.sh" "$temp_cron"; then + log_info "发现旧的 DNS 同步定时任务,正在更新..." + # 删除所有包含sync_dns.sh的行 + grep -v "sync_dns.sh" "$temp_cron" > "$temp_cron.new" + mv "$temp_cron.new" "$temp_cron" + log_info "旧的 DNS 同步定时任务已删除" + fi + + # 添加新的定时任务(每1分钟执行一次) + # 直接使用版本目录中的 DNS 同步脚本 + echo "# Argus-Metrics DNS 同步定时任务" >> "$temp_cron" + echo "* * * * * $sync_dns_script >> $INSTALL_DIR/.dns_sync.log 2>&1" >> "$temp_cron" + + # 安装新的crontab + if crontab "$temp_cron"; then + log_success "DNS 同步定时任务设置成功" + log_info " 执行频率: 每1分钟" + log_info " 日志文件: $INSTALL_DIR/.dns_sync.log" + log_info " 查看定时任务: crontab -l" + log_info " 删除定时任务: crontab -e" + else + log_error "DNS 同步定时任务设置失败" + rm -f "$temp_cron" + return 1 + fi + + # 清理临时文件 + rm -f "$temp_cron" + + log_info "DNS 同步通过crontab自动执行" +} + +# 设置版本校验定时任务 +setup_version_check_cron() { + log_info "设置版本校验定时任务..." + + # 使用当前版本目录中的版本校验脚本 + local check_version_script="$INSTALL_DIR/check_version.sh" + + # 检查脚本是否存在 + if [[ ! -f "$check_version_script" ]]; then + log_warning "版本校验脚本不存在: $check_version_script" + log_info "跳过版本校验定时任务设置" + return 0 + fi + + # 确保脚本可执行 + chmod +x "$check_version_script" + + # 创建临时crontab文件 + local temp_cron="/tmp/crontab_$$" + crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron" + + # 检查是否已存在版本校验定时任务 + if check_cron_task_exists "check_version.sh" "$temp_cron"; then + log_info "发现旧的版本校验定时任务,正在更新..." + # 删除所有包含check_version.sh的行 + grep -v "check_version.sh" "$temp_cron" > "$temp_cron.new" + mv "$temp_cron.new" "$temp_cron" + log_info "旧的版本校验定时任务已删除" + fi + + # 添加新的定时任务(每30分钟执行一次) + echo "# Argus-Metrics 版本校验定时任务" >> "$temp_cron" + echo "*/1 * * * * $check_version_script >> $INSTALL_DIR/.version_check.log 2>&1" >> "$temp_cron" + + # 安装新的crontab + if crontab "$temp_cron"; then + log_success "版本校验定时任务设置成功" + log_info " 执行频率: 每1分钟" + log_info " 日志文件: $INSTALL_DIR/.version_check.log" + log_info " 查看定时任务: crontab -l" + log_info " 删除定时任务: crontab -e" + else + log_error "版本校验定时任务设置失败" + rm -f "$temp_cron" + return 1 + fi + + # 清理临时文件 + rm -f "$temp_cron" + + log_info "版本校验通过crontab自动执行" +} + +# 设置自动重启定时任务 +setup_restart_cron() { + log_info "设置自动重启定时任务..." + + # 使用当前版本目录中的重启脚本 + local restart_script="$INSTALL_DIR/restart_unhealthy.sh" + + # 检查脚本是否存在 + if [[ ! -f "$restart_script" ]]; then + log_warning "重启脚本不存在: $restart_script" + log_info "跳过自动重启定时任务设置" + return 0 + fi + + # 确保脚本可执行 + chmod +x "$restart_script" + + # 创建临时crontab文件 + local temp_cron="/tmp/crontab_$$" + crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron" + + # 检查是否已存在自动重启定时任务 + if check_cron_task_exists "restart_unhealthy.sh" "$temp_cron"; then + log_info "发现旧的自动重启定时任务,正在更新..." + # 删除所有包含restart_unhealthy.sh的行 + grep -v "restart_unhealthy.sh" "$temp_cron" > "$temp_cron.new" + mv "$temp_cron.new" "$temp_cron" + log_info "旧的自动重启定时任务已删除" + fi + + # 添加新的定时任务(每2分钟执行一次) + echo "# Argus-Metrics 自动重启定时任务" >> "$temp_cron" + echo "*/2 * * * * $restart_script >> $INSTALL_DIR/.restart.log 2>&1" >> "$temp_cron" + + # 安装新的crontab + if crontab "$temp_cron"; then + log_success "自动重启定时任务设置成功" + log_info " 执行频率: 每2分钟" + log_info " 日志文件: $INSTALL_DIR/.restart.log" + log_info " 查看定时任务: crontab -l" + log_info " 删除定时任务: crontab -e" + else + log_error "自动重启定时任务设置失败" + rm -f "$temp_cron" + return 1 + fi + + # 清理临时文件 + rm -f "$temp_cron" + + log_info "自动重启检查通过crontab自动执行" +} + +# 显示安装信息 +show_install_info() { + log_success "Argus-Metrics All-in-One 安装完成!" + echo + log_info "安装日志已保存到: $LOG_FILE" + log_info "如需查看详细日志,请执行: cat $LOG_FILE" + echo +} + +cleanup() { + if [[ -d "$TEMP_DIR" ]]; then + rm -rf "$TEMP_DIR" + fi +} + +trap cleanup EXIT + +# 主函数 +main() { + echo "==========================================" + echo " Argus-Metrics All-in-One 安装脚本 v1.0" + echo "==========================================" + echo + + # 初始化日志文件 + mkdir -p "$INSTALL_DIR" + echo "==========================================" > "$LOG_FILE" + echo " Argus-Metrics All-in-One 安装日志" >> "$LOG_FILE" + echo " 开始时间: $(date '+%Y-%m-%d %H:%M:%S')" >> "$LOG_FILE" + echo "==========================================" >> "$LOG_FILE" + + # 加载配置文件 + load_config + + log_info "安装目录: $INSTALL_DIR" + log_info "日志文件: $LOG_FILE" + echo + + check_root + check_system + find_version_file + create_install_dirs + install_system_deps + parse_version_info + verify_checksums + install_components + copy_config_files + create_install_record + setup_health_check_cron + setup_dns_sync_cron + setup_version_check_cron + setup_restart_cron + + # 注释掉立即执行健康检查,避免与cron任务重复执行 + # log_info "立即执行一次健康检查..." + # local check_health_script="$INSTALL_DIR/check_health.sh" + # if [[ -f "$check_health_script" ]]; then + # if "$check_health_script" >> "$INSTALL_DIR/.health_check.log" 2>&1; then + # log_success "健康检查执行完成" + # else + # log_warning "健康检查执行失败,请检查日志: $INSTALL_DIR/.health_check.log" + # fi + # else + # log_warning "健康检查脚本不存在: $check_health_script" + # fi + + show_install_info +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/scripts/package_artifact.sh b/src/metric/client-plugins/all-in-one-full/scripts/package_artifact.sh new file mode 100755 index 0000000..2c4bb6b --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/package_artifact.sh @@ -0,0 +1,474 @@ +#!/bin/bash + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 显示帮助信息 +show_help() { + echo "AIOps All-in-One 打包脚本" + echo + echo "用法: $0 [选项]" + echo + echo "选项:" + echo " --force 强制重新打包,即使版本已存在" + echo " --help 显示此帮助信息" + echo + echo "示例:" + echo " $0 # 正常打包,跳过已存在的版本" + echo " $0 --force # 强制重新打包" + echo +} + +# 解析命令行参数 +FORCE_PACKAGE=false +if [[ "$1" == "--force" ]]; then + FORCE_PACKAGE=true + log_info "强制重新打包模式" +elif [[ "$1" == "--help" || "$1" == "-h" ]]; then + show_help + exit 0 +fi + +# 获取当前目录和版本 +CURRENT_DIR=$(pwd) +VERSION=$(cat config/VERSION 2>/dev/null || echo "1.0.0") +ARTIFACT_DIR="artifact/$VERSION" + +log_info "开始打包 AIOps All-in-One 安装包 v$VERSION" + +# 检查必要文件 +log_info "检查必要文件..." +if [[ ! -f "config/VERSION" ]]; then + log_error "VERSION 文件不存在" + exit 1 +fi + +if [[ ! -f "config/checklist" ]]; then + log_error "checklist 文件不存在" + exit 1 +fi + +# 检查是否已存在该版本 +if [[ -d "$ARTIFACT_DIR" && "$FORCE_PACKAGE" == "false" ]]; then + log_info "检查版本 $VERSION 是否已存在..." + + # 检查 version.json 是否存在 + if [[ -f "$ARTIFACT_DIR/version.json" ]]; then + log_info "找到已存在的版本信息文件" + + # 检查是否所有组件文件都存在 + missing_files=0 + existing_components=0 + + # 解析已存在的 version.json 来检查文件 + if command -v jq &> /dev/null; then + # 使用 jq 解析 + while IFS= read -r component; do + existing_components=$((existing_components + 1)) + # 查找对应的 tar 文件 + found_file=false + for file in "$ARTIFACT_DIR/${component}-"*.tar.gz; do + if [[ -f "$file" ]]; then + found_file=true + break + fi + done + if [[ "$found_file" == "false" ]]; then + missing_files=$((missing_files + 1)) + log_warning " 缺少文件: $component" + fi + done < <(jq -r '.artifact_list | keys[]' "$ARTIFACT_DIR/version.json" 2>/dev/null) + else + # 简单的文件检查 + for file in "$ARTIFACT_DIR"/*.tar.gz; do + if [[ -f "$file" ]]; then + existing_components=$((existing_components + 1)) + fi + done + fi + + # 如果所有文件都存在,则跳过打包 + if [[ $missing_files -eq 0 && $existing_components -gt 0 ]]; then + log_success "版本 $VERSION 已完整打包,跳过重复打包" + echo + echo "现有文件:" + ls -la "$ARTIFACT_DIR" + echo + echo "如需强制重新打包,请删除目录: rm -rf $ARTIFACT_DIR" + echo "或使用: ./package.sh --force" + exit 0 + else + log_warning "版本 $VERSION 存在但不完整,将重新打包" + log_info " 现有组件: $existing_components" + log_info " 缺少文件: $missing_files" + fi + else + log_warning "版本目录存在但缺少 version.json,将重新打包" + fi +fi + +# 创建 artifact 目录 +mkdir -p "$ARTIFACT_DIR" +log_info "创建输出目录: $ARTIFACT_DIR" + +# 创建临时文件存储数据 +TEMP_DIR=$(mktemp -d) +COMPONENTS_FILE="$TEMP_DIR/components.txt" +VERSIONS_FILE="$TEMP_DIR/versions.txt" +DEPENDENCIES_FILE="$TEMP_DIR/dependencies.txt" +INSTALL_ORDER_FILE="$TEMP_DIR/install_order.txt" +CHECKSUMS_FILE="$TEMP_DIR/checksums.txt" +ARTIFACT_LIST_FILE="$TEMP_DIR/artifact_list.txt" + +# 解析 checklist 文件 +log_info "解析组件清单..." +line_num=0 +component_count=0 + +while IFS= read -r line; do + [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue + + line_num=$((line_num + 1)) + + # 解析行: 组件名 目录路径 版本 [依赖组件] [安装顺序] + read -r component component_path version dep_component order <<< "$line" + + if [[ -z "$component" || -z "$component_path" || -z "$version" ]]; then + log_warning "跳过无效行 $line_num: $line" + continue + fi + + # 存储组件信息 + echo "$component" >> "$COMPONENTS_FILE" + echo "$component:$version" >> "$VERSIONS_FILE" + echo "$component:$component_path" >> "$TEMP_DIR/component_paths.txt" + + if [[ -n "$dep_component" && "$dep_component" != "$component" ]]; then + echo "$component:$dep_component" >> "$DEPENDENCIES_FILE" + fi + + if [[ -n "$order" && "$order" =~ ^[0-9]+$ ]]; then + echo "$order:$component" >> "$INSTALL_ORDER_FILE" + else + # 如果没有指定顺序,按解析顺序分配 + echo "$line_num:$component" >> "$INSTALL_ORDER_FILE" + fi + + component_count=$((component_count + 1)) + log_info " - $component v$version" +done < config/checklist + +if [[ $component_count -eq 0 ]]; then + log_error "没有找到有效的组件" + rm -rf "$TEMP_DIR" + exit 1 +fi + +log_success "找到 $component_count 个组件" + +# 检查组件目录是否存在 +log_info "检查组件目录..." +missing_components=() + +while IFS= read -r component; do + # 获取组件路径 + component_path=$(grep "^$component:" "$TEMP_DIR/component_paths.txt" | cut -d':' -f2-) + if [[ -z "$component_path" ]]; then + log_error "未找到组件 $component 的路径配置" + log_info "请检查 component_paths.txt 文件或添加路径配置" + exit 1 + fi + + if [[ ! -d "$component_path" ]]; then + missing_components+=("$component:$component_path") + fi +done < "$COMPONENTS_FILE" + +if [[ ${#missing_components[@]} -gt 0 ]]; then + log_error "以下组件目录不存在:" + for component_path in "${missing_components[@]}"; do + echo " - $component_path" + done + rm -rf "$TEMP_DIR" + exit 1 +fi + +# 打包各个组件 +log_info "开始打包组件..." + +while IFS= read -r component; do + # 获取组件版本和路径 + version=$(grep "^$component:" "$VERSIONS_FILE" | cut -d':' -f2) + component_path=$(grep "^$component:" "$TEMP_DIR/component_paths.txt" | cut -d':' -f2-) + if [[ -z "$component_path" ]]; then + log_error "未找到组件 $component 的路径配置" + log_info "请检查 component_paths.txt 文件或添加路径配置" + exit 1 + fi + + log_info "打包 $component v$version..." + log_info " 组件路径: $component_path" + + # 进入组件目录 + cd "$component_path" + + # 检查组件是否有 package.sh + if [[ ! -f "package.sh" ]]; then + log_error "$component 缺少 package.sh 文件" + cd "$CURRENT_DIR" + rm -rf "$TEMP_DIR" + exit 1 + fi + + # 执行组件的打包脚本 + if ./package.sh; then + # 查找生成的 tar 包 + tar_file=$(find . -name "*.tar.gz" -type f | head -1) + if [[ -n "$tar_file" ]]; then + # 移动到 artifact 目录 + mv "$tar_file" "$CURRENT_DIR/$ARTIFACT_DIR/" + tar_filename=$(basename "$tar_file") + + # 计算校验和 + checksum=$(sha256sum "$CURRENT_DIR/$ARTIFACT_DIR/$tar_filename" | cut -d' ' -f1) + echo "$component:sha256:$checksum" >> "$CHECKSUMS_FILE" + echo "$component:$version" >> "$ARTIFACT_LIST_FILE" + + # 将完整的文件名存储到安装顺序文件中 + echo "$tar_filename" >> "$TEMP_DIR/install_order_files.txt" + + log_success " $component 打包完成: $tar_filename" + else + log_error "$component 打包失败,未找到生成的 tar 包" + cd "$CURRENT_DIR" + rm -rf "$TEMP_DIR" + exit 1 + fi + else + log_error "$component 打包失败" + cd "$CURRENT_DIR" + rm -rf "$TEMP_DIR" + exit 1 + fi + + # 返回主目录 + cd "$CURRENT_DIR" +done < "$COMPONENTS_FILE" + +# 生成 version.json +log_info "生成版本信息文件..." +version_json="$ARTIFACT_DIR/version.json" + +# 构建依赖关系 JSON +deps_json="" +if [[ -f "$DEPENDENCIES_FILE" ]]; then + first=true + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + dep=$(echo "$line" | cut -d':' -f2) + if [[ "$first" == "true" ]]; then + deps_json="\"$component\":[\"$dep\"]" + first=false + else + deps_json="$deps_json,\"$component\":[\"$dep\"]" + fi + done < "$DEPENDENCIES_FILE" +fi + +# 构建安装顺序数组 +order_array="" +if [[ -f "$TEMP_DIR/install_order_files.txt" ]]; then + first=true + while IFS= read -r filename; do + if [[ "$first" == "true" ]]; then + order_array="\"$filename\"" + first=false + else + order_array="$order_array,\"$filename\"" + fi + done < "$TEMP_DIR/install_order_files.txt" +fi + +# 构建 artifact_list JSON +artifact_json="" +if [[ -f "$ARTIFACT_LIST_FILE" ]]; then + first=true + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + version=$(echo "$line" | cut -d':' -f2) + if [[ "$first" == "true" ]]; then + artifact_json="\"$component\":\"$version\"" + first=false + else + artifact_json="$artifact_json,\"$component\":\"$version\"" + fi + done < "$ARTIFACT_LIST_FILE" +fi + +# 构建 checksums JSON +checksums_json="" +if [[ -f "$CHECKSUMS_FILE" ]]; then + first=true + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + checksum=$(echo "$line" | cut -d':' -f2-) + if [[ "$first" == "true" ]]; then + checksums_json="\"$component\":\"$checksum\"" + first=false + else + checksums_json="$checksums_json,\"$component\":\"$checksum\"" + fi + done < "$CHECKSUMS_FILE" +fi + +# 生成完整的 version.json +cat > "$version_json" << EOF +{ + "version": "$VERSION", + "build_time": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", + "artifact_list": { + $artifact_json + }, + "checksums": { + $checksums_json + }, + "dependencies": { + $deps_json + }, + "install_order": [ + $order_array + ] +} +EOF + +log_success "版本信息文件生成完成: $version_json" + +# 复制`安装`脚本到 artifact 目录 +log_info "复制安装脚本..." +if [[ -f "scripts/install_artifact.sh" ]]; then + cp "scripts/install_artifact.sh" "$ARTIFACT_DIR/install.sh" + chmod +x "$ARTIFACT_DIR/install.sh" + log_success "安装脚本复制完成: $ARTIFACT_DIR/install.sh" +else + log_warning "scripts/install_artifact.sh 文件不存在" +fi + +# 复制`卸载`脚本到 artifact 目录 +log_info "复制卸载脚本..." +if [[ -f "scripts/uninstall_artifact.sh" ]]; then + cp "scripts/uninstall_artifact.sh" "$ARTIFACT_DIR/uninstall.sh" + chmod +x "$ARTIFACT_DIR/uninstall.sh" + log_success "卸载脚本复制完成: $ARTIFACT_DIR/uninstall.sh" +else + log_warning "scripts/uninstall_artifact.sh 文件不存在" +fi + +# 复制`健康检查`脚本到 artifact 目录 +log_info "复制健康检查脚本..." +if [[ -f "scripts/check_health.sh" ]]; then + cp "scripts/check_health.sh" "$ARTIFACT_DIR/check_health.sh" + chmod +x "$ARTIFACT_DIR/check_health.sh" + log_success "健康检查脚本复制完成: $ARTIFACT_DIR/check_health.sh" +else + log_warning "scripts/check_health.sh 文件不存在" +fi + +# 复制`DNS 同步`脚本到 artifact 目录 +log_info "复制 DNS 同步脚本..." +if [[ -f "scripts/sync_dns.sh" ]]; then + cp "scripts/sync_dns.sh" "$ARTIFACT_DIR/sync_dns.sh" + chmod +x "$ARTIFACT_DIR/sync_dns.sh" + log_success "DNS 同步脚本复制完成: $ARTIFACT_DIR/sync_dns.sh" +else + log_warning "scripts/sync_dns.sh 文件不存在" +fi + +# 复制`版本校验`脚本到 artifact 目录 +log_info "复制版本校验脚本..." +if [[ -f "scripts/check_version.sh" ]]; then + cp "scripts/check_version.sh" "$ARTIFACT_DIR/check_version.sh" + chmod +x "$ARTIFACT_DIR/check_version.sh" + log_success "版本校验脚本复制完成: $ARTIFACT_DIR/check_version.sh" +else + log_warning "scripts/check_version.sh 文件不存在" +fi + +# 复制`自动重启`脚本到 artifact 目录 +log_info "复制自动重启脚本..." +if [[ -f "scripts/restart_unhealthy.sh" ]]; then + cp "scripts/restart_unhealthy.sh" "$ARTIFACT_DIR/restart_unhealthy.sh" + chmod +x "$ARTIFACT_DIR/restart_unhealthy.sh" + log_success "自动重启脚本复制完成: $ARTIFACT_DIR/restart_unhealthy.sh" +else + log_warning "scripts/restart_unhealthy.sh 文件不存在" +fi + +# 复制配置文件到 artifact 目录 +log_info "复制配置文件..." +if [[ -f "config/config.env" ]]; then + cp "config/config.env" "$ARTIFACT_DIR/" + log_success "配置文件复制完成: $ARTIFACT_DIR/config.env" +else + log_warning "config 目录不存在,跳过配置文件复制" +fi + +# DNS 配置文件不需要复制到版本目录,直接从 FTP 服务器根目录获取 + +# 复制 deps 目录到 artifact 目录 +log_info "复制系统依赖包..." +if [[ -d "deps" ]]; then + cp -r "deps" "$ARTIFACT_DIR/" + log_success "系统依赖包复制完成: $ARTIFACT_DIR/deps" + + # 显示deps目录内容 + log_info " 依赖包列表:" + find "$ARTIFACT_DIR/deps" -name "*.tar.gz" -exec basename {} \; | while read dep_file; do + log_info " - $dep_file" + done +else + log_warning "deps 目录不存在,跳过依赖包复制" +fi + +# 显示打包结果 +log_success "打包完成!" +echo +echo "版本: $VERSION" +echo "输出目录: $ARTIFACT_DIR" +echo "包含组件:" +if [[ -f "$ARTIFACT_LIST_FILE" ]]; then + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + version=$(echo "$line" | cut -d':' -f2) + echo " - $component v$version" + done < "$ARTIFACT_LIST_FILE" +fi +echo +echo "文件列表:" +ls -la "$ARTIFACT_DIR" +echo + +# 清理临时文件 +rm -rf "$TEMP_DIR" diff --git a/src/metric/client-plugins/all-in-one-full/scripts/publish_artifact.sh b/src/metric/client-plugins/all-in-one-full/scripts/publish_artifact.sh new file mode 100755 index 0000000..b292a8d --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/publish_artifact.sh @@ -0,0 +1,293 @@ +#!/bin/bash + +set -e + +# 颜色定义 +GREEN='\033[0;32m' +BLUE='\033[0;34m' +RED='\033[0;31m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 显示帮助信息 +show_help() { + echo "Argus-Metric Artifact 发布脚本" + echo + echo "用法: $0 <版本号> [选项]" + echo + echo "参数:" + echo " <版本号> 要发布的版本号,对应 artifact 目录中的版本" + echo + echo "选项:" + echo " --output-dir <路径> 指定输出目录 (默认: /private/argus/ftp/share/)" + echo " --owner 指定文件所有者 (默认: 2133:2015)" + echo " -h, --help 显示此帮助信息" + echo + echo "示例:" + echo " $0 1.20.0 # 使用默认配置发布" + echo " $0 1.20.0 --output-dir /tmp/publish # 指定输出目录" + echo " $0 1.20.0 --owner 1000:1000 # 指定文件所有者" + echo " $0 1.20.0 --output-dir /srv/ftp --owner root:root # 同时指定两者" + echo +} + +# 默认配置 +DEFAULT_PUBLISH_DIR="/private/argus/ftp/share/" +DEFAULT_OWNER="2133:2015" + +# 解析参数 +VERSION="" +PUBLISH_DIR="$DEFAULT_PUBLISH_DIR" +OWNER="$DEFAULT_OWNER" + +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + show_help + exit 0 + ;; + --output-dir) + PUBLISH_DIR="$2" + shift 2 + ;; + --owner) + OWNER="$2" + shift 2 + ;; + *) + if [[ -z "$VERSION" ]]; then + VERSION="$1" + shift + else + log_error "未知参数: $1" + show_help + exit 1 + fi + ;; + esac +done + +# 检查版本号是否提供 +if [[ -z "$VERSION" ]]; then + log_error "请提供版本号参数" + show_help + exit 1 +fi + +ARTIFACT_DIR="artifact/$VERSION" + +# 检查版本目录是否存在 +if [[ ! -d "$ARTIFACT_DIR" ]]; then + log_error "版本目录不存在: $ARTIFACT_DIR" + exit 1 +fi + +log_info "开始发布版本: $VERSION" +log_info "输出目录: $PUBLISH_DIR" +log_info "文件所有者: $OWNER" + +# 确保发布目录存在 +log_info "确保发布目录存在: $PUBLISH_DIR" +mkdir -p "$PUBLISH_DIR" + +# 解析并校验所有者(仅在需要时 chown) +IFS=':' read -r OWNER_UID OWNER_GID <<< "$OWNER" +if [[ -z "$OWNER_UID" || -z "$OWNER_GID" ]]; then + log_error "--owner 格式不正确,应为 uid:gid" + exit 1 +fi + +CURRENT_UID=$(id -u) +CURRENT_GID=$(id -g) +if [[ "$OWNER_UID" != "$CURRENT_UID" || "$OWNER_GID" != "$CURRENT_GID" ]]; then + if [[ "$CURRENT_UID" -ne 0 ]]; then + log_error "当前用户 (${CURRENT_UID}:${CURRENT_GID}) 无法设置所有者为 ${OWNER_UID}:${OWNER_GID}" + log_error "请以目标用户运行脚本或预先调整目录权限" + exit 1 + fi + NEED_CHOWN=true +else + NEED_CHOWN=false +fi + +# 创建临时目录用于打包 +TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$" +mkdir -p "$TEMP_PACKAGE_DIR" + +# 复制所有 tar.gz 文件到临时目录 +log_info "准备 artifact 文件..." +tar_files=$(find "$ARTIFACT_DIR" -name "*.tar.gz" -type f) + +if [[ -z "$tar_files" ]]; then + log_error "在 $ARTIFACT_DIR 中未找到 tar.gz 文件" + exit 1 +fi + +for file in $tar_files; do + filename=$(basename "$file") + log_info " 准备: $filename" + cp "$file" "$TEMP_PACKAGE_DIR/" +done + +# 复制版本信息文件 +if [[ -f "$ARTIFACT_DIR/version.json" ]]; then + log_info "复制版本信息文件..." + cp "$ARTIFACT_DIR/version.json" "$TEMP_PACKAGE_DIR/" +fi + +# 复制健康检查脚本 +if [[ -f "$ARTIFACT_DIR/check_health.sh" ]]; then + log_info "复制健康检查脚本..." + cp "$ARTIFACT_DIR/check_health.sh" "$TEMP_PACKAGE_DIR/" +elif [[ -f "scripts/check_health.sh" ]]; then + log_info "复制健康检查脚本 (从当前目录)..." + cp "scripts/check_health.sh" "$TEMP_PACKAGE_DIR/" +else + log_warning "未找到 check_health.sh 文件" +fi + +# 复制 DNS 同步脚本 +if [[ -f "$ARTIFACT_DIR/sync_dns.sh" ]]; then + log_info "复制 DNS 同步脚本..." + cp "$ARTIFACT_DIR/sync_dns.sh" "$TEMP_PACKAGE_DIR/" +elif [[ -f "scripts/sync_dns.sh" ]]; then + log_info "复制 DNS 同步脚本 (从当前目录)..." + cp "scripts/sync_dns.sh" "$TEMP_PACKAGE_DIR/" +else + log_warning "未找到 sync_dns.sh 文件" +fi + +# 复制版本校验脚本 +if [[ -f "$ARTIFACT_DIR/check_version.sh" ]]; then + log_info "复制版本校验脚本..." + cp "$ARTIFACT_DIR/check_version.sh" "$TEMP_PACKAGE_DIR/" +elif [[ -f "scripts/check_version.sh" ]]; then + log_info "复制版本校验脚本 (从当前目录)..." + cp "scripts/check_version.sh" "$TEMP_PACKAGE_DIR/" +else + log_warning "未找到 check_version.sh 文件" +fi + +# 复制重启失败脚本 +if [[ -f "$ARTIFACT_DIR/restart_unhealthy.sh" ]]; then + log_info "复制重启失败脚本..." + cp "$ARTIFACT_DIR/restart_unhealthy.sh" "$TEMP_PACKAGE_DIR/" +elif [[ -f "scripts/restart_unhealthy.sh" ]]; then + log_info "复制重启失败脚本 (从当前目录)..." + cp "scripts/restart_unhealthy.sh" "$TEMP_PACKAGE_DIR/" +else + log_warning "未找到 restart_unhealthy.sh 文件" +fi + +# 复制安装脚本并重命名为 install.sh +if [[ -f "scripts/install_artifact.sh" ]]; then + log_info "复制安装脚本..." + cp "scripts/install_artifact.sh" "$TEMP_PACKAGE_DIR/install.sh" +fi + +if [[ -f "scripts/uninstall_artifact.sh" ]]; then + log_info "复制卸载脚本..." + cp "scripts/uninstall_artifact.sh" "$TEMP_PACKAGE_DIR/uninstall.sh" +fi + +# 复制配置文件 +if [[ -f "$ARTIFACT_DIR/config.env" ]]; then + log_info "复制配置文件..." + cp "$ARTIFACT_DIR/config.env" "$TEMP_PACKAGE_DIR/" + log_success "配置文件复制完成" +else + log_warning "未找到 config.env 文件" +fi + +# DNS 配置文件将在后面直接复制到发布目录根目录,不包含在 tar.gz 中 + +# 复制 deps 目录 +if [[ -d "$ARTIFACT_DIR/deps" ]]; then + log_info "复制系统依赖包..." + cp -r "$ARTIFACT_DIR/deps" "$TEMP_PACKAGE_DIR/" + log_success "系统依赖包复制完成" +fi + +# 创建tar包,使用新的命名规范 +TAR_NAME="argus-metric_$(echo $VERSION | tr '.' '_').tar.gz" +log_info "创建发布包: $TAR_NAME" +cd "$TEMP_PACKAGE_DIR" +tar -czf "$PUBLISH_DIR/$TAR_NAME" . +cd - > /dev/null + +# 设置文件所有者 +log_info "设置文件所有者为: $OWNER" +if [[ "$NEED_CHOWN" == true ]]; then + chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME" +fi + +# 清理临时目录 +rm -rf "$TEMP_PACKAGE_DIR" + +# 更新 LATEST_VERSION 文件 +log_info "更新 LATEST_VERSION 文件..." +echo "$VERSION" > "$PUBLISH_DIR/LATEST_VERSION" +if [[ "$NEED_CHOWN" == true ]]; then + chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION" +fi + +# 复制 DNS 配置文件到发布目录根目录(直接从 config 目录复制) +if [[ -f "config/dns.conf" ]]; then + log_info "复制 DNS 配置文件到发布目录根目录..." + cp "config/dns.conf" "$PUBLISH_DIR/" + if [[ "$NEED_CHOWN" == true ]]; then + chown "$OWNER" "$PUBLISH_DIR/dns.conf" + fi + log_success "DNS 配置文件复制完成: $PUBLISH_DIR/dns.conf" +else + log_warning "未找到 config/dns.conf 文件,跳过 DNS 配置文件复制" +fi + +# 复制 setup.sh 到发布目录 +if [[ -f "scripts/setup.sh" ]]; then + log_info "复制 setup.sh 到发布目录..." + cp "scripts/setup.sh" "$PUBLISH_DIR/" + if [[ "$NEED_CHOWN" == true ]]; then + chown "$OWNER" "$PUBLISH_DIR/setup.sh" + fi +fi + +# 显示发布结果 +log_success "版本 $VERSION 发布完成!" +echo +echo "发布目录: $PUBLISH_DIR" +echo "发布包: $PUBLISH_DIR/$TAR_NAME" +echo "包大小: $(du -h "$PUBLISH_DIR/$TAR_NAME" | cut -f1)" +echo "最新版本: $(cat "$PUBLISH_DIR/LATEST_VERSION")" +echo +echo "发布目录中的文件:" +ls -la "$PUBLISH_DIR" | while read line; do + echo " $line" +done +echo +echo "使用方法:" +echo " 1. 确保 /srv/ftp/share 目录可通过 FTP 访问" +echo " 2. 用户首先下载安装脚本:" +echo " curl -u ftpuser:admin1234 ftp://10.211.55.4/setup.sh -o setup.sh" +echo " 3. 然后执行安装 (自动获取最新版本):" +echo " sudo sh setup.sh" +echo " 4. 或者指定版本安装:" +echo " sudo sh setup.sh --version $VERSION" +echo " 5. 或者指定不同的FTP服务器:" +echo " sudo sh setup.sh --server 192.168.1.100 --user myuser --password mypass" diff --git a/src/metric/client-plugins/all-in-one-full/scripts/restart_unhealthy.sh b/src/metric/client-plugins/all-in-one-full/scripts/restart_unhealthy.sh new file mode 100755 index 0000000..cd2065b --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/restart_unhealthy.sh @@ -0,0 +1,337 @@ +#!/bin/bash + +# 此脚本会检查各组件的健康状态,并重启不健康的组件 + +# PID 文件检测,防止重复执行 +PIDFILE="/var/run/restart_unhealthy.pid" +if [ -f "$PIDFILE" ] && kill -0 $(cat "$PIDFILE") 2>/dev/null; then + echo "自动重启脚本已在运行中,跳过本次执行" >&2 + exit 0 +fi +echo $$ > "$PIDFILE" +trap "rm -f $PIDFILE" EXIT + +# 获取脚本所在目录 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record" + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1" +} + +# 加载配置文件 +load_config() { + local config_file="$SCRIPT_DIR/config.env" + + if [[ -f "$config_file" ]]; then + log_info "加载配置文件: $config_file" + set -a + source "$config_file" + set +a + log_success "配置文件加载完成" + else + log_warning "配置文件不存在: $config_file,使用默认配置" + fi +} + +# 检查单个组件健康状态 +check_component_health() { + local component_name="$1" + local check_script_path="$2" + + if [[ ! -f "$check_script_path" ]]; then + log_error "$component_name: 健康检查脚本不存在: $check_script_path" + return 1 + fi + + if [[ ! -x "$check_script_path" ]]; then + chmod +x "$check_script_path" 2>/dev/null || true + fi + + # 执行健康检查,捕获退出码 + if "$check_script_path" > /dev/null 2>&1; then + return 0 + else + return 1 + fi +} + +# 重启单个组件 +restart_component() { + local component_name="$1" + local install_dir="$2" + + log_warning "正在重启组件: $component_name" + + # 先执行卸载脚本 + local uninstall_script="$install_dir/uninstall.sh" + if [[ -f "$uninstall_script" ]]; then + log_info "$component_name: 执行卸载脚本..." + chmod +x "$uninstall_script" 2>/dev/null || true + # 使用 yes 命令自动回答所有确认提示 + yes 2>/dev/null | (cd "$install_dir" && "$uninstall_script") || true + log_info "$component_name: 卸载完成" + fi + + # 执行安装脚本 + local install_script="$install_dir/install.sh" + if [[ ! -f "$install_script" ]]; then + log_error "$component_name: 安装脚本不存在: $install_script" + return 1 + fi + + chmod +x "$install_script" 2>/dev/null || true + log_info "$component_name: 执行安装脚本..." + + # 使用 yes 命令自动回答所有确认提示,传递 SCRIPT_DIR 作为参数 + yes 2>/dev/null | (cd "$install_dir" && "$install_script" "$SCRIPT_DIR") || true + + log_info "$component_name: 安装脚本执行完成" + return 0 +} + +# 查找组件进程 PID +find_component_pid() { + local component_name="$1" + local component_pid="" + + case "$component_name" in + "node-exporter") + component_pid=$(pgrep -f "node_exporter" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(pgrep -f "node-exporter" | head -1) + fi + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "node_exporter" | awk '{print $2}' | head -1) + fi + ;; + "dcgm-exporter") + component_pid=$(pgrep -f "dcgm-exporter" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(pgrep -f "dcgm_exporter" | head -1) + fi + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "dcgm-exporter" | awk '{print $2}' | head -1) + fi + ;; + "fluent-bit") + component_pid=$(pgrep -f "fluent-bit" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(pgrep -f "fluent_bit" | head -1) + fi + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "fluent-bit" | awk '{print $2}' | head -1) + fi + ;; + "argus-agent") + component_pid=$(pgrep -f "argus-agent" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "argus-agent" | awk '{print $2}' | head -1) + fi + ;; + esac + + echo "$component_pid" +} + +# 更新安装记录文件中的 PID +update_install_record_pid() { + local component_name="$1" + local new_pid="$2" + + if [[ ! -f "$INSTALL_RECORD_FILE" ]]; then + log_error "安装记录文件不存在: $INSTALL_RECORD_FILE" + return 1 + fi + + # 读取当前 PID + local current_pid="" + if command -v jq &> /dev/null; then + current_pid=$(jq -r --arg comp "$component_name" '.components[$comp].pid // ""' "$INSTALL_RECORD_FILE" 2>/dev/null) + fi + + if [[ -z "$current_pid" ]]; then + log_warning "$component_name: 无法读取当前 PID,跳过更新" + return 1 + fi + + # 使用 sed 精确替换 PID,保持原有格式不变 + # 只替换指定组件块中的 pid 字段 + local temp_file="${INSTALL_RECORD_FILE}.tmp" + local in_component=0 + local updated=0 + + while IFS= read -r line; do + if [[ "$line" =~ \"$component_name\":[[:space:]]*\{ ]]; then + in_component=1 + echo "$line" + elif [[ $in_component -eq 1 && "$line" =~ \"pid\":[[:space:]]*\"$current_pid\" ]]; then + echo "$line" | sed "s/\"pid\": \"$current_pid\"/\"pid\": \"$new_pid\"/" + updated=1 + in_component=0 + else + echo "$line" + if [[ "$line" =~ ^[[:space:]]*\}[[:space:]]*$ ]]; then + in_component=0 + fi + fi + done < "$INSTALL_RECORD_FILE" > "$temp_file" + + # 验证替换是否成功 + if [[ $updated -eq 1 ]]; then + mv "$temp_file" "$INSTALL_RECORD_FILE" + log_success "$component_name: PID 已更新为 $new_pid(原值: $current_pid)" + return 0 + else + log_error "$component_name: PID 替换失败" + rm -f "$temp_file" + return 1 + fi +} + +# 从安装记录文件中读取组件信息 +read_install_record() { + local install_record_file="$1" + + if [[ ! -f "$install_record_file" ]]; then + log_error "安装记录文件不存在: $install_record_file" + return 1 + fi + + # 检查是否有 jq 命令来解析 JSON + if command -v jq &> /dev/null; then + # 使用 jq 解析 JSON + local components_json + if components_json=$(jq -r '.components | to_entries[] | "\(.key):\(.value.install_dir)"' "$install_record_file" 2>/dev/null); then + echo "$components_json" + return 0 + else + log_error "无法解析安装记录文件 JSON 格式: $install_record_file" + return 1 + fi + else + # 如果没有 jq,尝试简单的文本解析 + log_warning "jq 命令不可用,尝试简单文本解析" + + # 查找所有 install_dir 行 + local components=() + while IFS= read -r line; do + if [[ "$line" =~ \"install_dir\":[[:space:]]*\"([^\"]+)\" ]]; then + local install_dir="${BASH_REMATCH[1]}" + # 从路径中提取组件名称 + local component_name=$(basename "$install_dir") + components+=("$component_name:$install_dir") + fi + done < "$install_record_file" + + if [[ ${#components[@]} -gt 0 ]]; then + printf '%s\n' "${components[@]}" + return 0 + else + log_error "无法从安装记录文件中提取组件信息" + return 1 + fi + fi +} + +# 主函数 +main() { + log_info "==========================================" + log_info " 组件自动重启检查" + log_info "==========================================" + + # 检查是否是root用户 + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + exit 1 + fi + + # 加载配置文件 + load_config + + # 从安装记录文件中读取组件信息 + log_info "从安装记录文件读取组件信息: $INSTALL_RECORD_FILE" + local components_info + if ! components_info=$(read_install_record "$INSTALL_RECORD_FILE"); then + log_error "无法读取安装记录文件,自动重启检查终止" + exit 1 + fi + + local restart_count=0 + local check_count=0 + + # 逐个检查组件 + while IFS= read -r component_info; do + if [[ -n "$component_info" ]]; then + IFS=':' read -r component_name install_dir <<< "$component_info" + check_count=$((check_count + 1)) + + local check_script_path="$install_dir/check_health.sh" + + log_info "检查组件: $component_name" + + # 检查健康状态 + if check_component_health "$component_name" "$check_script_path"; then + log_success "$component_name: 运行正常" + else + log_warning "$component_name: 健康检查失败,尝试重启" + restart_count=$((restart_count + 1)) + + # 执行重启 + restart_component "$component_name" "$install_dir" + + # 等待服务启动 + log_info "$component_name: 等待进程启动..." + sleep 10 + + # 查找新的进程 PID + local new_pid=$(find_component_pid "$component_name") + if [[ -n "$new_pid" ]]; then + log_info "$component_name: 找到新进程 PID: $new_pid" + update_install_record_pid "$component_name" "$new_pid" + else + log_warning "$component_name: 未找到新进程 PID" + fi + + # 再次检查健康状态 + if check_component_health "$component_name" "$check_script_path"; then + log_success "$component_name: 重启成功" + else + log_warning "$component_name: 重启后仍不健康,可能需要手动检查" + fi + fi + fi + done <<< "$components_info" + + log_info "==========================================" + log_info "检查完成: 共检查 $check_count 个组件,尝试重启 $restart_count 个" + log_info "==========================================" + + exit 0 +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi + diff --git a/src/metric/client-plugins/all-in-one-full/scripts/setup.sh b/src/metric/client-plugins/all-in-one-full/scripts/setup.sh new file mode 100755 index 0000000..0c36bce --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/setup.sh @@ -0,0 +1,931 @@ +#!/bin/bash + +set -e + +# 加载配置文件(仅在解压后的目录中可用) +load_config() { + # setup.sh 脚本不需要配置文件,FTP参数通过命令行参数或环境变量提供 + log_info "setup.sh 脚本使用命令行参数或环境变量获取FTP配置" +} + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +FTP_SERVER="${FTP_SERVER}" +FTP_USER="${FTP_USER}" +FTP_PASS="${FTP_PASS}" +FTP_PORT="${FTP_PORT:-21}" +BASE_URL="" # FTP基础URL (将在check_ftp_params中设置) +LATEST_VERSION_URL="" # 版本文件URL (将在check_ftp_params中设置) +TEMP_DIR="/tmp/argus-metric-install-$$" + +# 安装目录配置 +DEFAULT_INSTALL_DIR="/opt/argus-metric" # 默认安装目录 +INSTALL_DIR="${INSTALL_DIR:-$DEFAULT_INSTALL_DIR}" # 可通过环境变量覆盖 +VERSIONS_DIR="$INSTALL_DIR/versions" # 版本目录 +BACKUPS_DIR="$INSTALL_DIR/backups" # 备份目录 +CURRENT_LINK="$INSTALL_DIR/current" # 当前版本软链接 +LATEST_VERSION_FILE="$INSTALL_DIR/LATEST_VERSION" # 当前版本记录文件 + +# 检查必需的FTP参数 +check_ftp_params() { + local missing_params=() + + if [[ -z "$FTP_SERVER" ]]; then + missing_params+=("FTP_SERVER") + fi + + if [[ -z "$FTP_USER" ]]; then + missing_params+=("FTP_USER") + fi + + if [[ -z "$FTP_PASS" ]]; then + missing_params+=("FTP_PASS") + fi + + if [[ ${#missing_params[@]} -gt 0 ]]; then + log_error "缺少必需的FTP参数: ${missing_params[*]}" + log_error "请通过以下方式之一设置FTP参数:" + log_error " 1. 命令行参数: --server <地址> --user <用户名> --password <密码>" + log_error " 2. 环境变量: FTP_SERVER=<地址> FTP_USER=<用户名> FTP_PASS=<密码>" + log_error "" + log_error "示例:" + log_error " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234" + log_error " FTP_SERVER=10.211.55.4 FTP_USER=ftpuser FTP_PASS=admin1234 sudo sh setup.sh" + exit 1 + fi + + # 设置BASE_URL和LATEST_VERSION_URL + BASE_URL="ftp://${FTP_SERVER}:${FTP_PORT}" + LATEST_VERSION_URL="$BASE_URL/LATEST_VERSION" + + log_info "FTP配置:" + log_info " 服务器: $FTP_SERVER:$FTP_PORT" + log_info " 用户: $FTP_USER" +} + +# 获取最新版本号的函数 +get_latest_version() { + log_info "获取最新版本信息..." >&2 + log_info "尝试从URL获取: $LATEST_VERSION_URL" >&2 + + # 先测试FTP连接 + log_info "测试FTP连接..." >&2 + if ! curl -u "${FTP_USER}:${FTP_PASS}" -sfI "$LATEST_VERSION_URL" >/dev/null 2>&1; then + log_error "无法连接到FTP服务器或文件不存在" >&2 + log_error "URL: $LATEST_VERSION_URL" >&2 + log_error "请检查:" >&2 + log_error " 1. FTP服务器是否运行: $FTP_SERVER:$FTP_PORT" >&2 + log_error " 2. 用户名密码是否正确: $FTP_USER" >&2 + log_error " 3. LATEST_VERSION文件是否存在" >&2 + log_error "手动测试命令: curl -u ${FTP_USER}:${FTP_PASS} ftp://${FTP_SERVER}/LATEST_VERSION" >&2 + exit 1 + fi + + # 获取文件内容 + if ! LATEST_VERSION=$(curl -u "${FTP_USER}:${FTP_PASS}" -sfL "$LATEST_VERSION_URL" 2>/dev/null | tr -d '[:space:]'); then + log_error "下载LATEST_VERSION文件失败" >&2 + exit 1 + fi + + log_info "原始获取内容: '$LATEST_VERSION'" >&2 + + if [[ -z "$LATEST_VERSION" ]]; then + log_error "获取到的版本信息为空" >&2 + log_error "可能的原因:" >&2 + log_error " 1. LATEST_VERSION文件为空" >&2 + log_error " 2. 文件内容格式不正确" >&2 + log_error " 3. 网络传输问题" >&2 + log_error "请检查FTP服务器上的 /srv/ftp/share/LATEST_VERSION 文件" >&2 + exit 1 + fi + + log_info "检测到最新版本: $LATEST_VERSION" >&2 + echo "$LATEST_VERSION" +} + +# 解析参数 +ARGUS_VERSION="" # 使用不同的变量名避免与系统VERSION冲突 +ACTION="install" +FORCE_INSTALL=false + +while [[ $# -gt 0 ]]; do + case $1 in + --version) + ARGUS_VERSION="$2" + shift 2 + ;; + --server) + FTP_SERVER="$2" + shift 2 + ;; + --user) + FTP_USER="$2" + shift 2 + ;; + --password) + FTP_PASS="$2" + shift 2 + ;; + --port) + FTP_PORT="$2" + shift 2 + ;; + --uninstall) + ACTION="uninstall" + shift + ;; + --install-dir) + INSTALL_DIR="$2" + shift 2 + ;; + # 简化安装逻辑:不再支持回滚和备份列表功能 + # --rollback) + # ACTION="rollback" + # shift + # ;; + # --backup-list) + # ACTION="backup-list" + # shift + # ;; + --status) + ACTION="status" + shift + ;; + --force) + FORCE_INSTALL=true + shift + ;; + --help) + echo "Argus Metric FTP在线安装脚本" + echo + echo "用法: curl -u <用户名>:<密码> ftp://<服务器>/setup.sh -o setup.sh && sh setup.sh [选项]" + echo + echo "必需参数 (必须通过命令行参数或环境变量设置):" + echo " --server SERVER FTP服务器地址 (必须)" + echo " --user USER FTP用户名 (必须)" + echo " --password PASS FTP密码 (必须)" + echo + echo "可选参数:" + echo " --version VERSION 指定版本 (默认: 自动获取最新版本)" + echo " --port PORT FTP端口 (默认: 21)" + echo " --install-dir DIR 安装目录 (默认: /opt/argus-metric)" + echo " --force 强制重新安装 (即使相同版本)" + echo " --uninstall 卸载 (自动确认)" + # echo " --rollback 回滚到上一个备份版本" + # echo " --backup-list 列出所有备份版本" + echo " --status 显示当前安装状态" + echo " --help 显示帮助" + echo + echo "环境变量:" + echo " FTP_SERVER FTP服务器地址 (必须)" + echo " FTP_USER FTP用户名 (必须)" + echo " FTP_PASS FTP密码 (必须)" + echo " FTP_PORT FTP端口 (默认: 21)" + echo + echo "示例:" + echo " # 方式1: 使用命令行参数" + echo " curl -u ftpuser:admin1234 ftp://10.211.55.4/setup.sh -o setup.sh" + echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234" + echo " " + echo " # 方式2: 使用环境变量" + echo " FTP_SERVER=10.211.55.4 FTP_USER=ftpuser FTP_PASS=admin1234 sudo sh setup.sh" + echo " " + echo " # 指定版本安装" + echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234 --version 1.30.0" + echo " " + echo " # 强制重新安装" + echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234 --force" + echo " " + echo " # 卸载" + echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234 --uninstall" + exit 0 + ;; + *) + log_error "未知参数: $1" + echo "使用 --help 查看帮助信息" + exit 1 + ;; + esac +done + +# 清理函数 +cleanup() { + if [[ -d "$TEMP_DIR" ]]; then + rm -rf "$TEMP_DIR" + fi +} + +trap cleanup EXIT + +# 创建安装目录结构 +create_install_directories() { + log_info "创建安装目录结构..." + + # 创建主要目录 + mkdir -p "$VERSIONS_DIR" + mkdir -p "$BACKUPS_DIR" + + log_success "安装目录结构创建完成: $INSTALL_DIR" +} + +# 获取当前安装的版本 +get_current_version() { + # 优先从LATEST_VERSION文件读取 + if [[ -f "$LATEST_VERSION_FILE" ]]; then + local version_from_file=$(cat "$LATEST_VERSION_FILE" 2>/dev/null | tr -d '[:space:]') + if [[ -n "$version_from_file" ]]; then + # 确保版本号格式一致(不带v前缀) + echo "$version_from_file" + return 0 + fi + fi + + # 如果文件不存在或为空,从软链接读取 + if [[ -L "$CURRENT_LINK" ]]; then + local current_path=$(readlink "$CURRENT_LINK") + # 从版本目录名中提取版本号(现在不带v前缀) + basename "$current_path" + else + echo "" + fi +} + +# 检查是否已安装 +check_installed() { + if [[ -L "$CURRENT_LINK" ]] && [[ -d "$CURRENT_LINK" ]]; then + local current_version=$(get_current_version) + if [[ -n "$current_version" ]]; then + log_info "检测到已安装版本: v$current_version" + return 0 + fi + fi + return 1 +} + +# 更新LATEST_VERSION文件 +update_latest_version_file() { + local version="$1" + log_info "更新LATEST_VERSION文件: $version" + + if echo "$version" > "$LATEST_VERSION_FILE"; then + log_success "LATEST_VERSION文件已更新" + else + log_error "更新LATEST_VERSION文件失败" + return 1 + fi +} + +# 初始化 DNS 配置文件到系统目录 +init_dns_config_to_system() { + log_info "初始化 DNS 配置文件到系统目录..." + + # 系统 DNS 配置文件 + local system_dns_conf="$INSTALL_DIR/dns.conf" + + # 如果系统目录中还没有 dns.conf,创建一个空的占位文件 + if [[ ! -f "$system_dns_conf" ]]; then + touch "$system_dns_conf" + chmod 644 "$system_dns_conf" + log_success "DNS 配置文件占位文件已创建: $system_dns_conf" + log_info "DNS 同步脚本将从 FTP 服务器下载实际的 DNS 配置" + else + log_info "DNS 配置文件已存在: $system_dns_conf" + fi +} + +# 备份当前版本 +backup_current_version() { + local current_version=$(get_current_version) + if [[ -z "$current_version" ]]; then + log_info "没有当前版本需要备份" + return 0 + fi + + # 确保备份目录存在 + mkdir -p "$BACKUPS_DIR" + + local backup_name="$current_version" + local backup_path="$BACKUPS_DIR/$backup_name" + + log_info "备份当前版本 $current_version 到: $backup_path" + + # 如果备份已存在,先删除 + if [[ -d "$backup_path" ]]; then + log_info "备份版本已存在,覆盖: $backup_path" + rm -rf "$backup_path" + fi + + # 复制当前版本目录(跟随软链接复制实际内容) + if cp -rL "$CURRENT_LINK" "$backup_path"; then + log_success "版本备份完成: $backup_name" + + else + log_error "版本备份失败" + exit 1 + fi +} + +# 回滚到备份版本 +rollback_to_backup() { + local backup_name="$1" + + # 确保备份目录存在 + mkdir -p "$BACKUPS_DIR" + + local backup_path="$BACKUPS_DIR/$backup_name" + + if [[ ! -d "$backup_path" ]]; then + log_error "备份不存在: $backup_path" + return 1 + fi + + log_info "回滚到备份版本: $backup_name" + + # 停止当前服务 + stop_services + + # 检查是否存在对应的版本目录 + local version_dir="$VERSIONS_DIR/$backup_name" + + if [[ ! -d "$version_dir" ]]; then + log_info "版本目录不存在,从备份恢复版本目录: $version_dir" + # 从备份目录恢复到版本目录 + mkdir -p "$VERSIONS_DIR" + cp -r "$backup_path" "$version_dir" + fi + + # 恢复软链接指向版本目录 + if ln -sfn "$version_dir" "$CURRENT_LINK"; then + log_success "版本回滚完成: $backup_name" + + # 更新LATEST_VERSION文件 + update_latest_version_file "$backup_name" + + return 0 + else + log_error "版本回滚失败" + return 1 + fi +} + +# 停止服务 +stop_services() { + log_info "停止当前服务..." + + # 检查服务是否正在运行 + if ! check_services_running; then + log_info "服务未运行,无需停止" + return 0 + fi + + # 尝试使用卸载脚本停止服务 + if [[ -f "$CURRENT_LINK/uninstall.sh" ]]; then + cd "$CURRENT_LINK" + chmod +x uninstall.sh + + # 自动确认停止服务(避免交互式确认) + echo "y" | ./uninstall.sh >/dev/null 2>&1 + local stop_exit_code=$? + + if [[ $stop_exit_code -eq 0 ]]; then + log_success "服务停止完成" + else + log_warning "停止服务时出现警告,尝试手动停止" + manual_stop_services + fi + else + log_warning "未找到卸载脚本,尝试手动停止服务" + manual_stop_services + fi +} + +# 手动停止服务 +manual_stop_services() { + log_info "手动停止服务..." + + # 停止 node_exporter + if pgrep -f "node_exporter" >/dev/null 2>&1; then + pkill -f "node_exporter" && log_info "node_exporter 已停止" + fi + + # 停止 dcgm_exporter + if pgrep -f "dcgm_exporter" >/dev/null 2>&1; then + pkill -f "dcgm_exporter" && log_info "dcgm_exporter 已停止" + fi + + # 等待进程完全停止 + sleep 2 + + # 检查是否还有残留进程 + if pgrep -f "node_exporter\|dcgm_exporter" >/dev/null 2>&1; then + log_warning "仍有服务进程运行,尝试强制停止" + pkill -9 -f "node_exporter\|dcgm_exporter" 2>/dev/null || true + fi + + log_success "手动停止服务完成" +} + +# 启动服务 +start_services() { + log_info "启动服务..." + + # 检查服务是否已经在运行 + if check_services_running; then + log_info "服务已在运行,跳过启动" + return 0 + fi + + # 由于 install_artifact.sh 已经安装了所有组件并设置了健康检查定时任务 + # 这里只需要简单验证服务状态即可 + log_info "组件已安装完成,健康检查定时任务已设置" + log_info "服务将在健康检查时自动启动(每5分钟检查一次)" + + # 等待一下让服务有时间启动 + sleep 3 + + # 验证服务状态 + if check_services_running; then + log_success "服务启动成功" + else + log_info "服务可能正在启动中,健康检查机制将自动监控" + fi + + return 0 +} + +# 检查服务是否正在运行 +check_services_running() { + # 检查常见的服务端口是否在监听 + local ports=(9100 9400) # node-exporter 和 dcgm-exporter 的默认端口 + + for port in "${ports[@]}"; do + if netstat -tlnp 2>/dev/null | grep -q ":$port "; then + log_info "检测到服务正在端口 $port 上运行" + return 0 + fi + done + + # 检查相关进程 + if pgrep -f "node_exporter\|dcgm_exporter" >/dev/null 2>&1; then + log_info "检测到相关服务进程正在运行" + return 0 + fi + + return 1 +} + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo sh setup.sh" + exit 1 + fi +} + +# 检查系统要求 +check_system() { + log_info "检查系统要求..." + + # 检查操作系统 + if [[ ! -f /etc/os-release ]]; then + log_error "无法检测操作系统版本" + exit 1 + fi + + # 读取系统信息,使用子shell避免污染当前环境变量 + local OS_INFO=$(source /etc/os-release && echo "$NAME $VERSION_ID") + log_info "检测到操作系统: $OS_INFO" + + # 检查系统架构 + arch=$(uname -m) + log_info "系统架构: $arch" + + # 检查磁盘空间 + available_space=$(df / | awk 'NR==2 {print $4}') + if [[ $available_space -lt 1024 ]]; then + log_warning "可用磁盘空间不足 1GB,当前可用: $(($available_space / 1024 / 1024))GB" + fi +} + +# 下载并安装 +install_argus_metric() { + # 如果没有指定版本,获取最新版本 + if [[ -z "$ARGUS_VERSION" ]]; then + ARGUS_VERSION=$(get_latest_version) + fi + + log_info "开始安装 Argus Metric v$ARGUS_VERSION..." + log_info "安装目录: $INSTALL_DIR" + + # 创建安装目录结构(必须先创建,以便备份时目录存在) + create_install_directories + + # 检查是否已安装 + local is_upgrade=false + if check_installed; then + local current_version=$(get_current_version) + if [[ "$current_version" == "$ARGUS_VERSION" ]]; then + if [[ "$FORCE_INSTALL" == true ]]; then + log_info "检测到相同版本 v$ARGUS_VERSION,但使用了 --force 参数,将强制重新安装" + is_upgrade=true + # 简化安装逻辑:不再备份当前版本 + # backup_current_version + else + log_info "版本 v$ARGUS_VERSION 已安装,无需重复安装" + log_info "如需强制重新安装,请使用 --force 参数" + return 0 + fi + else + log_info "检测到版本升级: v$current_version -> v$ARGUS_VERSION" + is_upgrade=true + + # 简化安装逻辑:不再备份当前版本 + # backup_current_version + fi + fi + + # 创建临时目录 + mkdir -p "$TEMP_DIR" + cd "$TEMP_DIR" + + # 下载发布包,使用新的命名规范 + TAR_NAME="argus-metric_$(echo $ARGUS_VERSION | tr '.' '_').tar.gz" + log_info "下载发布包: $TAR_NAME" + log_info "从FTP服务器下载: $FTP_SERVER:$FTP_PORT, 用户: $FTP_USER" + + # 构造curl命令并显示(隐藏密码) + CURL_CMD="curl -u \"${FTP_USER}:***\" -sfL \"$BASE_URL/$TAR_NAME\" -o \"$TAR_NAME\"" + log_info "执行命令: $CURL_CMD" + + if ! curl -u "${FTP_USER}:${FTP_PASS}" -sfL "$BASE_URL/$TAR_NAME" -o "$TAR_NAME"; then + log_error "下载发布包失败: $BASE_URL/$TAR_NAME" + log_error "完整命令: curl -u \"${FTP_USER}:${FTP_PASS}\" -sfL \"$BASE_URL/$TAR_NAME\" -o \"$TAR_NAME\"" + log_error "请检查FTP服务器连接、用户名密码是否正确" + exit 1 + fi + + # 解压发布包到当前目录 + log_info "解压发布包..." + if ! tar -xzf "$TAR_NAME"; then + log_error "解压发布包失败" + exit 1 + fi + + # 显示解压后的文件结构 + log_info "解压后的文件结构:" + ls -la "$TEMP_DIR" + + # 准备版本目录 + local version_dir="$VERSIONS_DIR/$ARGUS_VERSION" + log_info "安装到版本目录: $version_dir" + + # 如果升级,先停止服务 + if [[ "$is_upgrade" == true ]]; then + stop_services + fi + + # 创建版本目录 + if [[ -d "$version_dir" ]]; then + log_info "版本目录已存在,备份后更新" + rm -rf "$version_dir" + fi + + # 创建新的版本目录 + mkdir -p "$version_dir" + + # 移动解压的文件到版本目录 + log_info "移动文件到版本目录: $TEMP_DIR/* -> $version_dir/" + + # 检查源目录是否有内容 + if [[ ! "$(ls -A "$TEMP_DIR" 2>/dev/null)" ]]; then + log_error "临时目录为空,无法移动文件" + exit 1 + fi + + # 检查目标目录是否存在 + if [[ ! -d "$version_dir" ]]; then + log_error "目标版本目录不存在: $version_dir" + exit 1 + fi + + # 执行文件移动 + if mv "$TEMP_DIR"/* "$version_dir" 2>/dev/null; then + log_success "文件移动到版本目录完成" + else + log_error "移动文件到版本目录失败" + log_error "源目录内容:" + ls -la "$TEMP_DIR" || true + log_error "目标目录状态:" + ls -la "$version_dir" || true + log_error "权限检查:" + ls -ld "$TEMP_DIR" "$version_dir" || true + exit 1 + fi + + # 执行安装脚本 + log_info "执行安装脚本..." + cd "$version_dir" + if [[ -f "install.sh" ]]; then + chmod +x install.sh + # 传递安装根目录给安装脚本,让install_artifact.sh安装到正确的版本目录 + if ./install.sh "$version_dir"; then + log_success "安装脚本执行完成" + else + log_error "安装脚本执行失败" + # 简化安装逻辑:不再自动回滚 + # if [[ "$is_upgrade" == true ]]; then + # log_warning "升级失败,尝试回滚到之前版本..." + # # 确保备份目录存在 + # mkdir -p "$BACKUPS_DIR" + # local latest_backup=$(ls -1t "$BACKUPS_DIR" 2>/dev/null | head -n 1) + # if [[ -n "$latest_backup" ]]; then + # rollback_to_backup "$latest_backup" + # return 1 + # fi + # fi + exit 1 + fi + else + log_error "未找到安装脚本 install.sh" + exit 1 + fi + + # 更新软链接指向新版本 + log_info "更新当前版本链接..." + + # 如果 current 已经存在且是目录,先删除它 + if [[ -d "$CURRENT_LINK" ]] && [[ ! -L "$CURRENT_LINK" ]]; then + log_warning "发现 current 是目录而不是符号链接,正在删除..." + rm -rf "$CURRENT_LINK" + fi + + if ln -sfn "$version_dir" "$CURRENT_LINK"; then + log_success "版本链接更新完成: $CURRENT_LINK -> $version_dir" + else + log_error "版本链接更新失败" + exit 1 + fi + + # 更新LATEST_VERSION文件 + update_latest_version_file "$ARGUS_VERSION" + + # 初始化 DNS 配置文件到系统目录 + init_dns_config_to_system + + # 启动服务 + # start_services + + log_success "Argus Metric v$ARGUS_VERSION 安装完成!" + + # 显示安装信息 + echo + log_info "安装信息:" + log_info " 版本: $ARGUS_VERSION" + log_info " 安装目录: $INSTALL_DIR" + log_info " 版本目录: $version_dir" + log_info " 当前链接: $CURRENT_LINK" + if [[ "$is_upgrade" == true ]]; then + log_info " 升级类型: 版本升级" + else + log_info " 安装类型: 全新安装" + fi +} + +# 卸载 +uninstall_argus_metric() { + log_info "开始卸载 Argus Metric..." + log_info "安装目录: $INSTALL_DIR" + + # 检查是否已安装 + if ! check_installed; then + log_info "未检测到已安装的 Argus Metric" + return 0 + fi + + local current_version=$(get_current_version) + log_info "检测到当前版本: v$current_version" + + # 停止服务 + stop_services + + # 执行卸载脚本 + log_info "执行卸载脚本..." + if [[ -f "$CURRENT_LINK/uninstall.sh" ]]; then + cd "$CURRENT_LINK" + chmod +x uninstall.sh + + # 自动确认卸载(因为用户已经明确使用了 --uninstall 参数) + log_info "自动确认卸载操作..." + echo "y" | ./uninstall.sh + local uninstall_exit_code=$? + + if [[ $uninstall_exit_code -eq 0 ]]; then + log_success "卸载脚本执行完成" + else + log_error "卸载脚本执行失败 (退出码: $uninstall_exit_code)" + exit 1 + fi + else + log_warning "未找到卸载脚本,执行基本清理" + fi + + # 清理安装目录 + log_info "清理安装目录..." + if [[ -d "$INSTALL_DIR" ]]; then + # 询问是否完全删除安装目录 + log_warning "这将删除整个安装目录: $INSTALL_DIR" + log_warning "包括所有版本、备份和配置文件" + + # 在自动化环境中,直接删除 + if rm -rf "$INSTALL_DIR"; then + log_success "安装目录已完全清理: $INSTALL_DIR" + else + log_error "清理安装目录失败" + exit 1 + fi + else + log_info "安装目录不存在,无需清理" + fi + + log_success "Argus Metric 卸载完成!" +} + +# 显示状态 +show_status() { + echo "==========================================" + echo " Argus Metric 安装状态" + echo "==========================================" + echo + + if check_installed; then + local current_version=$(get_current_version) + log_info "当前版本: $current_version" + log_info "安装目录: $INSTALL_DIR" + log_info "当前链接: $CURRENT_LINK" + log_info "版本目录: $VERSIONS_DIR/$current_version" + log_info "版本文件: $LATEST_VERSION_FILE" + + # 显示LATEST_VERSION文件内容 + if [[ -f "$LATEST_VERSION_FILE" ]]; then + local file_version=$(cat "$LATEST_VERSION_FILE" 2>/dev/null | tr -d '[:space:]') + log_info "版本文件内容: $file_version" + fi + + echo + log_info "目录结构:" + if [[ -d "$INSTALL_DIR" ]]; then + tree -L 2 "$INSTALL_DIR" 2>/dev/null || ls -la "$INSTALL_DIR" + fi + + echo + log_info "可用版本:" + if [[ -d "$VERSIONS_DIR" ]]; then + ls -1 "$VERSIONS_DIR" 2>/dev/null | sed 's/^/ - /' + else + echo " 无" + fi + + # 简化安装逻辑:不再显示备份版本信息 + # echo + # log_info "备份版本:" + # if [[ -d "$BACKUPS_DIR" ]] && [[ $(ls -1 "$BACKUPS_DIR" 2>/dev/null | wc -l) -gt 0 ]]; then + # ls -1t "$BACKUPS_DIR" 2>/dev/null | sed 's/^/ - /' + # else + # echo " 无" + # fi + else + log_warning "Argus Metric 未安装" + log_info "安装目录: $INSTALL_DIR" + fi +} + +# 列出备份 +list_backups() { + echo "==========================================" + echo " Argus Metric 备份列表" + echo "==========================================" + echo + + if [[ -d "$BACKUPS_DIR" ]] && [[ $(ls -1 "$BACKUPS_DIR" 2>/dev/null | wc -l) -gt 0 ]]; then + log_info "可用备份版本:" + ls -1t "$BACKUPS_DIR" 2>/dev/null | while read backup; do + local backup_time=$(stat -c %y "$BACKUPS_DIR/$backup" 2>/dev/null | cut -d' ' -f1-2) + echo " - $backup (创建时间: $backup_time)" + done + else + log_warning "没有可用的备份版本" + fi +} + +# 回滚功能 +rollback_version() { + log_info "开始回滚操作..." + + if ! check_installed; then + log_error "没有检测到已安装的版本,无法回滚" + exit 1 + fi + + # 确保备份目录存在 + mkdir -p "$BACKUPS_DIR" + + # 获取最新的备份 + local latest_backup=$(ls -1t "$BACKUPS_DIR" 2>/dev/null | head -n 1) + if [[ -z "$latest_backup" ]]; then + log_error "没有找到可用的备份版本" + exit 1 + fi + + log_info "将回滚到备份版本: $latest_backup" + + if rollback_to_backup "$latest_backup"; then + log_success "回滚完成!" + + # 显示当前状态 + echo + show_status + else + log_error "回滚失败" + exit 1 + fi +} + +# 主函数 +main() { + echo "==========================================" + echo " Argus Metric 在线安装脚本 v1.0" + echo "==========================================" + echo + + # 加载配置文件 + load_config + + # 对于状态操作,不需要FTP参数和root权限 + # 简化安装逻辑:不再支持备份列表操作 + if [[ "$ACTION" == "status" ]]; then + show_status + return 0 + fi + # if [[ "$ACTION" == "status" || "$ACTION" == "backup-list" ]]; then + # if [[ "$ACTION" == "status" ]]; then + # show_status + # elif [[ "$ACTION" == "backup-list" ]]; then + # list_backups + # fi + # return 0 + # fi + + check_root + + # 更新目录配置变量(在设置INSTALL_DIR后) + VERSIONS_DIR="$INSTALL_DIR/versions" + BACKUPS_DIR="$INSTALL_DIR/backups" + CURRENT_LINK="$INSTALL_DIR/current" + LATEST_VERSION_FILE="$INSTALL_DIR/LATEST_VERSION" + + # 简化安装逻辑:不再支持回滚操作 + # if [[ "$ACTION" == "rollback" ]]; then + # rollback_version + # return 0 + # fi + + check_ftp_params + check_system + + if [[ "$ACTION" == "uninstall" ]]; then + uninstall_argus_metric + else + install_argus_metric + fi + + echo + log_info "操作完成!" +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/scripts/sync_dns.sh b/src/metric/client-plugins/all-in-one-full/scripts/sync_dns.sh new file mode 100755 index 0000000..ba8a84c --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/sync_dns.sh @@ -0,0 +1,143 @@ +#!/bin/bash +set -e + +# 颜色 +RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m' + +# 日志函数 +log_info() { echo -e "${BLUE}[INFO]${NC} $1" >&2; } +log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" >&2; } +log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1" >&2; } +log_error() { echo -e "${RED}[ERROR]${NC} $1" >&2; } + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LOCAL_DNS_CONF="/opt/argus-metric/dns.conf" +RESOLV_CONF="/etc/resolv.conf" +ALT_RESOLV_CONF="/run/resolv.conf" +LOG_FILE="/opt/argus-metric/.dns_sync.log" +REMOTE_DNS_CONF_URL="" + +# 获取 FTP 配置 +get_ftp_config() { + log_info "获取 FTP 配置信息..." + if [[ -z "$FTP_SERVER" || -z "$FTP_USER" || -z "$FTP_PASSWORD" ]]; then + [[ -f "$SCRIPT_DIR/config.env" ]] && source "$SCRIPT_DIR/config.env" + fi + FTP_SERVER="${FTP_SERVER:-localhost}" + FTP_USER="${FTP_USER:-ftpuser}" + FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" + REMOTE_DNS_CONF_URL="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/dns.conf" +} + +# 下载远程 dns.conf +download_remote_dns_conf() { + local tmp="/tmp/dns.remote.$$" + log_info "测试 FTP 连接..." + if ! curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/" >/dev/null; then + log_error "无法连接到 FTP 服务器: $FTP_SERVER"; return 1 + fi + if ! curl -u "${FTP_USER}:${FTP_PASSWORD}" -sf "ftp://${FTP_SERVER}/dns.conf" -o "$tmp" 2>/dev/null; then + log_error "下载 dns.conf 失败"; rm -f "$tmp"; return 1 + fi + echo "$tmp" +} + +# 文件比较 +compare_files() { diff -q "$1" "$2" >/dev/null 2>&1; } + +# 从 dns.conf 提取有效 IP +get_dns_ips() { + grep -Eo '^[0-9]{1,3}(\.[0-9]{1,3}){3}$' "$1" | sort -u +} + +# 安全更新 resolv.conf(保留符号链接) +update_resolv_conf() { + local dns_conf="$1" + local dns_ips + mapfile -t dns_ips < <(get_dns_ips "$dns_conf") + [[ ${#dns_ips[@]} -eq 0 ]] && { log_warning "未检测到有效 DNS"; return; } + + local target_file="$RESOLV_CONF" + if [[ ! -w "$RESOLV_CONF" ]]; then + log_warning "/etc/resolv.conf 不可写,使用兜底路径 $ALT_RESOLV_CONF" + target_file="$ALT_RESOLV_CONF" + fi + + local temp="/tmp/resolv.new.$$" + cp "$target_file" "${target_file}.backup.$(date +%Y%m%d_%H%M%S)" 2>/dev/null || true + log_info "更新 DNS 配置文件: $target_file" + + # 写入新的 nameserver 行 + for ip in "${dns_ips[@]}"; do + echo "nameserver $ip" + done >"$temp" + + # 追加原内容(去掉重复 nameserver) + grep -v '^nameserver' "$target_file" >>"$temp" 2>/dev/null || true + awk '!a[$0]++' "$temp" >"${temp}.uniq" + + # ⚙️ 使用 cat 原地覆盖,避免 mv 引发 “设备忙” + if cat "${temp}.uniq" >"$target_file" 2>/dev/null; then + chmod 644 "$target_file" + log_success "DNS 更新完成: ${dns_ips[*]}" + else + log_error "无法写入 $target_file,可能被系统锁定" + fi + + rm -f "$temp" "${temp}.uniq" +} + +# 检查 resolv.conf 是否包含 dns.conf 内容 +ensure_dns_in_resolv() { + local dns_conf="$1" + local dns_ips + mapfile -t dns_ips < <(get_dns_ips "$dns_conf") + [[ ${#dns_ips[@]} -eq 0 ]] && return + + for ip in "${dns_ips[@]}"; do + if ! grep -q "nameserver $ip" "$RESOLV_CONF" 2>/dev/null; then + log_warning "检测到 /etc/resolv.conf 缺少 $ip,执行兜底修复" + update_resolv_conf "$dns_conf" + return + fi + done + log_info "/etc/resolv.conf 已包含所有 DNS" +} + +log_sync() { echo "[$(date '+%F %T')] $1" >>"$LOG_FILE"; } + +main() { + log_info "开始 DNS 同步检查..." + mkdir -p /opt/argus-metric + + get_ftp_config + local remote_file + if ! remote_file=$(download_remote_dns_conf); then + log_error "下载失败"; log_sync "同步失败"; exit 1 + fi + + if [[ ! -f "$LOCAL_DNS_CONF" ]]; then + log_info "本地 dns.conf 不存在,初始化..." + cp "$remote_file" "$LOCAL_DNS_CONF" + update_resolv_conf "$LOCAL_DNS_CONF" + log_sync "首次同步完成" + else + if compare_files "$LOCAL_DNS_CONF" "$remote_file"; then + log_info "dns.conf 无变化" + ensure_dns_in_resolv "$LOCAL_DNS_CONF" + log_sync "dns.conf 无变化,执行兜底检查" + else + log_info "检测到 DNS 配置更新" + cp "$remote_file" "$LOCAL_DNS_CONF" + update_resolv_conf "$LOCAL_DNS_CONF" + log_sync "DNS 配置同步完成" + fi + fi + + rm -f "$remote_file" + log_success "DNS 同步流程完成" +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/scripts/uninstall_artifact.sh b/src/metric/client-plugins/all-in-one-full/scripts/uninstall_artifact.sh new file mode 100755 index 0000000..ca137a7 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/uninstall_artifact.sh @@ -0,0 +1,274 @@ +#!/bin/bash + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 配置变量 +INSTALL_DIR="/opt/argus-metric" +TEMP_DIR="/tmp/argus-metric-uninstall-$$" +VERSION_FILE="version.json" + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0" + exit 1 + fi +} + +# 查找版本文件 +find_version_file() { + log_info "查找版本信息文件..." + + # 在当前目录查找 + if [[ -f "$VERSION_FILE" ]]; then + VERSION_FILE_PATH="$VERSION_FILE" + log_success "找到版本文件: $VERSION_FILE" + return 0 + fi + + # 在 artifact 目录查找 + for version_dir in artifact/*/; do + if [[ -f "${version_dir}${VERSION_FILE}" ]]; then + VERSION_FILE_PATH="${version_dir}${VERSION_FILE}" + log_success "找到版本文件: $VERSION_FILE_PATH" + return 0 + fi + done + + log_error "未找到版本信息文件 $VERSION_FILE" + log_info "请确保在正确的目录下运行此脚本" + exit 1 +} + +# 解析版本信息 +parse_version_info() { + log_info "解析版本信息..." + + if [[ ! -f "$VERSION_FILE_PATH" ]]; then + log_error "版本文件不存在: $VERSION_FILE_PATH" + exit 1 + fi + + # 使用 jq 解析 JSON(如果可用) + if command -v jq &> /dev/null; then + VERSION=$(jq -r '.version' "$VERSION_FILE_PATH") + BUILD_TIME=$(jq -r '.build_time' "$VERSION_FILE_PATH") + + # 解析 install_order(现在包含完整的文件名) + if jq -e '.install_order' "$VERSION_FILE_PATH" > /dev/null 2>&1; then + jq -r '.install_order[]' "$VERSION_FILE_PATH" > "$TEMP_DIR/install_order.txt" + else + log_error "version.json 中缺少 install_order 字段" + exit 1 + fi + else + log_warning "jq 未安装,使用简单的 JSON 解析" + VERSION=$(grep '"version"' "$VERSION_FILE_PATH" | sed 's/.*"version": *"\([^"]*\)".*/\1/') + BUILD_TIME=$(grep '"build_time"' "$VERSION_FILE_PATH" | sed 's/.*"build_time": *"\([^"]*\)".*/\1/') + + # 解析 install_order + grep -A 100 '"install_order"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+"' | while read line; do + component=$(echo "$line" | sed 's/.*"\([^"]*\)".*/\1/') + echo "$component" >> "$TEMP_DIR/install_order.txt" + done + fi + + log_success "版本信息解析完成" + log_info " 版本: $VERSION" + log_info " 构建时间: $BUILD_TIME" +} + +# 创建临时目录 +create_temp_dirs() { + log_info "创建临时目录..." + mkdir -p "$TEMP_DIR" + log_success "临时目录创建完成: $TEMP_DIR" +} + +# 卸载组件 +uninstall_components() { + log_info "开始卸载组件..." + + artifact_dir=$(dirname "$VERSION_FILE_PATH") + uninstall_count=0 + total_count=0 + + if [[ -f "$TEMP_DIR/install_order.txt" ]]; then + total_count=$(wc -l < "$TEMP_DIR/install_order.txt") + fi + + if [[ -f "$TEMP_DIR/install_order.txt" ]]; then + while IFS= read -r filename; do + uninstall_count=$((uninstall_count + 1)) + + # 从文件名中提取组件名(去掉时间戳后缀) + component=$(echo "$filename" | sed 's/-[0-9]\{8\}-[0-9]\{6\}\.tar\.gz$//') + + log_info "[$uninstall_count/$total_count] 卸载 $component..." + + # 直接使用完整的文件名 + tar_file="$artifact_dir/$filename" + + if [[ ! -f "$tar_file" ]]; then + log_error "找不到组件文件: $filename" + exit 1 + fi + + # 解压到临时目录 + component_temp_dir="$TEMP_DIR/$component" + mkdir -p "$component_temp_dir" + + if tar -xzf "$tar_file" -C "$component_temp_dir"; then + log_success " $component 解压完成" + else + log_error " $component 解压失败" + exit 1 + fi + + # 查找解压后的目录 + extracted_dir="" + for dir in "$component_temp_dir"/*; do + if [[ -d "$dir" ]]; then + extracted_dir="$dir" + break + fi + done + + if [[ -z "$extracted_dir" ]]; then + log_error " $component 解压后未找到目录" + exit 1 + fi + + # 执行卸载脚本 + if [[ -f "$extracted_dir/uninstall.sh" ]]; then + log_info " 执行 $component 卸载脚本..." + # 所有组件都只需要一个确认 + if (cd "$extracted_dir" && echo "y" | ./uninstall.sh); then + log_success " $component 卸载完成" + else + log_error " $component 卸载失败" + exit 1 + fi + else + log_warning " $component 缺少 uninstall.sh 文件,跳过卸载" + fi + + # 清理临时文件 + rm -rf "$component_temp_dir" + done < "$TEMP_DIR/install_order.txt" + fi + + log_success "所有组件卸载完成" +} + +# 清理全局文件 +cleanup_global_files() { + log_info "清理全局文件..." + + # 清理安装目录 + if [[ -d "$INSTALL_DIR" ]]; then + rm -rf "$INSTALL_DIR" + log_success "安装目录已清理: $INSTALL_DIR" + else + log_info "安装目录不存在: $INSTALL_DIR" + fi + + # 清理可能的全局配置文件 + local global_configs=( + "/etc/argus-metric" + "/var/log/argus-metric" + ) + + for config in "${global_configs[@]}"; do + if [[ -d "$config" ]]; then + rm -rf "$config" + log_success "全局配置已清理: $config" + fi + done +} + +# 显示卸载信息 +show_uninstall_info() { + log_success "Argus-Metrics All-in-One 卸载完成!" + echo + echo "卸载信息:" + echo " 版本: $VERSION" + echo " 构建时间: $BUILD_TIME" + echo + echo "清理内容:" + echo " - 二进制文件" + echo " - 配置文件" + echo " - 数据目录" + echo " - 进程和服务" + echo " - 全局安装目录" + echo + echo "注意:" + echo " - 系统依赖包可能仍然存在" + echo " - 如需完全清理,请手动检查并删除相关文件" + echo +} + +# 清理函数 +cleanup() { + if [[ -d "$TEMP_DIR" ]]; then + rm -rf "$TEMP_DIR" + fi +} + +# 设置清理陷阱 +trap cleanup EXIT + +# 主函数 +main() { + echo "==========================================" + echo " Argus-Metrics All-in-One 卸载脚本" + echo "==========================================" + echo + + check_root + find_version_file + create_temp_dirs + parse_version_info + + log_warning "此操作将完全卸载 Argus-Metrics All-in-One" + read -p "确认继续?(y/N): " confirm + + if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "取消卸载操作" + exit 0 + fi + + uninstall_components + cleanup_global_files + show_uninstall_info +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi \ No newline at end of file diff --git a/src/metric/client-plugins/all-in-one-full/scripts/version-manager.sh b/src/metric/client-plugins/all-in-one-full/scripts/version-manager.sh new file mode 100755 index 0000000..65e566c --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/version-manager.sh @@ -0,0 +1,350 @@ +#!/bin/bash + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 显示帮助信息 +show_help() { + echo "AIOps 版本管理工具" + echo + echo "用法: $0 [options]" + echo + echo "命令:" + echo " bump - 升级版本号 (major|minor|patch)" + echo " set - 设置指定版本号" + echo " show - 显示当前版本信息" + echo " list - 列出所有版本" + echo " clean - 清理旧版本" + echo " validate - 验证版本配置" + echo + echo "示例:" + echo " $0 bump minor # 升级次版本号 1.0.0 -> 1.1.0" + echo " $0 set 2.0.0 # 设置版本为 2.0.0" + echo " $0 show # 显示当前版本" + echo " $0 list # 列出所有版本" +} + +# 获取当前版本 +get_current_version() { + if [[ -f "config/VERSION" ]]; then + cat config/VERSION + else + echo "0.0.0" + fi +} + +# 设置版本号 +set_version() { + local new_version="$1" + + # 验证版本号格式 + if [[ ! "$new_version" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + log_error "无效的版本号格式: $new_version" + log_info "版本号格式应为: major.minor.patch (如: 1.2.3)" + exit 1 + fi + + echo "$new_version" > config/VERSION + log_success "版本号已设置为: $new_version" +} + +# 升级版本号 +bump_version() { + local bump_type="$1" + local current_version=$(get_current_version) + + # 解析当前版本号 + IFS='.' read -r major minor patch <<< "$current_version" + + case "$bump_type" in + "major") + major=$((major + 1)) + minor=0 + patch=0 + ;; + "minor") + minor=$((minor + 1)) + patch=0 + ;; + "patch") + patch=$((patch + 1)) + ;; + *) + log_error "无效的升级类型: $bump_type" + log_info "支持的类型: major, minor, patch" + exit 1 + ;; + esac + + local new_version="$major.$minor.$patch" + set_version "$new_version" + log_success "版本号已从 $current_version 升级到 $new_version" +} + +# 显示当前版本信息 +show_version() { + local current_version=$(get_current_version) + log_info "当前版本: $current_version" + + if [[ -f "config/checklist" ]]; then + echo + echo "组件清单:" + while IFS= read -r line; do + [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue + read -r component version dep order <<< "$line" + if [[ -n "$component" && -n "$version" ]]; then + echo " - $component v$version" + fi + done < config/checklist + fi + + # 检查是否有对应的 artifact + local artifact_dir="artifact/$current_version" + if [[ -d "$artifact_dir" ]]; then + echo + echo "已构建的组件:" + for file in "$artifact_dir"/*.tar.gz; do + if [[ -f "$file" ]]; then + local filename=$(basename "$file") + local size=$(du -h "$file" | cut -f1) + echo " - $filename ($size)" + fi + done + + if [[ -f "$artifact_dir/version.json" ]]; then + echo + echo "版本信息文件: $artifact_dir/version.json" + fi + else + echo + log_warning "未找到对应的构建目录: $artifact_dir" + log_info "运行 ./package.sh 进行构建" + fi +} + +# 列出所有版本 +list_versions() { + log_info "所有版本列表:" + echo + + if [[ ! -d "artifact" ]]; then + log_warning "artifact 目录不存在" + return + fi + + for version_dir in artifact/*/; do + if [[ -d "$version_dir" ]]; then + local version=$(basename "$version_dir") + local current_version=$(get_current_version) + + if [[ "$version" == "$current_version" ]]; then + echo " * $version (当前版本)" + else + echo " $version" + fi + + # 显示该版本的组件 + local component_count=0 + for file in "$version_dir"/*.tar.gz; do + if [[ -f "$file" ]]; then + component_count=$((component_count + 1)) + fi + done + + if [[ $component_count -gt 0 ]]; then + echo " 包含 $component_count 个组件" + fi + fi + done +} + +# 清理旧版本 +clean_versions() { + local current_version=$(get_current_version) + local keep_versions=5 # 保留最近5个版本 + + log_info "清理旧版本 (保留最近 $keep_versions 个版本)..." + + if [[ ! -d "artifact" ]]; then + log_warning "artifact 目录不存在" + return + fi + + # 获取所有版本目录,按修改时间排序 + local versions=() + while IFS= read -r -d '' version_dir; do + versions+=("$(basename "$version_dir")") + done < <(find artifact -maxdepth 1 -type d -name "[0-9]*" -print0 | sort -z) + + local total_versions=${#versions[@]} + local versions_to_remove=$((total_versions - keep_versions)) + + if [[ $versions_to_remove -le 0 ]]; then + log_info "无需清理,当前只有 $total_versions 个版本" + return + fi + + log_info "将删除 $versions_to_remove 个旧版本..." + + for ((i=0; i