#!/bin/bash # 此脚本会检查各组件的健康状态,并重启不健康的组件 # 获取脚本所在目录 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record" # 颜色定义 RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # 日志函数 log_info() { echo -e "${BLUE}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1" } log_success() { echo -e "${GREEN}[SUCCESS]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1" } log_warning() { echo -e "${YELLOW}[WARNING]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1" } log_error() { echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1" } # 加载配置文件 load_config() { local config_file="$SCRIPT_DIR/config.env" if [[ -f "$config_file" ]]; then log_info "加载配置文件: $config_file" set -a source "$config_file" set +a log_success "配置文件加载完成" else log_warning "配置文件不存在: $config_file,使用默认配置" fi } # 检查单个组件健康状态 check_component_health() { local component_name="$1" local check_script_path="$2" if [[ ! -f "$check_script_path" ]]; then log_error "$component_name: 健康检查脚本不存在: $check_script_path" return 1 fi if [[ ! -x "$check_script_path" ]]; then chmod +x "$check_script_path" 2>/dev/null || true fi # 执行健康检查,捕获退出码 if "$check_script_path" > /dev/null 2>&1; then return 0 else return 1 fi } # 重启单个组件 restart_component() { local component_name="$1" local install_dir="$2" log_warning "正在重启组件: $component_name" # 先执行卸载脚本 local uninstall_script="$install_dir/uninstall.sh" if [[ -f "$uninstall_script" ]]; then log_info "$component_name: 执行卸载脚本..." chmod +x "$uninstall_script" 2>/dev/null || true # 使用 yes 命令自动回答所有确认提示 yes 2>/dev/null | (cd "$install_dir" && "$uninstall_script") || true log_info "$component_name: 卸载完成" fi # 执行安装脚本 local install_script="$install_dir/install.sh" if [[ ! -f "$install_script" ]]; then log_error "$component_name: 安装脚本不存在: $install_script" return 1 fi chmod +x "$install_script" 2>/dev/null || true log_info "$component_name: 执行安装脚本..." # 使用 yes 命令自动回答所有确认提示,传递 SCRIPT_DIR 作为参数 yes 2>/dev/null | (cd "$install_dir" && "$install_script" "$SCRIPT_DIR") || true log_info "$component_name: 安装脚本执行完成" return 0 } # 查找组件进程 PID find_component_pid() { local component_name="$1" local component_pid="" case "$component_name" in "node-exporter") component_pid=$(pgrep -f "node_exporter" | head -1) if [[ -z "$component_pid" ]]; then component_pid=$(pgrep -f "node-exporter" | head -1) fi if [[ -z "$component_pid" ]]; then component_pid=$(ps aux | grep -v grep | grep "node_exporter" | awk '{print $2}' | head -1) fi ;; "dcgm-exporter") component_pid=$(pgrep -f "dcgm-exporter" | head -1) if [[ -z "$component_pid" ]]; then component_pid=$(pgrep -f "dcgm_exporter" | head -1) fi if [[ -z "$component_pid" ]]; then component_pid=$(ps aux | grep -v grep | grep "dcgm-exporter" | awk '{print $2}' | head -1) fi ;; "fluent-bit") component_pid=$(pgrep -f "fluent-bit" | head -1) if [[ -z "$component_pid" ]]; then component_pid=$(pgrep -f "fluent_bit" | head -1) fi if [[ -z "$component_pid" ]]; then component_pid=$(ps aux | grep -v grep | grep "fluent-bit" | awk '{print $2}' | head -1) fi ;; "argus-agent") component_pid=$(pgrep -f "argus-agent" | head -1) if [[ -z "$component_pid" ]]; then component_pid=$(ps aux | grep -v grep | grep "argus-agent" | awk '{print $2}' | head -1) fi ;; esac echo "$component_pid" } # 更新安装记录文件中的 PID update_install_record_pid() { local component_name="$1" local new_pid="$2" if [[ ! -f "$INSTALL_RECORD_FILE" ]]; then log_error "安装记录文件不存在: $INSTALL_RECORD_FILE" return 1 fi # 读取当前 PID local current_pid="" if command -v jq &> /dev/null; then current_pid=$(jq -r --arg comp "$component_name" '.components[$comp].pid // ""' "$INSTALL_RECORD_FILE" 2>/dev/null) fi if [[ -z "$current_pid" ]]; then log_warning "$component_name: 无法读取当前 PID,跳过更新" return 1 fi # 使用 sed 精确替换 PID,保持原有格式不变 # 只替换指定组件块中的 pid 字段 local temp_file="${INSTALL_RECORD_FILE}.tmp" local in_component=0 local updated=0 while IFS= read -r line; do if [[ "$line" =~ \"$component_name\":[[:space:]]*\{ ]]; then in_component=1 echo "$line" elif [[ $in_component -eq 1 && "$line" =~ \"pid\":[[:space:]]*\"$current_pid\" ]]; then echo "$line" | sed "s/\"pid\": \"$current_pid\"/\"pid\": \"$new_pid\"/" updated=1 in_component=0 else echo "$line" if [[ "$line" =~ ^[[:space:]]*\}[[:space:]]*$ ]]; then in_component=0 fi fi done < "$INSTALL_RECORD_FILE" > "$temp_file" # 验证替换是否成功 if [[ $updated -eq 1 ]]; then mv "$temp_file" "$INSTALL_RECORD_FILE" log_success "$component_name: PID 已更新为 $new_pid(原值: $current_pid)" return 0 else log_error "$component_name: PID 替换失败" rm -f "$temp_file" return 1 fi } # 从安装记录文件中读取组件信息 read_install_record() { local install_record_file="$1" if [[ ! -f "$install_record_file" ]]; then log_error "安装记录文件不存在: $install_record_file" return 1 fi # 检查是否有 jq 命令来解析 JSON if command -v jq &> /dev/null; then # 使用 jq 解析 JSON local components_json if components_json=$(jq -r '.components | to_entries[] | "\(.key):\(.value.install_dir)"' "$install_record_file" 2>/dev/null); then echo "$components_json" return 0 else log_error "无法解析安装记录文件 JSON 格式: $install_record_file" return 1 fi else # 如果没有 jq,尝试简单的文本解析 log_warning "jq 命令不可用,尝试简单文本解析" # 查找所有 install_dir 行 local components=() while IFS= read -r line; do if [[ "$line" =~ \"install_dir\":[[:space:]]*\"([^\"]+)\" ]]; then local install_dir="${BASH_REMATCH[1]}" # 从路径中提取组件名称 local component_name=$(basename "$install_dir") components+=("$component_name:$install_dir") fi done < "$install_record_file" if [[ ${#components[@]} -gt 0 ]]; then printf '%s\n' "${components[@]}" return 0 else log_error "无法从安装记录文件中提取组件信息" return 1 fi fi } # 主函数 main() { log_info "==========================================" log_info " 组件自动重启检查" log_info "==========================================" # 检查是否是root用户 if [[ $EUID -ne 0 ]]; then log_error "此脚本需要 root 权限运行" exit 1 fi # 加载配置文件 load_config # 从安装记录文件中读取组件信息 log_info "从安装记录文件读取组件信息: $INSTALL_RECORD_FILE" local components_info if ! components_info=$(read_install_record "$INSTALL_RECORD_FILE"); then log_error "无法读取安装记录文件,自动重启检查终止" exit 1 fi local restart_count=0 local check_count=0 # 逐个检查组件 while IFS= read -r component_info; do if [[ -n "$component_info" ]]; then IFS=':' read -r component_name install_dir <<< "$component_info" check_count=$((check_count + 1)) local check_script_path="$install_dir/check_health.sh" log_info "检查组件: $component_name" # 检查健康状态 if check_component_health "$component_name" "$check_script_path"; then log_success "$component_name: 运行正常" else log_warning "$component_name: 健康检查失败,尝试重启" restart_count=$((restart_count + 1)) # 执行重启 restart_component "$component_name" "$install_dir" # 等待服务启动 log_info "$component_name: 等待进程启动..." sleep 10 # 查找新的进程 PID local new_pid=$(find_component_pid "$component_name") if [[ -n "$new_pid" ]]; then log_info "$component_name: 找到新进程 PID: $new_pid" update_install_record_pid "$component_name" "$new_pid" else log_warning "$component_name: 未找到新进程 PID" fi # 再次检查健康状态 if check_component_health "$component_name" "$check_script_path"; then log_success "$component_name: 重启成功" else log_warning "$component_name: 重启后仍不健康,可能需要手动检查" fi fi fi done <<< "$components_info" log_info "==========================================" log_info "检查完成: 共检查 $check_count 个组件,尝试重启 $restart_count 个" log_info "==========================================" exit 0 } # 脚本入口 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then main "$@" fi