329 lines
10 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# 此脚本会检查各组件的健康状态,并重启不健康的组件
# 获取脚本所在目录
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
}
# 加载配置文件
load_config() {
local config_file="$SCRIPT_DIR/config.env"
if [[ -f "$config_file" ]]; then
log_info "加载配置文件: $config_file"
set -a
source "$config_file"
set +a
log_success "配置文件加载完成"
else
log_warning "配置文件不存在: $config_file,使用默认配置"
fi
}
# 检查单个组件健康状态
check_component_health() {
local component_name="$1"
local check_script_path="$2"
if [[ ! -f "$check_script_path" ]]; then
log_error "$component_name: 健康检查脚本不存在: $check_script_path"
return 1
fi
if [[ ! -x "$check_script_path" ]]; then
chmod +x "$check_script_path" 2>/dev/null || true
fi
# 执行健康检查,捕获退出码
if "$check_script_path" > /dev/null 2>&1; then
return 0
else
return 1
fi
}
# 重启单个组件
restart_component() {
local component_name="$1"
local install_dir="$2"
log_warning "正在重启组件: $component_name"
# 先执行卸载脚本
local uninstall_script="$install_dir/uninstall.sh"
if [[ -f "$uninstall_script" ]]; then
log_info "$component_name: 执行卸载脚本..."
chmod +x "$uninstall_script" 2>/dev/null || true
# 使用 yes 命令自动回答所有确认提示
yes 2>/dev/null | (cd "$install_dir" && "$uninstall_script") || true
log_info "$component_name: 卸载完成"
fi
# 执行安装脚本
local install_script="$install_dir/install.sh"
if [[ ! -f "$install_script" ]]; then
log_error "$component_name: 安装脚本不存在: $install_script"
return 1
fi
chmod +x "$install_script" 2>/dev/null || true
log_info "$component_name: 执行安装脚本..."
# 使用 yes 命令自动回答所有确认提示,传递 SCRIPT_DIR 作为参数
yes 2>/dev/null | (cd "$install_dir" && "$install_script" "$SCRIPT_DIR") || true
log_info "$component_name: 安装脚本执行完成"
return 0
}
# 查找组件进程 PID
find_component_pid() {
local component_name="$1"
local component_pid=""
case "$component_name" in
"node-exporter")
component_pid=$(pgrep -f "node_exporter" | head -1)
if [[ -z "$component_pid" ]]; then
component_pid=$(pgrep -f "node-exporter" | head -1)
fi
if [[ -z "$component_pid" ]]; then
component_pid=$(ps aux | grep -v grep | grep "node_exporter" | awk '{print $2}' | head -1)
fi
;;
"dcgm-exporter")
component_pid=$(pgrep -f "dcgm-exporter" | head -1)
if [[ -z "$component_pid" ]]; then
component_pid=$(pgrep -f "dcgm_exporter" | head -1)
fi
if [[ -z "$component_pid" ]]; then
component_pid=$(ps aux | grep -v grep | grep "dcgm-exporter" | awk '{print $2}' | head -1)
fi
;;
"fluent-bit")
component_pid=$(pgrep -f "fluent-bit" | head -1)
if [[ -z "$component_pid" ]]; then
component_pid=$(pgrep -f "fluent_bit" | head -1)
fi
if [[ -z "$component_pid" ]]; then
component_pid=$(ps aux | grep -v grep | grep "fluent-bit" | awk '{print $2}' | head -1)
fi
;;
"argus-agent")
component_pid=$(pgrep -f "argus-agent" | head -1)
if [[ -z "$component_pid" ]]; then
component_pid=$(ps aux | grep -v grep | grep "argus-agent" | awk '{print $2}' | head -1)
fi
;;
esac
echo "$component_pid"
}
# 更新安装记录文件中的 PID
update_install_record_pid() {
local component_name="$1"
local new_pid="$2"
if [[ ! -f "$INSTALL_RECORD_FILE" ]]; then
log_error "安装记录文件不存在: $INSTALL_RECORD_FILE"
return 1
fi
# 读取当前 PID
local current_pid=""
if command -v jq &> /dev/null; then
current_pid=$(jq -r --arg comp "$component_name" '.components[$comp].pid // ""' "$INSTALL_RECORD_FILE" 2>/dev/null)
fi
if [[ -z "$current_pid" ]]; then
log_warning "$component_name: 无法读取当前 PID跳过更新"
return 1
fi
# 使用 sed 精确替换 PID保持原有格式不变
# 只替换指定组件块中的 pid 字段
local temp_file="${INSTALL_RECORD_FILE}.tmp"
local in_component=0
local updated=0
while IFS= read -r line; do
if [[ "$line" =~ \"$component_name\":[[:space:]]*\{ ]]; then
in_component=1
echo "$line"
elif [[ $in_component -eq 1 && "$line" =~ \"pid\":[[:space:]]*\"$current_pid\" ]]; then
echo "$line" | sed "s/\"pid\": \"$current_pid\"/\"pid\": \"$new_pid\"/"
updated=1
in_component=0
else
echo "$line"
if [[ "$line" =~ ^[[:space:]]*\}[[:space:]]*$ ]]; then
in_component=0
fi
fi
done < "$INSTALL_RECORD_FILE" > "$temp_file"
# 验证替换是否成功
if [[ $updated -eq 1 ]]; then
mv "$temp_file" "$INSTALL_RECORD_FILE"
log_success "$component_name: PID 已更新为 $new_pid(原值: $current_pid"
return 0
else
log_error "$component_name: PID 替换失败"
rm -f "$temp_file"
return 1
fi
}
# 从安装记录文件中读取组件信息
read_install_record() {
local install_record_file="$1"
if [[ ! -f "$install_record_file" ]]; then
log_error "安装记录文件不存在: $install_record_file"
return 1
fi
# 检查是否有 jq 命令来解析 JSON
if command -v jq &> /dev/null; then
# 使用 jq 解析 JSON
local components_json
if components_json=$(jq -r '.components | to_entries[] | "\(.key):\(.value.install_dir)"' "$install_record_file" 2>/dev/null); then
echo "$components_json"
return 0
else
log_error "无法解析安装记录文件 JSON 格式: $install_record_file"
return 1
fi
else
# 如果没有 jq尝试简单的文本解析
log_warning "jq 命令不可用,尝试简单文本解析"
# 查找所有 install_dir 行
local components=()
while IFS= read -r line; do
if [[ "$line" =~ \"install_dir\":[[:space:]]*\"([^\"]+)\" ]]; then
local install_dir="${BASH_REMATCH[1]}"
# 从路径中提取组件名称
local component_name=$(basename "$install_dir")
components+=("$component_name:$install_dir")
fi
done < "$install_record_file"
if [[ ${#components[@]} -gt 0 ]]; then
printf '%s\n' "${components[@]}"
return 0
else
log_error "无法从安装记录文件中提取组件信息"
return 1
fi
fi
}
# 主函数
main() {
log_info "=========================================="
log_info " 组件自动重启检查"
log_info "=========================================="
# 检查是否是root用户
if [[ $EUID -ne 0 ]]; then
log_error "此脚本需要 root 权限运行"
exit 1
fi
# 加载配置文件
load_config
# 从安装记录文件中读取组件信息
log_info "从安装记录文件读取组件信息: $INSTALL_RECORD_FILE"
local components_info
if ! components_info=$(read_install_record "$INSTALL_RECORD_FILE"); then
log_error "无法读取安装记录文件,自动重启检查终止"
exit 1
fi
local restart_count=0
local check_count=0
# 逐个检查组件
while IFS= read -r component_info; do
if [[ -n "$component_info" ]]; then
IFS=':' read -r component_name install_dir <<< "$component_info"
check_count=$((check_count + 1))
local check_script_path="$install_dir/check_health.sh"
log_info "检查组件: $component_name"
# 检查健康状态
if check_component_health "$component_name" "$check_script_path"; then
log_success "$component_name: 运行正常"
else
log_warning "$component_name: 健康检查失败,尝试重启"
restart_count=$((restart_count + 1))
# 执行重启
restart_component "$component_name" "$install_dir"
# 等待服务启动
log_info "$component_name: 等待进程启动..."
sleep 10
# 查找新的进程 PID
local new_pid=$(find_component_pid "$component_name")
if [[ -n "$new_pid" ]]; then
log_info "$component_name: 找到新进程 PID: $new_pid"
update_install_record_pid "$component_name" "$new_pid"
else
log_warning "$component_name: 未找到新进程 PID"
fi
# 再次检查健康状态
if check_component_health "$component_name" "$check_script_path"; then
log_success "$component_name: 重启成功"
else
log_warning "$component_name: 重启后仍不健康,可能需要手动检查"
fi
fi
fi
done <<< "$components_info"
log_info "=========================================="
log_info "检查完成: 共检查 $check_count 个组件,尝试重启 $restart_count"
log_info "=========================================="
exit 0
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi