329 lines
10 KiB
Bash
Executable File
329 lines
10 KiB
Bash
Executable File
#!/bin/bash
|
||
|
||
# 此脚本会检查各组件的健康状态,并重启不健康的组件
|
||
|
||
# 获取脚本所在目录
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record"
|
||
|
||
# 颜色定义
|
||
RED='\033[0;31m'
|
||
GREEN='\033[0;32m'
|
||
YELLOW='\033[1;33m'
|
||
BLUE='\033[0;34m'
|
||
NC='\033[0m'
|
||
|
||
# 日志函数
|
||
log_info() {
|
||
echo -e "${BLUE}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
|
||
}
|
||
|
||
log_success() {
|
||
echo -e "${GREEN}[SUCCESS]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
|
||
}
|
||
|
||
log_warning() {
|
||
echo -e "${YELLOW}[WARNING]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
|
||
}
|
||
|
||
log_error() {
|
||
echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
|
||
}
|
||
|
||
# 加载配置文件
|
||
load_config() {
|
||
local config_file="$SCRIPT_DIR/config.env"
|
||
|
||
if [[ -f "$config_file" ]]; then
|
||
log_info "加载配置文件: $config_file"
|
||
set -a
|
||
source "$config_file"
|
||
set +a
|
||
log_success "配置文件加载完成"
|
||
else
|
||
log_warning "配置文件不存在: $config_file,使用默认配置"
|
||
fi
|
||
}
|
||
|
||
# 检查单个组件健康状态
|
||
check_component_health() {
|
||
local component_name="$1"
|
||
local check_script_path="$2"
|
||
|
||
if [[ ! -f "$check_script_path" ]]; then
|
||
log_error "$component_name: 健康检查脚本不存在: $check_script_path"
|
||
return 1
|
||
fi
|
||
|
||
if [[ ! -x "$check_script_path" ]]; then
|
||
chmod +x "$check_script_path" 2>/dev/null || true
|
||
fi
|
||
|
||
# 执行健康检查,捕获退出码
|
||
if "$check_script_path" > /dev/null 2>&1; then
|
||
return 0
|
||
else
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# 重启单个组件
|
||
restart_component() {
|
||
local component_name="$1"
|
||
local install_dir="$2"
|
||
|
||
log_warning "正在重启组件: $component_name"
|
||
|
||
# 先执行卸载脚本
|
||
local uninstall_script="$install_dir/uninstall.sh"
|
||
if [[ -f "$uninstall_script" ]]; then
|
||
log_info "$component_name: 执行卸载脚本..."
|
||
chmod +x "$uninstall_script" 2>/dev/null || true
|
||
# 使用 yes 命令自动回答所有确认提示
|
||
yes 2>/dev/null | (cd "$install_dir" && "$uninstall_script") || true
|
||
log_info "$component_name: 卸载完成"
|
||
fi
|
||
|
||
# 执行安装脚本
|
||
local install_script="$install_dir/install.sh"
|
||
if [[ ! -f "$install_script" ]]; then
|
||
log_error "$component_name: 安装脚本不存在: $install_script"
|
||
return 1
|
||
fi
|
||
|
||
chmod +x "$install_script" 2>/dev/null || true
|
||
log_info "$component_name: 执行安装脚本..."
|
||
|
||
# 使用 yes 命令自动回答所有确认提示,传递 SCRIPT_DIR 作为参数
|
||
yes 2>/dev/null | (cd "$install_dir" && "$install_script" "$SCRIPT_DIR") || true
|
||
|
||
log_info "$component_name: 安装脚本执行完成"
|
||
return 0
|
||
}
|
||
|
||
# 查找组件进程 PID
|
||
find_component_pid() {
|
||
local component_name="$1"
|
||
local component_pid=""
|
||
|
||
case "$component_name" in
|
||
"node-exporter")
|
||
component_pid=$(pgrep -f "node_exporter" | head -1)
|
||
if [[ -z "$component_pid" ]]; then
|
||
component_pid=$(pgrep -f "node-exporter" | head -1)
|
||
fi
|
||
if [[ -z "$component_pid" ]]; then
|
||
component_pid=$(ps aux | grep -v grep | grep "node_exporter" | awk '{print $2}' | head -1)
|
||
fi
|
||
;;
|
||
"dcgm-exporter")
|
||
component_pid=$(pgrep -f "dcgm-exporter" | head -1)
|
||
if [[ -z "$component_pid" ]]; then
|
||
component_pid=$(pgrep -f "dcgm_exporter" | head -1)
|
||
fi
|
||
if [[ -z "$component_pid" ]]; then
|
||
component_pid=$(ps aux | grep -v grep | grep "dcgm-exporter" | awk '{print $2}' | head -1)
|
||
fi
|
||
;;
|
||
"fluent-bit")
|
||
component_pid=$(pgrep -f "fluent-bit" | head -1)
|
||
if [[ -z "$component_pid" ]]; then
|
||
component_pid=$(pgrep -f "fluent_bit" | head -1)
|
||
fi
|
||
if [[ -z "$component_pid" ]]; then
|
||
component_pid=$(ps aux | grep -v grep | grep "fluent-bit" | awk '{print $2}' | head -1)
|
||
fi
|
||
;;
|
||
"argus-agent")
|
||
component_pid=$(pgrep -f "argus-agent" | head -1)
|
||
if [[ -z "$component_pid" ]]; then
|
||
component_pid=$(ps aux | grep -v grep | grep "argus-agent" | awk '{print $2}' | head -1)
|
||
fi
|
||
;;
|
||
esac
|
||
|
||
echo "$component_pid"
|
||
}
|
||
|
||
# 更新安装记录文件中的 PID
|
||
update_install_record_pid() {
|
||
local component_name="$1"
|
||
local new_pid="$2"
|
||
|
||
if [[ ! -f "$INSTALL_RECORD_FILE" ]]; then
|
||
log_error "安装记录文件不存在: $INSTALL_RECORD_FILE"
|
||
return 1
|
||
fi
|
||
|
||
# 读取当前 PID
|
||
local current_pid=""
|
||
if command -v jq &> /dev/null; then
|
||
current_pid=$(jq -r --arg comp "$component_name" '.components[$comp].pid // ""' "$INSTALL_RECORD_FILE" 2>/dev/null)
|
||
fi
|
||
|
||
if [[ -z "$current_pid" ]]; then
|
||
log_warning "$component_name: 无法读取当前 PID,跳过更新"
|
||
return 1
|
||
fi
|
||
|
||
# 使用 sed 精确替换 PID,保持原有格式不变
|
||
# 只替换指定组件块中的 pid 字段
|
||
local temp_file="${INSTALL_RECORD_FILE}.tmp"
|
||
local in_component=0
|
||
local updated=0
|
||
|
||
while IFS= read -r line; do
|
||
if [[ "$line" =~ \"$component_name\":[[:space:]]*\{ ]]; then
|
||
in_component=1
|
||
echo "$line"
|
||
elif [[ $in_component -eq 1 && "$line" =~ \"pid\":[[:space:]]*\"$current_pid\" ]]; then
|
||
echo "$line" | sed "s/\"pid\": \"$current_pid\"/\"pid\": \"$new_pid\"/"
|
||
updated=1
|
||
in_component=0
|
||
else
|
||
echo "$line"
|
||
if [[ "$line" =~ ^[[:space:]]*\}[[:space:]]*$ ]]; then
|
||
in_component=0
|
||
fi
|
||
fi
|
||
done < "$INSTALL_RECORD_FILE" > "$temp_file"
|
||
|
||
# 验证替换是否成功
|
||
if [[ $updated -eq 1 ]]; then
|
||
mv "$temp_file" "$INSTALL_RECORD_FILE"
|
||
log_success "$component_name: PID 已更新为 $new_pid(原值: $current_pid)"
|
||
return 0
|
||
else
|
||
log_error "$component_name: PID 替换失败"
|
||
rm -f "$temp_file"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# 从安装记录文件中读取组件信息
|
||
read_install_record() {
|
||
local install_record_file="$1"
|
||
|
||
if [[ ! -f "$install_record_file" ]]; then
|
||
log_error "安装记录文件不存在: $install_record_file"
|
||
return 1
|
||
fi
|
||
|
||
# 检查是否有 jq 命令来解析 JSON
|
||
if command -v jq &> /dev/null; then
|
||
# 使用 jq 解析 JSON
|
||
local components_json
|
||
if components_json=$(jq -r '.components | to_entries[] | "\(.key):\(.value.install_dir)"' "$install_record_file" 2>/dev/null); then
|
||
echo "$components_json"
|
||
return 0
|
||
else
|
||
log_error "无法解析安装记录文件 JSON 格式: $install_record_file"
|
||
return 1
|
||
fi
|
||
else
|
||
# 如果没有 jq,尝试简单的文本解析
|
||
log_warning "jq 命令不可用,尝试简单文本解析"
|
||
|
||
# 查找所有 install_dir 行
|
||
local components=()
|
||
while IFS= read -r line; do
|
||
if [[ "$line" =~ \"install_dir\":[[:space:]]*\"([^\"]+)\" ]]; then
|
||
local install_dir="${BASH_REMATCH[1]}"
|
||
# 从路径中提取组件名称
|
||
local component_name=$(basename "$install_dir")
|
||
components+=("$component_name:$install_dir")
|
||
fi
|
||
done < "$install_record_file"
|
||
|
||
if [[ ${#components[@]} -gt 0 ]]; then
|
||
printf '%s\n' "${components[@]}"
|
||
return 0
|
||
else
|
||
log_error "无法从安装记录文件中提取组件信息"
|
||
return 1
|
||
fi
|
||
fi
|
||
}
|
||
|
||
# 主函数
|
||
main() {
|
||
log_info "=========================================="
|
||
log_info " 组件自动重启检查"
|
||
log_info "=========================================="
|
||
|
||
# 检查是否是root用户
|
||
if [[ $EUID -ne 0 ]]; then
|
||
log_error "此脚本需要 root 权限运行"
|
||
exit 1
|
||
fi
|
||
|
||
# 加载配置文件
|
||
load_config
|
||
|
||
# 从安装记录文件中读取组件信息
|
||
log_info "从安装记录文件读取组件信息: $INSTALL_RECORD_FILE"
|
||
local components_info
|
||
if ! components_info=$(read_install_record "$INSTALL_RECORD_FILE"); then
|
||
log_error "无法读取安装记录文件,自动重启检查终止"
|
||
exit 1
|
||
fi
|
||
|
||
local restart_count=0
|
||
local check_count=0
|
||
|
||
# 逐个检查组件
|
||
while IFS= read -r component_info; do
|
||
if [[ -n "$component_info" ]]; then
|
||
IFS=':' read -r component_name install_dir <<< "$component_info"
|
||
check_count=$((check_count + 1))
|
||
|
||
local check_script_path="$install_dir/check_health.sh"
|
||
|
||
log_info "检查组件: $component_name"
|
||
|
||
# 检查健康状态
|
||
if check_component_health "$component_name" "$check_script_path"; then
|
||
log_success "$component_name: 运行正常"
|
||
else
|
||
log_warning "$component_name: 健康检查失败,尝试重启"
|
||
restart_count=$((restart_count + 1))
|
||
|
||
# 执行重启
|
||
restart_component "$component_name" "$install_dir"
|
||
|
||
# 等待服务启动
|
||
log_info "$component_name: 等待进程启动..."
|
||
sleep 10
|
||
|
||
# 查找新的进程 PID
|
||
local new_pid=$(find_component_pid "$component_name")
|
||
if [[ -n "$new_pid" ]]; then
|
||
log_info "$component_name: 找到新进程 PID: $new_pid"
|
||
update_install_record_pid "$component_name" "$new_pid"
|
||
else
|
||
log_warning "$component_name: 未找到新进程 PID"
|
||
fi
|
||
|
||
# 再次检查健康状态
|
||
if check_component_health "$component_name" "$check_script_path"; then
|
||
log_success "$component_name: 重启成功"
|
||
else
|
||
log_warning "$component_name: 重启后仍不健康,可能需要手动检查"
|
||
fi
|
||
fi
|
||
fi
|
||
done <<< "$components_info"
|
||
|
||
log_info "=========================================="
|
||
log_info "检查完成: 共检查 $check_count 个组件,尝试重启 $restart_count 个"
|
||
log_info "=========================================="
|
||
|
||
exit 0
|
||
}
|
||
|
||
# 脚本入口
|
||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||
main "$@"
|
||
fi
|
||
|