parent
687d29993a
commit
28ef5df6e4
@ -26,6 +26,37 @@ log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# 更新安装记录
|
||||
update_install_record() {
|
||||
local pid="$1"
|
||||
# 使用传入的安装目录参数,如果没有则使用默认值
|
||||
local install_base_dir="${2:-/opt/argus-metric/current}"
|
||||
local install_record="$install_base_dir/.install_record"
|
||||
|
||||
# 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建
|
||||
if [[ ! -f "$install_record" ]]; then
|
||||
log_info "安装记录文件不存在,将由主安装脚本创建"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 如果文件存在,说明是重启场景,只更新 PID 字段
|
||||
if command -v jq &> /dev/null; then
|
||||
# 读取当前 PID
|
||||
local current_pid=$(jq -r '.components."node-exporter".pid // ""' "$install_record" 2>/dev/null)
|
||||
|
||||
if [[ -z "$current_pid" ]]; then
|
||||
log_warning "无法读取当前 PID,跳过更新"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段
|
||||
jq --arg new_pid "$pid" '.components."node-exporter".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record"
|
||||
log_info "PID 已更新: $current_pid -> $pid"
|
||||
else
|
||||
log_warning "jq 命令不可用,无法更新安装记录文件"
|
||||
fi
|
||||
}
|
||||
|
||||
# 显示帮助信息
|
||||
show_help() {
|
||||
echo "Node Exporter 安装脚本"
|
||||
@ -97,61 +128,66 @@ check_system() {
|
||||
fi
|
||||
}
|
||||
|
||||
# 停止可能运行的服务
|
||||
stop_existing_service() {
|
||||
log_info "检查并停止可能运行的服务..."
|
||||
log_info "检查并停止可能运行的 Node Exporter 服务..."
|
||||
|
||||
local pid_file="/var/run/node-exporter.pid"
|
||||
# 当前脚本 PID,防止误杀
|
||||
SELF_PID=$$
|
||||
|
||||
# 检查并停止通过 PID 文件管理的服务
|
||||
if [[ -f "$pid_file" ]]; then
|
||||
local pid=$(cat "$pid_file")
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_info "发现正在运行的 Node Exporter 服务 (PID: $pid),正在停止..."
|
||||
kill "$pid"
|
||||
sleep 2
|
||||
# 1. 停止 systemd 服务(如果存在)
|
||||
if systemctl list-units --full -all | grep -q "node_exporter.service"; then
|
||||
log_info "检测到 systemd 服务 node_exporter,正在停止..."
|
||||
systemctl stop node_exporter || true
|
||||
systemctl disable node_exporter || true
|
||||
fi
|
||||
|
||||
# 2. 清理可能存在的 PID 文件
|
||||
for pid_file in /var/run/node-exporter.pid /var/run/node_exporter.pid /tmp/node_exporter.pid; do
|
||||
if [[ -f "$pid_file" ]]; then
|
||||
local pid=$(cat "$pid_file")
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_warning "进程未响应,强制终止..."
|
||||
kill -9 "$pid" 2>/dev/null || true
|
||||
log_info "发现 Node Exporter (PID: $pid),正在停止..."
|
||||
kill "$pid"
|
||||
sleep 2
|
||||
kill -0 "$pid" 2>/dev/null && kill -9 "$pid"
|
||||
fi
|
||||
rm -f "$pid_file"
|
||||
log_success "服务已停止"
|
||||
else
|
||||
log_warning "发现过期的 PID 文件,正在清理..."
|
||||
rm -f "$pid_file"
|
||||
fi
|
||||
done
|
||||
|
||||
# 3. 用 pgrep 查找进程,排除当前脚本
|
||||
local pids=$(pgrep -f "node_exporter|node-exporter|/usr/local/bin/node-exporter" | grep -vw "$SELF_PID" || true)
|
||||
if [[ -n "$pids" ]]; then
|
||||
log_info "发现 Node Exporter 进程 (PID: $pids),正在停止..."
|
||||
for pid in $pids; do
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
kill "$pid" 2>/dev/null || true
|
||||
sleep 1
|
||||
kill -0 "$pid" 2>/dev/null && kill -9 "$pid" 2>/dev/null || true
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# 查找并停止所有 node_exporter 和 node-exporter 进程
|
||||
local pids=$(pgrep -f "node_exporter\|node-exporter" 2>/dev/null || true)
|
||||
if [[ -n "$pids" ]]; then
|
||||
log_info "发现 node_exporter 或 node-exporter 进程,正在停止..."
|
||||
for pid in $pids; do
|
||||
log_info "停止进程 PID: $pid"
|
||||
kill "$pid" 2>/dev/null || true
|
||||
# 4. 兜底:检查是否有进程占用 9100 端口
|
||||
local listen_pids=$(lsof -ti:9100 2>/dev/null || true)
|
||||
if [[ -n "$listen_pids" ]]; then
|
||||
log_warning "发现占用 9100 端口的进程 (PID: $listen_pids),强制终止..."
|
||||
for pid in $listen_pids; do
|
||||
kill -9 "$pid" 2>/dev/null || true
|
||||
done
|
||||
sleep 2
|
||||
sleep 1
|
||||
fi
|
||||
|
||||
# 检查是否还有进程在运行,如果有则强制终止
|
||||
local remaining_pids=$(pgrep -f "node_exporter\|node-exporter" 2>/dev/null || true)
|
||||
if [[ -n "$remaining_pids" ]]; then
|
||||
log_warning "进程未响应,强制终止..."
|
||||
for pid in $remaining_pids; do
|
||||
log_info "强制终止进程 PID: $pid"
|
||||
kill -9 "$pid" 2>/dev/null || true
|
||||
done
|
||||
sleep 1
|
||||
fi
|
||||
|
||||
# 最终检查
|
||||
if pgrep -f "node_exporter\|node-exporter" > /dev/null; then
|
||||
log_error "无法停止所有 node_exporter 进程"
|
||||
else
|
||||
log_success "所有 node_exporter 进程已停止"
|
||||
fi
|
||||
# 5. 最终验证
|
||||
if netstat -tuln 2>/dev/null | grep -q ":9100 "; then
|
||||
log_error "端口 9100 仍被占用,请手动检查"
|
||||
return 1
|
||||
else
|
||||
log_success "旧的 Node Exporter 已完全停止"
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
# 安装 Node Exporter 二进制文件
|
||||
install_node_exporter() {
|
||||
log_info "安装 Node Exporter..."
|
||||
@ -243,6 +279,9 @@ start_node_exporter() {
|
||||
log_success "Node Exporter 服务启动成功 (PID: $pid)"
|
||||
log_info "日志文件: $log_file"
|
||||
log_info "PID 文件: $pid_file"
|
||||
|
||||
# 更新安装记录
|
||||
update_install_record "$pid" "$INSTALL_DIR"
|
||||
else
|
||||
log_error "Node Exporter 服务启动失败"
|
||||
rm -f "$pid_file"
|
||||
@ -301,3 +340,4 @@ main() {
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
|
||||
|
@ -512,9 +512,12 @@ install_components() {
|
||||
|
||||
# 将解压后的目录移动到安装目录,保留组件目录
|
||||
component_install_dir="$INSTALL_DIR/$component"
|
||||
# 简化安装逻辑:直接删除旧目录,不进行备份
|
||||
if [[ -d "$component_install_dir" ]]; then
|
||||
log_info " 组件目录已存在,备份后更新: $component_install_dir"
|
||||
mv "$component_install_dir" "${component_install_dir}.backup.$(date +%Y%m%d_%H%M%S)"
|
||||
log_info " 组件目录已存在,删除旧版本: $component_install_dir"
|
||||
rm -rf "$component_install_dir"
|
||||
# log_info " 组件目录已存在,备份后更新: $component_install_dir"
|
||||
# mv "$component_install_dir" "${component_install_dir}.backup.$(date +%Y%m%d_%H%M%S)"
|
||||
fi
|
||||
mv "$extracted_dir" "$component_install_dir"
|
||||
log_success " 组件目录已保存: $component_install_dir"
|
||||
@ -536,7 +539,7 @@ create_install_record() {
|
||||
sleep 3
|
||||
|
||||
local install_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
||||
local install_record_file=".install_record"
|
||||
local install_record_file="$INSTALL_DIR/.install_record"
|
||||
|
||||
# 创建 JSON 格式的安装记录
|
||||
cat > "$install_record_file" << EOF
|
||||
@ -799,6 +802,59 @@ setup_version_check_cron() {
|
||||
log_info "版本校验通过crontab自动执行"
|
||||
}
|
||||
|
||||
# 设置自动重启定时任务
|
||||
setup_restart_cron() {
|
||||
log_info "设置自动重启定时任务..."
|
||||
|
||||
# 使用当前版本目录中的重启脚本
|
||||
local restart_script="$INSTALL_DIR/restart_unhealthy.sh"
|
||||
|
||||
# 检查脚本是否存在
|
||||
if [[ ! -f "$restart_script" ]]; then
|
||||
log_warning "重启脚本不存在: $restart_script"
|
||||
log_info "跳过自动重启定时任务设置"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 确保脚本可执行
|
||||
chmod +x "$restart_script"
|
||||
|
||||
# 创建临时crontab文件
|
||||
local temp_cron="/tmp/crontab_$$"
|
||||
crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron"
|
||||
|
||||
# 检查是否已存在自动重启定时任务
|
||||
if grep -q "restart_unhealthy.sh" "$temp_cron"; then
|
||||
log_info "发现旧的自动重启定时任务,正在更新..."
|
||||
# 删除所有包含restart_unhealthy.sh的行
|
||||
grep -v "restart_unhealthy.sh" "$temp_cron" > "$temp_cron.new"
|
||||
mv "$temp_cron.new" "$temp_cron"
|
||||
log_info "旧的自动重启定时任务已删除"
|
||||
fi
|
||||
|
||||
# 添加新的定时任务(每2分钟执行一次)
|
||||
echo "# Argus-Metrics 自动重启定时任务" >> "$temp_cron"
|
||||
echo "*/2 * * * * $restart_script >> $INSTALL_DIR/.restart.log 2>&1" >> "$temp_cron"
|
||||
|
||||
# 安装新的crontab
|
||||
if crontab "$temp_cron"; then
|
||||
log_success "自动重启定时任务设置成功"
|
||||
log_info " 执行频率: 每2分钟"
|
||||
log_info " 日志文件: $INSTALL_DIR/.restart.log"
|
||||
log_info " 查看定时任务: crontab -l"
|
||||
log_info " 删除定时任务: crontab -e"
|
||||
else
|
||||
log_error "自动重启定时任务设置失败"
|
||||
rm -f "$temp_cron"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 清理临时文件
|
||||
rm -f "$temp_cron"
|
||||
|
||||
log_info "自动重启检查通过crontab自动执行"
|
||||
}
|
||||
|
||||
# 显示安装信息
|
||||
show_install_info() {
|
||||
log_success "Argus-Metrics All-in-One 安装完成!"
|
||||
@ -838,6 +894,7 @@ main() {
|
||||
setup_health_check_cron
|
||||
setup_dns_sync_cron
|
||||
setup_version_check_cron
|
||||
setup_restart_cron
|
||||
show_install_info
|
||||
}
|
||||
|
||||
|
@ -416,6 +416,16 @@ else
|
||||
log_warning "scripts/check_version.sh 文件不存在"
|
||||
fi
|
||||
|
||||
# 复制`自动重启`脚本到 artifact 目录
|
||||
log_info "复制自动重启脚本..."
|
||||
if [[ -f "scripts/restart_unhealthy.sh" ]]; then
|
||||
cp "scripts/restart_unhealthy.sh" "$ARTIFACT_DIR/restart_unhealthy.sh"
|
||||
chmod +x "$ARTIFACT_DIR/restart_unhealthy.sh"
|
||||
log_success "自动重启脚本复制完成: $ARTIFACT_DIR/restart_unhealthy.sh"
|
||||
else
|
||||
log_warning "scripts/restart_unhealthy.sh 文件不存在"
|
||||
fi
|
||||
|
||||
# 复制配置文件到 artifact 目录
|
||||
log_info "复制配置文件..."
|
||||
if [[ -f "config/config.env" ]]; then
|
||||
|
@ -120,6 +120,17 @@ else
|
||||
log_warning "未找到 check_version.sh 文件"
|
||||
fi
|
||||
|
||||
# 复制重启失败脚本
|
||||
if [[ -f "$ARTIFACT_DIR/restart_unhealthy.sh" ]]; then
|
||||
log_info "复制重启失败脚本..."
|
||||
cp "$ARTIFACT_DIR/restart_unhealthy.sh" "$TEMP_PACKAGE_DIR/"
|
||||
elif [[ -f "scripts/restart_unhealthy.sh" ]]; then
|
||||
log_info "复制重启失败脚本 (从当前目录)..."
|
||||
cp "scripts/restart_unhealthy.sh" "$TEMP_PACKAGE_DIR/"
|
||||
else
|
||||
log_warning "未找到 restart_unhealthy.sh 文件"
|
||||
fi
|
||||
|
||||
# 复制安装脚本并重命名为 install.sh
|
||||
if [[ -f "scripts/install_artifact.sh" ]]; then
|
||||
log_info "复制安装脚本..."
|
||||
|
328
src/metric/client-plugins/all-in-one/scripts/restart_unhealthy.sh
Executable file
328
src/metric/client-plugins/all-in-one/scripts/restart_unhealthy.sh
Executable file
@ -0,0 +1,328 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 此脚本会检查各组件的健康状态,并重启不健康的组件
|
||||
|
||||
# 获取脚本所在目录
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record"
|
||||
|
||||
# 颜色定义
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m'
|
||||
|
||||
# 日志函数
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
|
||||
}
|
||||
|
||||
# 加载配置文件
|
||||
load_config() {
|
||||
local config_file="$SCRIPT_DIR/config.env"
|
||||
|
||||
if [[ -f "$config_file" ]]; then
|
||||
log_info "加载配置文件: $config_file"
|
||||
set -a
|
||||
source "$config_file"
|
||||
set +a
|
||||
log_success "配置文件加载完成"
|
||||
else
|
||||
log_warning "配置文件不存在: $config_file,使用默认配置"
|
||||
fi
|
||||
}
|
||||
|
||||
# 检查单个组件健康状态
|
||||
check_component_health() {
|
||||
local component_name="$1"
|
||||
local check_script_path="$2"
|
||||
|
||||
if [[ ! -f "$check_script_path" ]]; then
|
||||
log_error "$component_name: 健康检查脚本不存在: $check_script_path"
|
||||
return 1
|
||||
fi
|
||||
|
||||
if [[ ! -x "$check_script_path" ]]; then
|
||||
chmod +x "$check_script_path" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# 执行健康检查,捕获退出码
|
||||
if "$check_script_path" > /dev/null 2>&1; then
|
||||
return 0
|
||||
else
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 重启单个组件
|
||||
restart_component() {
|
||||
local component_name="$1"
|
||||
local install_dir="$2"
|
||||
|
||||
log_warning "正在重启组件: $component_name"
|
||||
|
||||
# 先执行卸载脚本
|
||||
local uninstall_script="$install_dir/uninstall.sh"
|
||||
if [[ -f "$uninstall_script" ]]; then
|
||||
log_info "$component_name: 执行卸载脚本..."
|
||||
chmod +x "$uninstall_script" 2>/dev/null || true
|
||||
# 使用 yes 命令自动回答所有确认提示
|
||||
yes 2>/dev/null | (cd "$install_dir" && "$uninstall_script") || true
|
||||
log_info "$component_name: 卸载完成"
|
||||
fi
|
||||
|
||||
# 执行安装脚本
|
||||
local install_script="$install_dir/install.sh"
|
||||
if [[ ! -f "$install_script" ]]; then
|
||||
log_error "$component_name: 安装脚本不存在: $install_script"
|
||||
return 1
|
||||
fi
|
||||
|
||||
chmod +x "$install_script" 2>/dev/null || true
|
||||
log_info "$component_name: 执行安装脚本..."
|
||||
|
||||
# 使用 yes 命令自动回答所有确认提示,传递 SCRIPT_DIR 作为参数
|
||||
yes 2>/dev/null | (cd "$install_dir" && "$install_script" "$SCRIPT_DIR") || true
|
||||
|
||||
log_info "$component_name: 安装脚本执行完成"
|
||||
return 0
|
||||
}
|
||||
|
||||
# 查找组件进程 PID
|
||||
find_component_pid() {
|
||||
local component_name="$1"
|
||||
local component_pid=""
|
||||
|
||||
case "$component_name" in
|
||||
"node-exporter")
|
||||
component_pid=$(pgrep -f "node_exporter" | head -1)
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(pgrep -f "node-exporter" | head -1)
|
||||
fi
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(ps aux | grep -v grep | grep "node_exporter" | awk '{print $2}' | head -1)
|
||||
fi
|
||||
;;
|
||||
"dcgm-exporter")
|
||||
component_pid=$(pgrep -f "dcgm-exporter" | head -1)
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(pgrep -f "dcgm_exporter" | head -1)
|
||||
fi
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(ps aux | grep -v grep | grep "dcgm-exporter" | awk '{print $2}' | head -1)
|
||||
fi
|
||||
;;
|
||||
"fluent-bit")
|
||||
component_pid=$(pgrep -f "fluent-bit" | head -1)
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(pgrep -f "fluent_bit" | head -1)
|
||||
fi
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(ps aux | grep -v grep | grep "fluent-bit" | awk '{print $2}' | head -1)
|
||||
fi
|
||||
;;
|
||||
"argus-agent")
|
||||
component_pid=$(pgrep -f "argus-agent" | head -1)
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(ps aux | grep -v grep | grep "argus-agent" | awk '{print $2}' | head -1)
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "$component_pid"
|
||||
}
|
||||
|
||||
# 更新安装记录文件中的 PID
|
||||
update_install_record_pid() {
|
||||
local component_name="$1"
|
||||
local new_pid="$2"
|
||||
|
||||
if [[ ! -f "$INSTALL_RECORD_FILE" ]]; then
|
||||
log_error "安装记录文件不存在: $INSTALL_RECORD_FILE"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 读取当前 PID
|
||||
local current_pid=""
|
||||
if command -v jq &> /dev/null; then
|
||||
current_pid=$(jq -r --arg comp "$component_name" '.components[$comp].pid // ""' "$INSTALL_RECORD_FILE" 2>/dev/null)
|
||||
fi
|
||||
|
||||
if [[ -z "$current_pid" ]]; then
|
||||
log_warning "$component_name: 无法读取当前 PID,跳过更新"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 使用 sed 精确替换 PID,保持原有格式不变
|
||||
# 只替换指定组件块中的 pid 字段
|
||||
local temp_file="${INSTALL_RECORD_FILE}.tmp"
|
||||
local in_component=0
|
||||
local updated=0
|
||||
|
||||
while IFS= read -r line; do
|
||||
if [[ "$line" =~ \"$component_name\":[[:space:]]*\{ ]]; then
|
||||
in_component=1
|
||||
echo "$line"
|
||||
elif [[ $in_component -eq 1 && "$line" =~ \"pid\":[[:space:]]*\"$current_pid\" ]]; then
|
||||
echo "$line" | sed "s/\"pid\": \"$current_pid\"/\"pid\": \"$new_pid\"/"
|
||||
updated=1
|
||||
in_component=0
|
||||
else
|
||||
echo "$line"
|
||||
if [[ "$line" =~ ^[[:space:]]*\}[[:space:]]*$ ]]; then
|
||||
in_component=0
|
||||
fi
|
||||
fi
|
||||
done < "$INSTALL_RECORD_FILE" > "$temp_file"
|
||||
|
||||
# 验证替换是否成功
|
||||
if [[ $updated -eq 1 ]]; then
|
||||
mv "$temp_file" "$INSTALL_RECORD_FILE"
|
||||
log_success "$component_name: PID 已更新为 $new_pid(原值: $current_pid)"
|
||||
return 0
|
||||
else
|
||||
log_error "$component_name: PID 替换失败"
|
||||
rm -f "$temp_file"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 从安装记录文件中读取组件信息
|
||||
read_install_record() {
|
||||
local install_record_file="$1"
|
||||
|
||||
if [[ ! -f "$install_record_file" ]]; then
|
||||
log_error "安装记录文件不存在: $install_record_file"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 检查是否有 jq 命令来解析 JSON
|
||||
if command -v jq &> /dev/null; then
|
||||
# 使用 jq 解析 JSON
|
||||
local components_json
|
||||
if components_json=$(jq -r '.components | to_entries[] | "\(.key):\(.value.install_dir)"' "$install_record_file" 2>/dev/null); then
|
||||
echo "$components_json"
|
||||
return 0
|
||||
else
|
||||
log_error "无法解析安装记录文件 JSON 格式: $install_record_file"
|
||||
return 1
|
||||
fi
|
||||
else
|
||||
# 如果没有 jq,尝试简单的文本解析
|
||||
log_warning "jq 命令不可用,尝试简单文本解析"
|
||||
|
||||
# 查找所有 install_dir 行
|
||||
local components=()
|
||||
while IFS= read -r line; do
|
||||
if [[ "$line" =~ \"install_dir\":[[:space:]]*\"([^\"]+)\" ]]; then
|
||||
local install_dir="${BASH_REMATCH[1]}"
|
||||
# 从路径中提取组件名称
|
||||
local component_name=$(basename "$install_dir")
|
||||
components+=("$component_name:$install_dir")
|
||||
fi
|
||||
done < "$install_record_file"
|
||||
|
||||
if [[ ${#components[@]} -gt 0 ]]; then
|
||||
printf '%s\n' "${components[@]}"
|
||||
return 0
|
||||
else
|
||||
log_error "无法从安装记录文件中提取组件信息"
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
log_info "=========================================="
|
||||
log_info " 组件自动重启检查"
|
||||
log_info "=========================================="
|
||||
|
||||
# 检查是否是root用户
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
log_error "此脚本需要 root 权限运行"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 加载配置文件
|
||||
load_config
|
||||
|
||||
# 从安装记录文件中读取组件信息
|
||||
log_info "从安装记录文件读取组件信息: $INSTALL_RECORD_FILE"
|
||||
local components_info
|
||||
if ! components_info=$(read_install_record "$INSTALL_RECORD_FILE"); then
|
||||
log_error "无法读取安装记录文件,自动重启检查终止"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
local restart_count=0
|
||||
local check_count=0
|
||||
|
||||
# 逐个检查组件
|
||||
while IFS= read -r component_info; do
|
||||
if [[ -n "$component_info" ]]; then
|
||||
IFS=':' read -r component_name install_dir <<< "$component_info"
|
||||
check_count=$((check_count + 1))
|
||||
|
||||
local check_script_path="$install_dir/check_health.sh"
|
||||
|
||||
log_info "检查组件: $component_name"
|
||||
|
||||
# 检查健康状态
|
||||
if check_component_health "$component_name" "$check_script_path"; then
|
||||
log_success "$component_name: 运行正常"
|
||||
else
|
||||
log_warning "$component_name: 健康检查失败,尝试重启"
|
||||
restart_count=$((restart_count + 1))
|
||||
|
||||
# 执行重启
|
||||
restart_component "$component_name" "$install_dir"
|
||||
|
||||
# 等待服务启动
|
||||
log_info "$component_name: 等待进程启动..."
|
||||
sleep 10
|
||||
|
||||
# 查找新的进程 PID
|
||||
local new_pid=$(find_component_pid "$component_name")
|
||||
if [[ -n "$new_pid" ]]; then
|
||||
log_info "$component_name: 找到新进程 PID: $new_pid"
|
||||
update_install_record_pid "$component_name" "$new_pid"
|
||||
else
|
||||
log_warning "$component_name: 未找到新进程 PID"
|
||||
fi
|
||||
|
||||
# 再次检查健康状态
|
||||
if check_component_health "$component_name" "$check_script_path"; then
|
||||
log_success "$component_name: 重启成功"
|
||||
else
|
||||
log_warning "$component_name: 重启后仍不健康,可能需要手动检查"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
done <<< "$components_info"
|
||||
|
||||
log_info "=========================================="
|
||||
log_info "检查完成: 共检查 $check_count 个组件,尝试重启 $restart_count 个"
|
||||
log_info "=========================================="
|
||||
|
||||
exit 0
|
||||
}
|
||||
|
||||
# 脚本入口
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
|
@ -160,14 +160,15 @@ while [[ $# -gt 0 ]]; do
|
||||
INSTALL_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
--rollback)
|
||||
ACTION="rollback"
|
||||
shift
|
||||
;;
|
||||
--backup-list)
|
||||
ACTION="backup-list"
|
||||
shift
|
||||
;;
|
||||
# 简化安装逻辑:不再支持回滚和备份列表功能
|
||||
# --rollback)
|
||||
# ACTION="rollback"
|
||||
# shift
|
||||
# ;;
|
||||
# --backup-list)
|
||||
# ACTION="backup-list"
|
||||
# shift
|
||||
# ;;
|
||||
--status)
|
||||
ACTION="status"
|
||||
shift
|
||||
@ -192,8 +193,8 @@ while [[ $# -gt 0 ]]; do
|
||||
echo " --install-dir DIR 安装目录 (默认: /opt/argus-metric)"
|
||||
echo " --force 强制重新安装 (即使相同版本)"
|
||||
echo " --uninstall 卸载 (自动确认)"
|
||||
echo " --rollback 回滚到上一个备份版本"
|
||||
echo " --backup-list 列出所有备份版本"
|
||||
# echo " --rollback 回滚到上一个备份版本"
|
||||
# echo " --backup-list 列出所有备份版本"
|
||||
echo " --status 显示当前安装状态"
|
||||
echo " --help 显示帮助"
|
||||
echo
|
||||
@ -550,8 +551,8 @@ install_argus_metric() {
|
||||
if [[ "$FORCE_INSTALL" == true ]]; then
|
||||
log_info "检测到相同版本 v$ARGUS_VERSION,但使用了 --force 参数,将强制重新安装"
|
||||
is_upgrade=true
|
||||
# 备份当前版本
|
||||
backup_current_version
|
||||
# 简化安装逻辑:不再备份当前版本
|
||||
# backup_current_version
|
||||
else
|
||||
log_info "版本 v$ARGUS_VERSION 已安装,无需重复安装"
|
||||
log_info "如需强制重新安装,请使用 --force 参数"
|
||||
@ -561,8 +562,8 @@ install_argus_metric() {
|
||||
log_info "检测到版本升级: v$current_version -> v$ARGUS_VERSION"
|
||||
is_upgrade=true
|
||||
|
||||
# 备份当前版本
|
||||
backup_current_version
|
||||
# 简化安装逻辑:不再备份当前版本
|
||||
# backup_current_version
|
||||
fi
|
||||
fi
|
||||
|
||||
@ -654,17 +655,17 @@ install_argus_metric() {
|
||||
log_success "安装脚本执行完成"
|
||||
else
|
||||
log_error "安装脚本执行失败"
|
||||
# 如果是升级失败,尝试回滚
|
||||
if [[ "$is_upgrade" == true ]]; then
|
||||
log_warning "升级失败,尝试回滚到之前版本..."
|
||||
# 确保备份目录存在
|
||||
mkdir -p "$BACKUPS_DIR"
|
||||
local latest_backup=$(ls -1t "$BACKUPS_DIR" 2>/dev/null | head -n 1)
|
||||
if [[ -n "$latest_backup" ]]; then
|
||||
rollback_to_backup "$latest_backup"
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
# 简化安装逻辑:不再自动回滚
|
||||
# if [[ "$is_upgrade" == true ]]; then
|
||||
# log_warning "升级失败,尝试回滚到之前版本..."
|
||||
# # 确保备份目录存在
|
||||
# mkdir -p "$BACKUPS_DIR"
|
||||
# local latest_backup=$(ls -1t "$BACKUPS_DIR" 2>/dev/null | head -n 1)
|
||||
# if [[ -n "$latest_backup" ]]; then
|
||||
# rollback_to_backup "$latest_backup"
|
||||
# return 1
|
||||
# fi
|
||||
# fi
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
@ -807,13 +808,14 @@ show_status() {
|
||||
echo " 无"
|
||||
fi
|
||||
|
||||
echo
|
||||
log_info "备份版本:"
|
||||
if [[ -d "$BACKUPS_DIR" ]] && [[ $(ls -1 "$BACKUPS_DIR" 2>/dev/null | wc -l) -gt 0 ]]; then
|
||||
ls -1t "$BACKUPS_DIR" 2>/dev/null | sed 's/^/ - /'
|
||||
else
|
||||
echo " 无"
|
||||
fi
|
||||
# 简化安装逻辑:不再显示备份版本信息
|
||||
# echo
|
||||
# log_info "备份版本:"
|
||||
# if [[ -d "$BACKUPS_DIR" ]] && [[ $(ls -1 "$BACKUPS_DIR" 2>/dev/null | wc -l) -gt 0 ]]; then
|
||||
# ls -1t "$BACKUPS_DIR" 2>/dev/null | sed 's/^/ - /'
|
||||
# else
|
||||
# echo " 无"
|
||||
# fi
|
||||
else
|
||||
log_warning "Argus Metric 未安装"
|
||||
log_info "安装目录: $INSTALL_DIR"
|
||||
@ -881,15 +883,20 @@ main() {
|
||||
# 加载配置文件
|
||||
load_config
|
||||
|
||||
# 对于状态和备份列表操作,不需要FTP参数和root权限
|
||||
if [[ "$ACTION" == "status" || "$ACTION" == "backup-list" ]]; then
|
||||
if [[ "$ACTION" == "status" ]]; then
|
||||
show_status
|
||||
elif [[ "$ACTION" == "backup-list" ]]; then
|
||||
list_backups
|
||||
fi
|
||||
# 对于状态操作,不需要FTP参数和root权限
|
||||
# 简化安装逻辑:不再支持备份列表操作
|
||||
if [[ "$ACTION" == "status" ]]; then
|
||||
show_status
|
||||
return 0
|
||||
fi
|
||||
# if [[ "$ACTION" == "status" || "$ACTION" == "backup-list" ]]; then
|
||||
# if [[ "$ACTION" == "status" ]]; then
|
||||
# show_status
|
||||
# elif [[ "$ACTION" == "backup-list" ]]; then
|
||||
# list_backups
|
||||
# fi
|
||||
# return 0
|
||||
# fi
|
||||
|
||||
check_root
|
||||
|
||||
@ -899,11 +906,11 @@ main() {
|
||||
CURRENT_LINK="$INSTALL_DIR/current"
|
||||
LATEST_VERSION_FILE="$INSTALL_DIR/LATEST_VERSION"
|
||||
|
||||
# 对于回滚操作,不需要FTP参数
|
||||
if [[ "$ACTION" == "rollback" ]]; then
|
||||
rollback_version
|
||||
return 0
|
||||
fi
|
||||
# 简化安装逻辑:不再支持回滚操作
|
||||
# if [[ "$ACTION" == "rollback" ]]; then
|
||||
# rollback_version
|
||||
# return 0
|
||||
# fi
|
||||
|
||||
check_ftp_params
|
||||
check_system
|
||||
|
Loading…
x
Reference in New Issue
Block a user