286 lines
8.2 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# 整体健康检查脚本,调用各个组件的健康检查并将结果写入 .health_log 文件
set -e
# PID 文件检测,防止重复执行
PIDFILE="/var/run/check_health.pid"
if [ -f "$PIDFILE" ] && kill -0 $(cat "$PIDFILE") 2>/dev/null; then
echo "健康检查脚本已在运行中,跳过本次执行" >&2
exit 0
fi
echo $$ > "$PIDFILE"
trap "rm -f $PIDFILE" EXIT
# 获取脚本所在目录
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
HEALTH_LOG_FILE="$SCRIPT_DIR/.health_log"
INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数 - 输出到 stderr 避免影响 JSON 结果
log_info() {
echo -e "${BLUE}[INFO]${NC} $1" >&2
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1" >&2
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1" >&2
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1" >&2
}
# 检查单个组件健康状态
check_component() {
local component_name="$1"
local check_script_path="$2"
log_info "检查 $component_name 健康状态..."
if [[ ! -f "$check_script_path" ]]; then
log_error "健康检查脚本不存在: $check_script_path"
echo "{\"name\": \"$component_name\", \"status\": \"unhealth\", \"reason\": \"健康检查脚本不存在: $check_script_path\"}"
return 1
fi
if [[ ! -x "$check_script_path" ]]; then
log_error "健康检查脚本无执行权限: $check_script_path"
echo "{\"name\": \"$component_name\", \"status\": \"unhealth\", \"reason\": \"健康检查脚本无执行权限: $check_script_path\"}"
return 1
fi
# 执行健康检查脚本,只捕获 stdoutstderr 输出到终端
local result
if result=$("$check_script_path" 2>/dev/null); then
log_success "$component_name 健康检查通过"
echo "$result"
return 0
else
log_warning "$component_name 健康检查失败"
echo "$result"
return 1
fi
}
# 生成时间戳
get_timestamp() {
date '+%Y-%m-%d %H:%M:%S'
}
# 生成UTC时间戳
get_utc_timestamp() {
date -u '+%Y-%m-%dT%H:%M:%SZ'
}
# 获取主机名
get_hostname() {
echo "${HOSTNAME:-$(hostname)}"
}
# 创建健康状态目录
create_health_dir() {
local hostname=$(get_hostname)
local health_dir="/private/argus/agent/$hostname/health"
if [[ ! -d "$health_dir" ]]; then
log_info "创建健康状态目录: $health_dir"
mkdir -p "$health_dir"
fi
echo "$health_dir"
}
# 写入单个模块的健康状态JSON文件
write_component_health_json() {
local component_name="$1"
local status="$2"
local error_msg="$3"
local health_dir="$4"
# 生成模块名前缀-xxx.json格式的文件名
local module_prefix="metric"
local filename="${module_prefix}-${component_name}.json"
local filepath="$health_dir/$filename"
# 生成UTC时间戳
local timestamp=$(get_utc_timestamp)
# 构建JSON内容
local json_content=$(cat << EOF
{
"status": "$status",
"error": "$error_msg",
"timestamp": "$timestamp"
}
EOF
)
# 写入文件
echo "$json_content" > "$filepath"
log_info "已写入模块健康状态文件: $filepath"
}
# 从安装记录文件中读取组件安装目录
read_install_record() {
local install_record_file="$1"
if [[ ! -f "$install_record_file" ]]; then
log_error "安装记录文件不存在: $install_record_file"
return 1
fi
# 检查是否有 jq 命令来解析 JSON
if command -v jq &> /dev/null; then
# 使用 jq 解析 JSON
local components_json
if components_json=$(jq -r '.components | to_entries[] | "\(.key):\(.value.install_dir)"' "$install_record_file" 2>/dev/null); then
echo "$components_json"
return 0
else
log_error "无法解析安装记录文件 JSON 格式: $install_record_file"
return 1
fi
else
# 如果没有 jq尝试简单的文本解析
log_warning "jq 命令不可用,尝试简单文本解析"
# 查找所有 install_dir 行
local components=()
while IFS= read -r line; do
if [[ "$line" =~ \"install_dir\":[[:space:]]*\"([^\"]+)\" ]]; then
local install_dir="${BASH_REMATCH[1]}"
# 从路径中提取组件名称
local component_name=$(basename "$install_dir")
components+=("$component_name:$install_dir")
fi
done < "$install_record_file"
if [[ ${#components[@]} -gt 0 ]]; then
printf '%s\n' "${components[@]}"
return 0
else
log_error "无法从安装记录文件中提取组件信息"
return 1
fi
fi
}
# 主函数
main() {
echo "==========================================" >&2
echo " 整体健康检查脚本" >&2
echo "==========================================" >&2
echo >&2
# 记录健康检查开始时间
local start_time=$(get_timestamp)
log_info "健康检查开始时间: $start_time"
# 创建健康状态目录
local health_dir
health_dir=$(create_health_dir)
# 从安装记录文件中读取组件信息
log_info "从安装记录文件读取组件信息: $INSTALL_RECORD_FILE"
local components_info
if ! components_info=$(read_install_record "$INSTALL_RECORD_FILE"); then
log_error "无法读取安装记录文件,健康检查终止"
exit 1
fi
# 存储所有检查结果
local all_results=()
local overall_status="health"
# 逐个检查组件
while IFS= read -r component_info; do
if [[ -n "$component_info" ]]; then
IFS=':' read -r component_name install_dir <<< "$component_info"
local check_script_path="$install_dir/check_health.sh"
local result
local component_status="healthy"
local error_msg=""
if result=$(check_component "$component_name" "$check_script_path"); then
all_results+=("$result")
else
all_results+=("$result")
overall_status="unhealth"
component_status="unhealthy"
# 从结果中提取错误信息
if command -v jq &> /dev/null; then
error_msg=$(echo "$result" | jq -r '.reason // ""' 2>/dev/null || echo "")
else
# 简单的文本解析提取错误信息
if [[ "$result" =~ \"reason\":[[:space:]]*\"([^\"]+)\" ]]; then
error_msg="${BASH_REMATCH[1]}"
fi
fi
fi
# 写入单个模块的健康状态JSON文件
write_component_health_json "$component_name" "$component_status" "$error_msg" "$health_dir"
fi
done <<< "$components_info"
# 记录健康检查结束时间
local end_time=$(get_timestamp)
log_info "健康检查结束时间: $end_time"
# 构建完整的健康检查结果 JSON
local health_check_result=$(cat << EOF
{
"start_time": "$start_time",
"end_time": "$end_time",
"overall_status": "$overall_status",
"components": [
$(printf '%s,\n' "${all_results[@]}" | sed '$s/,$//')
]
}
EOF
)
# 写入健康日志文件
log_info "将健康检查结果写入日志文件: $HEALTH_LOG_FILE"
echo "$health_check_result" >> "$HEALTH_LOG_FILE"
# 输出 JSON 结果到 stdout
echo "$health_check_result"
# 显示总结到 stderr
echo >&2
echo "==========================================" >&2
echo " 健康检查总结" >&2
echo "==========================================" >&2
echo "开始时间: $start_time" >&2
echo "结束时间: $end_time" >&2
echo "整体状态: $overall_status" >&2
echo "日志文件: $HEALTH_LOG_FILE" >&2
echo >&2
if [[ "$overall_status" == "health" ]]; then
log_success "所有组件健康检查通过!"
exit 0
else
log_error "部分组件健康检查失败,请查看上述详细信息"
exit 1
fi
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi