286 lines
8.2 KiB
Bash
Executable File
286 lines
8.2 KiB
Bash
Executable File
#!/bin/bash
|
||
|
||
# 整体健康检查脚本,调用各个组件的健康检查并将结果写入 .health_log 文件
|
||
|
||
set -e
|
||
|
||
# PID 文件检测,防止重复执行
|
||
PIDFILE="/var/run/check_health.pid"
|
||
if [ -f "$PIDFILE" ] && kill -0 $(cat "$PIDFILE") 2>/dev/null; then
|
||
echo "健康检查脚本已在运行中,跳过本次执行" >&2
|
||
exit 0
|
||
fi
|
||
echo $$ > "$PIDFILE"
|
||
trap "rm -f $PIDFILE" EXIT
|
||
|
||
# 获取脚本所在目录
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
HEALTH_LOG_FILE="$SCRIPT_DIR/.health_log"
|
||
INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record"
|
||
|
||
# 颜色定义
|
||
RED='\033[0;31m'
|
||
GREEN='\033[0;32m'
|
||
YELLOW='\033[1;33m'
|
||
BLUE='\033[0;34m'
|
||
NC='\033[0m' # No Color
|
||
|
||
# 日志函数 - 输出到 stderr 避免影响 JSON 结果
|
||
log_info() {
|
||
echo -e "${BLUE}[INFO]${NC} $1" >&2
|
||
}
|
||
|
||
log_success() {
|
||
echo -e "${GREEN}[SUCCESS]${NC} $1" >&2
|
||
}
|
||
|
||
log_warning() {
|
||
echo -e "${YELLOW}[WARNING]${NC} $1" >&2
|
||
}
|
||
|
||
log_error() {
|
||
echo -e "${RED}[ERROR]${NC} $1" >&2
|
||
}
|
||
|
||
# 检查单个组件健康状态
|
||
check_component() {
|
||
local component_name="$1"
|
||
local check_script_path="$2"
|
||
|
||
log_info "检查 $component_name 健康状态..."
|
||
|
||
if [[ ! -f "$check_script_path" ]]; then
|
||
log_error "健康检查脚本不存在: $check_script_path"
|
||
echo "{\"name\": \"$component_name\", \"status\": \"unhealth\", \"reason\": \"健康检查脚本不存在: $check_script_path\"}"
|
||
return 1
|
||
fi
|
||
|
||
if [[ ! -x "$check_script_path" ]]; then
|
||
log_error "健康检查脚本无执行权限: $check_script_path"
|
||
echo "{\"name\": \"$component_name\", \"status\": \"unhealth\", \"reason\": \"健康检查脚本无执行权限: $check_script_path\"}"
|
||
return 1
|
||
fi
|
||
|
||
# 执行健康检查脚本,只捕获 stdout,stderr 输出到终端
|
||
local result
|
||
if result=$("$check_script_path" 2>/dev/null); then
|
||
log_success "$component_name 健康检查通过"
|
||
echo "$result"
|
||
return 0
|
||
else
|
||
log_warning "$component_name 健康检查失败"
|
||
echo "$result"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# 生成时间戳
|
||
get_timestamp() {
|
||
date '+%Y-%m-%d %H:%M:%S'
|
||
}
|
||
|
||
# 生成UTC时间戳
|
||
get_utc_timestamp() {
|
||
date -u '+%Y-%m-%dT%H:%M:%SZ'
|
||
}
|
||
|
||
# 获取主机名
|
||
get_hostname() {
|
||
echo "${HOSTNAME:-$(hostname)}"
|
||
}
|
||
|
||
# 创建健康状态目录
|
||
create_health_dir() {
|
||
local hostname=$(get_hostname)
|
||
local health_dir="/private/argus/agent/$hostname/health"
|
||
|
||
if [[ ! -d "$health_dir" ]]; then
|
||
log_info "创建健康状态目录: $health_dir"
|
||
mkdir -p "$health_dir"
|
||
fi
|
||
|
||
echo "$health_dir"
|
||
}
|
||
|
||
# 写入单个模块的健康状态JSON文件
|
||
write_component_health_json() {
|
||
local component_name="$1"
|
||
local status="$2"
|
||
local error_msg="$3"
|
||
local health_dir="$4"
|
||
|
||
# 生成模块名前缀-xxx.json格式的文件名
|
||
local module_prefix="metric"
|
||
local filename="${module_prefix}-${component_name}.json"
|
||
local filepath="$health_dir/$filename"
|
||
|
||
# 生成UTC时间戳
|
||
local timestamp=$(get_utc_timestamp)
|
||
|
||
# 构建JSON内容
|
||
local json_content=$(cat << EOF
|
||
{
|
||
"status": "$status",
|
||
"error": "$error_msg",
|
||
"timestamp": "$timestamp"
|
||
}
|
||
EOF
|
||
)
|
||
|
||
# 写入文件
|
||
echo "$json_content" > "$filepath"
|
||
log_info "已写入模块健康状态文件: $filepath"
|
||
}
|
||
|
||
# 从安装记录文件中读取组件安装目录
|
||
read_install_record() {
|
||
local install_record_file="$1"
|
||
|
||
if [[ ! -f "$install_record_file" ]]; then
|
||
log_error "安装记录文件不存在: $install_record_file"
|
||
return 1
|
||
fi
|
||
|
||
# 检查是否有 jq 命令来解析 JSON
|
||
if command -v jq &> /dev/null; then
|
||
# 使用 jq 解析 JSON
|
||
local components_json
|
||
if components_json=$(jq -r '.components | to_entries[] | "\(.key):\(.value.install_dir)"' "$install_record_file" 2>/dev/null); then
|
||
echo "$components_json"
|
||
return 0
|
||
else
|
||
log_error "无法解析安装记录文件 JSON 格式: $install_record_file"
|
||
return 1
|
||
fi
|
||
else
|
||
# 如果没有 jq,尝试简单的文本解析
|
||
log_warning "jq 命令不可用,尝试简单文本解析"
|
||
|
||
# 查找所有 install_dir 行
|
||
local components=()
|
||
while IFS= read -r line; do
|
||
if [[ "$line" =~ \"install_dir\":[[:space:]]*\"([^\"]+)\" ]]; then
|
||
local install_dir="${BASH_REMATCH[1]}"
|
||
# 从路径中提取组件名称
|
||
local component_name=$(basename "$install_dir")
|
||
components+=("$component_name:$install_dir")
|
||
fi
|
||
done < "$install_record_file"
|
||
|
||
if [[ ${#components[@]} -gt 0 ]]; then
|
||
printf '%s\n' "${components[@]}"
|
||
return 0
|
||
else
|
||
log_error "无法从安装记录文件中提取组件信息"
|
||
return 1
|
||
fi
|
||
fi
|
||
}
|
||
|
||
# 主函数
|
||
main() {
|
||
echo "==========================================" >&2
|
||
echo " 整体健康检查脚本" >&2
|
||
echo "==========================================" >&2
|
||
echo >&2
|
||
|
||
# 记录健康检查开始时间
|
||
local start_time=$(get_timestamp)
|
||
log_info "健康检查开始时间: $start_time"
|
||
|
||
# 创建健康状态目录
|
||
local health_dir
|
||
health_dir=$(create_health_dir)
|
||
|
||
# 从安装记录文件中读取组件信息
|
||
log_info "从安装记录文件读取组件信息: $INSTALL_RECORD_FILE"
|
||
local components_info
|
||
if ! components_info=$(read_install_record "$INSTALL_RECORD_FILE"); then
|
||
log_error "无法读取安装记录文件,健康检查终止"
|
||
exit 1
|
||
fi
|
||
|
||
# 存储所有检查结果
|
||
local all_results=()
|
||
local overall_status="health"
|
||
|
||
# 逐个检查组件
|
||
while IFS= read -r component_info; do
|
||
if [[ -n "$component_info" ]]; then
|
||
IFS=':' read -r component_name install_dir <<< "$component_info"
|
||
local check_script_path="$install_dir/check_health.sh"
|
||
|
||
local result
|
||
local component_status="healthy"
|
||
local error_msg=""
|
||
|
||
if result=$(check_component "$component_name" "$check_script_path"); then
|
||
all_results+=("$result")
|
||
else
|
||
all_results+=("$result")
|
||
overall_status="unhealth"
|
||
component_status="unhealthy"
|
||
# 从结果中提取错误信息
|
||
if command -v jq &> /dev/null; then
|
||
error_msg=$(echo "$result" | jq -r '.reason // ""' 2>/dev/null || echo "")
|
||
else
|
||
# 简单的文本解析提取错误信息
|
||
if [[ "$result" =~ \"reason\":[[:space:]]*\"([^\"]+)\" ]]; then
|
||
error_msg="${BASH_REMATCH[1]}"
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
# 写入单个模块的健康状态JSON文件
|
||
write_component_health_json "$component_name" "$component_status" "$error_msg" "$health_dir"
|
||
fi
|
||
done <<< "$components_info"
|
||
|
||
# 记录健康检查结束时间
|
||
local end_time=$(get_timestamp)
|
||
log_info "健康检查结束时间: $end_time"
|
||
|
||
# 构建完整的健康检查结果 JSON
|
||
local health_check_result=$(cat << EOF
|
||
{
|
||
"start_time": "$start_time",
|
||
"end_time": "$end_time",
|
||
"overall_status": "$overall_status",
|
||
"components": [
|
||
$(printf '%s,\n' "${all_results[@]}" | sed '$s/,$//')
|
||
]
|
||
}
|
||
EOF
|
||
)
|
||
|
||
# 写入健康日志文件
|
||
log_info "将健康检查结果写入日志文件: $HEALTH_LOG_FILE"
|
||
echo "$health_check_result" >> "$HEALTH_LOG_FILE"
|
||
|
||
# 输出 JSON 结果到 stdout
|
||
echo "$health_check_result"
|
||
|
||
# 显示总结到 stderr
|
||
echo >&2
|
||
echo "==========================================" >&2
|
||
echo " 健康检查总结" >&2
|
||
echo "==========================================" >&2
|
||
echo "开始时间: $start_time" >&2
|
||
echo "结束时间: $end_time" >&2
|
||
echo "整体状态: $overall_status" >&2
|
||
echo "日志文件: $HEALTH_LOG_FILE" >&2
|
||
echo >&2
|
||
|
||
if [[ "$overall_status" == "health" ]]; then
|
||
log_success "所有组件健康检查通过!"
|
||
exit 0
|
||
else
|
||
log_error "部分组件健康检查失败,请查看上述详细信息"
|
||
exit 1
|
||
fi
|
||
}
|
||
|
||
# 脚本入口
|
||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||
main "$@"
|
||
fi |