#!/bin/bash set -e # 颜色定义 RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # 日志函数 log_info() { echo -e "${BLUE}[INFO]${NC} $1" } log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" } log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1" } log_error() { echo -e "${RED}[ERROR]${NC} $1" } log_info "Starting Fluent Bit installation..." # 解析命令行参数 INSTALL_DIR="${1:-/opt/argus-metric/current}" # 更新安装记录 update_install_record() { local pid="$1" # 使用传入的安装目录参数,如果没有则使用默认值 local install_base_dir="${2:-/opt/argus-metric/current}" local install_record="$install_base_dir/.install_record" # 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建 if [[ ! -f "$install_record" ]]; then log_info "安装记录文件不存在,将由主安装脚本创建" return 0 fi # 如果文件存在,说明是重启场景,只更新 PID 字段 if command -v jq &> /dev/null; then # 读取当前 PID local current_pid=$(jq -r '.components."fluent-bit".pid // ""' "$install_record" 2>/dev/null) if [[ -z "$current_pid" ]]; then log_warning "无法读取当前 PID,跳过更新" return 1 fi # 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段 jq --arg new_pid "$pid" '.components."fluent-bit".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record" log_info "PID updated: $current_pid -> $pid" else log_warning "jq 命令不可用,无法更新安装记录文件" fi } # 检查是否为 root 用户 if [[ $EUID -ne 0 ]]; then log_error "This script requires root privileges" log_info "Please use: sudo $0" exit 1 fi # 停止可能运行的服务 log_info "Stopping existing fluent-bit processes..." # 只匹配进程名为 fluent-bit 的进程 pids=$(pgrep -x fluent-bit 2>/dev/null || true) if [[ -n "$pids" ]]; then for pid in $pids; do log_info "Stopping process PID: $pid" kill "$pid" 2>/dev/null || true done sleep 2 # 检查是否还有残留进程 remaining_pids=$(pgrep -x fluent-bit 2>/dev/null || true) if [[ -n "$remaining_pids" ]]; then log_warning "Force killing unresponsive processes..." for pid in $remaining_pids; do kill -9 "$pid" 2>/dev/null || true done fi fi # 安装 Fluent Bit 依赖库 libpq5(离线模式) log_info "Checking Fluent Bit dependency: libpq5 ..." if ! ldconfig -p | grep -q libpq.so.5; then if ls bin/libpq5_*.deb >/dev/null 2>&1; then log_info "Installing local dependency package: libpq5" DEBIAN_FRONTEND=noninteractive dpkg -i bin/libpq5_*.deb >/dev/null 2>&1 || { log_error "Failed to install libpq5 from bin/, please check package validity" exit 1 } else log_error "Missing dependency: libpq5 (libpq.so.5). Please put bin/libpq5_*.deb in the bin/ directory." exit 1 fi else log_info "libpq.so.5 already present on system" fi # 安装 Fluent Bit 依赖库 libyaml-0-2(离线模式) log_info "Checking Fluent Bit dependency: libyaml-0.so.2 ..." if ! ldconfig -p | grep -q libyaml-0.so.2; then if ls bin/libyaml-0-2_*.deb >/dev/null 2>&1; then log_info "Installing local dependency package: libyaml-0-2" DEBIAN_FRONTEND=noninteractive dpkg -i bin/libyaml-0-2_*.deb >/dev/null 2>&1 || { log_error "Failed to install libyaml-0-2 from bin/, please check package validity" exit 1 } else log_error "Missing dependency: libyaml-0-2 (libyaml-0.so.2). Please put bin/libyaml-0-2_*.deb in the bin/ directory." exit 1 fi else log_info "libyaml-0.so.2 already present on system" fi # 清理可能存在的旧 fluent-bit 安装(避免配置文件冲突) log_info "Cleaning up old fluent-bit installation if exists..." if dpkg -l | grep -q "^ii.*fluent-bit"; then log_info "Found existing fluent-bit package, removing..." dpkg --purge fluent-bit 2>/dev/null || true apt-get remove --purge -y fluent-bit 2>/dev/null || true fi # 确保清理残留的配置文件 if [[ -d "/etc/fluent-bit" ]]; then log_info "Removing old fluent-bit configuration directory..." rm -rf /etc/fluent-bit fi # 安装 Fluent Bit 主包 log_info "Installing Fluent Bit from deb package..." deb_file="bin/fluent-bit_3.1.9_amd64.deb" if [[ ! -f "$deb_file" ]]; then log_error "Fluent Bit package not found: $deb_file" exit 1 fi DEBIAN_FRONTEND=noninteractive dpkg -i "$deb_file" >/dev/null 2>&1 || true # 验证 Fluent Bit 可以运行 fb_version=$(/opt/fluent-bit/bin/fluent-bit --version 2>&1 | head -1) log_info "Fluent Bit version: $fb_version" # 创建 fluent-bit 用户 log_info "Creating fluent-bit user..." if ! id "fluent-bit" &>/dev/null; then useradd --no-create-home --shell /bin/false fluent-bit fi # 创建配置目录 log_info "Installing configuration files..." mkdir -p /etc/fluent-bit if [[ -d "config" ]]; then cp -r config/* /etc/fluent-bit/ chown -R fluent-bit:fluent-bit /etc/fluent-bit fi # 创建日志和缓冲区目录 log_info "Creating log and buffer directories..." mkdir -p /logs/train /logs/infer /buffers chmod 755 /logs/train /logs/infer chmod 770 /buffers chown -R fluent-bit:fluent-bit /logs /buffers # 启动 Fluent Bit log_info "Starting Fluent Bit with configuration from /etc/fluent-bit/" config_path="/etc/fluent-bit/fluent-bit.conf" if [[ ! -f "$config_path" ]]; then log_error "Configuration file not found: $config_path" exit 1 fi # 设置环境变量 log_info "Setting environment variables..." # 获取非 127.0.0.1 的 IP 地址作为 HOSTNAME if [[ -z "${HOSTNAME:-}" ]]; then # 获取 177.x.x.x 段的 IP 地址 HOSTNAME=$(ip route get 8.8.8.8 2>/dev/null | grep -oP 'src \K\S+' | grep '^177\.' | head -1) # 如果没有找到 177.x.x.x 段的 IP,则获取第一个非 127.0.0.1 的 IP if [[ -z "$HOSTNAME" ]]; then HOSTNAME=$(ip route get 8.8.8.8 2>/dev/null | grep -oP 'src \K\S+' | grep -v '^127\.' | head -1) fi # 如果还是没有找到,使用 hostname 命令 if [[ -z "$HOSTNAME" ]]; then HOSTNAME=$(hostname) fi fi export HOSTNAME export CLUSTER="${CLUSTER:-local}" export RACK="${RACK:-dev}" export ES_HOST="${ES_HOST:-localhost}" export ES_PORT="${ES_PORT:-9200}" log_info "Environment variables:" log_info " CLUSTER=$CLUSTER" log_info " RACK=$RACK" log_info " HOSTNAME=$HOSTNAME" log_info " ES_HOST=$ES_HOST" log_info " ES_PORT=$ES_PORT" # 检查 fluent-bit 二进制文件 log_info "[DEBUG] Checking fluent-bit binary..." if [[ ! -f "/opt/fluent-bit/bin/fluent-bit" ]]; then log_error "fluent-bit binary not found at /opt/fluent-bit/bin/fluent-bit" exit 1 fi log_info "[DEBUG] fluent-bit binary exists and is executable: $(ls -lh /opt/fluent-bit/bin/fluent-bit)" # 检查配置文件 log_info "[DEBUG] Checking configuration file: $config_path" if [[ ! -f "$config_path" ]]; then log_error "Configuration file not found: $config_path" exit 1 fi log_info "[DEBUG] Configuration file exists: $(ls -lh $config_path)" # 显示完整的启动命令 log_info "[DEBUG] Full command to execute:" log_info "[DEBUG] su -s /bin/bash fluent-bit -c 'env CLUSTER=\"$CLUSTER\" RACK=\"$RACK\" HOSTNAME=\"$HOSTNAME\" ES_HOST=\"$ES_HOST\" ES_PORT=\"$ES_PORT\" /opt/fluent-bit/bin/fluent-bit --config=\"$config_path\"'" # 清空或创建日志文件 log_info "[DEBUG] Preparing log file: /var/log/fluent-bit.log" : > /var/log/fluent-bit.log chmod 666 /var/log/fluent-bit.log log_info "Command: /opt/fluent-bit/bin/fluent-bit --config=$config_path" log_info "[DEBUG] Starting fluent-bit process as fluent-bit user (using su)..." nohup su -s /bin/bash fluent-bit -c "env CLUSTER='$CLUSTER' RACK='$RACK' HOSTNAME='$HOSTNAME' ES_HOST='$ES_HOST' ES_PORT='$ES_PORT' /opt/fluent-bit/bin/fluent-bit --config='$config_path' >> /var/log/fluent-bit.log 2>&1" & bg_pid=$! log_info "[DEBUG] Background process started with PID: $bg_pid" # 等待服务启动 log_info "[DEBUG] Waiting 3 seconds for service to start..." sleep 3 # 查找实际的 fluent-bit 进程 PID log_info "[DEBUG] Searching for fluent-bit process..." log_info "[DEBUG] Running: pgrep -u fluent-bit -x fluent-bit" actual_pid=$(pgrep -u fluent-bit -x fluent-bit | head -1) # 显示所有 fluent-bit 相关进程 log_info "[DEBUG] All fluent-bit related processes:" ps aux | grep fluent-bit | grep -v grep || log_warning "No fluent-bit processes found in ps output" if [[ -n "$actual_pid" ]]; then log_success "Fluent Bit started successfully (PID: $actual_pid)" log_info "[DEBUG] Process details: $(ps -p $actual_pid -o pid,user,cmd --no-headers)" # 更新安装记录 update_install_record "$actual_pid" "$INSTALL_DIR" else log_error "Fluent Bit failed to start - no fluent-bit process found" log_info "[DEBUG] Checking if background process $bg_pid still exists..." if ps -p $bg_pid > /dev/null 2>&1; then log_warning "Background shell process $bg_pid still exists" else log_warning "Background shell process $bg_pid has exited" fi log_info "[DEBUG] Last 20 lines of /var/log/fluent-bit.log:" if [[ -f "/var/log/fluent-bit.log" ]]; then tail -20 /var/log/fluent-bit.log | while IFS= read -r line; do log_info "[LOG] $line" done else log_error "Log file /var/log/fluent-bit.log does not exist" fi exit 1 fi log_success "Fluent Bit installation completed!"