292 lines
9.6 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
log_info "Starting Fluent Bit installation..."
# 解析命令行参数
INSTALL_DIR="${1:-/opt/argus-metric/current}"
# 更新安装记录
update_install_record() {
local pid="$1"
# 使用传入的安装目录参数,如果没有则使用默认值
local install_base_dir="${2:-/opt/argus-metric/current}"
local install_record="$install_base_dir/.install_record"
# 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建
if [[ ! -f "$install_record" ]]; then
log_info "安装记录文件不存在,将由主安装脚本创建"
return 0
fi
# 如果文件存在,说明是重启场景,只更新 PID 字段
if command -v jq &> /dev/null; then
# 读取当前 PID
local current_pid=$(jq -r '.components."fluent-bit".pid // ""' "$install_record" 2>/dev/null)
if [[ -z "$current_pid" ]]; then
log_warning "无法读取当前 PID跳过更新"
return 1
fi
# 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段
jq --arg new_pid "$pid" '.components."fluent-bit".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record"
log_info "PID updated: $current_pid -> $pid"
else
log_warning "jq 命令不可用,无法更新安装记录文件"
fi
}
# 检查是否为 root 用户
if [[ $EUID -ne 0 ]]; then
log_error "This script requires root privileges"
log_info "Please use: sudo $0"
exit 1
fi
# 停止可能运行的服务
log_info "Stopping existing fluent-bit processes..."
# 只匹配进程名为 fluent-bit 的进程
pids=$(pgrep -x fluent-bit 2>/dev/null || true)
if [[ -n "$pids" ]]; then
for pid in $pids; do
log_info "Stopping process PID: $pid"
kill "$pid" 2>/dev/null || true
done
sleep 2
# 检查是否还有残留进程
remaining_pids=$(pgrep -x fluent-bit 2>/dev/null || true)
if [[ -n "$remaining_pids" ]]; then
log_warning "Force killing unresponsive processes..."
for pid in $remaining_pids; do
kill -9 "$pid" 2>/dev/null || true
done
fi
fi
# 安装 Fluent Bit 依赖库 libpq5离线模式
log_info "Checking Fluent Bit dependency: libpq5 ..."
if ! ldconfig -p | grep -q libpq.so.5; then
if ls bin/libpq5_*.deb >/dev/null 2>&1; then
log_info "Installing local dependency package: libpq5"
DEBIAN_FRONTEND=noninteractive dpkg -i bin/libpq5_*.deb >/dev/null 2>&1 || {
log_error "Failed to install libpq5 from bin/, please check package validity"
exit 1
}
else
log_error "Missing dependency: libpq5 (libpq.so.5). Please put bin/libpq5_*.deb in the bin/ directory."
exit 1
fi
else
log_info "libpq.so.5 already present on system"
fi
# 安装 Fluent Bit 依赖库 libyaml-0-2离线模式
log_info "Checking Fluent Bit dependency: libyaml-0.so.2 ..."
if ! ldconfig -p | grep -q libyaml-0.so.2; then
if ls bin/libyaml-0-2_*.deb >/dev/null 2>&1; then
log_info "Installing local dependency package: libyaml-0-2"
DEBIAN_FRONTEND=noninteractive dpkg -i bin/libyaml-0-2_*.deb >/dev/null 2>&1 || {
log_error "Failed to install libyaml-0-2 from bin/, please check package validity"
exit 1
}
else
log_error "Missing dependency: libyaml-0-2 (libyaml-0.so.2). Please put bin/libyaml-0-2_*.deb in the bin/ directory."
exit 1
fi
else
log_info "libyaml-0.so.2 already present on system"
fi
# 清理可能存在的旧 fluent-bit 安装(避免配置文件冲突)
log_info "Cleaning up old fluent-bit installation if exists..."
if dpkg -l | grep -q "^ii.*fluent-bit"; then
log_info "Found existing fluent-bit package, removing..."
dpkg --purge fluent-bit 2>/dev/null || true
apt-get remove --purge -y fluent-bit 2>/dev/null || true
fi
# 确保清理残留的配置文件
if [[ -d "/etc/fluent-bit" ]]; then
log_info "Removing old fluent-bit configuration directory..."
rm -rf /etc/fluent-bit
fi
# 安装 Fluent Bit 主包
log_info "Installing Fluent Bit from deb package..."
deb_file="bin/fluent-bit_3.1.9_amd64.deb"
if [[ ! -f "$deb_file" ]]; then
log_error "Fluent Bit package not found: $deb_file"
exit 1
fi
DEBIAN_FRONTEND=noninteractive dpkg -i "$deb_file" >/dev/null 2>&1 || true
# 验证 Fluent Bit 可以运行
fb_version=$(/opt/fluent-bit/bin/fluent-bit --version 2>&1 | head -1)
log_info "Fluent Bit version: $fb_version"
# 创建 fluent-bit 用户
log_info "Creating fluent-bit user..."
if ! id "fluent-bit" &>/dev/null; then
useradd --no-create-home --shell /bin/false fluent-bit
fi
# 创建配置目录
log_info "Installing configuration files..."
mkdir -p /etc/fluent-bit
if [[ -d "config" ]]; then
cp -r config/* /etc/fluent-bit/
chown -R fluent-bit:fluent-bit /etc/fluent-bit
fi
# 创建日志和缓冲区目录
log_info "Creating log and buffer directories..."
mkdir -p /logs/train /logs/infer /buffers
chmod 755 /logs/train /logs/infer
chmod 770 /buffers
chown -R fluent-bit:fluent-bit /logs /buffers
# 启动 Fluent Bit
log_info "Starting Fluent Bit with configuration from /etc/fluent-bit/"
config_path="/etc/fluent-bit/fluent-bit.conf"
if [[ ! -f "$config_path" ]]; then
log_error "Configuration file not found: $config_path"
exit 1
fi
# 设置环境变量
log_info "Setting environment variables..."
# 获取非 127.0.0.1 的 IP 地址作为 HOSTNAME
if [[ -z "${HOSTNAME:-}" ]]; then
# 获取 177.x.x.x 段的 IP 地址
HOSTNAME=$(ip route get 8.8.8.8 2>/dev/null | grep -oP 'src \K\S+' | grep '^177\.' | head -1)
# 如果没有找到 177.x.x.x 段的 IP则获取第一个非 127.0.0.1 的 IP
if [[ -z "$HOSTNAME" ]]; then
HOSTNAME=$(ip route get 8.8.8.8 2>/dev/null | grep -oP 'src \K\S+' | grep -v '^127\.' | head -1)
fi
# 如果还是没有找到,使用 hostname 命令
if [[ -z "$HOSTNAME" ]]; then
HOSTNAME=$(hostname)
fi
fi
export HOSTNAME
export CLUSTER="${CLUSTER:-local}"
export RACK="${RACK:-dev}"
export ES_HOST="${ES_HOST:-localhost}"
export ES_PORT="${ES_PORT:-9200}"
log_info "Environment variables:"
log_info " CLUSTER=$CLUSTER"
log_info " RACK=$RACK"
log_info " HOSTNAME=$HOSTNAME"
log_info " ES_HOST=$ES_HOST"
log_info " ES_PORT=$ES_PORT"
# 检查 fluent-bit 二进制文件
log_info "[DEBUG] Checking fluent-bit binary..."
if [[ ! -f "/opt/fluent-bit/bin/fluent-bit" ]]; then
log_error "fluent-bit binary not found at /opt/fluent-bit/bin/fluent-bit"
exit 1
fi
log_info "[DEBUG] fluent-bit binary exists and is executable: $(ls -lh /opt/fluent-bit/bin/fluent-bit)"
# 检查配置文件
log_info "[DEBUG] Checking configuration file: $config_path"
if [[ ! -f "$config_path" ]]; then
log_error "Configuration file not found: $config_path"
exit 1
fi
log_info "[DEBUG] Configuration file exists: $(ls -lh $config_path)"
# 显示完整的启动命令
log_info "[DEBUG] Full command to execute:"
log_info "[DEBUG] su -s /bin/bash fluent-bit -c 'env CLUSTER=\"$CLUSTER\" RACK=\"$RACK\" HOSTNAME=\"$HOSTNAME\" ES_HOST=\"$ES_HOST\" ES_PORT=\"$ES_PORT\" /opt/fluent-bit/bin/fluent-bit --config=\"$config_path\"'"
# 清空或创建日志文件
log_info "[DEBUG] Preparing log file: /var/log/fluent-bit.log"
: > /var/log/fluent-bit.log
chmod 666 /var/log/fluent-bit.log
log_info "Command: /opt/fluent-bit/bin/fluent-bit --config=$config_path"
log_info "[DEBUG] Starting fluent-bit process as fluent-bit user (using su)..."
nohup su -s /bin/bash fluent-bit -c "env CLUSTER='$CLUSTER' RACK='$RACK' HOSTNAME='$HOSTNAME' ES_HOST='$ES_HOST' ES_PORT='$ES_PORT' /opt/fluent-bit/bin/fluent-bit --config='$config_path' >> /var/log/fluent-bit.log 2>&1" &
bg_pid=$!
log_info "[DEBUG] Background process started with PID: $bg_pid"
# 等待服务启动
log_info "[DEBUG] Waiting 3 seconds for service to start..."
sleep 3
# 查找实际的 fluent-bit 进程 PID
log_info "[DEBUG] Searching for fluent-bit process..."
log_info "[DEBUG] Running: pgrep -u fluent-bit -x fluent-bit"
actual_pid=$(pgrep -u fluent-bit -x fluent-bit | head -1)
# 显示所有 fluent-bit 相关进程
log_info "[DEBUG] All fluent-bit related processes:"
ps aux | grep fluent-bit | grep -v grep || log_warning "No fluent-bit processes found in ps output"
if [[ -n "$actual_pid" ]]; then
log_success "Fluent Bit started successfully (PID: $actual_pid)"
log_info "[DEBUG] Process details: $(ps -p $actual_pid -o pid,user,cmd --no-headers)"
# 更新安装记录
update_install_record "$actual_pid" "$INSTALL_DIR"
else
log_error "Fluent Bit failed to start - no fluent-bit process found"
log_info "[DEBUG] Checking if background process $bg_pid still exists..."
if ps -p $bg_pid > /dev/null 2>&1; then
log_warning "Background shell process $bg_pid still exists"
else
log_warning "Background shell process $bg_pid has exited"
fi
log_info "[DEBUG] Last 20 lines of /var/log/fluent-bit.log:"
if [[ -f "/var/log/fluent-bit.log" ]]; then
tail -20 /var/log/fluent-bit.log | while IFS= read -r line; do
log_info "[LOG] $line"
done
else
log_error "Log file /var/log/fluent-bit.log does not exist"
fi
exit 1
fi
log_success "Fluent Bit installation completed!"