292 lines
9.6 KiB
Bash
Executable File
292 lines
9.6 KiB
Bash
Executable File
#!/bin/bash
|
||
|
||
set -e
|
||
|
||
# 颜色定义
|
||
RED='\033[0;31m'
|
||
GREEN='\033[0;32m'
|
||
YELLOW='\033[1;33m'
|
||
BLUE='\033[0;34m'
|
||
NC='\033[0m' # No Color
|
||
|
||
# 日志函数
|
||
log_info() {
|
||
echo -e "${BLUE}[INFO]${NC} $1"
|
||
}
|
||
|
||
log_success() {
|
||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||
}
|
||
|
||
log_warning() {
|
||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||
}
|
||
|
||
log_error() {
|
||
echo -e "${RED}[ERROR]${NC} $1"
|
||
}
|
||
|
||
log_info "Starting Fluent Bit installation..."
|
||
|
||
# 解析命令行参数
|
||
INSTALL_DIR="${1:-/opt/argus-metric/current}"
|
||
|
||
# 更新安装记录
|
||
update_install_record() {
|
||
local pid="$1"
|
||
# 使用传入的安装目录参数,如果没有则使用默认值
|
||
local install_base_dir="${2:-/opt/argus-metric/current}"
|
||
local install_record="$install_base_dir/.install_record"
|
||
|
||
# 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建
|
||
if [[ ! -f "$install_record" ]]; then
|
||
log_info "安装记录文件不存在,将由主安装脚本创建"
|
||
return 0
|
||
fi
|
||
|
||
# 如果文件存在,说明是重启场景,只更新 PID 字段
|
||
if command -v jq &> /dev/null; then
|
||
# 读取当前 PID
|
||
local current_pid=$(jq -r '.components."fluent-bit".pid // ""' "$install_record" 2>/dev/null)
|
||
|
||
if [[ -z "$current_pid" ]]; then
|
||
log_warning "无法读取当前 PID,跳过更新"
|
||
return 1
|
||
fi
|
||
|
||
# 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段
|
||
jq --arg new_pid "$pid" '.components."fluent-bit".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record"
|
||
log_info "PID updated: $current_pid -> $pid"
|
||
else
|
||
log_warning "jq 命令不可用,无法更新安装记录文件"
|
||
fi
|
||
}
|
||
|
||
# 检查是否为 root 用户
|
||
if [[ $EUID -ne 0 ]]; then
|
||
log_error "This script requires root privileges"
|
||
log_info "Please use: sudo $0"
|
||
exit 1
|
||
fi
|
||
|
||
# 停止可能运行的服务
|
||
log_info "Stopping existing fluent-bit processes..."
|
||
|
||
# 只匹配进程名为 fluent-bit 的进程
|
||
pids=$(pgrep -x fluent-bit 2>/dev/null || true)
|
||
|
||
if [[ -n "$pids" ]]; then
|
||
for pid in $pids; do
|
||
log_info "Stopping process PID: $pid"
|
||
kill "$pid" 2>/dev/null || true
|
||
done
|
||
sleep 2
|
||
|
||
# 检查是否还有残留进程
|
||
remaining_pids=$(pgrep -x fluent-bit 2>/dev/null || true)
|
||
if [[ -n "$remaining_pids" ]]; then
|
||
log_warning "Force killing unresponsive processes..."
|
||
for pid in $remaining_pids; do
|
||
kill -9 "$pid" 2>/dev/null || true
|
||
done
|
||
fi
|
||
fi
|
||
|
||
# 安装 Fluent Bit 依赖库 libpq5(离线模式)
|
||
log_info "Checking Fluent Bit dependency: libpq5 ..."
|
||
if ! ldconfig -p | grep -q libpq.so.5; then
|
||
if ls bin/libpq5_*.deb >/dev/null 2>&1; then
|
||
log_info "Installing local dependency package: libpq5"
|
||
DEBIAN_FRONTEND=noninteractive dpkg -i bin/libpq5_*.deb >/dev/null 2>&1 || {
|
||
log_error "Failed to install libpq5 from bin/, please check package validity"
|
||
exit 1
|
||
}
|
||
else
|
||
log_error "Missing dependency: libpq5 (libpq.so.5). Please put bin/libpq5_*.deb in the bin/ directory."
|
||
exit 1
|
||
fi
|
||
else
|
||
log_info "libpq.so.5 already present on system"
|
||
fi
|
||
|
||
# 安装 Fluent Bit 依赖库 libyaml-0-2(离线模式)
|
||
log_info "Checking Fluent Bit dependency: libyaml-0.so.2 ..."
|
||
if ! ldconfig -p | grep -q libyaml-0.so.2; then
|
||
if ls bin/libyaml-0-2_*.deb >/dev/null 2>&1; then
|
||
log_info "Installing local dependency package: libyaml-0-2"
|
||
DEBIAN_FRONTEND=noninteractive dpkg -i bin/libyaml-0-2_*.deb >/dev/null 2>&1 || {
|
||
log_error "Failed to install libyaml-0-2 from bin/, please check package validity"
|
||
exit 1
|
||
}
|
||
else
|
||
log_error "Missing dependency: libyaml-0-2 (libyaml-0.so.2). Please put bin/libyaml-0-2_*.deb in the bin/ directory."
|
||
exit 1
|
||
fi
|
||
else
|
||
log_info "libyaml-0.so.2 already present on system"
|
||
fi
|
||
|
||
# 清理可能存在的旧 fluent-bit 安装(避免配置文件冲突)
|
||
log_info "Cleaning up old fluent-bit installation if exists..."
|
||
if dpkg -l | grep -q "^ii.*fluent-bit"; then
|
||
log_info "Found existing fluent-bit package, removing..."
|
||
dpkg --purge fluent-bit 2>/dev/null || true
|
||
apt-get remove --purge -y fluent-bit 2>/dev/null || true
|
||
fi
|
||
|
||
# 确保清理残留的配置文件
|
||
if [[ -d "/etc/fluent-bit" ]]; then
|
||
log_info "Removing old fluent-bit configuration directory..."
|
||
rm -rf /etc/fluent-bit
|
||
fi
|
||
|
||
# 安装 Fluent Bit 主包
|
||
log_info "Installing Fluent Bit from deb package..."
|
||
deb_file="bin/fluent-bit_3.1.9_amd64.deb"
|
||
if [[ ! -f "$deb_file" ]]; then
|
||
log_error "Fluent Bit package not found: $deb_file"
|
||
exit 1
|
||
fi
|
||
|
||
DEBIAN_FRONTEND=noninteractive dpkg -i "$deb_file" >/dev/null 2>&1 || true
|
||
|
||
# 验证 Fluent Bit 可以运行
|
||
fb_version=$(/opt/fluent-bit/bin/fluent-bit --version 2>&1 | head -1)
|
||
log_info "Fluent Bit version: $fb_version"
|
||
|
||
# 创建 fluent-bit 用户
|
||
log_info "Creating fluent-bit user..."
|
||
if ! id "fluent-bit" &>/dev/null; then
|
||
useradd --no-create-home --shell /bin/false fluent-bit
|
||
fi
|
||
|
||
# 创建配置目录
|
||
log_info "Installing configuration files..."
|
||
mkdir -p /etc/fluent-bit
|
||
if [[ -d "config" ]]; then
|
||
cp -r config/* /etc/fluent-bit/
|
||
chown -R fluent-bit:fluent-bit /etc/fluent-bit
|
||
fi
|
||
|
||
# 创建日志和缓冲区目录
|
||
log_info "Creating log and buffer directories..."
|
||
mkdir -p /logs/train /logs/infer /buffers
|
||
chmod 755 /logs/train /logs/infer
|
||
chmod 770 /buffers
|
||
chown -R fluent-bit:fluent-bit /logs /buffers
|
||
|
||
# 启动 Fluent Bit
|
||
log_info "Starting Fluent Bit with configuration from /etc/fluent-bit/"
|
||
config_path="/etc/fluent-bit/fluent-bit.conf"
|
||
|
||
if [[ ! -f "$config_path" ]]; then
|
||
log_error "Configuration file not found: $config_path"
|
||
exit 1
|
||
fi
|
||
|
||
# 设置环境变量
|
||
log_info "Setting environment variables..."
|
||
|
||
# 获取非 127.0.0.1 的 IP 地址作为 HOSTNAME
|
||
if [[ -z "${HOSTNAME:-}" ]]; then
|
||
# 获取 177.x.x.x 段的 IP 地址
|
||
HOSTNAME=$(ip route get 8.8.8.8 2>/dev/null | grep -oP 'src \K\S+' | grep '^177\.' | head -1)
|
||
|
||
# 如果没有找到 177.x.x.x 段的 IP,则获取第一个非 127.0.0.1 的 IP
|
||
if [[ -z "$HOSTNAME" ]]; then
|
||
HOSTNAME=$(ip route get 8.8.8.8 2>/dev/null | grep -oP 'src \K\S+' | grep -v '^127\.' | head -1)
|
||
fi
|
||
|
||
# 如果还是没有找到,使用 hostname 命令
|
||
if [[ -z "$HOSTNAME" ]]; then
|
||
HOSTNAME=$(hostname)
|
||
fi
|
||
fi
|
||
export HOSTNAME
|
||
|
||
export CLUSTER="${CLUSTER:-local}"
|
||
export RACK="${RACK:-dev}"
|
||
export ES_HOST="${ES_HOST:-localhost}"
|
||
export ES_PORT="${ES_PORT:-9200}"
|
||
|
||
log_info "Environment variables:"
|
||
log_info " CLUSTER=$CLUSTER"
|
||
log_info " RACK=$RACK"
|
||
log_info " HOSTNAME=$HOSTNAME"
|
||
log_info " ES_HOST=$ES_HOST"
|
||
log_info " ES_PORT=$ES_PORT"
|
||
|
||
# 检查 fluent-bit 二进制文件
|
||
log_info "[DEBUG] Checking fluent-bit binary..."
|
||
if [[ ! -f "/opt/fluent-bit/bin/fluent-bit" ]]; then
|
||
log_error "fluent-bit binary not found at /opt/fluent-bit/bin/fluent-bit"
|
||
exit 1
|
||
fi
|
||
log_info "[DEBUG] fluent-bit binary exists and is executable: $(ls -lh /opt/fluent-bit/bin/fluent-bit)"
|
||
|
||
# 检查配置文件
|
||
log_info "[DEBUG] Checking configuration file: $config_path"
|
||
if [[ ! -f "$config_path" ]]; then
|
||
log_error "Configuration file not found: $config_path"
|
||
exit 1
|
||
fi
|
||
log_info "[DEBUG] Configuration file exists: $(ls -lh $config_path)"
|
||
|
||
# 显示完整的启动命令
|
||
log_info "[DEBUG] Full command to execute:"
|
||
log_info "[DEBUG] su -s /bin/bash fluent-bit -c 'env CLUSTER=\"$CLUSTER\" RACK=\"$RACK\" HOSTNAME=\"$HOSTNAME\" ES_HOST=\"$ES_HOST\" ES_PORT=\"$ES_PORT\" /opt/fluent-bit/bin/fluent-bit --config=\"$config_path\"'"
|
||
|
||
# 清空或创建日志文件
|
||
log_info "[DEBUG] Preparing log file: /var/log/fluent-bit.log"
|
||
: > /var/log/fluent-bit.log
|
||
chmod 666 /var/log/fluent-bit.log
|
||
|
||
log_info "Command: /opt/fluent-bit/bin/fluent-bit --config=$config_path"
|
||
log_info "[DEBUG] Starting fluent-bit process as fluent-bit user (using su)..."
|
||
nohup su -s /bin/bash fluent-bit -c "env CLUSTER='$CLUSTER' RACK='$RACK' HOSTNAME='$HOSTNAME' ES_HOST='$ES_HOST' ES_PORT='$ES_PORT' /opt/fluent-bit/bin/fluent-bit --config='$config_path' >> /var/log/fluent-bit.log 2>&1" &
|
||
|
||
bg_pid=$!
|
||
log_info "[DEBUG] Background process started with PID: $bg_pid"
|
||
|
||
# 等待服务启动
|
||
log_info "[DEBUG] Waiting 3 seconds for service to start..."
|
||
sleep 3
|
||
|
||
# 查找实际的 fluent-bit 进程 PID
|
||
log_info "[DEBUG] Searching for fluent-bit process..."
|
||
log_info "[DEBUG] Running: pgrep -u fluent-bit -x fluent-bit"
|
||
actual_pid=$(pgrep -u fluent-bit -x fluent-bit | head -1)
|
||
|
||
# 显示所有 fluent-bit 相关进程
|
||
log_info "[DEBUG] All fluent-bit related processes:"
|
||
ps aux | grep fluent-bit | grep -v grep || log_warning "No fluent-bit processes found in ps output"
|
||
|
||
if [[ -n "$actual_pid" ]]; then
|
||
log_success "Fluent Bit started successfully (PID: $actual_pid)"
|
||
log_info "[DEBUG] Process details: $(ps -p $actual_pid -o pid,user,cmd --no-headers)"
|
||
|
||
# 更新安装记录
|
||
update_install_record "$actual_pid" "$INSTALL_DIR"
|
||
else
|
||
log_error "Fluent Bit failed to start - no fluent-bit process found"
|
||
log_info "[DEBUG] Checking if background process $bg_pid still exists..."
|
||
if ps -p $bg_pid > /dev/null 2>&1; then
|
||
log_warning "Background shell process $bg_pid still exists"
|
||
else
|
||
log_warning "Background shell process $bg_pid has exited"
|
||
fi
|
||
|
||
log_info "[DEBUG] Last 20 lines of /var/log/fluent-bit.log:"
|
||
if [[ -f "/var/log/fluent-bit.log" ]]; then
|
||
tail -20 /var/log/fluent-bit.log | while IFS= read -r line; do
|
||
log_info "[LOG] $line"
|
||
done
|
||
else
|
||
log_error "Log file /var/log/fluent-bit.log does not exist"
|
||
fi
|
||
|
||
exit 1
|
||
fi
|
||
|
||
log_success "Fluent Bit installation completed!"
|