dev_1.0.0_yuyr 完成 log和bind模块开发部署测试 #8

Merged
yuyr merged 26 commits from dev_1.0.0_yuyr into dev_1.0.0 2025-09-22 16:39:39 +08:00
19 changed files with 445 additions and 0 deletions
Showing only changes of commit 653a0c04a6 - Show all commits

View File

@ -1,2 +1,7 @@
测试log模块开发
elasticsearch: 部署镜像构建及启动脚本解决账号问题、挂载目录、使用supervisor守护
kibana: 镜像构建
fluent-bit: 安装包,脚本准备, 交付给大鹏统一组织客户端侧安装流程
init: EK初始化脚本数据视图创建脚本等

View File

@ -0,0 +1,74 @@
version: "3.8"
services:
es:
image: docker.elastic.co/elasticsearch/elasticsearch:8.13.4
environment:
- discovery.type=single-node
- xpack.security.enabled=false
- ES_JAVA_OPTS=-Xms512m -Xmx512m
volumes:
- ./private:/private
ports: ["9200:9200"]
command: /private/es/scripts/start-es.sh
healthcheck:
test: ["CMD-SHELL", "curl -fs http://localhost:9200 >/dev/null || exit 1"]
interval: 10s
timeout: 5s
retries: 30
kibana:
image: docker.elastic.co/kibana/kibana:8.13.4
environment:
- ELASTICSEARCH_HOSTS=http://es:9200
volumes:
- ./private:/private
ports: ["5601:5601"]
command: /private/kibana/scripts/start-kibana.sh
depends_on:
es:
condition: service_healthy
fluent-bit-host01:
image: ubuntu:22.04
environment:
- CLUSTER=local
- RACK=dev
- HOSTNAME=host01
- ES_HOST=es
- ES_PORT=9200
volumes:
- ./start-fluent-bit.sh:/private/start-fluent-bit.sh:ro
- ./fluent-bit-bundle.tar.gz:/private/fluent-bit-bundle.tar.gz:ro
ports: ["2020:2020"]
depends_on:
es:
condition: service_healthy
command: /private/start-fluent-bit.sh
healthcheck:
test: ["CMD-SHELL", "curl -fs http://localhost:2020/api/v2/metrics >/dev/null || exit 1"]
interval: 15s
timeout: 10s
retries: 30
fluent-bit-host02:
image: ubuntu:22.04
environment:
- CLUSTER=local
- RACK=dev
- HOSTNAME=host02
- ES_HOST=es
- ES_PORT=9200
volumes:
- ./start-fluent-bit.sh:/private/start-fluent-bit.sh:ro
- ./fluent-bit-bundle.tar.gz:/private/fluent-bit-bundle.tar.gz:ro
ports: ["2021:2020"]
depends_on:
es:
condition: service_healthy
command: /private/start-fluent-bit.sh
healthcheck:
test: ["CMD-SHELL", "curl -fs http://localhost:2020/api/v2/metrics >/dev/null || exit 1"]
interval: 15s
timeout: 10s
retries: 30

View File

@ -0,0 +1,19 @@
#!/bin/bash
set -euo pipefail
echo "[INFO] Starting Elasticsearch with private directory setup..."
# 创建数据目录
mkdir -p /private/es/data
# 创建软链接到Elasticsearch预期的数据目录
rm -rf /usr/share/elasticsearch/data 2>/dev/null || true
ln -sf /private/es/data /usr/share/elasticsearch/data
# 设置正确的权限 (Elasticsearch使用UID 1000)
chown -R 1000:1000 /private/es/data
echo "[INFO] Data directory linked: /usr/share/elasticsearch/data -> /private/es/data"
# 启动原始的Elasticsearch entrypoint
exec /usr/local/bin/docker-entrypoint.sh elasticsearch

Binary file not shown.

View File

@ -0,0 +1,46 @@
#!/bin/bash
set -euo pipefail
echo "[INFO] Starting Fluent Bit setup in Ubuntu container..."
# 安装必要的工具
echo "[INFO] Installing required packages..."
export DEBIAN_FRONTEND=noninteractive
apt-get update -qq
apt-get install -y -qq curl
# 解压bundle到/tmp
echo "[INFO] Extracting fluent-bit bundle..."
cd /tmp
tar -xzf /private/fluent-bit-bundle.tar.gz
# 安装 Fluent Bit 从 deb 包
echo "[INFO] Installing Fluent Bit from deb package..."
dpkg -i /tmp/packages/fluent-bit_3.1.9_amd64.deb || true
apt-get install -f -y -qq # 解决依赖问题
# 验证 Fluent Bit 可以运行
echo "[INFO] Fluent Bit version:"
/opt/fluent-bit/bin/fluent-bit --version
# 创建配置目录
mkdir -p /etc/fluent-bit
cp -r /tmp/etc/* /etc/fluent-bit/
# 创建日志和缓冲区目录
mkdir -p /logs/train /logs/infer /buffers
chmod 755 /logs/train /logs/infer /buffers
# 等待 Elasticsearch 就绪
echo "[INFO] Waiting for Elasticsearch to be ready..."
while ! curl -fs http://${ES_HOST}:${ES_PORT}/_cluster/health >/dev/null 2>&1; do
echo " Waiting for ES at ${ES_HOST}:${ES_PORT}..."
sleep 5
done
echo "[INFO] Elasticsearch is ready"
# 启动 Fluent Bit
echo "[INFO] Starting Fluent Bit with configuration from /etc/fluent-bit/"
echo "[INFO] Command: /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf"
exec /opt/fluent-bit/bin/fluent-bit \
--config=/etc/fluent-bit/fluent-bit.conf

View File

@ -0,0 +1,19 @@
#!/bin/bash
set -euo pipefail
echo "[INFO] Starting Kibana with private directory setup..."
# 创建数据目录
mkdir -p /private/kibana/data
# 创建软链接到Kibana预期的数据目录
rm -rf /usr/share/kibana/data 2>/dev/null || true
ln -sf /private/kibana/data /usr/share/kibana/data
# 设置正确的权限 (Kibana使用UID 1000)
chown -R 1000:1000 /private/kibana/data
echo "[INFO] Data directory linked: /usr/share/kibana/data -> /private/kibana/data"
# 启动原始的Kibana entrypoint
exec /usr/local/bin/kibana-docker

47
src/log/misc/01_bootstrap.sh Executable file
View File

@ -0,0 +1,47 @@
#!/usr/bin/env bash
set -euo pipefail
root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
# 创建private目录结构 (只需要ES和Kibana)
echo "[INFO] Creating private directory structure..."
mkdir -p "$root/private/es/data"
mkdir -p "$root/private/es/scripts"
mkdir -p "$root/private/kibana/data"
mkdir -p "$root/private/kibana/scripts"
# 复制启动脚本
echo "[INFO] Copying startup scripts..."
cp "$root/scripts/start-es.sh" "$root/private/es/scripts/"
cp "$root/scripts/start-kibana.sh" "$root/private/kibana/scripts/"
# 设置执行权限
chmod +x "$root/private/es/scripts/start-es.sh"
chmod +x "$root/private/kibana/scripts/start-kibana.sh"
# 设置数据目录权限ES 和 Kibana 容器都使用 UID 1000
sudo chown -R 1000:1000 "$root/private/es/data" "$root/private/kibana/data" 2>/dev/null || true
# 检查fluent-bit相关文件是否存在
if [[ ! -f "$root/fluent-bit-bundle.tar.gz" ]]; then
echo "[INFO] Creating fluent-bit bundle..."
# 创建bundle如果目录存在的话
cd "$root"
if [[ -d "private/fluent-bit" ]]; then
cd private/fluent-bit && tar -czf ../../fluent-bit-bundle.tar.gz etc/ packages/ 2>/dev/null && cd ../..
elif [[ -d "fluent-bit" && -d "packages" ]]; then
# 临时创建目录结构来打包
mkdir -p temp-bundle/etc temp-bundle/packages
cp -r fluent-bit/* temp-bundle/etc/
cp -r packages/* temp-bundle/packages/
cd temp-bundle && tar -czf ../fluent-bit-bundle.tar.gz . && cd ..
rm -rf temp-bundle
else
echo "[WARN] 无法创建fluent-bit bundle请确保fluent-bit配置和packages目录存在"
fi
fi
if [[ ! -f "$root/start-fluent-bit.sh" ]]; then
echo "[WARN] start-fluent-bit.sh 不存在,请确保已创建该启动脚本"
fi
echo "[OK] 初始化完成: private/{es,kibana}, fluent-bit-bundle.tar.gz, start-fluent-bit.sh"

10
src/log/misc/02_up.sh Executable file
View File

@ -0,0 +1,10 @@
#!/usr/bin/env bash
set -euo pipefail
cd "$(dirname "$0")/.."
compose_cmd="docker compose"
if ! $compose_cmd version >/dev/null 2>&1; then
if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else
echo "需要 Docker Compose请安装后重试" >&2; exit 1; fi
fi
$compose_cmd -p logging-mvp up -d --remove-orphans
echo "[OK] 服务已启动ES http://localhost:9200 Kibana http://localhost:5601 Fluent-Bit host01 http://localhost:2020 Fluent-Bit host02 http://localhost:2021"

17
src/log/misc/03_send_test.sh Executable file
View File

@ -0,0 +1,17 @@
#!/usr/bin/env bash
set -euo pipefail
echo "[INFO] 向两个fluent-bit容器发送测试日志..."
# 发送日志到host01
echo "[INFO] 发送日志到 host01..."
./scripts/03_send_test_host01.sh
echo ""
# 发送日志到host02
echo "[INFO] 发送日志到 host02..."
./scripts/03_send_test_host02.sh
echo ""
echo "[OK] 已完成向两个主机发送测试日志"

View File

@ -0,0 +1,31 @@
#!/usr/bin/env bash
set -euo pipefail
# 获取fluent-bit-host01容器名称
container_name="logging-mvp-fluent-bit-host01-1"
# 检查容器是否存在并运行
if ! docker ps --format "table {{.Names}}" | grep -q "$container_name"; then
echo "[ERROR] Fluent Bit容器 $container_name 未运行"
exit 1
fi
# 创建日志目录
docker exec "$container_name" mkdir -p /logs/train /logs/infer
# 写入训练日志 (host01)
docker exec "$container_name" sh -c "printf '%s INFO [host01] training step=1 loss=1.23 model=bert\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log"
docker exec "$container_name" sh -c "printf '%s INFO [host01] training step=2 loss=1.15 model=bert\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log"
# 写入推理日志 (host01)
docker exec "$container_name" sh -c "printf '%s ERROR [host01] inference failed on batch=1\n' \"\$(date '+%F %T')\" >> /logs/infer/infer-demo.log"
docker exec "$container_name" sh -c "cat <<'STACK' >> /logs/infer/infer-demo.log
Traceback (most recent call last):
File \"inference.py\", line 15, in <module>
raise RuntimeError(\"CUDA out of memory on host01\")
RuntimeError: CUDA out of memory on host01
STACK"
echo "[OK] 已通过docker exec写入测试日志到 host01 容器内:"
echo " - /logs/train/train-demo.log"
echo " - /logs/infer/infer-demo.log"

View File

@ -0,0 +1,27 @@
#!/usr/bin/env bash
set -euo pipefail
# 获取fluent-bit-host02容器名称
container_name="logging-mvp-fluent-bit-host02-1"
# 检查容器是否存在并运行
if ! docker ps --format "table {{.Names}}" | grep -q "$container_name"; then
echo "[ERROR] Fluent Bit容器 $container_name 未运行"
exit 1
fi
# 创建日志目录
docker exec "$container_name" mkdir -p /logs/train /logs/infer
# 写入训练日志 (host02)
docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=1 loss=1.45 model=gpt\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log"
docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=2 loss=1.38 model=gpt\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log"
docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=3 loss=1.32 model=gpt\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log"
# 写入推理日志 (host02)
docker exec "$container_name" sh -c "printf '%s WARN [host02] inference slow on batch=5 latency=2.3s\n' \"\$(date '+%F %T')\" >> /logs/infer/infer-demo.log"
docker exec "$container_name" sh -c "printf '%s INFO [host02] inference completed batch=6 latency=0.8s\n' \"\$(date '+%F %T')\" >> /logs/infer/infer-demo.log"
echo "[OK] 已通过docker exec写入测试日志到 host02 容器内:"
echo " - /logs/train/train-demo.log"
echo " - /logs/infer/infer-demo.log"

7
src/log/misc/04_query_es.sh Executable file
View File

@ -0,0 +1,7 @@
#!/usr/bin/env bash
set -euo pipefail
ES="${ES:-http://localhost:9200}"
echo "[i] 查询 ES 端点:$ES"
curl -fsS "$ES/_cat/indices?v" | egrep 'train-|infer-|logstash' || true
printf "train-* 计数:"; curl -fsS "$ES/train-*/_count" | sed -E 's/.*"count":([0-9]+).*/\1/'; echo
printf "infer-* 计数:"; curl -fsS "$ES/infer-*/_count" | sed -E 's/.*"count":([0-9]+).*/\1/'; echo

10
src/log/misc/05_down.sh Executable file
View File

@ -0,0 +1,10 @@
#!/usr/bin/env bash
set -euo pipefail
cd "$(dirname "$0")/.."
compose_cmd="docker compose"
if ! $compose_cmd version >/dev/null 2>&1; then
if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else
echo "需要 Docker Compose请安装后重试" >&2; exit 1; fi
fi
$compose_cmd -p logging-mvp down
echo "[OK] 已停止所有容器"

View File

@ -0,0 +1,10 @@
#!/usr/bin/env bash
set -euo pipefail
cd "$(dirname "$0")/.."
compose_cmd="docker compose"
if ! $compose_cmd version >/dev/null 2>&1; then
if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else
echo "需要 Docker Compose请安装后重试" >&2; exit 1; fi
fi
$compose_cmd -p logging-mvp restart fluent-bit
echo "[OK] 已重启 fluent-bit该镜像不支持 SIGHUP 热重载)"

View File

@ -0,0 +1,44 @@
#!/usr/bin/env bash
set -euo pipefail
KB="${KB:-http://localhost:5601}"
# 等待 Kibana 完全启动
wait_for_kibana() {
echo "[i] 等待 Kibana 启动..."
local max_attempts=60
local attempt=1
while [ $attempt -le $max_attempts ]; do
if curl -fs "$KB/api/status" >/dev/null 2>&1; then
local status=$(curl -s "$KB/api/status" | jq -r '.status.overall.level // "unknown"')
if [ "$status" = "available" ]; then
echo "[OK] Kibana 已启动 (status: $status)"
return 0
else
echo " 等待中... ($attempt/$max_attempts, status: $status)"
fi
else
echo " 等待中... ($attempt/$max_attempts, 连接失败)"
fi
sleep 5
((attempt++))
done
echo "[ERROR] Kibana 启动超时"
return 1
}
create_view() {
local name="$1" pattern="$2" timefield="${3:-@timestamp}"
echo "[i] 创建 Data View: $name ($pattern, time=$timefield)"
curl -fsS -X POST "$KB/api/data_views/data_view" \
-H 'kbn-xsrf: true' \
-H 'Content-Type: application/json' \
-d "{\"data_view\":{\"name\":\"$name\",\"title\":\"$pattern\",\"timeFieldName\":\"$timefield\"}}" \
>/dev/null && echo " -> OK" || { echo " -> 失败(可能是没有匹配索引)"; return 1; }
}
# 等待 Kibana 启动
wait_for_kibana || exit 1
create_view "train" "train-*" "@timestamp" || true
create_view "infer" "infer-*" "@timestamp" || true
echo "[DONE] 若提示失败,请先确保已产生 train-*/infer-* 索引,再重试本脚本。"

View File

@ -0,0 +1,8 @@
#!/usr/bin/env bash
set -euo pipefail
ES="${ES:-http://localhost:9200}"
for idx in train-* infer-*; do
echo "[i] 将 $idx 副本数设置为 0"
curl -fsS -X PUT "$ES/$idx/_settings" -H 'Content-Type: application/json' -d '{"index":{"number_of_replicas":0}}' || true
done
echo "[OK] 完成"

33
src/log/misc/99_clean.sh Executable file
View File

@ -0,0 +1,33 @@
#!/usr/bin/env bash
set -euo pipefail
cd "$(dirname "$0")/.."
echo "清理选项:"
echo "1. 清理索引和缓冲区(保留持久化数据)"
echo "2. 完全清理(包括持久化数据目录)"
read -rp "请选择 (1/2): " choice
case $choice in
1)
read -rp "危险操作:删除 ES 索引 train-*, infer-*(以及 logstash-*),并清空 buffers。确认? (yes/NO) " ans
if [[ "${ans:-NO}" != "yes" ]]; then echo "已取消"; exit 0; fi
ES="${ES:-http://localhost:9200}"
for idx in train-* infer-* logstash-*; do
echo "[i] 删除索引 $idx"
curl -fsS -X DELETE "$ES/$idx" || true
done
rm -rf ./private/fluent-bit/buffers/* || true
echo "[OK] 索引和缓冲区清理完成"
;;
2)
read -rp "危险操作:删除所有数据包括持久化存储!确认? (yes/NO) " ans
if [[ "${ans:-NO}" != "yes" ]]; then echo "已取消"; exit 0; fi
rm -rf ./private/fluent-bit/buffers/* ./private/es/data/* ./private/kibana/data/* || true
rm -rf ./private/fluent-bit/logs/train/* ./private/fluent-bit/logs/infer/* || true
echo "[OK] 完全清理完成(包括持久化数据)"
;;
*)
echo "已取消"
exit 0
;;
esac

19
src/log/misc/start-es.sh Normal file
View File

@ -0,0 +1,19 @@
#!/bin/bash
set -euo pipefail
echo "[INFO] Starting Elasticsearch with private directory setup..."
# 创建数据目录
mkdir -p /private/es/data
# 创建软链接到Elasticsearch预期的数据目录
rm -rf /usr/share/elasticsearch/data 2>/dev/null || true
ln -sf /private/es/data /usr/share/elasticsearch/data
# 设置正确的权限 (Elasticsearch使用UID 1000)
chown -R 1000:1000 /private/es/data
echo "[INFO] Data directory linked: /usr/share/elasticsearch/data -> /private/es/data"
# 启动原始的Elasticsearch entrypoint
exec /usr/local/bin/docker-entrypoint.sh elasticsearch

View File

@ -0,0 +1,19 @@
#!/bin/bash
set -euo pipefail
echo "[INFO] Starting Kibana with private directory setup..."
# 创建数据目录
mkdir -p /private/kibana/data
# 创建软链接到Kibana预期的数据目录
rm -rf /usr/share/kibana/data 2>/dev/null || true
ln -sf /private/kibana/data /usr/share/kibana/data
# 设置正确的权限 (Kibana使用UID 1000)
chown -R 1000:1000 /private/kibana/data
echo "[INFO] Data directory linked: /usr/share/kibana/data -> /private/kibana/data"
# 启动原始的Kibana entrypoint
exec /usr/local/bin/kibana-docker