diff --git a/src/log/README.md b/src/log/README.md index f40fbca..4ce441f 100644 --- a/src/log/README.md +++ b/src/log/README.md @@ -1,2 +1,7 @@ 测试log模块开发 + +elasticsearch: 部署镜像构建及启动脚本(解决账号问题、挂载目录、使用supervisor守护) +kibana: 镜像构建 +fluent-bit: 安装包,脚本准备, 交付给大鹏统一组织客户端侧安装流程 +init: EK初始化脚本:数据视图创建脚本等 \ No newline at end of file diff --git a/src/log/docker-compose.yml b/src/log/docker-compose.yml new file mode 100644 index 0000000..2237568 --- /dev/null +++ b/src/log/docker-compose.yml @@ -0,0 +1,74 @@ +version: "3.8" +services: + es: + image: docker.elastic.co/elasticsearch/elasticsearch:8.13.4 + environment: + - discovery.type=single-node + - xpack.security.enabled=false + - ES_JAVA_OPTS=-Xms512m -Xmx512m + volumes: + - ./private:/private + ports: ["9200:9200"] + command: /private/es/scripts/start-es.sh + healthcheck: + test: ["CMD-SHELL", "curl -fs http://localhost:9200 >/dev/null || exit 1"] + interval: 10s + timeout: 5s + retries: 30 + + kibana: + image: docker.elastic.co/kibana/kibana:8.13.4 + environment: + - ELASTICSEARCH_HOSTS=http://es:9200 + volumes: + - ./private:/private + ports: ["5601:5601"] + command: /private/kibana/scripts/start-kibana.sh + depends_on: + es: + condition: service_healthy + + fluent-bit-host01: + image: ubuntu:22.04 + environment: + - CLUSTER=local + - RACK=dev + - HOSTNAME=host01 + - ES_HOST=es + - ES_PORT=9200 + volumes: + - ./start-fluent-bit.sh:/private/start-fluent-bit.sh:ro + - ./fluent-bit-bundle.tar.gz:/private/fluent-bit-bundle.tar.gz:ro + ports: ["2020:2020"] + depends_on: + es: + condition: service_healthy + command: /private/start-fluent-bit.sh + healthcheck: + test: ["CMD-SHELL", "curl -fs http://localhost:2020/api/v2/metrics >/dev/null || exit 1"] + interval: 15s + timeout: 10s + retries: 30 + + fluent-bit-host02: + image: ubuntu:22.04 + environment: + - CLUSTER=local + - RACK=dev + - HOSTNAME=host02 + - ES_HOST=es + - ES_PORT=9200 + volumes: + - ./start-fluent-bit.sh:/private/start-fluent-bit.sh:ro + - ./fluent-bit-bundle.tar.gz:/private/fluent-bit-bundle.tar.gz:ro + ports: ["2021:2020"] + depends_on: + es: + condition: service_healthy + command: /private/start-fluent-bit.sh + healthcheck: + test: ["CMD-SHELL", "curl -fs http://localhost:2020/api/v2/metrics >/dev/null || exit 1"] + interval: 15s + timeout: 10s + retries: 30 + diff --git a/src/log/elasticsearch/start-es.sh b/src/log/elasticsearch/start-es.sh new file mode 100644 index 0000000..18edb9f --- /dev/null +++ b/src/log/elasticsearch/start-es.sh @@ -0,0 +1,19 @@ +#!/bin/bash +set -euo pipefail + +echo "[INFO] Starting Elasticsearch with private directory setup..." + +# 创建数据目录 +mkdir -p /private/es/data + +# 创建软链接到Elasticsearch预期的数据目录 +rm -rf /usr/share/elasticsearch/data 2>/dev/null || true +ln -sf /private/es/data /usr/share/elasticsearch/data + +# 设置正确的权限 (Elasticsearch使用UID 1000) +chown -R 1000:1000 /private/es/data + +echo "[INFO] Data directory linked: /usr/share/elasticsearch/data -> /private/es/data" + +# 启动原始的Elasticsearch entrypoint +exec /usr/local/bin/docker-entrypoint.sh elasticsearch \ No newline at end of file diff --git a/src/log/fluent-bit/fluent-bit-bundle.tar.gz b/src/log/fluent-bit/fluent-bit-bundle.tar.gz new file mode 100644 index 0000000..e46b153 Binary files /dev/null and b/src/log/fluent-bit/fluent-bit-bundle.tar.gz differ diff --git a/src/log/fluent-bit/start-fluent-bit.sh b/src/log/fluent-bit/start-fluent-bit.sh new file mode 100755 index 0000000..ff90080 --- /dev/null +++ b/src/log/fluent-bit/start-fluent-bit.sh @@ -0,0 +1,46 @@ +#!/bin/bash +set -euo pipefail + +echo "[INFO] Starting Fluent Bit setup in Ubuntu container..." + +# 安装必要的工具 +echo "[INFO] Installing required packages..." +export DEBIAN_FRONTEND=noninteractive +apt-get update -qq +apt-get install -y -qq curl + +# 解压bundle到/tmp +echo "[INFO] Extracting fluent-bit bundle..." +cd /tmp +tar -xzf /private/fluent-bit-bundle.tar.gz + +# 安装 Fluent Bit 从 deb 包 +echo "[INFO] Installing Fluent Bit from deb package..." +dpkg -i /tmp/packages/fluent-bit_3.1.9_amd64.deb || true +apt-get install -f -y -qq # 解决依赖问题 + +# 验证 Fluent Bit 可以运行 +echo "[INFO] Fluent Bit version:" +/opt/fluent-bit/bin/fluent-bit --version + +# 创建配置目录 +mkdir -p /etc/fluent-bit +cp -r /tmp/etc/* /etc/fluent-bit/ + +# 创建日志和缓冲区目录 +mkdir -p /logs/train /logs/infer /buffers +chmod 755 /logs/train /logs/infer /buffers + +# 等待 Elasticsearch 就绪 +echo "[INFO] Waiting for Elasticsearch to be ready..." +while ! curl -fs http://${ES_HOST}:${ES_PORT}/_cluster/health >/dev/null 2>&1; do + echo " Waiting for ES at ${ES_HOST}:${ES_PORT}..." + sleep 5 +done +echo "[INFO] Elasticsearch is ready" + +# 启动 Fluent Bit +echo "[INFO] Starting Fluent Bit with configuration from /etc/fluent-bit/" +echo "[INFO] Command: /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf" +exec /opt/fluent-bit/bin/fluent-bit \ + --config=/etc/fluent-bit/fluent-bit.conf \ No newline at end of file diff --git a/src/log/kibana/start-kibana.sh b/src/log/kibana/start-kibana.sh new file mode 100644 index 0000000..b6c0d07 --- /dev/null +++ b/src/log/kibana/start-kibana.sh @@ -0,0 +1,19 @@ +#!/bin/bash +set -euo pipefail + +echo "[INFO] Starting Kibana with private directory setup..." + +# 创建数据目录 +mkdir -p /private/kibana/data + +# 创建软链接到Kibana预期的数据目录 +rm -rf /usr/share/kibana/data 2>/dev/null || true +ln -sf /private/kibana/data /usr/share/kibana/data + +# 设置正确的权限 (Kibana使用UID 1000) +chown -R 1000:1000 /private/kibana/data + +echo "[INFO] Data directory linked: /usr/share/kibana/data -> /private/kibana/data" + +# 启动原始的Kibana entrypoint +exec /usr/local/bin/kibana-docker \ No newline at end of file diff --git a/src/log/misc/01_bootstrap.sh b/src/log/misc/01_bootstrap.sh new file mode 100755 index 0000000..8129f03 --- /dev/null +++ b/src/log/misc/01_bootstrap.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +set -euo pipefail +root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +# 创建private目录结构 (只需要ES和Kibana) +echo "[INFO] Creating private directory structure..." +mkdir -p "$root/private/es/data" +mkdir -p "$root/private/es/scripts" +mkdir -p "$root/private/kibana/data" +mkdir -p "$root/private/kibana/scripts" + +# 复制启动脚本 +echo "[INFO] Copying startup scripts..." +cp "$root/scripts/start-es.sh" "$root/private/es/scripts/" +cp "$root/scripts/start-kibana.sh" "$root/private/kibana/scripts/" + +# 设置执行权限 +chmod +x "$root/private/es/scripts/start-es.sh" +chmod +x "$root/private/kibana/scripts/start-kibana.sh" + +# 设置数据目录权限(ES 和 Kibana 容器都使用 UID 1000) +sudo chown -R 1000:1000 "$root/private/es/data" "$root/private/kibana/data" 2>/dev/null || true + +# 检查fluent-bit相关文件是否存在 +if [[ ! -f "$root/fluent-bit-bundle.tar.gz" ]]; then + echo "[INFO] Creating fluent-bit bundle..." + # 创建bundle(如果目录存在的话) + cd "$root" + if [[ -d "private/fluent-bit" ]]; then + cd private/fluent-bit && tar -czf ../../fluent-bit-bundle.tar.gz etc/ packages/ 2>/dev/null && cd ../.. + elif [[ -d "fluent-bit" && -d "packages" ]]; then + # 临时创建目录结构来打包 + mkdir -p temp-bundle/etc temp-bundle/packages + cp -r fluent-bit/* temp-bundle/etc/ + cp -r packages/* temp-bundle/packages/ + cd temp-bundle && tar -czf ../fluent-bit-bundle.tar.gz . && cd .. + rm -rf temp-bundle + else + echo "[WARN] 无法创建fluent-bit bundle,请确保fluent-bit配置和packages目录存在" + fi +fi + +if [[ ! -f "$root/start-fluent-bit.sh" ]]; then + echo "[WARN] start-fluent-bit.sh 不存在,请确保已创建该启动脚本" +fi + +echo "[OK] 初始化完成: private/{es,kibana}, fluent-bit-bundle.tar.gz, start-fluent-bit.sh" \ No newline at end of file diff --git a/src/log/misc/02_up.sh b/src/log/misc/02_up.sh new file mode 100755 index 0000000..5e49baa --- /dev/null +++ b/src/log/misc/02_up.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +set -euo pipefail +cd "$(dirname "$0")/.." +compose_cmd="docker compose" +if ! $compose_cmd version >/dev/null 2>&1; then + if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else + echo "需要 Docker Compose,请安装后重试" >&2; exit 1; fi +fi +$compose_cmd -p logging-mvp up -d --remove-orphans +echo "[OK] 服务已启动:ES http://localhost:9200 Kibana http://localhost:5601 Fluent-Bit host01 http://localhost:2020 Fluent-Bit host02 http://localhost:2021" diff --git a/src/log/misc/03_send_test.sh b/src/log/misc/03_send_test.sh new file mode 100755 index 0000000..f971f32 --- /dev/null +++ b/src/log/misc/03_send_test.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -euo pipefail + +echo "[INFO] 向两个fluent-bit容器发送测试日志..." + +# 发送日志到host01 +echo "[INFO] 发送日志到 host01..." +./scripts/03_send_test_host01.sh + +echo "" + +# 发送日志到host02 +echo "[INFO] 发送日志到 host02..." +./scripts/03_send_test_host02.sh + +echo "" +echo "[OK] 已完成向两个主机发送测试日志" \ No newline at end of file diff --git a/src/log/misc/03_send_test_host01.sh b/src/log/misc/03_send_test_host01.sh new file mode 100755 index 0000000..41e3c28 --- /dev/null +++ b/src/log/misc/03_send_test_host01.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +set -euo pipefail + +# 获取fluent-bit-host01容器名称 +container_name="logging-mvp-fluent-bit-host01-1" + +# 检查容器是否存在并运行 +if ! docker ps --format "table {{.Names}}" | grep -q "$container_name"; then + echo "[ERROR] Fluent Bit容器 $container_name 未运行" + exit 1 +fi + +# 创建日志目录 +docker exec "$container_name" mkdir -p /logs/train /logs/infer + +# 写入训练日志 (host01) +docker exec "$container_name" sh -c "printf '%s INFO [host01] training step=1 loss=1.23 model=bert\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log" +docker exec "$container_name" sh -c "printf '%s INFO [host01] training step=2 loss=1.15 model=bert\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log" + +# 写入推理日志 (host01) +docker exec "$container_name" sh -c "printf '%s ERROR [host01] inference failed on batch=1\n' \"\$(date '+%F %T')\" >> /logs/infer/infer-demo.log" +docker exec "$container_name" sh -c "cat <<'STACK' >> /logs/infer/infer-demo.log +Traceback (most recent call last): + File \"inference.py\", line 15, in + raise RuntimeError(\"CUDA out of memory on host01\") +RuntimeError: CUDA out of memory on host01 +STACK" + +echo "[OK] 已通过docker exec写入测试日志到 host01 容器内:" +echo " - /logs/train/train-demo.log" +echo " - /logs/infer/infer-demo.log" \ No newline at end of file diff --git a/src/log/misc/03_send_test_host02.sh b/src/log/misc/03_send_test_host02.sh new file mode 100755 index 0000000..5d0c4f6 --- /dev/null +++ b/src/log/misc/03_send_test_host02.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +set -euo pipefail + +# 获取fluent-bit-host02容器名称 +container_name="logging-mvp-fluent-bit-host02-1" + +# 检查容器是否存在并运行 +if ! docker ps --format "table {{.Names}}" | grep -q "$container_name"; then + echo "[ERROR] Fluent Bit容器 $container_name 未运行" + exit 1 +fi + +# 创建日志目录 +docker exec "$container_name" mkdir -p /logs/train /logs/infer + +# 写入训练日志 (host02) +docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=1 loss=1.45 model=gpt\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log" +docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=2 loss=1.38 model=gpt\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log" +docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=3 loss=1.32 model=gpt\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log" + +# 写入推理日志 (host02) +docker exec "$container_name" sh -c "printf '%s WARN [host02] inference slow on batch=5 latency=2.3s\n' \"\$(date '+%F %T')\" >> /logs/infer/infer-demo.log" +docker exec "$container_name" sh -c "printf '%s INFO [host02] inference completed batch=6 latency=0.8s\n' \"\$(date '+%F %T')\" >> /logs/infer/infer-demo.log" + +echo "[OK] 已通过docker exec写入测试日志到 host02 容器内:" +echo " - /logs/train/train-demo.log" +echo " - /logs/infer/infer-demo.log" \ No newline at end of file diff --git a/src/log/misc/04_query_es.sh b/src/log/misc/04_query_es.sh new file mode 100755 index 0000000..2cf427e --- /dev/null +++ b/src/log/misc/04_query_es.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +set -euo pipefail +ES="${ES:-http://localhost:9200}" +echo "[i] 查询 ES 端点:$ES" +curl -fsS "$ES/_cat/indices?v" | egrep 'train-|infer-|logstash' || true +printf "train-* 计数:"; curl -fsS "$ES/train-*/_count" | sed -E 's/.*"count":([0-9]+).*/\1/'; echo +printf "infer-* 计数:"; curl -fsS "$ES/infer-*/_count" | sed -E 's/.*"count":([0-9]+).*/\1/'; echo diff --git a/src/log/misc/05_down.sh b/src/log/misc/05_down.sh new file mode 100755 index 0000000..2ec8050 --- /dev/null +++ b/src/log/misc/05_down.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +set -euo pipefail +cd "$(dirname "$0")/.." +compose_cmd="docker compose" +if ! $compose_cmd version >/dev/null 2>&1; then + if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else + echo "需要 Docker Compose,请安装后重试" >&2; exit 1; fi +fi +$compose_cmd -p logging-mvp down +echo "[OK] 已停止所有容器" diff --git a/src/log/misc/06_restart_fluentbit.sh b/src/log/misc/06_restart_fluentbit.sh new file mode 100755 index 0000000..e0f1a58 --- /dev/null +++ b/src/log/misc/06_restart_fluentbit.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +set -euo pipefail +cd "$(dirname "$0")/.." +compose_cmd="docker compose" +if ! $compose_cmd version >/dev/null 2>&1; then + if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else + echo "需要 Docker Compose,请安装后重试" >&2; exit 1; fi +fi +$compose_cmd -p logging-mvp restart fluent-bit +echo "[OK] 已重启 fluent-bit(该镜像不支持 SIGHUP 热重载)" diff --git a/src/log/misc/07_create_data_views.sh b/src/log/misc/07_create_data_views.sh new file mode 100755 index 0000000..9accf13 --- /dev/null +++ b/src/log/misc/07_create_data_views.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +set -euo pipefail +KB="${KB:-http://localhost:5601}" + +# 等待 Kibana 完全启动 +wait_for_kibana() { + echo "[i] 等待 Kibana 启动..." + local max_attempts=60 + local attempt=1 + + while [ $attempt -le $max_attempts ]; do + if curl -fs "$KB/api/status" >/dev/null 2>&1; then + local status=$(curl -s "$KB/api/status" | jq -r '.status.overall.level // "unknown"') + if [ "$status" = "available" ]; then + echo "[OK] Kibana 已启动 (status: $status)" + return 0 + else + echo " 等待中... ($attempt/$max_attempts, status: $status)" + fi + else + echo " 等待中... ($attempt/$max_attempts, 连接失败)" + fi + sleep 5 + ((attempt++)) + done + + echo "[ERROR] Kibana 启动超时" + return 1 +} +create_view() { + local name="$1" pattern="$2" timefield="${3:-@timestamp}" + echo "[i] 创建 Data View: $name ($pattern, time=$timefield)" + curl -fsS -X POST "$KB/api/data_views/data_view" \ + -H 'kbn-xsrf: true' \ + -H 'Content-Type: application/json' \ + -d "{\"data_view\":{\"name\":\"$name\",\"title\":\"$pattern\",\"timeFieldName\":\"$timefield\"}}" \ + >/dev/null && echo " -> OK" || { echo " -> 失败(可能是没有匹配索引)"; return 1; } +} +# 等待 Kibana 启动 +wait_for_kibana || exit 1 + +create_view "train" "train-*" "@timestamp" || true +create_view "infer" "infer-*" "@timestamp" || true +echo "[DONE] 若提示失败,请先确保已产生 train-*/infer-* 索引,再重试本脚本。" diff --git a/src/log/misc/08_fix_replicas.sh b/src/log/misc/08_fix_replicas.sh new file mode 100755 index 0000000..54e175e --- /dev/null +++ b/src/log/misc/08_fix_replicas.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +set -euo pipefail +ES="${ES:-http://localhost:9200}" +for idx in train-* infer-*; do + echo "[i] 将 $idx 副本数设置为 0" + curl -fsS -X PUT "$ES/$idx/_settings" -H 'Content-Type: application/json' -d '{"index":{"number_of_replicas":0}}' || true +done +echo "[OK] 完成" diff --git a/src/log/misc/99_clean.sh b/src/log/misc/99_clean.sh new file mode 100755 index 0000000..2cdd8be --- /dev/null +++ b/src/log/misc/99_clean.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +set -euo pipefail +cd "$(dirname "$0")/.." + +echo "清理选项:" +echo "1. 清理索引和缓冲区(保留持久化数据)" +echo "2. 完全清理(包括持久化数据目录)" +read -rp "请选择 (1/2): " choice + +case $choice in + 1) + read -rp "危险操作:删除 ES 索引 train-*, infer-*(以及 logstash-*),并清空 buffers。确认? (yes/NO) " ans + if [[ "${ans:-NO}" != "yes" ]]; then echo "已取消"; exit 0; fi + ES="${ES:-http://localhost:9200}" + for idx in train-* infer-* logstash-*; do + echo "[i] 删除索引 $idx" + curl -fsS -X DELETE "$ES/$idx" || true + done + rm -rf ./private/fluent-bit/buffers/* || true + echo "[OK] 索引和缓冲区清理完成" + ;; + 2) + read -rp "危险操作:删除所有数据包括持久化存储!确认? (yes/NO) " ans + if [[ "${ans:-NO}" != "yes" ]]; then echo "已取消"; exit 0; fi + rm -rf ./private/fluent-bit/buffers/* ./private/es/data/* ./private/kibana/data/* || true + rm -rf ./private/fluent-bit/logs/train/* ./private/fluent-bit/logs/infer/* || true + echo "[OK] 完全清理完成(包括持久化数据)" + ;; + *) + echo "已取消" + exit 0 + ;; +esac diff --git a/src/log/misc/start-es.sh b/src/log/misc/start-es.sh new file mode 100644 index 0000000..18edb9f --- /dev/null +++ b/src/log/misc/start-es.sh @@ -0,0 +1,19 @@ +#!/bin/bash +set -euo pipefail + +echo "[INFO] Starting Elasticsearch with private directory setup..." + +# 创建数据目录 +mkdir -p /private/es/data + +# 创建软链接到Elasticsearch预期的数据目录 +rm -rf /usr/share/elasticsearch/data 2>/dev/null || true +ln -sf /private/es/data /usr/share/elasticsearch/data + +# 设置正确的权限 (Elasticsearch使用UID 1000) +chown -R 1000:1000 /private/es/data + +echo "[INFO] Data directory linked: /usr/share/elasticsearch/data -> /private/es/data" + +# 启动原始的Elasticsearch entrypoint +exec /usr/local/bin/docker-entrypoint.sh elasticsearch \ No newline at end of file diff --git a/src/log/misc/start-kibana.sh b/src/log/misc/start-kibana.sh new file mode 100644 index 0000000..b6c0d07 --- /dev/null +++ b/src/log/misc/start-kibana.sh @@ -0,0 +1,19 @@ +#!/bin/bash +set -euo pipefail + +echo "[INFO] Starting Kibana with private directory setup..." + +# 创建数据目录 +mkdir -p /private/kibana/data + +# 创建软链接到Kibana预期的数据目录 +rm -rf /usr/share/kibana/data 2>/dev/null || true +ln -sf /private/kibana/data /usr/share/kibana/data + +# 设置正确的权限 (Kibana使用UID 1000) +chown -R 1000:1000 /private/kibana/data + +echo "[INFO] Data directory linked: /usr/share/kibana/data -> /private/kibana/data" + +# 启动原始的Kibana entrypoint +exec /usr/local/bin/kibana-docker \ No newline at end of file