Reviewed-on: #17 Reviewed-by: sundapeng <sundp@mail.zgclab.edu.cn> Reviewed-by: xuxt <xuxt@zgclab.edu.cn>
46 lines
1.6 KiB
Bash
Executable File
46 lines
1.6 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
# 获取fluent-bit-host01容器名称
|
|
container_name="logging-mvp-fluent-bit-host01-1"
|
|
|
|
wait_for_container() {
|
|
local name="$1"
|
|
local attempts=30
|
|
local delay=5
|
|
local i
|
|
for ((i = 1; i <= attempts; i++)); do
|
|
if docker ps --format '{{.Names}}' | grep -Fx "$name" >/dev/null; then
|
|
return 0
|
|
fi
|
|
echo "[INFO] 等待容器 $name 启动中... ($i/$attempts)"
|
|
sleep "$delay"
|
|
done
|
|
return 1
|
|
}
|
|
|
|
if ! wait_for_container "$container_name"; then
|
|
echo "[ERROR] Fluent Bit容器 $container_name 未运行"
|
|
exit 1
|
|
fi
|
|
|
|
# 创建日志目录
|
|
docker exec "$container_name" mkdir -p /logs/train /logs/infer
|
|
|
|
# 写入训练日志 (host01)
|
|
docker exec "$container_name" sh -c "printf '%s INFO [host01] training step=1 loss=1.23 model=bert\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log"
|
|
docker exec "$container_name" sh -c "printf '%s INFO [host01] training step=2 loss=1.15 model=bert\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log"
|
|
|
|
# 写入推理日志 (host01)
|
|
docker exec "$container_name" sh -c "printf '%s ERROR [host01] inference failed on batch=1\n' \"\$(date '+%F %T')\" >> /logs/infer/infer-demo.log"
|
|
docker exec "$container_name" sh -c "cat <<'STACK' >> /logs/infer/infer-demo.log
|
|
Traceback (most recent call last):
|
|
File \"inference.py\", line 15, in <module>
|
|
raise RuntimeError(\"CUDA out of memory on host01\")
|
|
RuntimeError: CUDA out of memory on host01
|
|
STACK"
|
|
|
|
echo "[OK] 已通过docker exec写入测试日志到 host01 容器内:"
|
|
echo " - /logs/train/train-demo.log"
|
|
echo " - /logs/infer/infer-demo.log"
|