argus/src/sys/debug/scripts/07_logs_send_and_assert.sh
yuyr 34cb239bf4 完成H20服务器部署及重启测试 (#51)
当前部署情况
- h1: 部署server & client
- h2: 部署client
- 部署2025-11-25
- 部署目录:  /home2/argus/server  ,  /home2/argus/client
- 部署使用账号:argus

网络拓扑:
- h1 作为docker swarm manager
- h2 作为worker加入docker swarm
- docker swarm 上创建overlay network

访问方式:
- 通过ssh到h1服务器,端口转发 20006-20011 端口到笔记本本地;
- 门户网址:http://localhost:20006/dashboard

部署截图:
![image.png](/attachments/86c1a7af-dacc-4ba7-a182-f7cefd4e6427)
![image.png](/attachments/06f20852-771c-4264-b031-e6acd0f6ea1c)
![image.png](/attachments/091ab5a8-95bf-466f-a394-3255dcb49735)

注意事项:
- server各容器使用域名作为overlay network上alias别名,实现域名访问,当前版本禁用bind作为域名解析,原因是容器重启后IP变化场景bind机制复杂且不稳定。
- client 构建是内置安装包,容器启动时执行安装流程,后续重启容器跳过安装步骤。
- UID/GID:部署使用 argus账号 uid=2133, gid=2015。

Reviewed-on: #51
Reviewed-by: sundapeng <sundp@mail.zgclab.edu.cn>
Reviewed-by: xuxt <xuxt@zgclab.edu.cn>
Reviewed-by: huhy <husteryezi@163.com>
2025-11-25 15:54:29 +08:00

71 lines
2.2 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
# shellcheck source=common.sh
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
ensure_env_file
ensure_paths_defined
log "Sending logs and asserting ES counts"
get_count() {
local idx="$1"
curl -s "http://localhost:9200/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" | sed -E 's/.*"count":([0-9]+).*/\1/' | awk 'NF{print $0;exit} END{if(NR==0)print 0}'
}
train0=$(get_count "train-*")
infer0=$(get_count "infer-*")
base=$((train0 + infer0))
log "initial counts: train=${train0} infer=${infer0} total=${base}"
service_id() {
compose ps -q "$1"
}
send_logs() {
local sid="$1"; local hosttag="$2"
docker exec "$sid" sh -lc 'mkdir -p /logs/train /logs/infer'
docker exec "$sid" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log"
docker exec "$sid" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log"
docker exec "$sid" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log"
}
CID_A="$(service_id node-a)"
CID_B="$(service_id node-b)"
[[ -n "$CID_A" && -n "$CID_B" ]] || { echo "[ERR] node containers not found" >&2; exit 1; }
send_logs "$CID_A" "host01"
send_logs "$CID_B" "host02"
log "Waiting for ES to ingest"
sleep 10
train1=$(get_count "train-*")
infer1=$(get_count "infer-*")
final=$((train1 + infer1))
log "final counts: train=${train1} infer=${infer1} total=${final}"
if (( final <= base )); then
echo "[ERR] ES total did not increase (${base} -> ${final})" >&2
exit 1
fi
if (( final < 4 )); then
echo "[ERR] ES total below expected threshold: ${final} < 4" >&2
exit 1
fi
es_health=$(curl -s "http://localhost:9200/_cluster/health" | grep -o '"status":"[^"]*"' | cut -d'"' -f4)
if [[ "$es_health" != "green" && "$es_health" != "yellow" ]]; then
echo "[ERR] ES health not green/yellow: $es_health" >&2
exit 1
fi
if ! curl -fs "http://localhost:5601/api/status" >/dev/null 2>&1; then
echo "[WARN] Kibana status endpoint not available"
fi
log "ES counts increased and services healthy"