当前部署情况 - h1: 部署server & client - h2: 部署client - 部署2025-11-25 - 部署目录: /home2/argus/server , /home2/argus/client - 部署使用账号:argus 网络拓扑: - h1 作为docker swarm manager - h2 作为worker加入docker swarm - docker swarm 上创建overlay network 访问方式: - 通过ssh到h1服务器,端口转发 20006-20011 端口到笔记本本地; - 门户网址:http://localhost:20006/dashboard 部署截图:    注意事项: - server各容器使用域名作为overlay network上alias别名,实现域名访问,当前版本禁用bind作为域名解析,原因是容器重启后IP变化场景bind机制复杂且不稳定。 - client 构建是内置安装包,容器启动时执行安装流程,后续重启容器跳过安装步骤。 - UID/GID:部署使用 argus账号 uid=2133, gid=2015。 Reviewed-on: #51 Reviewed-by: sundapeng <sundp@mail.zgclab.edu.cn> Reviewed-by: xuxt <xuxt@zgclab.edu.cn> Reviewed-by: huhy <husteryezi@163.com>
93 lines
3.6 KiB
Bash
Executable File
93 lines
3.6 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
echo "[INFO] Sending logs via node-a/node-b and asserting ES counts..."
|
|
|
|
# 载入端口变量
|
|
TEST_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. && pwd)"
|
|
if [[ -f "$TEST_ROOT/.env" ]]; then
|
|
set -a; source "$TEST_ROOT/.env"; set +a
|
|
fi
|
|
|
|
# Robust count helper: tolerates 404/503 and non-JSON responses, returns integer >=0
|
|
get_count() {
|
|
local idx="$1"; local tmp; tmp=$(mktemp)
|
|
local code
|
|
code=$(curl -s -o "$tmp" -w "%{http_code}" "http://localhost:${ES_HTTP_PORT:-9200}/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" || true)
|
|
if [[ "$code" == "200" ]]; then
|
|
local val
|
|
val=$(jq -r '(.count // 0) | tonumber? // 0' "$tmp" 2>/dev/null || echo 0)
|
|
echo "$val"
|
|
else
|
|
echo 0
|
|
fi
|
|
rm -f "$tmp"
|
|
}
|
|
|
|
train0=$(get_count "train-*")
|
|
infer0=$(get_count "infer-*")
|
|
base=$((train0 + infer0))
|
|
echo "[INFO] initial counts: train=${train0} infer=${infer0} total=${base}"
|
|
|
|
send_logs() {
|
|
local cname="$1"; local hosttag="$2"
|
|
docker exec "$cname" sh -lc 'mkdir -p /logs/train /logs/infer'
|
|
docker exec "$cname" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log"
|
|
docker exec "$cname" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log"
|
|
docker exec "$cname" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log"
|
|
}
|
|
|
|
# Determine container names
|
|
node_a=$(docker ps --format '{{.Names}}' | grep -E '^argus-node-a$|argus-sys-node-a-1' | head -n1)
|
|
node_b=$(docker ps --format '{{.Names}}' | grep -E '^argus-node-b$|argus-sys-node-b-1' | head -n1)
|
|
|
|
send_logs "$node_a" "host01"
|
|
send_logs "$node_b" "host02"
|
|
|
|
echo "[INFO] Waiting for ES to ingest..."
|
|
# Proactively refresh indices (ignore errors if not created yet)
|
|
curl -s -X POST "http://localhost:${ES_HTTP_PORT:-9200}/train-*/_refresh" >/dev/null 2>&1 || true
|
|
curl -s -X POST "http://localhost:${ES_HTTP_PORT:-9200}/infer-*/_refresh" >/dev/null 2>&1 || true
|
|
|
|
# Retry up to 120s for counts to increase and reach threshold (>=4)
|
|
final=0
|
|
threshold=4
|
|
for attempt in {1..60}; do
|
|
train1=$(get_count "train-*")
|
|
infer1=$(get_count "infer-*")
|
|
final=$((train1 + infer1))
|
|
if (( final > base && final >= threshold )); then
|
|
break
|
|
fi
|
|
echo "[..] waiting ES counts increase to >=${threshold} ($attempt/60) current=${final} base=${base}"
|
|
# refresh indices again to speed up visibility
|
|
curl -s -X POST "http://localhost:${ES_HTTP_PORT:-9200}/train-*/_refresh" >/dev/null 2>&1 || true
|
|
curl -s -X POST "http://localhost:${ES_HTTP_PORT:-9200}/infer-*/_refresh" >/dev/null 2>&1 || true
|
|
sleep 2
|
|
done
|
|
echo "[INFO] final counts: train=${train1} infer=${infer1} total=${final}"
|
|
|
|
if (( final <= base )); then
|
|
echo "[ERR] ES total did not increase (${base} -> ${final})" >&2
|
|
exit 1
|
|
fi
|
|
|
|
# Minimal threshold to be tolerant: expect at least 4 documents (2 train + 1 infer per node)
|
|
if (( final < 4 )); then
|
|
echo "[ERR] ES total below expected threshold: ${final} < 4" >&2
|
|
exit 1
|
|
fi
|
|
|
|
# Health endpoints
|
|
es_health=$(curl -s "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" | grep -o '"status":"[^"]*"' | cut -d'"' -f4)
|
|
if [[ "$es_health" != "green" && "$es_health" != "yellow" ]]; then
|
|
echo "[ERR] ES health not green/yellow: $es_health" >&2
|
|
exit 1
|
|
fi
|
|
|
|
if ! curl -fs "http://localhost:${KIBANA_PORT:-5601}/api/status" >/dev/null 2>&1; then
|
|
echo "[WARN] Kibana status endpoint not available"
|
|
fi
|
|
|
|
echo "[OK] ES counts increased and services healthy"
|