[#29] 支持gpu节点系统集成测试

This commit is contained in:
yuyr 2025-10-28 14:12:49 +08:00
parent 26c39604d5
commit 8e01264e3f
19 changed files with 311 additions and 32 deletions

View File

@ -1,47 +1,96 @@
#!/bin/bash
set -euo pipefail
echo "[INFO] Starting Fluent Bit setup in Ubuntu container..."
echo "[INFO] Starting Fluent Bit setup in Ubuntu container (offline-first)..."
# 安装必要的工具
echo "[INFO] Installing required packages..."
export DEBIAN_FRONTEND=noninteractive
apt-get update -qq
apt-get install -y -qq curl
# 解压bundle到/tmp
echo "[INFO] Extracting fluent-bit bundle..."
cp -r /private/etc /tmp
cp -r /private/packages /tmp
cd /tmp
# Stage bundle to /tmp (read-only mount under /private)
echo "[INFO] Staging fluent-bit bundle..."
rm -rf /tmp/flb && mkdir -p /tmp/flb
cp -r /private/etc /tmp/flb/
mkdir -p /tmp/flb/packages
cp -r /private/packages/* /tmp/flb/packages/ 2>/dev/null || true
# 安装 Fluent Bit 从 deb 包
echo "[INFO] Installing Fluent Bit from deb package..."
dpkg -i /tmp/packages/fluent-bit_3.1.9_amd64.deb || true
apt-get install -f -y -qq # 解决依赖问题
# Helper: check and install a local deb if not already satisfied
ensure_lib() {
local soname="$1"; shift
local pattern="$1"; shift
if ldconfig -p 2>/dev/null | grep -q "$soname"; then
echo "[OK] $soname already present"
return 0
fi
local deb="$(ls /tmp/flb/packages/$pattern 2>/dev/null | head -n1 || true)"
if [[ -n "$deb" ]]; then
echo "[INFO] Installing local dependency: $(basename "$deb")"
dpkg -i "$deb" >/dev/null 2>&1 || true
else
echo "[WARN] Local deb for $soname not found (pattern=$pattern)"
fi
if ! ldconfig -p 2>/dev/null | grep -q "$soname"; then
echo "[WARN] $soname still missing after local install; attempting apt fallback"
apt-get update -qq || true
case "$soname" in
libpq.so.5) apt-get install -y -qq libpq5 || true ;;
libyaml-0.so.2) apt-get install -y -qq libyaml-0-2 || true ;;
esac
fi
ldconfig 2>/dev/null || true
}
# Offline-first: satisfy runtime deps from local debs, fallback to apt only if necessary
ensure_lib "libpq.so.5" "libpq5_*_amd64.deb"
ensure_lib "libyaml-0.so.2" "libyaml-0-2_*_amd64.deb"
ensure_lib "libsasl2.so.2" "libsasl2-2_*_amd64.deb"
ensure_lib "libldap-2.5.so.0" "libldap-2.5-0_*_amd64.deb"
# Install fluent-bit main package from local bundle
FLB_DEB="$(ls /tmp/flb/packages/fluent-bit_*_amd64.deb 2>/dev/null | head -n1 || true)"
if [[ -z "$FLB_DEB" ]]; then
echo "[ERROR] fluent-bit deb not found under /private/packages" >&2
exit 1
fi
echo "[INFO] Installing Fluent Bit: $(basename "$FLB_DEB")"
dpkg -i "$FLB_DEB" >/dev/null 2>&1 || true
# If dpkg reported unresolved dependencies, try apt -f only as last resort
if ! command -v /opt/fluent-bit/bin/fluent-bit >/dev/null 2>&1; then
echo "[WARN] fluent-bit binary missing after dpkg; attempting apt --fix-broken"
apt-get install -f -y -qq || true
fi
# Ensure runtime library dependencies are satisfied (libsasl2, libldap are required via libpq/curl)
MISSING=$(ldd /opt/fluent-bit/bin/fluent-bit 2>/dev/null | awk '/not found/{print $1}' | xargs -r echo || true)
if [[ -n "$MISSING" ]]; then
echo "[WARN] missing shared libs: $MISSING"
apt-get update -qq || true
apt-get install -y -qq libsasl2-2 libldap-2.5-0 || true
apt-get install -f -y -qq || true
fi
# 验证 Fluent Bit 可以运行
echo "[INFO] Fluent Bit version:"
/opt/fluent-bit/bin/fluent-bit --version
/opt/fluent-bit/bin/fluent-bit --version || { echo "[ERROR] fluent-bit not installed or libraries missing" >&2; exit 1; }
# 创建配置目录
# Place configuration
mkdir -p /etc/fluent-bit
cp -r /tmp/etc/* /etc/fluent-bit/
cp -r /tmp/flb/etc/* /etc/fluent-bit/
# 创建日志和缓冲区目录
# Create logs/buffers dirs
mkdir -p /logs/train /logs/infer /buffers
chmod 755 /logs/train /logs/infer /buffers
# 等待 Elasticsearch 就绪
echo "[INFO] Waiting for Elasticsearch to be ready..."
while ! curl -fs http://${ES_HOST}:${ES_PORT}/_cluster/health >/dev/null 2>&1; do
echo " Waiting for ES at ${ES_HOST}:${ES_PORT}..."
sleep 5
# Wait for Elasticsearch via bash /dev/tcp to avoid curl dependency
echo "[INFO] Waiting for Elasticsearch to be ready (tcp ${ES_HOST}:${ES_PORT})..."
for i in $(seq 1 120); do
if exec 3<>/dev/tcp/${ES_HOST}/${ES_PORT}; then
exec 3<&- 3>&-
echo "[INFO] Elasticsearch is ready"
break
fi
[[ $i -eq 120 ]] && { echo "[ERROR] ES not reachable" >&2; exit 1; }
sleep 1
done
echo "[INFO] Elasticsearch is ready"
# 启动 Fluent Bit
echo "[INFO] Starting Fluent Bit with configuration from /etc/fluent-bit/"
echo "[INFO] Command: /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf"
exec /opt/fluent-bit/bin/fluent-bit \
--config=/etc/fluent-bit/fluent-bit.conf
exec /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf

View File

@ -32,3 +32,42 @@ fi
echo "[OK] 初始化完成: private/argus/log/{elasticsearch,kibana}"
echo "[INFO] Fluent-bit files should be in fluent-bit/ directory"
# 准备 Fluent Bit 离线依赖(从 metric all-in-one-full 复制 deb 到 ../fluent-bit/build/packages
FLB_BUILD_PACKAGES_DIR="$root/../fluent-bit/build/packages"
mkdir -p "$FLB_BUILD_PACKAGES_DIR"
for deb in \
"$project_root/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libyaml-0-2_"*_amd64.deb \
"$project_root/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libpq5_"*_amd64.deb ; do
if ls $deb >/dev/null 2>&1; then
for f in $deb; do
base="$(basename "$f")"
if [[ ! -f "$FLB_BUILD_PACKAGES_DIR/$base" ]]; then
cp "$f" "$FLB_BUILD_PACKAGES_DIR/"
echo " [+] copied $base"
fi
done
fi
done
# 额外:从 all-in-one-full 的 ubuntu22/curl.tar.gz 解包必要依赖libsasl2/ldap便于离线安装
CURLOPT_TAR="$project_root/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/curl.tar.gz"
if [[ -f "$CURLOPT_TAR" ]]; then
tmpdir=$(mktemp -d)
if tar -xzf "$CURLOPT_TAR" -C "$tmpdir" 2>/dev/null; then
for p in \
libsasl2-2_*_amd64.deb \
libsasl2-modules-db_*_amd64.deb \
libldap-2.5-0_*_amd64.deb \
libidn2-0_*_amd64.deb \
libbrotli1_*_amd64.deb \
libssl3_*_amd64.deb ; do
src=$(ls "$tmpdir"/curl/$p 2>/dev/null | head -n1 || true)
if [[ -n "$src" ]]; then
base="$(basename "$src")"
[[ -f "$FLB_BUILD_PACKAGES_DIR/$base" ]] || cp "$src" "$FLB_BUILD_PACKAGES_DIR/" && echo " [+] staged $base"
fi
done
fi
rm -rf "$tmpdir"
fi

View File

@ -1 +1 @@
1.30.0
1.33.0

View File

@ -32,7 +32,7 @@
- 一键执行
- `cd src/sys/tests`
- `./scripts/00_e2e_test.sh`
- `./scripts/00_e2e_test.sh`CPU-only`./scripts/00_e2e_test.sh --enable-gpu`(启用 GPU 流程)
- 分步执行(推荐用于排查)
- `./scripts/01_bootstrap.sh` 生成目录/拷贝 `update-dns.sh`/构建 agent 二进制/写 `.env`
@ -43,6 +43,11 @@
- `./scripts/06_write_health_and_assert.sh` 写健康文件并断言 `nodes.json` 仅包含 2 个在线节点
- `./scripts/07_logs_send_and_assert.sh` 向两个节点写日志,断言 ES `train-*`/`infer-*` 计数增长
- `./scripts/08_restart_agent_reregister.sh` `node-b` 改为固定 IP `172.31.0.200`,验证保持同一节点 ID 且 IP/时间戳更新
- `./scripts/10_metric_publish.sh` 发布 metric 客户端包到 FTP
- `./scripts/11_metric_node_install.sh` 在 CPU 节点安装并验证端点
- `./scripts/12_metric_gpu_install.sh` 在 GPU 节点安装并等待 9100/9400 就绪(仅启用 GPU 时)
- `./scripts/13_metric_verify.sh` 对 master/Prometheus/数据面/Grafana 做综合校验(含 GPU 时校验 dcgm 指标)
- `./scripts/14_metric_cleanup.sh` 清理 FTP 产物
- `./scripts/09_down.sh` 回收容器、网络并清理 `private*/``tmp/`
- 重置环境
@ -107,6 +112,7 @@
- 判定:
- `private/argus/etc/master.argus.com` 存在且为 master IP
- 在 node-a/node-b 内 `getent hosts master.argus.com` 成功解析到 master IP
- 在 metric CPU/GPU 节点内可解析 `master.argus.com``prom.metric.argus.com`
- `05_agent_register.sh`
- 目的:确认两个节点注册到 master 并持久化 `node.json`
@ -137,3 +143,16 @@
---
如需更严格的断言(例如 Kibana 载入具体插件、ES 文档字段校验),可在 `07_*.sh` 中追加查询与校验逻辑。
---
## 可选GPU 流程说明
- 前置条件:宿主安装 NVIDIA 驱动与 `nvidia-container-toolkit``nvidia-smi` 在宿主可用。
- 启用方式:
- 一键:`./scripts/00_e2e_test.sh --enable-gpu`
- 分步:设置 `ARGUS_SYS_ENABLE_GPU=true` 后执行 `01_bootstrap.sh``02_up.sh`;或直接在 `.env` 中将 `ENABLE_GPU=true` 后单独运行 `02_up.sh`
- `01_bootstrap.sh` 会写入:
- `METRIC_TEST_HOSTNAME_GPU=test-metric-gpu-node-001`
- `METRIC_TEST_INSTANCE_GPU=172.31.0.51:9100`
- `METRIC_TEST_DCGM_GPU=172.31.0.51:9400`
- 验证点:`04_verify_dns_routing.sh` 增加对 metric 节点的域名解析;`12_metric_gpu_install.sh` 等待 9100/9400`13_metric_verify_*` 校验 dcgm 指标与 Grafana 面板。

View File

@ -104,6 +104,45 @@ ensure_image "argus-web-frontend:latest"
ensure_image "argus-web-proxy:latest"
ensure_image "argus-alertmanager:latest"
echo "[INFO] Preparing Fluent Bit local dependency packages..."
FLB_BUILD_PACKAGES_DIR="$REPO_ROOT/src/log/fluent-bit/build/packages"
mkdir -p "$FLB_BUILD_PACKAGES_DIR"
for deb in \
"$REPO_ROOT/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libyaml-0-2_"*_amd64.deb \
"$REPO_ROOT/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libpq5_"*_amd64.deb ; do
if ls $deb >/dev/null 2>&1; then
for f in $deb; do
base="$(basename "$f")"
if [[ ! -f "$FLB_BUILD_PACKAGES_DIR/$base" ]]; then
cp "$f" "$FLB_BUILD_PACKAGES_DIR/"
echo " [+] copied $base"
fi
done
fi
done
# 额外:从 all-in-one-full 的 ubuntu22/curl.tar.gz 解包必要依赖libsasl2/ldap便于离线安装
CURLOPT_TAR="$REPO_ROOT/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/curl.tar.gz"
if [[ -f "$CURLOPT_TAR" ]]; then
tmpdir=$(mktemp -d)
if tar -xzf "$CURLOPT_TAR" -C "$tmpdir" 2>/dev/null; then
for p in \
libsasl2-2_*_amd64.deb \
libsasl2-modules-db_*_amd64.deb \
libldap-2.5-0_*_amd64.deb \
libidn2-0_*_amd64.deb \
libbrotli1_*_amd64.deb \
libssl3_*_amd64.deb ; do
src=$(ls "$tmpdir"/curl/$p 2>/dev/null | head -n1 || true)
if [[ -n "$src" ]]; then
base="$(basename "$src")"
[[ -f "$FLB_BUILD_PACKAGES_DIR/$base" ]] || cp "$src" "$FLB_BUILD_PACKAGES_DIR/" && echo " [+] staged $base"
fi
done
fi
rm -rf "$tmpdir"
fi
echo "[INFO] Building agent binary..."
pushd "$REPO_ROOT/src/agent" >/dev/null
./scripts/build_binary.sh
@ -217,6 +256,13 @@ ARGUS_BUILD_GID=$ARGUS_BUILD_GID
# GPU 配置
ENABLE_GPU=$GPU_AVAILABLE
# 测试节点CPU/GPU默认标识与实例
METRIC_TEST_HOSTNAME_CPU=test-metric-node-001
METRIC_TEST_INSTANCE_CPU=172.31.0.50:9100
METRIC_TEST_HOSTNAME_GPU=test-metric-gpu-node-001
METRIC_TEST_INSTANCE_GPU=172.31.0.51:9100
METRIC_TEST_DCGM_GPU=172.31.0.51:9400
# Master/日志/监控等服务的宿主机端口(自动分配)
MASTER_PORT=$MASTER_PORT_VAL
ES_HTTP_PORT=$ES_HTTP_PORT_VAL

View File

@ -20,7 +20,15 @@ if [[ -f "$TEST_ROOT/.env" ]]; then
set -a; source "$TEST_ROOT/.env"; set +a
fi
REQUEST_GPU=${ARGUS_SYS_ENABLE_GPU:-false}
# GPU 开关优先级:显式环境变量 > .env 中的 ENABLE_GPU > 默认 false
if [[ "${ARGUS_SYS_ENABLE_GPU:-}" == "true" ]]; then
REQUEST_GPU=true
elif [[ "${ARGUS_SYS_ENABLE_GPU:-}" == "false" ]]; then
REQUEST_GPU=false
else
REQUEST_GPU=${ENABLE_GPU:-false}
fi
GPU_AVAILABLE=false
GPU_CHECK_SCRIPT="$REPO_ROOT/src/metric/tests/scripts/common/check-gpu.sh"

View File

@ -34,6 +34,25 @@ else
echo "[WARN] bind container not found; skip dig"
fi
check_inside() {
local cname="$1"; shift
local domains=("$@")
CID="$(cid_by_name "$cname")"
if [[ -z "$CID" ]]; then
echo "[WARN] container $cname not found; skip"
return 0
fi
for d in "${domains[@]}"; do
echo "[INFO] Checking resolution inside $cname for $d..."
if ! docker exec "$CID" getent hosts "$d" >/dev/null 2>&1; then
echo "[ERR] $cname cannot resolve $d" >&2
return 1
fi
RES="$(docker exec "$CID" getent hosts "$d" | awk '{print $1}' | head -n1)"
echo "[OK] $cname resolved $d -> $RES"
done
}
for node in argus-node-a argus-node-b; do
CID="$(cid_by_name "$node")"
echo "[INFO] Checking resolution inside $node..."
@ -47,4 +66,8 @@ done
popd >/dev/null
# 追加:在 metric 节点中验证 master 与 prom 域名解析
check_inside argus-metric-test-node master.argus.com prom.metric.argus.com || exit 1
check_inside argus-metric-test-gpu-node master.argus.com prom.metric.argus.com || exit 1
echo "[OK] DNS routing verified"

View File

@ -62,3 +62,21 @@ echo "[SYS-METRIC] GPU 节点安装完成"
'
echo "[SYS-METRIC] Metric GPU 节点安装流程完成"
# 就绪性检测9400(dcgm) 与 9100(node) 端口
echo "[SYS-METRIC] 等待 dcgm-exporter(9400) 与 node-exporter(9100) 就绪..."
retries=30
until docker exec "$CONTAINER" bash -lc "curl -fsS --max-time 2 http://localhost:9400/metrics >/dev/null"; do
((retries--)) || { echo "[ERR] dcgm-exporter 9400 未就绪" >&2; exit 1; }
sleep 2
done
echo "[OK] dcgm-exporter 端点可访问"
retries=30
until docker exec "$CONTAINER" bash -lc "curl -fsS --max-time 2 http://localhost:9100/metrics >/dev/null"; do
((retries--)) || { echo "[ERR] node-exporter 9100 未就绪" >&2; exit 1; }
sleep 2
done
echo "[OK] node-exporter 端点可访问"
mkdir -p "$TEST_ROOT/tmp" && touch "$TEST_ROOT/tmp/gpu_install_ready"

View File

@ -51,3 +51,16 @@ PY
awk -v a="$TS1" -v b="$TS2" 'BEGIN{ if (b>=a) exit 0; else exit 1 }' || { echo "[ERR] sample timestamp did not advance" >&2; exit 1; }
echo "[OK] sample timestamp advanced"
echo "[DONE] dataplane verify"
# 追加GPU 节点端点连通性检查(启用 GPU 时)
if [[ "${ENABLE_GPU:-false}" == "true" ]]; then
echo
echo "[VERIFY:DATA][GPU] curl endpoints on gpu node"
if ! docker exec argus-metric-test-gpu-node bash -lc 'curl -fsS --max-time 5 http://localhost:9100/metrics >/dev/null'; then
echo "[ERR] gpu node 9100 not reachable" >&2; exit 1
fi
if ! docker exec argus-metric-test-gpu-node bash -lc 'curl -fsS --max-time 5 http://localhost:9400/metrics >/dev/null'; then
echo "[ERR] gpu node 9400 not reachable" >&2; exit 1
fi
echo "[OK] gpu node endpoints reachable"
fi

View File

@ -71,4 +71,17 @@ print('OK',val)
PY
echo "[OK] cluster node online count >= 1 via Grafana proxy"
if [[ -f "$TEST_ROOT/.env" ]]; then
set -a; source "$TEST_ROOT/.env"; set +a
fi
# 可选GPU 面板查询(当启用 GPU 时)
if [[ "${ENABLE_GPU:-false}" == "true" ]]; then
echo "[VERIFY:GRAF-PANELS] GPU Panels — DCGM GPU UTIL"
Q_GPU_UTIL='DCGM_FI_DEV_GPU_UTIL'
proxy_query "$Q_GPU_UTIL" "$TMP_DIR/graf_panel_dcgm_util.json"
assert_vector_recent_nonempty "$TMP_DIR/graf_panel_dcgm_util.json" 300 >/dev/null || { echo "[ERR] dcgm gpu util no recent sample via Grafana proxy" >&2; exit 1; }
echo "[OK] dcgm gpu util has recent samples via Grafana proxy"
fi
echo "[DONE] grafana panels verify"

View File

@ -12,7 +12,7 @@ if [[ -f "$TEST_ROOT/.env" ]]; then
fi
PROM_BASE="http://localhost:${PROMETHEUS_PORT:-9090}/api/v1"
HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}"
HOSTNAME="${METRIC_TEST_HOSTNAME:-${METRIC_TEST_HOSTNAME_CPU:-test-metric-node-001}}"
nodes_json="$TEST_ROOT/private/argus/metric/prometheus/nodes.json"
targets_json="$TEST_ROOT/private/argus/metric/prometheus/targets/node_exporter.json"
@ -145,3 +145,54 @@ assert val>=1, f"count < 1: {val}"
PY
echo "[OK] up count satisfied"
echo "[DONE] prometheus verify"
# ========== GPU 验证(可选) ==========
if [[ "${ENABLE_GPU:-false}" == "true" ]]; then
echo
echo "[VERIFY:PROM][GPU] dcgm targets & up metric"
GPU_IP_PORT="${METRIC_TEST_DCGM_GPU:-172.31.0.51:9400}"
GPU_IP="${GPU_IP_PORT%%:*}"
# 1) file_sd 目标存在(在 Prometheus 容器内生成的 targets 文件)
TARGETS_FILE="$TEST_ROOT/private/argus/metric/prometheus/targets/dcgm_exporter.json"
if [[ ! -f "$TARGETS_FILE" ]]; then
echo "[ERR] $TARGETS_FILE missing" >&2; exit 1
fi
if ! jq -r '.[].targets[]' "$TARGETS_FILE" 2>/dev/null | grep -q "^${GPU_IP}:9400$"; then
echo "[ERR] dcgm target not found for ${GPU_IP}:9400" >&2
exit 1
fi
echo "[OK] dcgm target present in file_sd"
# 2) up{job="dcgm", ip=GPU_IP} == 1
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=up{job=\"dcgm\",ip=\"$GPU_IP\"}==1" > "$TMP_DIR/prom_dcgm_up.json"
python3 - "$TMP_DIR/prom_dcgm_up.json" <<'PY'
import json,sys
j=json.load(open(sys.argv[1]))
res=j.get('data',{}).get('result',[])
assert res, 'up==1 empty for dcgm'
val=float(res[0]['value'][1])
assert val==1.0, f'up not 1: {val}'
print('OK')
PY
echo "[OK] up{job=dcgm,ip=$GPU_IP} == 1"
# 3) 至少一个 GPU 指标存在(优先 DCGM_FI_DEV_GPU_UTIL若无则尝试 DCGM_FI_DEV_FB_USED
query_one() {
local q="$1"; local out="$2"
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=$q" > "$out"
python3 - "$out" <<'PY'
import json,sys
j=json.load(open(sys.argv[1]))
ok=(j.get('status')=='success' and len(j.get('data',{}).get('result',[]))>0)
raise SystemExit(0 if ok else 1)
PY
}
if query_one 'DCGM_FI_DEV_GPU_UTIL' "$TMP_DIR/prom_dcgm_util.json" || query_one 'DCGM_FI_DEV_FB_USED' "$TMP_DIR/prom_dcgm_fb.json"; then
echo "[OK] dcgm metrics present"
else
echo "[ERR] no dcgm metrics found" >&2; exit 1
fi
echo "[DONE] prometheus gpu verify"
fi