diff --git a/src/log/fluent-bit/build/packages/libbrotli1_1.0.9-2build6_amd64.deb b/src/log/fluent-bit/build/packages/libbrotli1_1.0.9-2build6_amd64.deb new file mode 100644 index 0000000..ab0e6d8 Binary files /dev/null and b/src/log/fluent-bit/build/packages/libbrotli1_1.0.9-2build6_amd64.deb differ diff --git a/src/log/fluent-bit/build/packages/libidn2-0_2.3.2-2build1_amd64.deb b/src/log/fluent-bit/build/packages/libidn2-0_2.3.2-2build1_amd64.deb new file mode 100644 index 0000000..017d14f Binary files /dev/null and b/src/log/fluent-bit/build/packages/libidn2-0_2.3.2-2build1_amd64.deb differ diff --git a/src/log/fluent-bit/build/packages/libldap-2.5-0_2.5.19+dfsg-0ubuntu0.22.04.1_amd64.deb b/src/log/fluent-bit/build/packages/libldap-2.5-0_2.5.19+dfsg-0ubuntu0.22.04.1_amd64.deb new file mode 100644 index 0000000..375f621 Binary files /dev/null and b/src/log/fluent-bit/build/packages/libldap-2.5-0_2.5.19+dfsg-0ubuntu0.22.04.1_amd64.deb differ diff --git a/src/log/fluent-bit/build/packages/libpq5_14.19-0ubuntu0.22.04.1_amd64.deb b/src/log/fluent-bit/build/packages/libpq5_14.19-0ubuntu0.22.04.1_amd64.deb new file mode 100644 index 0000000..9832c54 Binary files /dev/null and b/src/log/fluent-bit/build/packages/libpq5_14.19-0ubuntu0.22.04.1_amd64.deb differ diff --git a/src/log/fluent-bit/build/packages/libsasl2-2_2.1.27+dfsg2-3ubuntu1.2_amd64.deb b/src/log/fluent-bit/build/packages/libsasl2-2_2.1.27+dfsg2-3ubuntu1.2_amd64.deb new file mode 100644 index 0000000..a5a960c Binary files /dev/null and b/src/log/fluent-bit/build/packages/libsasl2-2_2.1.27+dfsg2-3ubuntu1.2_amd64.deb differ diff --git a/src/log/fluent-bit/build/packages/libsasl2-modules-db_2.1.27+dfsg2-3ubuntu1.2_amd64.deb b/src/log/fluent-bit/build/packages/libsasl2-modules-db_2.1.27+dfsg2-3ubuntu1.2_amd64.deb new file mode 100644 index 0000000..fb1d510 Binary files /dev/null and b/src/log/fluent-bit/build/packages/libsasl2-modules-db_2.1.27+dfsg2-3ubuntu1.2_amd64.deb differ diff --git a/src/log/fluent-bit/build/packages/libssl3_3.0.2-0ubuntu1.20_amd64.deb b/src/log/fluent-bit/build/packages/libssl3_3.0.2-0ubuntu1.20_amd64.deb new file mode 100644 index 0000000..cfc883f Binary files /dev/null and b/src/log/fluent-bit/build/packages/libssl3_3.0.2-0ubuntu1.20_amd64.deb differ diff --git a/src/log/fluent-bit/build/packages/libyaml-0-2_0.2.2-1build2_amd64.deb b/src/log/fluent-bit/build/packages/libyaml-0-2_0.2.2-1build2_amd64.deb new file mode 100644 index 0000000..a995886 Binary files /dev/null and b/src/log/fluent-bit/build/packages/libyaml-0-2_0.2.2-1build2_amd64.deb differ diff --git a/src/log/fluent-bit/build/start-fluent-bit.sh b/src/log/fluent-bit/build/start-fluent-bit.sh index 5db6aa7..5b4cd35 100755 --- a/src/log/fluent-bit/build/start-fluent-bit.sh +++ b/src/log/fluent-bit/build/start-fluent-bit.sh @@ -1,47 +1,96 @@ #!/bin/bash set -euo pipefail -echo "[INFO] Starting Fluent Bit setup in Ubuntu container..." +echo "[INFO] Starting Fluent Bit setup in Ubuntu container (offline-first)..." -# 安装必要的工具 -echo "[INFO] Installing required packages..." export DEBIAN_FRONTEND=noninteractive -apt-get update -qq -apt-get install -y -qq curl -# 解压bundle到/tmp -echo "[INFO] Extracting fluent-bit bundle..." -cp -r /private/etc /tmp -cp -r /private/packages /tmp -cd /tmp +# Stage bundle to /tmp (read-only mount under /private) +echo "[INFO] Staging fluent-bit bundle..." +rm -rf /tmp/flb && mkdir -p /tmp/flb +cp -r /private/etc /tmp/flb/ +mkdir -p /tmp/flb/packages +cp -r /private/packages/* /tmp/flb/packages/ 2>/dev/null || true -# 安装 Fluent Bit 从 deb 包 -echo "[INFO] Installing Fluent Bit from deb package..." -dpkg -i /tmp/packages/fluent-bit_3.1.9_amd64.deb || true -apt-get install -f -y -qq # 解决依赖问题 +# Helper: check and install a local deb if not already satisfied +ensure_lib() { + local soname="$1"; shift + local pattern="$1"; shift + if ldconfig -p 2>/dev/null | grep -q "$soname"; then + echo "[OK] $soname already present" + return 0 + fi + local deb="$(ls /tmp/flb/packages/$pattern 2>/dev/null | head -n1 || true)" + if [[ -n "$deb" ]]; then + echo "[INFO] Installing local dependency: $(basename "$deb")" + dpkg -i "$deb" >/dev/null 2>&1 || true + else + echo "[WARN] Local deb for $soname not found (pattern=$pattern)" + fi + if ! ldconfig -p 2>/dev/null | grep -q "$soname"; then + echo "[WARN] $soname still missing after local install; attempting apt fallback" + apt-get update -qq || true + case "$soname" in + libpq.so.5) apt-get install -y -qq libpq5 || true ;; + libyaml-0.so.2) apt-get install -y -qq libyaml-0-2 || true ;; + esac + fi + ldconfig 2>/dev/null || true +} + +# Offline-first: satisfy runtime deps from local debs, fallback to apt only if necessary +ensure_lib "libpq.so.5" "libpq5_*_amd64.deb" +ensure_lib "libyaml-0.so.2" "libyaml-0-2_*_amd64.deb" +ensure_lib "libsasl2.so.2" "libsasl2-2_*_amd64.deb" +ensure_lib "libldap-2.5.so.0" "libldap-2.5-0_*_amd64.deb" + +# Install fluent-bit main package from local bundle +FLB_DEB="$(ls /tmp/flb/packages/fluent-bit_*_amd64.deb 2>/dev/null | head -n1 || true)" +if [[ -z "$FLB_DEB" ]]; then + echo "[ERROR] fluent-bit deb not found under /private/packages" >&2 + exit 1 +fi +echo "[INFO] Installing Fluent Bit: $(basename "$FLB_DEB")" +dpkg -i "$FLB_DEB" >/dev/null 2>&1 || true + +# If dpkg reported unresolved dependencies, try apt -f only as last resort +if ! command -v /opt/fluent-bit/bin/fluent-bit >/dev/null 2>&1; then + echo "[WARN] fluent-bit binary missing after dpkg; attempting apt --fix-broken" + apt-get install -f -y -qq || true +fi + +# Ensure runtime library dependencies are satisfied (libsasl2, libldap are required via libpq/curl) +MISSING=$(ldd /opt/fluent-bit/bin/fluent-bit 2>/dev/null | awk '/not found/{print $1}' | xargs -r echo || true) +if [[ -n "$MISSING" ]]; then + echo "[WARN] missing shared libs: $MISSING" + apt-get update -qq || true + apt-get install -y -qq libsasl2-2 libldap-2.5-0 || true + apt-get install -f -y -qq || true +fi -# 验证 Fluent Bit 可以运行 echo "[INFO] Fluent Bit version:" -/opt/fluent-bit/bin/fluent-bit --version +/opt/fluent-bit/bin/fluent-bit --version || { echo "[ERROR] fluent-bit not installed or libraries missing" >&2; exit 1; } -# 创建配置目录 +# Place configuration mkdir -p /etc/fluent-bit -cp -r /tmp/etc/* /etc/fluent-bit/ +cp -r /tmp/flb/etc/* /etc/fluent-bit/ -# 创建日志和缓冲区目录 +# Create logs/buffers dirs mkdir -p /logs/train /logs/infer /buffers chmod 755 /logs/train /logs/infer /buffers -# 等待 Elasticsearch 就绪 -echo "[INFO] Waiting for Elasticsearch to be ready..." -while ! curl -fs http://${ES_HOST}:${ES_PORT}/_cluster/health >/dev/null 2>&1; do - echo " Waiting for ES at ${ES_HOST}:${ES_PORT}..." - sleep 5 +# Wait for Elasticsearch via bash /dev/tcp to avoid curl dependency +echo "[INFO] Waiting for Elasticsearch to be ready (tcp ${ES_HOST}:${ES_PORT})..." +for i in $(seq 1 120); do + if exec 3<>/dev/tcp/${ES_HOST}/${ES_PORT}; then + exec 3<&- 3>&- + echo "[INFO] Elasticsearch is ready" + break + fi + [[ $i -eq 120 ]] && { echo "[ERROR] ES not reachable" >&2; exit 1; } + sleep 1 done -echo "[INFO] Elasticsearch is ready" -# 启动 Fluent Bit echo "[INFO] Starting Fluent Bit with configuration from /etc/fluent-bit/" echo "[INFO] Command: /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf" -exec /opt/fluent-bit/bin/fluent-bit \ - --config=/etc/fluent-bit/fluent-bit.conf +exec /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf diff --git a/src/log/tests/scripts/01_bootstrap.sh b/src/log/tests/scripts/01_bootstrap.sh index 93898e0..fb322ab 100755 --- a/src/log/tests/scripts/01_bootstrap.sh +++ b/src/log/tests/scripts/01_bootstrap.sh @@ -32,3 +32,42 @@ fi echo "[OK] 初始化完成: private/argus/log/{elasticsearch,kibana}" echo "[INFO] Fluent-bit files should be in fluent-bit/ directory" + +# 准备 Fluent Bit 离线依赖(从 metric all-in-one-full 复制 deb 到 ../fluent-bit/build/packages) +FLB_BUILD_PACKAGES_DIR="$root/../fluent-bit/build/packages" +mkdir -p "$FLB_BUILD_PACKAGES_DIR" +for deb in \ + "$project_root/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libyaml-0-2_"*_amd64.deb \ + "$project_root/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libpq5_"*_amd64.deb ; do + if ls $deb >/dev/null 2>&1; then + for f in $deb; do + base="$(basename "$f")" + if [[ ! -f "$FLB_BUILD_PACKAGES_DIR/$base" ]]; then + cp "$f" "$FLB_BUILD_PACKAGES_DIR/" + echo " [+] copied $base" + fi + done + fi +done + +# 额外:从 all-in-one-full 的 ubuntu22/curl.tar.gz 解包必要依赖(libsasl2/ldap),便于离线安装 +CURLOPT_TAR="$project_root/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/curl.tar.gz" +if [[ -f "$CURLOPT_TAR" ]]; then + tmpdir=$(mktemp -d) + if tar -xzf "$CURLOPT_TAR" -C "$tmpdir" 2>/dev/null; then + for p in \ + libsasl2-2_*_amd64.deb \ + libsasl2-modules-db_*_amd64.deb \ + libldap-2.5-0_*_amd64.deb \ + libidn2-0_*_amd64.deb \ + libbrotli1_*_amd64.deb \ + libssl3_*_amd64.deb ; do + src=$(ls "$tmpdir"/curl/$p 2>/dev/null | head -n1 || true) + if [[ -n "$src" ]]; then + base="$(basename "$src")" + [[ -f "$FLB_BUILD_PACKAGES_DIR/$base" ]] || cp "$src" "$FLB_BUILD_PACKAGES_DIR/" && echo " [+] staged $base" + fi + done + fi + rm -rf "$tmpdir" +fi diff --git a/src/metric/client-plugins/all-in-one-full/config/VERSION b/src/metric/client-plugins/all-in-one-full/config/VERSION index 034552a..7aa332e 100644 --- a/src/metric/client-plugins/all-in-one-full/config/VERSION +++ b/src/metric/client-plugins/all-in-one-full/config/VERSION @@ -1 +1 @@ -1.30.0 +1.33.0 diff --git a/src/sys/tests/README.md b/src/sys/tests/README.md index 4a21be5..c166625 100644 --- a/src/sys/tests/README.md +++ b/src/sys/tests/README.md @@ -32,7 +32,7 @@ - 一键执行 - `cd src/sys/tests` - - `./scripts/00_e2e_test.sh` + - `./scripts/00_e2e_test.sh`(CPU-only)或 `./scripts/00_e2e_test.sh --enable-gpu`(启用 GPU 流程) - 分步执行(推荐用于排查) - `./scripts/01_bootstrap.sh` 生成目录/拷贝 `update-dns.sh`/构建 agent 二进制/写 `.env` @@ -43,6 +43,11 @@ - `./scripts/06_write_health_and_assert.sh` 写健康文件并断言 `nodes.json` 仅包含 2 个在线节点 - `./scripts/07_logs_send_and_assert.sh` 向两个节点写日志,断言 ES `train-*`/`infer-*` 计数增长 - `./scripts/08_restart_agent_reregister.sh` `node-b` 改为固定 IP `172.31.0.200`,验证保持同一节点 ID 且 IP/时间戳更新 + - `./scripts/10_metric_publish.sh` 发布 metric 客户端包到 FTP + - `./scripts/11_metric_node_install.sh` 在 CPU 节点安装并验证端点 + - `./scripts/12_metric_gpu_install.sh` 在 GPU 节点安装并等待 9100/9400 就绪(仅启用 GPU 时) + - `./scripts/13_metric_verify.sh` 对 master/Prometheus/数据面/Grafana 做综合校验(含 GPU 时校验 dcgm 指标) + - `./scripts/14_metric_cleanup.sh` 清理 FTP 产物 - `./scripts/09_down.sh` 回收容器、网络并清理 `private*/`、`tmp/` - 重置环境 @@ -107,6 +112,7 @@ - 判定: - `private/argus/etc/master.argus.com` 存在且为 master IP - 在 node-a/node-b 内 `getent hosts master.argus.com` 成功解析到 master IP + - 在 metric CPU/GPU 节点内可解析 `master.argus.com` 与 `prom.metric.argus.com` - `05_agent_register.sh` - 目的:确认两个节点注册到 master 并持久化 `node.json` @@ -137,3 +143,16 @@ --- 如需更严格的断言(例如 Kibana 载入具体插件、ES 文档字段校验),可在 `07_*.sh` 中追加查询与校验逻辑。 + +--- + +## 可选:GPU 流程说明 +- 前置条件:宿主安装 NVIDIA 驱动与 `nvidia-container-toolkit`,`nvidia-smi` 在宿主可用。 +- 启用方式: + - 一键:`./scripts/00_e2e_test.sh --enable-gpu` + - 分步:设置 `ARGUS_SYS_ENABLE_GPU=true` 后执行 `01_bootstrap.sh`、`02_up.sh`;或直接在 `.env` 中将 `ENABLE_GPU=true` 后单独运行 `02_up.sh`。 +- `01_bootstrap.sh` 会写入: + - `METRIC_TEST_HOSTNAME_GPU=test-metric-gpu-node-001` + - `METRIC_TEST_INSTANCE_GPU=172.31.0.51:9100` + - `METRIC_TEST_DCGM_GPU=172.31.0.51:9400` +- 验证点:`04_verify_dns_routing.sh` 增加对 metric 节点的域名解析;`12_metric_gpu_install.sh` 等待 9100/9400;`13_metric_verify_*` 校验 dcgm 指标与 Grafana 面板。 diff --git a/src/sys/tests/scripts/01_bootstrap.sh b/src/sys/tests/scripts/01_bootstrap.sh index f1a45f8..a4dd69e 100755 --- a/src/sys/tests/scripts/01_bootstrap.sh +++ b/src/sys/tests/scripts/01_bootstrap.sh @@ -104,6 +104,45 @@ ensure_image "argus-web-frontend:latest" ensure_image "argus-web-proxy:latest" ensure_image "argus-alertmanager:latest" +echo "[INFO] Preparing Fluent Bit local dependency packages..." +FLB_BUILD_PACKAGES_DIR="$REPO_ROOT/src/log/fluent-bit/build/packages" +mkdir -p "$FLB_BUILD_PACKAGES_DIR" +for deb in \ + "$REPO_ROOT/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libyaml-0-2_"*_amd64.deb \ + "$REPO_ROOT/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libpq5_"*_amd64.deb ; do + if ls $deb >/dev/null 2>&1; then + for f in $deb; do + base="$(basename "$f")" + if [[ ! -f "$FLB_BUILD_PACKAGES_DIR/$base" ]]; then + cp "$f" "$FLB_BUILD_PACKAGES_DIR/" + echo " [+] copied $base" + fi + done + fi +done + +# 额外:从 all-in-one-full 的 ubuntu22/curl.tar.gz 解包必要依赖(libsasl2/ldap),便于离线安装 +CURLOPT_TAR="$REPO_ROOT/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/curl.tar.gz" +if [[ -f "$CURLOPT_TAR" ]]; then + tmpdir=$(mktemp -d) + if tar -xzf "$CURLOPT_TAR" -C "$tmpdir" 2>/dev/null; then + for p in \ + libsasl2-2_*_amd64.deb \ + libsasl2-modules-db_*_amd64.deb \ + libldap-2.5-0_*_amd64.deb \ + libidn2-0_*_amd64.deb \ + libbrotli1_*_amd64.deb \ + libssl3_*_amd64.deb ; do + src=$(ls "$tmpdir"/curl/$p 2>/dev/null | head -n1 || true) + if [[ -n "$src" ]]; then + base="$(basename "$src")" + [[ -f "$FLB_BUILD_PACKAGES_DIR/$base" ]] || cp "$src" "$FLB_BUILD_PACKAGES_DIR/" && echo " [+] staged $base" + fi + done + fi + rm -rf "$tmpdir" +fi + echo "[INFO] Building agent binary..." pushd "$REPO_ROOT/src/agent" >/dev/null ./scripts/build_binary.sh @@ -217,6 +256,13 @@ ARGUS_BUILD_GID=$ARGUS_BUILD_GID # GPU 配置 ENABLE_GPU=$GPU_AVAILABLE +# 测试节点(CPU/GPU)默认标识与实例 +METRIC_TEST_HOSTNAME_CPU=test-metric-node-001 +METRIC_TEST_INSTANCE_CPU=172.31.0.50:9100 +METRIC_TEST_HOSTNAME_GPU=test-metric-gpu-node-001 +METRIC_TEST_INSTANCE_GPU=172.31.0.51:9100 +METRIC_TEST_DCGM_GPU=172.31.0.51:9400 + # Master/日志/监控等服务的宿主机端口(自动分配) MASTER_PORT=$MASTER_PORT_VAL ES_HTTP_PORT=$ES_HTTP_PORT_VAL diff --git a/src/sys/tests/scripts/02_up.sh b/src/sys/tests/scripts/02_up.sh index bfc15cd..9879d58 100755 --- a/src/sys/tests/scripts/02_up.sh +++ b/src/sys/tests/scripts/02_up.sh @@ -20,7 +20,15 @@ if [[ -f "$TEST_ROOT/.env" ]]; then set -a; source "$TEST_ROOT/.env"; set +a fi -REQUEST_GPU=${ARGUS_SYS_ENABLE_GPU:-false} +# GPU 开关优先级:显式环境变量 > .env 中的 ENABLE_GPU > 默认 false +if [[ "${ARGUS_SYS_ENABLE_GPU:-}" == "true" ]]; then + REQUEST_GPU=true +elif [[ "${ARGUS_SYS_ENABLE_GPU:-}" == "false" ]]; then + REQUEST_GPU=false +else + REQUEST_GPU=${ENABLE_GPU:-false} +fi + GPU_AVAILABLE=false GPU_CHECK_SCRIPT="$REPO_ROOT/src/metric/tests/scripts/common/check-gpu.sh" diff --git a/src/sys/tests/scripts/04_verify_dns_routing.sh b/src/sys/tests/scripts/04_verify_dns_routing.sh index 3b389d7..1895131 100755 --- a/src/sys/tests/scripts/04_verify_dns_routing.sh +++ b/src/sys/tests/scripts/04_verify_dns_routing.sh @@ -34,6 +34,25 @@ else echo "[WARN] bind container not found; skip dig" fi +check_inside() { + local cname="$1"; shift + local domains=("$@") + CID="$(cid_by_name "$cname")" + if [[ -z "$CID" ]]; then + echo "[WARN] container $cname not found; skip" + return 0 + fi + for d in "${domains[@]}"; do + echo "[INFO] Checking resolution inside $cname for $d..." + if ! docker exec "$CID" getent hosts "$d" >/dev/null 2>&1; then + echo "[ERR] $cname cannot resolve $d" >&2 + return 1 + fi + RES="$(docker exec "$CID" getent hosts "$d" | awk '{print $1}' | head -n1)" + echo "[OK] $cname resolved $d -> $RES" + done +} + for node in argus-node-a argus-node-b; do CID="$(cid_by_name "$node")" echo "[INFO] Checking resolution inside $node..." @@ -47,4 +66,8 @@ done popd >/dev/null +# 追加:在 metric 节点中验证 master 与 prom 域名解析 +check_inside argus-metric-test-node master.argus.com prom.metric.argus.com || exit 1 +check_inside argus-metric-test-gpu-node master.argus.com prom.metric.argus.com || exit 1 + echo "[OK] DNS routing verified" diff --git a/src/sys/tests/scripts/12_metric_gpu_install.sh b/src/sys/tests/scripts/12_metric_gpu_install.sh index 917221a..c92bf4f 100755 --- a/src/sys/tests/scripts/12_metric_gpu_install.sh +++ b/src/sys/tests/scripts/12_metric_gpu_install.sh @@ -62,3 +62,21 @@ echo "[SYS-METRIC] GPU 节点安装完成" ' echo "[SYS-METRIC] Metric GPU 节点安装流程完成" + +# 就绪性检测:9400(dcgm) 与 9100(node) 端口 +echo "[SYS-METRIC] 等待 dcgm-exporter(9400) 与 node-exporter(9100) 就绪..." +retries=30 +until docker exec "$CONTAINER" bash -lc "curl -fsS --max-time 2 http://localhost:9400/metrics >/dev/null"; do + ((retries--)) || { echo "[ERR] dcgm-exporter 9400 未就绪" >&2; exit 1; } + sleep 2 +done +echo "[OK] dcgm-exporter 端点可访问" + +retries=30 +until docker exec "$CONTAINER" bash -lc "curl -fsS --max-time 2 http://localhost:9100/metrics >/dev/null"; do + ((retries--)) || { echo "[ERR] node-exporter 9100 未就绪" >&2; exit 1; } + sleep 2 +done +echo "[OK] node-exporter 端点可访问" + +mkdir -p "$TEST_ROOT/tmp" && touch "$TEST_ROOT/tmp/gpu_install_ready" diff --git a/src/sys/tests/scripts/13_metric_verify_dataplane.sh b/src/sys/tests/scripts/13_metric_verify_dataplane.sh index 4cb78d9..12342ec 100755 --- a/src/sys/tests/scripts/13_metric_verify_dataplane.sh +++ b/src/sys/tests/scripts/13_metric_verify_dataplane.sh @@ -51,3 +51,16 @@ PY awk -v a="$TS1" -v b="$TS2" 'BEGIN{ if (b>=a) exit 0; else exit 1 }' || { echo "[ERR] sample timestamp did not advance" >&2; exit 1; } echo "[OK] sample timestamp advanced" echo "[DONE] dataplane verify" + +# 追加:GPU 节点端点连通性检查(启用 GPU 时) +if [[ "${ENABLE_GPU:-false}" == "true" ]]; then + echo + echo "[VERIFY:DATA][GPU] curl endpoints on gpu node" + if ! docker exec argus-metric-test-gpu-node bash -lc 'curl -fsS --max-time 5 http://localhost:9100/metrics >/dev/null'; then + echo "[ERR] gpu node 9100 not reachable" >&2; exit 1 + fi + if ! docker exec argus-metric-test-gpu-node bash -lc 'curl -fsS --max-time 5 http://localhost:9400/metrics >/dev/null'; then + echo "[ERR] gpu node 9400 not reachable" >&2; exit 1 + fi + echo "[OK] gpu node endpoints reachable" +fi diff --git a/src/sys/tests/scripts/13_metric_verify_grafana_panels.sh b/src/sys/tests/scripts/13_metric_verify_grafana_panels.sh index 8cdc731..0b5b242 100755 --- a/src/sys/tests/scripts/13_metric_verify_grafana_panels.sh +++ b/src/sys/tests/scripts/13_metric_verify_grafana_panels.sh @@ -71,4 +71,17 @@ print('OK',val) PY echo "[OK] cluster node online count >= 1 via Grafana proxy" +if [[ -f "$TEST_ROOT/.env" ]]; then + set -a; source "$TEST_ROOT/.env"; set +a +fi + +# 可选:GPU 面板查询(当启用 GPU 时) +if [[ "${ENABLE_GPU:-false}" == "true" ]]; then + echo "[VERIFY:GRAF-PANELS] GPU Panels — DCGM GPU UTIL" + Q_GPU_UTIL='DCGM_FI_DEV_GPU_UTIL' + proxy_query "$Q_GPU_UTIL" "$TMP_DIR/graf_panel_dcgm_util.json" + assert_vector_recent_nonempty "$TMP_DIR/graf_panel_dcgm_util.json" 300 >/dev/null || { echo "[ERR] dcgm gpu util no recent sample via Grafana proxy" >&2; exit 1; } + echo "[OK] dcgm gpu util has recent samples via Grafana proxy" +fi + echo "[DONE] grafana panels verify" diff --git a/src/sys/tests/scripts/13_metric_verify_prometheus.sh b/src/sys/tests/scripts/13_metric_verify_prometheus.sh index 374c013..b5bd781 100755 --- a/src/sys/tests/scripts/13_metric_verify_prometheus.sh +++ b/src/sys/tests/scripts/13_metric_verify_prometheus.sh @@ -12,7 +12,7 @@ if [[ -f "$TEST_ROOT/.env" ]]; then fi PROM_BASE="http://localhost:${PROMETHEUS_PORT:-9090}/api/v1" -HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}" +HOSTNAME="${METRIC_TEST_HOSTNAME:-${METRIC_TEST_HOSTNAME_CPU:-test-metric-node-001}}" nodes_json="$TEST_ROOT/private/argus/metric/prometheus/nodes.json" targets_json="$TEST_ROOT/private/argus/metric/prometheus/targets/node_exporter.json" @@ -145,3 +145,54 @@ assert val>=1, f"count < 1: {val}" PY echo "[OK] up count satisfied" echo "[DONE] prometheus verify" + +# ========== GPU 验证(可选) ========== +if [[ "${ENABLE_GPU:-false}" == "true" ]]; then + echo + echo "[VERIFY:PROM][GPU] dcgm targets & up metric" + GPU_IP_PORT="${METRIC_TEST_DCGM_GPU:-172.31.0.51:9400}" + GPU_IP="${GPU_IP_PORT%%:*}" + + # 1) file_sd 目标存在(在 Prometheus 容器内生成的 targets 文件) + TARGETS_FILE="$TEST_ROOT/private/argus/metric/prometheus/targets/dcgm_exporter.json" + if [[ ! -f "$TARGETS_FILE" ]]; then + echo "[ERR] $TARGETS_FILE missing" >&2; exit 1 + fi + if ! jq -r '.[].targets[]' "$TARGETS_FILE" 2>/dev/null | grep -q "^${GPU_IP}:9400$"; then + echo "[ERR] dcgm target not found for ${GPU_IP}:9400" >&2 + exit 1 + fi + echo "[OK] dcgm target present in file_sd" + + # 2) up{job="dcgm", ip=GPU_IP} == 1 + curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=up{job=\"dcgm\",ip=\"$GPU_IP\"}==1" > "$TMP_DIR/prom_dcgm_up.json" + python3 - "$TMP_DIR/prom_dcgm_up.json" <<'PY' +import json,sys +j=json.load(open(sys.argv[1])) +res=j.get('data',{}).get('result',[]) +assert res, 'up==1 empty for dcgm' +val=float(res[0]['value'][1]) +assert val==1.0, f'up not 1: {val}' +print('OK') +PY + echo "[OK] up{job=dcgm,ip=$GPU_IP} == 1" + + # 3) 至少一个 GPU 指标存在(优先 DCGM_FI_DEV_GPU_UTIL,若无则尝试 DCGM_FI_DEV_FB_USED) + query_one() { + local q="$1"; local out="$2" + curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=$q" > "$out" + python3 - "$out" <<'PY' +import json,sys +j=json.load(open(sys.argv[1])) +ok=(j.get('status')=='success' and len(j.get('data',{}).get('result',[]))>0) +raise SystemExit(0 if ok else 1) +PY + } + if query_one 'DCGM_FI_DEV_GPU_UTIL' "$TMP_DIR/prom_dcgm_util.json" || query_one 'DCGM_FI_DEV_FB_USED' "$TMP_DIR/prom_dcgm_fb.json"; then + echo "[OK] dcgm metrics present" + else + echo "[ERR] no dcgm metrics found" >&2; exit 1 + fi + + echo "[DONE] prometheus gpu verify" +fi