完成a6000测试系统构建、部署、测试整合 #35
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,47 +1,96 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
echo "[INFO] Starting Fluent Bit setup in Ubuntu container..."
|
echo "[INFO] Starting Fluent Bit setup in Ubuntu container (offline-first)..."
|
||||||
|
|
||||||
# 安装必要的工具
|
|
||||||
echo "[INFO] Installing required packages..."
|
|
||||||
export DEBIAN_FRONTEND=noninteractive
|
export DEBIAN_FRONTEND=noninteractive
|
||||||
apt-get update -qq
|
|
||||||
apt-get install -y -qq curl
|
|
||||||
|
|
||||||
# 解压bundle到/tmp
|
# Stage bundle to /tmp (read-only mount under /private)
|
||||||
echo "[INFO] Extracting fluent-bit bundle..."
|
echo "[INFO] Staging fluent-bit bundle..."
|
||||||
cp -r /private/etc /tmp
|
rm -rf /tmp/flb && mkdir -p /tmp/flb
|
||||||
cp -r /private/packages /tmp
|
cp -r /private/etc /tmp/flb/
|
||||||
cd /tmp
|
mkdir -p /tmp/flb/packages
|
||||||
|
cp -r /private/packages/* /tmp/flb/packages/ 2>/dev/null || true
|
||||||
|
|
||||||
# 安装 Fluent Bit 从 deb 包
|
# Helper: check and install a local deb if not already satisfied
|
||||||
echo "[INFO] Installing Fluent Bit from deb package..."
|
ensure_lib() {
|
||||||
dpkg -i /tmp/packages/fluent-bit_3.1.9_amd64.deb || true
|
local soname="$1"; shift
|
||||||
apt-get install -f -y -qq # 解决依赖问题
|
local pattern="$1"; shift
|
||||||
|
if ldconfig -p 2>/dev/null | grep -q "$soname"; then
|
||||||
|
echo "[OK] $soname already present"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
local deb="$(ls /tmp/flb/packages/$pattern 2>/dev/null | head -n1 || true)"
|
||||||
|
if [[ -n "$deb" ]]; then
|
||||||
|
echo "[INFO] Installing local dependency: $(basename "$deb")"
|
||||||
|
dpkg -i "$deb" >/dev/null 2>&1 || true
|
||||||
|
else
|
||||||
|
echo "[WARN] Local deb for $soname not found (pattern=$pattern)"
|
||||||
|
fi
|
||||||
|
if ! ldconfig -p 2>/dev/null | grep -q "$soname"; then
|
||||||
|
echo "[WARN] $soname still missing after local install; attempting apt fallback"
|
||||||
|
apt-get update -qq || true
|
||||||
|
case "$soname" in
|
||||||
|
libpq.so.5) apt-get install -y -qq libpq5 || true ;;
|
||||||
|
libyaml-0.so.2) apt-get install -y -qq libyaml-0-2 || true ;;
|
||||||
|
esac
|
||||||
|
fi
|
||||||
|
ldconfig 2>/dev/null || true
|
||||||
|
}
|
||||||
|
|
||||||
|
# Offline-first: satisfy runtime deps from local debs, fallback to apt only if necessary
|
||||||
|
ensure_lib "libpq.so.5" "libpq5_*_amd64.deb"
|
||||||
|
ensure_lib "libyaml-0.so.2" "libyaml-0-2_*_amd64.deb"
|
||||||
|
ensure_lib "libsasl2.so.2" "libsasl2-2_*_amd64.deb"
|
||||||
|
ensure_lib "libldap-2.5.so.0" "libldap-2.5-0_*_amd64.deb"
|
||||||
|
|
||||||
|
# Install fluent-bit main package from local bundle
|
||||||
|
FLB_DEB="$(ls /tmp/flb/packages/fluent-bit_*_amd64.deb 2>/dev/null | head -n1 || true)"
|
||||||
|
if [[ -z "$FLB_DEB" ]]; then
|
||||||
|
echo "[ERROR] fluent-bit deb not found under /private/packages" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "[INFO] Installing Fluent Bit: $(basename "$FLB_DEB")"
|
||||||
|
dpkg -i "$FLB_DEB" >/dev/null 2>&1 || true
|
||||||
|
|
||||||
|
# If dpkg reported unresolved dependencies, try apt -f only as last resort
|
||||||
|
if ! command -v /opt/fluent-bit/bin/fluent-bit >/dev/null 2>&1; then
|
||||||
|
echo "[WARN] fluent-bit binary missing after dpkg; attempting apt --fix-broken"
|
||||||
|
apt-get install -f -y -qq || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Ensure runtime library dependencies are satisfied (libsasl2, libldap are required via libpq/curl)
|
||||||
|
MISSING=$(ldd /opt/fluent-bit/bin/fluent-bit 2>/dev/null | awk '/not found/{print $1}' | xargs -r echo || true)
|
||||||
|
if [[ -n "$MISSING" ]]; then
|
||||||
|
echo "[WARN] missing shared libs: $MISSING"
|
||||||
|
apt-get update -qq || true
|
||||||
|
apt-get install -y -qq libsasl2-2 libldap-2.5-0 || true
|
||||||
|
apt-get install -f -y -qq || true
|
||||||
|
fi
|
||||||
|
|
||||||
# 验证 Fluent Bit 可以运行
|
|
||||||
echo "[INFO] Fluent Bit version:"
|
echo "[INFO] Fluent Bit version:"
|
||||||
/opt/fluent-bit/bin/fluent-bit --version
|
/opt/fluent-bit/bin/fluent-bit --version || { echo "[ERROR] fluent-bit not installed or libraries missing" >&2; exit 1; }
|
||||||
|
|
||||||
# 创建配置目录
|
# Place configuration
|
||||||
mkdir -p /etc/fluent-bit
|
mkdir -p /etc/fluent-bit
|
||||||
cp -r /tmp/etc/* /etc/fluent-bit/
|
cp -r /tmp/flb/etc/* /etc/fluent-bit/
|
||||||
|
|
||||||
# 创建日志和缓冲区目录
|
# Create logs/buffers dirs
|
||||||
mkdir -p /logs/train /logs/infer /buffers
|
mkdir -p /logs/train /logs/infer /buffers
|
||||||
chmod 755 /logs/train /logs/infer /buffers
|
chmod 755 /logs/train /logs/infer /buffers
|
||||||
|
|
||||||
# 等待 Elasticsearch 就绪
|
# Wait for Elasticsearch via bash /dev/tcp to avoid curl dependency
|
||||||
echo "[INFO] Waiting for Elasticsearch to be ready..."
|
echo "[INFO] Waiting for Elasticsearch to be ready (tcp ${ES_HOST}:${ES_PORT})..."
|
||||||
while ! curl -fs http://${ES_HOST}:${ES_PORT}/_cluster/health >/dev/null 2>&1; do
|
for i in $(seq 1 120); do
|
||||||
echo " Waiting for ES at ${ES_HOST}:${ES_PORT}..."
|
if exec 3<>/dev/tcp/${ES_HOST}/${ES_PORT}; then
|
||||||
sleep 5
|
exec 3<&- 3>&-
|
||||||
done
|
|
||||||
echo "[INFO] Elasticsearch is ready"
|
echo "[INFO] Elasticsearch is ready"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
[[ $i -eq 120 ]] && { echo "[ERROR] ES not reachable" >&2; exit 1; }
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
|
||||||
# 启动 Fluent Bit
|
|
||||||
echo "[INFO] Starting Fluent Bit with configuration from /etc/fluent-bit/"
|
echo "[INFO] Starting Fluent Bit with configuration from /etc/fluent-bit/"
|
||||||
echo "[INFO] Command: /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf"
|
echo "[INFO] Command: /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf"
|
||||||
exec /opt/fluent-bit/bin/fluent-bit \
|
exec /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf
|
||||||
--config=/etc/fluent-bit/fluent-bit.conf
|
|
||||||
|
|||||||
@ -32,3 +32,42 @@ fi
|
|||||||
|
|
||||||
echo "[OK] 初始化完成: private/argus/log/{elasticsearch,kibana}"
|
echo "[OK] 初始化完成: private/argus/log/{elasticsearch,kibana}"
|
||||||
echo "[INFO] Fluent-bit files should be in fluent-bit/ directory"
|
echo "[INFO] Fluent-bit files should be in fluent-bit/ directory"
|
||||||
|
|
||||||
|
# 准备 Fluent Bit 离线依赖(从 metric all-in-one-full 复制 deb 到 ../fluent-bit/build/packages)
|
||||||
|
FLB_BUILD_PACKAGES_DIR="$root/../fluent-bit/build/packages"
|
||||||
|
mkdir -p "$FLB_BUILD_PACKAGES_DIR"
|
||||||
|
for deb in \
|
||||||
|
"$project_root/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libyaml-0-2_"*_amd64.deb \
|
||||||
|
"$project_root/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libpq5_"*_amd64.deb ; do
|
||||||
|
if ls $deb >/dev/null 2>&1; then
|
||||||
|
for f in $deb; do
|
||||||
|
base="$(basename "$f")"
|
||||||
|
if [[ ! -f "$FLB_BUILD_PACKAGES_DIR/$base" ]]; then
|
||||||
|
cp "$f" "$FLB_BUILD_PACKAGES_DIR/"
|
||||||
|
echo " [+] copied $base"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# 额外:从 all-in-one-full 的 ubuntu22/curl.tar.gz 解包必要依赖(libsasl2/ldap),便于离线安装
|
||||||
|
CURLOPT_TAR="$project_root/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/curl.tar.gz"
|
||||||
|
if [[ -f "$CURLOPT_TAR" ]]; then
|
||||||
|
tmpdir=$(mktemp -d)
|
||||||
|
if tar -xzf "$CURLOPT_TAR" -C "$tmpdir" 2>/dev/null; then
|
||||||
|
for p in \
|
||||||
|
libsasl2-2_*_amd64.deb \
|
||||||
|
libsasl2-modules-db_*_amd64.deb \
|
||||||
|
libldap-2.5-0_*_amd64.deb \
|
||||||
|
libidn2-0_*_amd64.deb \
|
||||||
|
libbrotli1_*_amd64.deb \
|
||||||
|
libssl3_*_amd64.deb ; do
|
||||||
|
src=$(ls "$tmpdir"/curl/$p 2>/dev/null | head -n1 || true)
|
||||||
|
if [[ -n "$src" ]]; then
|
||||||
|
base="$(basename "$src")"
|
||||||
|
[[ -f "$FLB_BUILD_PACKAGES_DIR/$base" ]] || cp "$src" "$FLB_BUILD_PACKAGES_DIR/" && echo " [+] staged $base"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
rm -rf "$tmpdir"
|
||||||
|
fi
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
1.30.0
|
1.33.0
|
||||||
|
|||||||
@ -32,7 +32,7 @@
|
|||||||
|
|
||||||
- 一键执行
|
- 一键执行
|
||||||
- `cd src/sys/tests`
|
- `cd src/sys/tests`
|
||||||
- `./scripts/00_e2e_test.sh`
|
- `./scripts/00_e2e_test.sh`(CPU-only)或 `./scripts/00_e2e_test.sh --enable-gpu`(启用 GPU 流程)
|
||||||
|
|
||||||
- 分步执行(推荐用于排查)
|
- 分步执行(推荐用于排查)
|
||||||
- `./scripts/01_bootstrap.sh` 生成目录/拷贝 `update-dns.sh`/构建 agent 二进制/写 `.env`
|
- `./scripts/01_bootstrap.sh` 生成目录/拷贝 `update-dns.sh`/构建 agent 二进制/写 `.env`
|
||||||
@ -43,6 +43,11 @@
|
|||||||
- `./scripts/06_write_health_and_assert.sh` 写健康文件并断言 `nodes.json` 仅包含 2 个在线节点
|
- `./scripts/06_write_health_and_assert.sh` 写健康文件并断言 `nodes.json` 仅包含 2 个在线节点
|
||||||
- `./scripts/07_logs_send_and_assert.sh` 向两个节点写日志,断言 ES `train-*`/`infer-*` 计数增长
|
- `./scripts/07_logs_send_and_assert.sh` 向两个节点写日志,断言 ES `train-*`/`infer-*` 计数增长
|
||||||
- `./scripts/08_restart_agent_reregister.sh` `node-b` 改为固定 IP `172.31.0.200`,验证保持同一节点 ID 且 IP/时间戳更新
|
- `./scripts/08_restart_agent_reregister.sh` `node-b` 改为固定 IP `172.31.0.200`,验证保持同一节点 ID 且 IP/时间戳更新
|
||||||
|
- `./scripts/10_metric_publish.sh` 发布 metric 客户端包到 FTP
|
||||||
|
- `./scripts/11_metric_node_install.sh` 在 CPU 节点安装并验证端点
|
||||||
|
- `./scripts/12_metric_gpu_install.sh` 在 GPU 节点安装并等待 9100/9400 就绪(仅启用 GPU 时)
|
||||||
|
- `./scripts/13_metric_verify.sh` 对 master/Prometheus/数据面/Grafana 做综合校验(含 GPU 时校验 dcgm 指标)
|
||||||
|
- `./scripts/14_metric_cleanup.sh` 清理 FTP 产物
|
||||||
- `./scripts/09_down.sh` 回收容器、网络并清理 `private*/`、`tmp/`
|
- `./scripts/09_down.sh` 回收容器、网络并清理 `private*/`、`tmp/`
|
||||||
|
|
||||||
- 重置环境
|
- 重置环境
|
||||||
@ -107,6 +112,7 @@
|
|||||||
- 判定:
|
- 判定:
|
||||||
- `private/argus/etc/master.argus.com` 存在且为 master IP
|
- `private/argus/etc/master.argus.com` 存在且为 master IP
|
||||||
- 在 node-a/node-b 内 `getent hosts master.argus.com` 成功解析到 master IP
|
- 在 node-a/node-b 内 `getent hosts master.argus.com` 成功解析到 master IP
|
||||||
|
- 在 metric CPU/GPU 节点内可解析 `master.argus.com` 与 `prom.metric.argus.com`
|
||||||
|
|
||||||
- `05_agent_register.sh`
|
- `05_agent_register.sh`
|
||||||
- 目的:确认两个节点注册到 master 并持久化 `node.json`
|
- 目的:确认两个节点注册到 master 并持久化 `node.json`
|
||||||
@ -137,3 +143,16 @@
|
|||||||
---
|
---
|
||||||
|
|
||||||
如需更严格的断言(例如 Kibana 载入具体插件、ES 文档字段校验),可在 `07_*.sh` 中追加查询与校验逻辑。
|
如需更严格的断言(例如 Kibana 载入具体插件、ES 文档字段校验),可在 `07_*.sh` 中追加查询与校验逻辑。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 可选:GPU 流程说明
|
||||||
|
- 前置条件:宿主安装 NVIDIA 驱动与 `nvidia-container-toolkit`,`nvidia-smi` 在宿主可用。
|
||||||
|
- 启用方式:
|
||||||
|
- 一键:`./scripts/00_e2e_test.sh --enable-gpu`
|
||||||
|
- 分步:设置 `ARGUS_SYS_ENABLE_GPU=true` 后执行 `01_bootstrap.sh`、`02_up.sh`;或直接在 `.env` 中将 `ENABLE_GPU=true` 后单独运行 `02_up.sh`。
|
||||||
|
- `01_bootstrap.sh` 会写入:
|
||||||
|
- `METRIC_TEST_HOSTNAME_GPU=test-metric-gpu-node-001`
|
||||||
|
- `METRIC_TEST_INSTANCE_GPU=172.31.0.51:9100`
|
||||||
|
- `METRIC_TEST_DCGM_GPU=172.31.0.51:9400`
|
||||||
|
- 验证点:`04_verify_dns_routing.sh` 增加对 metric 节点的域名解析;`12_metric_gpu_install.sh` 等待 9100/9400;`13_metric_verify_*` 校验 dcgm 指标与 Grafana 面板。
|
||||||
|
|||||||
@ -104,6 +104,45 @@ ensure_image "argus-web-frontend:latest"
|
|||||||
ensure_image "argus-web-proxy:latest"
|
ensure_image "argus-web-proxy:latest"
|
||||||
ensure_image "argus-alertmanager:latest"
|
ensure_image "argus-alertmanager:latest"
|
||||||
|
|
||||||
|
echo "[INFO] Preparing Fluent Bit local dependency packages..."
|
||||||
|
FLB_BUILD_PACKAGES_DIR="$REPO_ROOT/src/log/fluent-bit/build/packages"
|
||||||
|
mkdir -p "$FLB_BUILD_PACKAGES_DIR"
|
||||||
|
for deb in \
|
||||||
|
"$REPO_ROOT/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libyaml-0-2_"*_amd64.deb \
|
||||||
|
"$REPO_ROOT/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libpq5_"*_amd64.deb ; do
|
||||||
|
if ls $deb >/dev/null 2>&1; then
|
||||||
|
for f in $deb; do
|
||||||
|
base="$(basename "$f")"
|
||||||
|
if [[ ! -f "$FLB_BUILD_PACKAGES_DIR/$base" ]]; then
|
||||||
|
cp "$f" "$FLB_BUILD_PACKAGES_DIR/"
|
||||||
|
echo " [+] copied $base"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# 额外:从 all-in-one-full 的 ubuntu22/curl.tar.gz 解包必要依赖(libsasl2/ldap),便于离线安装
|
||||||
|
CURLOPT_TAR="$REPO_ROOT/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/curl.tar.gz"
|
||||||
|
if [[ -f "$CURLOPT_TAR" ]]; then
|
||||||
|
tmpdir=$(mktemp -d)
|
||||||
|
if tar -xzf "$CURLOPT_TAR" -C "$tmpdir" 2>/dev/null; then
|
||||||
|
for p in \
|
||||||
|
libsasl2-2_*_amd64.deb \
|
||||||
|
libsasl2-modules-db_*_amd64.deb \
|
||||||
|
libldap-2.5-0_*_amd64.deb \
|
||||||
|
libidn2-0_*_amd64.deb \
|
||||||
|
libbrotli1_*_amd64.deb \
|
||||||
|
libssl3_*_amd64.deb ; do
|
||||||
|
src=$(ls "$tmpdir"/curl/$p 2>/dev/null | head -n1 || true)
|
||||||
|
if [[ -n "$src" ]]; then
|
||||||
|
base="$(basename "$src")"
|
||||||
|
[[ -f "$FLB_BUILD_PACKAGES_DIR/$base" ]] || cp "$src" "$FLB_BUILD_PACKAGES_DIR/" && echo " [+] staged $base"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
rm -rf "$tmpdir"
|
||||||
|
fi
|
||||||
|
|
||||||
echo "[INFO] Building agent binary..."
|
echo "[INFO] Building agent binary..."
|
||||||
pushd "$REPO_ROOT/src/agent" >/dev/null
|
pushd "$REPO_ROOT/src/agent" >/dev/null
|
||||||
./scripts/build_binary.sh
|
./scripts/build_binary.sh
|
||||||
@ -217,6 +256,13 @@ ARGUS_BUILD_GID=$ARGUS_BUILD_GID
|
|||||||
# GPU 配置
|
# GPU 配置
|
||||||
ENABLE_GPU=$GPU_AVAILABLE
|
ENABLE_GPU=$GPU_AVAILABLE
|
||||||
|
|
||||||
|
# 测试节点(CPU/GPU)默认标识与实例
|
||||||
|
METRIC_TEST_HOSTNAME_CPU=test-metric-node-001
|
||||||
|
METRIC_TEST_INSTANCE_CPU=172.31.0.50:9100
|
||||||
|
METRIC_TEST_HOSTNAME_GPU=test-metric-gpu-node-001
|
||||||
|
METRIC_TEST_INSTANCE_GPU=172.31.0.51:9100
|
||||||
|
METRIC_TEST_DCGM_GPU=172.31.0.51:9400
|
||||||
|
|
||||||
# Master/日志/监控等服务的宿主机端口(自动分配)
|
# Master/日志/监控等服务的宿主机端口(自动分配)
|
||||||
MASTER_PORT=$MASTER_PORT_VAL
|
MASTER_PORT=$MASTER_PORT_VAL
|
||||||
ES_HTTP_PORT=$ES_HTTP_PORT_VAL
|
ES_HTTP_PORT=$ES_HTTP_PORT_VAL
|
||||||
|
|||||||
@ -20,7 +20,15 @@ if [[ -f "$TEST_ROOT/.env" ]]; then
|
|||||||
set -a; source "$TEST_ROOT/.env"; set +a
|
set -a; source "$TEST_ROOT/.env"; set +a
|
||||||
fi
|
fi
|
||||||
|
|
||||||
REQUEST_GPU=${ARGUS_SYS_ENABLE_GPU:-false}
|
# GPU 开关优先级:显式环境变量 > .env 中的 ENABLE_GPU > 默认 false
|
||||||
|
if [[ "${ARGUS_SYS_ENABLE_GPU:-}" == "true" ]]; then
|
||||||
|
REQUEST_GPU=true
|
||||||
|
elif [[ "${ARGUS_SYS_ENABLE_GPU:-}" == "false" ]]; then
|
||||||
|
REQUEST_GPU=false
|
||||||
|
else
|
||||||
|
REQUEST_GPU=${ENABLE_GPU:-false}
|
||||||
|
fi
|
||||||
|
|
||||||
GPU_AVAILABLE=false
|
GPU_AVAILABLE=false
|
||||||
GPU_CHECK_SCRIPT="$REPO_ROOT/src/metric/tests/scripts/common/check-gpu.sh"
|
GPU_CHECK_SCRIPT="$REPO_ROOT/src/metric/tests/scripts/common/check-gpu.sh"
|
||||||
|
|
||||||
|
|||||||
@ -34,6 +34,25 @@ else
|
|||||||
echo "[WARN] bind container not found; skip dig"
|
echo "[WARN] bind container not found; skip dig"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
check_inside() {
|
||||||
|
local cname="$1"; shift
|
||||||
|
local domains=("$@")
|
||||||
|
CID="$(cid_by_name "$cname")"
|
||||||
|
if [[ -z "$CID" ]]; then
|
||||||
|
echo "[WARN] container $cname not found; skip"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
for d in "${domains[@]}"; do
|
||||||
|
echo "[INFO] Checking resolution inside $cname for $d..."
|
||||||
|
if ! docker exec "$CID" getent hosts "$d" >/dev/null 2>&1; then
|
||||||
|
echo "[ERR] $cname cannot resolve $d" >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
RES="$(docker exec "$CID" getent hosts "$d" | awk '{print $1}' | head -n1)"
|
||||||
|
echo "[OK] $cname resolved $d -> $RES"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
for node in argus-node-a argus-node-b; do
|
for node in argus-node-a argus-node-b; do
|
||||||
CID="$(cid_by_name "$node")"
|
CID="$(cid_by_name "$node")"
|
||||||
echo "[INFO] Checking resolution inside $node..."
|
echo "[INFO] Checking resolution inside $node..."
|
||||||
@ -47,4 +66,8 @@ done
|
|||||||
|
|
||||||
popd >/dev/null
|
popd >/dev/null
|
||||||
|
|
||||||
|
# 追加:在 metric 节点中验证 master 与 prom 域名解析
|
||||||
|
check_inside argus-metric-test-node master.argus.com prom.metric.argus.com || exit 1
|
||||||
|
check_inside argus-metric-test-gpu-node master.argus.com prom.metric.argus.com || exit 1
|
||||||
|
|
||||||
echo "[OK] DNS routing verified"
|
echo "[OK] DNS routing verified"
|
||||||
|
|||||||
@ -62,3 +62,21 @@ echo "[SYS-METRIC] GPU 节点安装完成"
|
|||||||
'
|
'
|
||||||
|
|
||||||
echo "[SYS-METRIC] Metric GPU 节点安装流程完成"
|
echo "[SYS-METRIC] Metric GPU 节点安装流程完成"
|
||||||
|
|
||||||
|
# 就绪性检测:9400(dcgm) 与 9100(node) 端口
|
||||||
|
echo "[SYS-METRIC] 等待 dcgm-exporter(9400) 与 node-exporter(9100) 就绪..."
|
||||||
|
retries=30
|
||||||
|
until docker exec "$CONTAINER" bash -lc "curl -fsS --max-time 2 http://localhost:9400/metrics >/dev/null"; do
|
||||||
|
((retries--)) || { echo "[ERR] dcgm-exporter 9400 未就绪" >&2; exit 1; }
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
echo "[OK] dcgm-exporter 端点可访问"
|
||||||
|
|
||||||
|
retries=30
|
||||||
|
until docker exec "$CONTAINER" bash -lc "curl -fsS --max-time 2 http://localhost:9100/metrics >/dev/null"; do
|
||||||
|
((retries--)) || { echo "[ERR] node-exporter 9100 未就绪" >&2; exit 1; }
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
echo "[OK] node-exporter 端点可访问"
|
||||||
|
|
||||||
|
mkdir -p "$TEST_ROOT/tmp" && touch "$TEST_ROOT/tmp/gpu_install_ready"
|
||||||
|
|||||||
@ -51,3 +51,16 @@ PY
|
|||||||
awk -v a="$TS1" -v b="$TS2" 'BEGIN{ if (b>=a) exit 0; else exit 1 }' || { echo "[ERR] sample timestamp did not advance" >&2; exit 1; }
|
awk -v a="$TS1" -v b="$TS2" 'BEGIN{ if (b>=a) exit 0; else exit 1 }' || { echo "[ERR] sample timestamp did not advance" >&2; exit 1; }
|
||||||
echo "[OK] sample timestamp advanced"
|
echo "[OK] sample timestamp advanced"
|
||||||
echo "[DONE] dataplane verify"
|
echo "[DONE] dataplane verify"
|
||||||
|
|
||||||
|
# 追加:GPU 节点端点连通性检查(启用 GPU 时)
|
||||||
|
if [[ "${ENABLE_GPU:-false}" == "true" ]]; then
|
||||||
|
echo
|
||||||
|
echo "[VERIFY:DATA][GPU] curl endpoints on gpu node"
|
||||||
|
if ! docker exec argus-metric-test-gpu-node bash -lc 'curl -fsS --max-time 5 http://localhost:9100/metrics >/dev/null'; then
|
||||||
|
echo "[ERR] gpu node 9100 not reachable" >&2; exit 1
|
||||||
|
fi
|
||||||
|
if ! docker exec argus-metric-test-gpu-node bash -lc 'curl -fsS --max-time 5 http://localhost:9400/metrics >/dev/null'; then
|
||||||
|
echo "[ERR] gpu node 9400 not reachable" >&2; exit 1
|
||||||
|
fi
|
||||||
|
echo "[OK] gpu node endpoints reachable"
|
||||||
|
fi
|
||||||
|
|||||||
@ -71,4 +71,17 @@ print('OK',val)
|
|||||||
PY
|
PY
|
||||||
echo "[OK] cluster node online count >= 1 via Grafana proxy"
|
echo "[OK] cluster node online count >= 1 via Grafana proxy"
|
||||||
|
|
||||||
|
if [[ -f "$TEST_ROOT/.env" ]]; then
|
||||||
|
set -a; source "$TEST_ROOT/.env"; set +a
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 可选:GPU 面板查询(当启用 GPU 时)
|
||||||
|
if [[ "${ENABLE_GPU:-false}" == "true" ]]; then
|
||||||
|
echo "[VERIFY:GRAF-PANELS] GPU Panels — DCGM GPU UTIL"
|
||||||
|
Q_GPU_UTIL='DCGM_FI_DEV_GPU_UTIL'
|
||||||
|
proxy_query "$Q_GPU_UTIL" "$TMP_DIR/graf_panel_dcgm_util.json"
|
||||||
|
assert_vector_recent_nonempty "$TMP_DIR/graf_panel_dcgm_util.json" 300 >/dev/null || { echo "[ERR] dcgm gpu util no recent sample via Grafana proxy" >&2; exit 1; }
|
||||||
|
echo "[OK] dcgm gpu util has recent samples via Grafana proxy"
|
||||||
|
fi
|
||||||
|
|
||||||
echo "[DONE] grafana panels verify"
|
echo "[DONE] grafana panels verify"
|
||||||
|
|||||||
@ -12,7 +12,7 @@ if [[ -f "$TEST_ROOT/.env" ]]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
PROM_BASE="http://localhost:${PROMETHEUS_PORT:-9090}/api/v1"
|
PROM_BASE="http://localhost:${PROMETHEUS_PORT:-9090}/api/v1"
|
||||||
HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}"
|
HOSTNAME="${METRIC_TEST_HOSTNAME:-${METRIC_TEST_HOSTNAME_CPU:-test-metric-node-001}}"
|
||||||
|
|
||||||
nodes_json="$TEST_ROOT/private/argus/metric/prometheus/nodes.json"
|
nodes_json="$TEST_ROOT/private/argus/metric/prometheus/nodes.json"
|
||||||
targets_json="$TEST_ROOT/private/argus/metric/prometheus/targets/node_exporter.json"
|
targets_json="$TEST_ROOT/private/argus/metric/prometheus/targets/node_exporter.json"
|
||||||
@ -145,3 +145,54 @@ assert val>=1, f"count < 1: {val}"
|
|||||||
PY
|
PY
|
||||||
echo "[OK] up count satisfied"
|
echo "[OK] up count satisfied"
|
||||||
echo "[DONE] prometheus verify"
|
echo "[DONE] prometheus verify"
|
||||||
|
|
||||||
|
# ========== GPU 验证(可选) ==========
|
||||||
|
if [[ "${ENABLE_GPU:-false}" == "true" ]]; then
|
||||||
|
echo
|
||||||
|
echo "[VERIFY:PROM][GPU] dcgm targets & up metric"
|
||||||
|
GPU_IP_PORT="${METRIC_TEST_DCGM_GPU:-172.31.0.51:9400}"
|
||||||
|
GPU_IP="${GPU_IP_PORT%%:*}"
|
||||||
|
|
||||||
|
# 1) file_sd 目标存在(在 Prometheus 容器内生成的 targets 文件)
|
||||||
|
TARGETS_FILE="$TEST_ROOT/private/argus/metric/prometheus/targets/dcgm_exporter.json"
|
||||||
|
if [[ ! -f "$TARGETS_FILE" ]]; then
|
||||||
|
echo "[ERR] $TARGETS_FILE missing" >&2; exit 1
|
||||||
|
fi
|
||||||
|
if ! jq -r '.[].targets[]' "$TARGETS_FILE" 2>/dev/null | grep -q "^${GPU_IP}:9400$"; then
|
||||||
|
echo "[ERR] dcgm target not found for ${GPU_IP}:9400" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "[OK] dcgm target present in file_sd"
|
||||||
|
|
||||||
|
# 2) up{job="dcgm", ip=GPU_IP} == 1
|
||||||
|
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=up{job=\"dcgm\",ip=\"$GPU_IP\"}==1" > "$TMP_DIR/prom_dcgm_up.json"
|
||||||
|
python3 - "$TMP_DIR/prom_dcgm_up.json" <<'PY'
|
||||||
|
import json,sys
|
||||||
|
j=json.load(open(sys.argv[1]))
|
||||||
|
res=j.get('data',{}).get('result',[])
|
||||||
|
assert res, 'up==1 empty for dcgm'
|
||||||
|
val=float(res[0]['value'][1])
|
||||||
|
assert val==1.0, f'up not 1: {val}'
|
||||||
|
print('OK')
|
||||||
|
PY
|
||||||
|
echo "[OK] up{job=dcgm,ip=$GPU_IP} == 1"
|
||||||
|
|
||||||
|
# 3) 至少一个 GPU 指标存在(优先 DCGM_FI_DEV_GPU_UTIL,若无则尝试 DCGM_FI_DEV_FB_USED)
|
||||||
|
query_one() {
|
||||||
|
local q="$1"; local out="$2"
|
||||||
|
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=$q" > "$out"
|
||||||
|
python3 - "$out" <<'PY'
|
||||||
|
import json,sys
|
||||||
|
j=json.load(open(sys.argv[1]))
|
||||||
|
ok=(j.get('status')=='success' and len(j.get('data',{}).get('result',[]))>0)
|
||||||
|
raise SystemExit(0 if ok else 1)
|
||||||
|
PY
|
||||||
|
}
|
||||||
|
if query_one 'DCGM_FI_DEV_GPU_UTIL' "$TMP_DIR/prom_dcgm_util.json" || query_one 'DCGM_FI_DEV_FB_USED' "$TMP_DIR/prom_dcgm_fb.json"; then
|
||||||
|
echo "[OK] dcgm metrics present"
|
||||||
|
else
|
||||||
|
echo "[ERR] no dcgm metrics found" >&2; exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[DONE] prometheus gpu verify"
|
||||||
|
fi
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user