From c4582c99bcc5b3a3ad49cfb654f708e0bb1afe3a Mon Sep 17 00:00:00 2001 From: "sundapeng.sdp" Date: Mon, 20 Oct 2025 15:30:07 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20metric=20e2e=20=E5=90=AF=E5=8A=A8?= =?UTF-8?q?=E6=9C=8D=E5=8A=A1=E4=B8=AD=E5=88=A4=E6=96=AD=E6=98=AF=E5=90=A6?= =?UTF-8?q?=E5=BD=93=E5=89=8D=E6=9C=8D=E5=8A=A1=E5=99=A8=E5=AD=98=E5=9C=A8?= =?UTF-8?q?=E5=8F=AF=E7=94=A8GPU=EF=BC=8C=E5=A6=82=E6=97=A0=E5=88=99?= =?UTF-8?q?=E8=B7=B3=E8=BF=87=20test-gpu-node=20=E5=AE=B9=E5=99=A8?= =?UTF-8?q?=E5=88=9B=E5=BB=BA=E5=92=8C=E5=AE=89=E8=A3=85=E5=8C=85=E7=9A=84?= =?UTF-8?q?=E6=B5=8B=E8=AF=95=EF=BC=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit refs #29 --- .../tests/scripts/04_test_gpu_node_install.sh | 28 ++++++--- src/metric/tests/scripts/common/check-gpu.sh | 59 +++++++++++++++++++ src/metric/tests/scripts/common/start-all.sh | 43 +++++++++++--- 3 files changed, 116 insertions(+), 14 deletions(-) create mode 100755 src/metric/tests/scripts/common/check-gpu.sh diff --git a/src/metric/tests/scripts/04_test_gpu_node_install.sh b/src/metric/tests/scripts/04_test_gpu_node_install.sh index ce1d19a..b0e2355 100755 --- a/src/metric/tests/scripts/04_test_gpu_node_install.sh +++ b/src/metric/tests/scripts/04_test_gpu_node_install.sh @@ -1,6 +1,9 @@ #!/bin/bash set -e +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +COMMON_DIR="$SCRIPT_DIR/common" + FTP_SERVER="${FTP_SERVER:-172.30.0.40}" FTP_USER="${FTP_USER:-ftpuser}" FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" @@ -8,26 +11,37 @@ FTP_PORT="${FTP_PORT:-21}" FTP_HOST="${FTP_SERVER}" -echo "[03] 进入测试节点执行安装..." -echo "[03] 使用 FTP 地址: ${FTP_HOST}:${FTP_PORT}" +echo "[04] 检测GPU环境..." +# 检测GPU环境 +if bash "$COMMON_DIR/check-gpu.sh"; then + echo "[04] GPU环境可用,继续执行GPU节点安装" + GPU_AVAILABLE=true +else + echo "[04] GPU环境不可用,跳过GPU节点安装" + GPU_AVAILABLE=false + exit 0 +fi + +echo "[04] 进入测试节点执行安装..." +echo "[04] 使用 FTP 地址: ${FTP_HOST}:${FTP_PORT}" docker exec argus-metric-test-gpu-node bash -c " set -e if ! command -v curl &>/dev/null; then - echo '[03] curl 未安装,正在安装...' + echo '[04] curl 未安装,正在安装...' apt-get update && apt-get install -y curl fi cd /tmp -echo '[03] 下载 setup.sh...' +echo '[04] 下载 setup.sh...' curl -u ${FTP_USER}:${FTP_PASSWORD} ftp://${FTP_HOST}:${FTP_PORT}/setup.sh -o setup.sh -echo '[03] 执行安装...' +echo '[04] 执行安装...' chmod +x setup.sh bash setup.sh --server ${FTP_HOST} --user ${FTP_USER} --password '${FTP_PASSWORD}' --port ${FTP_PORT} -echo '[03] 安装完成' +echo '[04] 安装完成' " -echo "[03] 完成" +echo "[04] 完成" diff --git a/src/metric/tests/scripts/common/check-gpu.sh b/src/metric/tests/scripts/common/check-gpu.sh new file mode 100755 index 0000000..c602304 --- /dev/null +++ b/src/metric/tests/scripts/common/check-gpu.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# GPU环境检测脚本 +# 检测系统是否有NVIDIA GPU硬件 + +set -e + +# 检测函数 +check_gpu_support() { + echo "检测GPU环境..." + + # 方法1: 检测GPU设备文件 + if ls /dev/nvidia* &>/dev/null; then + echo "✓ 检测到NVIDIA GPU设备文件" + return 0 + fi + + # 方法2: 检测lspci中的NVIDIA设备(Linux) + if command -v lspci &> /dev/null; then + if lspci | grep -i nvidia &> /dev/null; then + echo "✓ 检测到NVIDIA GPU硬件" + return 0 + fi + fi + + # 方法3: 检测nvidia-smi + if command -v nvidia-smi &> /dev/null; then + if nvidia-smi &> /dev/null; then + echo "✓ 检测到NVIDIA GPU硬件" + return 0 + fi + fi + + echo "✗ 未检测到NVIDIA GPU硬件" + return 1 +} + +# 主函数 +main() { + echo "==========================================" + echo " GPU环境检测" + echo "==========================================" + echo "" + + if check_gpu_support; then + echo "" + echo "结果: GPU环境可用" + exit 0 + else + echo "" + echo "结果: GPU环境不可用,将跳过GPU相关服务" + exit 1 + fi +} + +# 如果直接运行此脚本 +if [ "${BASH_SOURCE[0]}" = "${0}" ]; then + main "$@" +fi diff --git a/src/metric/tests/scripts/common/start-all.sh b/src/metric/tests/scripts/common/start-all.sh index 34d68df..7f0e7d5 100755 --- a/src/metric/tests/scripts/common/start-all.sh +++ b/src/metric/tests/scripts/common/start-all.sh @@ -59,18 +59,40 @@ echo "1. 初始化目录结构..." bash "$SCRIPT_DIR/init-directories.sh" echo "" -echo "2. 检查 Docker 镜像..." +echo "2. 检测GPU环境..." +# 检测GPU环境 +if bash "$SCRIPT_DIR/check-gpu.sh"; then + echo "GPU环境可用,将启动GPU节点" + GPU_AVAILABLE=true +else + echo "GPU环境不可用,跳过GPU节点" + GPU_AVAILABLE=false +fi + +echo "" +echo "3. 检查 Docker 镜像..." # 检查必要的镜像是否存在 -IMAGES=("argus-metric-ftp:latest" "argus-metric-prometheus:latest" "argus-metric-grafana:latest" "argus-metric-test-node:latest" "argus-metric-test-gpu-node:latest") -missing_images=() +BASE_IMAGES=("argus-metric-ftp:latest" "argus-metric-prometheus:latest" "argus-metric-grafana:latest" "argus-metric-test-node:latest") +GPU_IMAGES=("argus-metric-test-gpu-node:latest") -for image in "${IMAGES[@]}"; do +# 先检查基础镜像 +missing_images=() +for image in "${BASE_IMAGES[@]}"; do if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then missing_images+=("$image") fi done +# 检查GPU镜像(如果GPU环境可用) +if [ "$GPU_AVAILABLE" = true ]; then + for image in "${GPU_IMAGES[@]}"; do + if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then + missing_images+=("$image") + fi + done +fi + if [ ${#missing_images[@]} -gt 0 ]; then echo "以下镜像缺失,请先运行 build/build_images.sh 构建镜像:" for image in "${missing_images[@]}"; do @@ -85,10 +107,17 @@ else fi echo "" -echo "3. 启动基础服务..." +echo "4. 启动基础服务..." cd "$TEST_DIR" -# 启动除GPU节点外的所有服务 -docker compose up -d ftp prometheus grafana test-node test-gpu-node + +# 根据GPU环境决定启动的服务 +if [ "$GPU_AVAILABLE" = true ]; then + echo "启动所有服务(包括GPU节点)..." + docker compose up -d ftp prometheus grafana test-node test-gpu-node +else + echo "启动基础服务(跳过GPU节点)..." + docker compose up -d ftp prometheus grafana test-node +fi echo "" echo "4. 等待服务启动..."