feat: metric e2e 启动服务中判断是否当前服务器存在可用GPU,如无则跳过 test-gpu-node 容器创建和安装包的测试;
refs #29
This commit is contained in:
parent
299765ed40
commit
c4582c99bc
@ -1,6 +1,9 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
COMMON_DIR="$SCRIPT_DIR/common"
|
||||
|
||||
FTP_SERVER="${FTP_SERVER:-172.30.0.40}"
|
||||
FTP_USER="${FTP_USER:-ftpuser}"
|
||||
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
|
||||
@ -8,26 +11,37 @@ FTP_PORT="${FTP_PORT:-21}"
|
||||
|
||||
FTP_HOST="${FTP_SERVER}"
|
||||
|
||||
echo "[03] 进入测试节点执行安装..."
|
||||
echo "[03] 使用 FTP 地址: ${FTP_HOST}:${FTP_PORT}"
|
||||
echo "[04] 检测GPU环境..."
|
||||
# 检测GPU环境
|
||||
if bash "$COMMON_DIR/check-gpu.sh"; then
|
||||
echo "[04] GPU环境可用,继续执行GPU节点安装"
|
||||
GPU_AVAILABLE=true
|
||||
else
|
||||
echo "[04] GPU环境不可用,跳过GPU节点安装"
|
||||
GPU_AVAILABLE=false
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "[04] 进入测试节点执行安装..."
|
||||
echo "[04] 使用 FTP 地址: ${FTP_HOST}:${FTP_PORT}"
|
||||
|
||||
docker exec argus-metric-test-gpu-node bash -c "
|
||||
set -e
|
||||
|
||||
if ! command -v curl &>/dev/null; then
|
||||
echo '[03] curl 未安装,正在安装...'
|
||||
echo '[04] curl 未安装,正在安装...'
|
||||
apt-get update && apt-get install -y curl
|
||||
fi
|
||||
|
||||
cd /tmp
|
||||
echo '[03] 下载 setup.sh...'
|
||||
echo '[04] 下载 setup.sh...'
|
||||
curl -u ${FTP_USER}:${FTP_PASSWORD} ftp://${FTP_HOST}:${FTP_PORT}/setup.sh -o setup.sh
|
||||
|
||||
echo '[03] 执行安装...'
|
||||
echo '[04] 执行安装...'
|
||||
chmod +x setup.sh
|
||||
bash setup.sh --server ${FTP_HOST} --user ${FTP_USER} --password '${FTP_PASSWORD}' --port ${FTP_PORT}
|
||||
|
||||
echo '[03] 安装完成'
|
||||
echo '[04] 安装完成'
|
||||
"
|
||||
|
||||
echo "[03] 完成"
|
||||
echo "[04] 完成"
|
||||
|
||||
59
src/metric/tests/scripts/common/check-gpu.sh
Executable file
59
src/metric/tests/scripts/common/check-gpu.sh
Executable file
@ -0,0 +1,59 @@
|
||||
#!/bin/bash
|
||||
|
||||
# GPU环境检测脚本
|
||||
# 检测系统是否有NVIDIA GPU硬件
|
||||
|
||||
set -e
|
||||
|
||||
# 检测函数
|
||||
check_gpu_support() {
|
||||
echo "检测GPU环境..."
|
||||
|
||||
# 方法1: 检测GPU设备文件
|
||||
if ls /dev/nvidia* &>/dev/null; then
|
||||
echo "✓ 检测到NVIDIA GPU设备文件"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 方法2: 检测lspci中的NVIDIA设备(Linux)
|
||||
if command -v lspci &> /dev/null; then
|
||||
if lspci | grep -i nvidia &> /dev/null; then
|
||||
echo "✓ 检测到NVIDIA GPU硬件"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
# 方法3: 检测nvidia-smi
|
||||
if command -v nvidia-smi &> /dev/null; then
|
||||
if nvidia-smi &> /dev/null; then
|
||||
echo "✓ 检测到NVIDIA GPU硬件"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "✗ 未检测到NVIDIA GPU硬件"
|
||||
return 1
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
echo "=========================================="
|
||||
echo " GPU环境检测"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
if check_gpu_support; then
|
||||
echo ""
|
||||
echo "结果: GPU环境可用"
|
||||
exit 0
|
||||
else
|
||||
echo ""
|
||||
echo "结果: GPU环境不可用,将跳过GPU相关服务"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 如果直接运行此脚本
|
||||
if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
|
||||
main "$@"
|
||||
fi
|
||||
@ -59,18 +59,40 @@ echo "1. 初始化目录结构..."
|
||||
bash "$SCRIPT_DIR/init-directories.sh"
|
||||
|
||||
echo ""
|
||||
echo "2. 检查 Docker 镜像..."
|
||||
echo "2. 检测GPU环境..."
|
||||
# 检测GPU环境
|
||||
if bash "$SCRIPT_DIR/check-gpu.sh"; then
|
||||
echo "GPU环境可用,将启动GPU节点"
|
||||
GPU_AVAILABLE=true
|
||||
else
|
||||
echo "GPU环境不可用,跳过GPU节点"
|
||||
GPU_AVAILABLE=false
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "3. 检查 Docker 镜像..."
|
||||
|
||||
# 检查必要的镜像是否存在
|
||||
IMAGES=("argus-metric-ftp:latest" "argus-metric-prometheus:latest" "argus-metric-grafana:latest" "argus-metric-test-node:latest" "argus-metric-test-gpu-node:latest")
|
||||
missing_images=()
|
||||
BASE_IMAGES=("argus-metric-ftp:latest" "argus-metric-prometheus:latest" "argus-metric-grafana:latest" "argus-metric-test-node:latest")
|
||||
GPU_IMAGES=("argus-metric-test-gpu-node:latest")
|
||||
|
||||
for image in "${IMAGES[@]}"; do
|
||||
# 先检查基础镜像
|
||||
missing_images=()
|
||||
for image in "${BASE_IMAGES[@]}"; do
|
||||
if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then
|
||||
missing_images+=("$image")
|
||||
fi
|
||||
done
|
||||
|
||||
# 检查GPU镜像(如果GPU环境可用)
|
||||
if [ "$GPU_AVAILABLE" = true ]; then
|
||||
for image in "${GPU_IMAGES[@]}"; do
|
||||
if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then
|
||||
missing_images+=("$image")
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
if [ ${#missing_images[@]} -gt 0 ]; then
|
||||
echo "以下镜像缺失,请先运行 build/build_images.sh 构建镜像:"
|
||||
for image in "${missing_images[@]}"; do
|
||||
@ -85,10 +107,17 @@ else
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "3. 启动基础服务..."
|
||||
echo "4. 启动基础服务..."
|
||||
cd "$TEST_DIR"
|
||||
# 启动除GPU节点外的所有服务
|
||||
|
||||
# 根据GPU环境决定启动的服务
|
||||
if [ "$GPU_AVAILABLE" = true ]; then
|
||||
echo "启动所有服务(包括GPU节点)..."
|
||||
docker compose up -d ftp prometheus grafana test-node test-gpu-node
|
||||
else
|
||||
echo "启动基础服务(跳过GPU节点)..."
|
||||
docker compose up -d ftp prometheus grafana test-node
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "4. 等待服务启动..."
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user