From 7d1730f348abdec2804ef5582530db93e597662f Mon Sep 17 00:00:00 2001 From: Rob Armstrong Date: Wed, 2 Apr 2025 11:19:48 -0700 Subject: [PATCH] Remove outdated bandwidthTest sample --- CHANGELOG.md | 6 + .../.vscode/c_cpp_properties.json | 18 - .../bandwidthTest/.vscode/extensions.json | 7 - .../bandwidthTest/.vscode/launch.json | 10 - .../bandwidthTest/.vscode/tasks.json | 15 - .../1_Utilities/bandwidthTest/CMakeLists.txt | 30 - Samples/1_Utilities/bandwidthTest/README.md | 32 - .../bandwidthTest/bandwidthTest.cu | 1049 ----------------- 8 files changed, 6 insertions(+), 1161 deletions(-) delete mode 100644 Samples/1_Utilities/bandwidthTest/.vscode/c_cpp_properties.json delete mode 100644 Samples/1_Utilities/bandwidthTest/.vscode/extensions.json delete mode 100644 Samples/1_Utilities/bandwidthTest/.vscode/launch.json delete mode 100644 Samples/1_Utilities/bandwidthTest/.vscode/tasks.json delete mode 100644 Samples/1_Utilities/bandwidthTest/CMakeLists.txt delete mode 100644 Samples/1_Utilities/bandwidthTest/README.md delete mode 100644 Samples/1_Utilities/bandwidthTest/bandwidthTest.cu diff --git a/CHANGELOG.md b/CHANGELOG.md index 93d1500b..12e424fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ ### CUDA 12.9 * Updated toolchain for cross-compilation for Tegra Linux platforms. +* Repository has been updated with consistent code formatting across all samples +* Many small code tweaks and bug fixes (see commit history for details) +* Removed the following outdated samples: + * `1_Utilities` + * `bandwidthTest` - this sample was out of date and did not produce accurate results. For bandwidth + testing of NVIDIA GPU platforms, please refer to [NVBandwidth](https://github.com/NVIDIA/nvbandwidth) ### CUDA 12.8 * Updated build system across the repository to CMake. Removed Visual Studio project files and Makefiles. diff --git a/Samples/1_Utilities/bandwidthTest/.vscode/c_cpp_properties.json b/Samples/1_Utilities/bandwidthTest/.vscode/c_cpp_properties.json deleted file mode 100644 index f0066b0f..00000000 --- a/Samples/1_Utilities/bandwidthTest/.vscode/c_cpp_properties.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "configurations": [ - { - "name": "Linux", - "includePath": [ - "${workspaceFolder}/**", - "${workspaceFolder}/../../../Common" - ], - "defines": [], - "compilerPath": "/usr/local/cuda/bin/nvcc", - "cStandard": "gnu17", - "cppStandard": "gnu++14", - "intelliSenseMode": "linux-gcc-x64", - "configurationProvider": "ms-vscode.makefile-tools" - } - ], - "version": 4 -} diff --git a/Samples/1_Utilities/bandwidthTest/.vscode/extensions.json b/Samples/1_Utilities/bandwidthTest/.vscode/extensions.json deleted file mode 100644 index c7eb54dc..00000000 --- a/Samples/1_Utilities/bandwidthTest/.vscode/extensions.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "recommendations": [ - "nvidia.nsight-vscode-edition", - "ms-vscode.cpptools", - "ms-vscode.makefile-tools" - ] -} diff --git a/Samples/1_Utilities/bandwidthTest/.vscode/launch.json b/Samples/1_Utilities/bandwidthTest/.vscode/launch.json deleted file mode 100644 index 81bb9bf1..00000000 --- a/Samples/1_Utilities/bandwidthTest/.vscode/launch.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "configurations": [ - { - "name": "CUDA C++: Launch", - "type": "cuda-gdb", - "request": "launch", - "program": "${workspaceFolder}/bandwidthTest" - } - ] -} diff --git a/Samples/1_Utilities/bandwidthTest/.vscode/tasks.json b/Samples/1_Utilities/bandwidthTest/.vscode/tasks.json deleted file mode 100644 index 4509aeb1..00000000 --- a/Samples/1_Utilities/bandwidthTest/.vscode/tasks.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "version": "2.0.0", - "tasks": [ - { - "label": "sample", - "type": "shell", - "command": "make dbg=1", - "problemMatcher": ["$nvcc"], - "group": { - "kind": "build", - "isDefault": true - } - } - ] -} diff --git a/Samples/1_Utilities/bandwidthTest/CMakeLists.txt b/Samples/1_Utilities/bandwidthTest/CMakeLists.txt deleted file mode 100644 index 32c3bff1..00000000 --- a/Samples/1_Utilities/bandwidthTest/CMakeLists.txt +++ /dev/null @@ -1,30 +0,0 @@ -cmake_minimum_required(VERSION 3.20) - -list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules") - -project(bandwidthTest LANGUAGES C CXX CUDA) - -find_package(CUDAToolkit REQUIRED) - -set(CMAKE_POSITION_INDEPENDENT_CODE ON) - -set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120) -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") -if(ENABLE_CUDA_DEBUG) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets) -else() - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option) -endif() - -# Include directories and libraries -include_directories(../../../Common) - -# Source file -# Add target for bandwidthTest -add_executable(bandwidthTest bandwidthTest.cu) - -target_compile_options(bandwidthTest PRIVATE $<$:--extended-lambda>) - -target_compile_features(bandwidthTest PRIVATE cxx_std_17 cuda_std_17) - -set_target_properties(bandwidthTest PROPERTIES CUDA_SEPARABLE_COMPILATION ON) diff --git a/Samples/1_Utilities/bandwidthTest/README.md b/Samples/1_Utilities/bandwidthTest/README.md deleted file mode 100644 index 0cfbd154..00000000 --- a/Samples/1_Utilities/bandwidthTest/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# bandwidthTest - Bandwidth Test - -## Description - -This is a simple test program to measure the memcopy bandwidth of the GPU and memcpy bandwidth across PCI-e. This test application is capable of measuring device to device copy bandwidth, host to device copy bandwidth for pageable and page-locked memory, and device to host copy bandwidth for pageable and page-locked memory. - -## Key Concepts - -CUDA Streams and Events, Performance Strategies - -## Supported SM Architectures - -[SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) [SM 8.9 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus) - -## Supported OSes - -Linux, Windows - -## Supported CPU Architecture - -x86_64, armv7l - -## CUDA APIs involved - -### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) -cudaHostAlloc, cudaMemcpy, cudaMalloc, cudaMemcpyAsync, cudaFree, cudaGetErrorString, cudaMallocHost, cudaSetDevice, cudaGetDeviceProperties, cudaDeviceSynchronize, cudaEventRecord, cudaFreeHost, cudaEventDestroy, cudaEventElapsedTime, cudaGetDeviceCount, cudaEventCreate - -## Prerequisites - -Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. - -## References (for more details) diff --git a/Samples/1_Utilities/bandwidthTest/bandwidthTest.cu b/Samples/1_Utilities/bandwidthTest/bandwidthTest.cu deleted file mode 100644 index 56764d02..00000000 --- a/Samples/1_Utilities/bandwidthTest/bandwidthTest.cu +++ /dev/null @@ -1,1049 +0,0 @@ -/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of NVIDIA CORPORATION nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * This is a simple test program to measure the memcopy bandwidth of the GPU. - * It can measure device to device copy bandwidth, host to device copy bandwidth - * for pageable and pinned memory, and device to host copy bandwidth for - * pageable and pinned memory. - * - * Usage: - * ./bandwidthTest [option]... - */ - -// CUDA runtime -#include - -// includes -#include -#include -#include // helper functions for CUDA error checking and initialization -#include // helper for shared functions common to CUDA Samples -#include -#include - -static const char *sSDKsample = "CUDA Bandwidth Test"; - -// defines, project -#define MEMCOPY_ITERATIONS 100 -#define DEFAULT_SIZE (32 * (1e6)) // 32 M -#define DEFAULT_INCREMENT (4 * (1e6)) // 4 M -#define CACHE_CLEAR_SIZE (16 * (1e6)) // 16 M - -// shmoo mode defines -#define SHMOO_MEMSIZE_MAX (64 * (1e6)) // 64 M -#define SHMOO_MEMSIZE_START (1e3) // 1 KB -#define SHMOO_INCREMENT_1KB (1e3) // 1 KB -#define SHMOO_INCREMENT_2KB (2 * 1e3) // 2 KB -#define SHMOO_INCREMENT_10KB (10 * (1e3)) // 10KB -#define SHMOO_INCREMENT_100KB (100 * (1e3)) // 100 KB -#define SHMOO_INCREMENT_1MB (1e6) // 1 MB -#define SHMOO_INCREMENT_2MB (2 * 1e6) // 2 MB -#define SHMOO_INCREMENT_4MB (4 * 1e6) // 4 MB -#define SHMOO_LIMIT_20KB (20 * (1e3)) // 20 KB -#define SHMOO_LIMIT_50KB (50 * (1e3)) // 50 KB -#define SHMOO_LIMIT_100KB (100 * (1e3)) // 100 KB -#define SHMOO_LIMIT_1MB (1e6) // 1 MB -#define SHMOO_LIMIT_16MB (16 * 1e6) // 16 MB -#define SHMOO_LIMIT_32MB (32 * 1e6) // 32 MB - -// CPU cache flush -#define FLUSH_SIZE (256 * 1024 * 1024) -char *flush_buf; - -// enums, project -enum testMode { QUICK_MODE, RANGE_MODE, SHMOO_MODE }; -enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE }; -enum printMode { USER_READABLE, CSV }; -enum memoryMode { PINNED, PAGEABLE }; - -const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device", "Device to Device", NULL}; - -const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL}; - -// if true, use CPU based timing for everything -static bool bDontUseGPUTiming; - -int *pArgc = NULL; -char **pArgv = NULL; - -//////////////////////////////////////////////////////////////////////////////// -// declaration, forward -int runTest(const int argc, const char **argv); -void testBandwidth(unsigned int start, - unsigned int end, - unsigned int increment, - testMode mode, - memcpyKind kind, - printMode printmode, - memoryMode memMode, - int startDevice, - int endDevice, - bool wc); -void testBandwidthQuick(unsigned int size, - memcpyKind kind, - printMode printmode, - memoryMode memMode, - int startDevice, - int endDevice, - bool wc); -void testBandwidthRange(unsigned int start, - unsigned int end, - unsigned int increment, - memcpyKind kind, - printMode printmode, - memoryMode memMode, - int startDevice, - int endDevice, - bool wc); -void testBandwidthShmoo(memcpyKind kind, - printMode printmode, - memoryMode memMode, - int startDevice, - int endDevice, - bool wc); -float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc); -float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc); -float testDeviceToDeviceTransfer(unsigned int memSize); -void printResultsReadable(unsigned int *memSizes, - double *bandwidths, - unsigned int count, - memcpyKind kind, - memoryMode memMode, - int iNumDevs, - bool wc); -void printResultsCSV(unsigned int *memSizes, - double *bandwidths, - unsigned int count, - memcpyKind kind, - memoryMode memMode, - int iNumDevs, - bool wc); -void printHelp(void); - -//////////////////////////////////////////////////////////////////////////////// -// Program main -//////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) -{ - pArgc = &argc; - pArgv = argv; - - flush_buf = (char *)malloc(FLUSH_SIZE); - - // set logfile name and start logs - printf("[%s] - Starting...\n", sSDKsample); - - int iRetVal = runTest(argc, (const char **)argv); - - if (iRetVal < 0) { - checkCudaErrors(cudaSetDevice(0)); - } - - // finish - printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL"); - - printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n"); - - free(flush_buf); - - exit((iRetVal == 0) ? EXIT_SUCCESS : EXIT_FAILURE); -} - -/////////////////////////////////////////////////////////////////////////////// -// Parse args, run the appropriate tests -/////////////////////////////////////////////////////////////////////////////// -int runTest(const int argc, const char **argv) -{ - int start = DEFAULT_SIZE; - int end = DEFAULT_SIZE; - int startDevice = 0; - int endDevice = 0; - int increment = DEFAULT_INCREMENT; - testMode mode = QUICK_MODE; - bool htod = false; - bool dtoh = false; - bool dtod = false; - bool wc = false; - char *modeStr; - char *device = NULL; - printMode printmode = USER_READABLE; - char *memModeStr = NULL; - memoryMode memMode = PINNED; - - // process command line args - if (checkCmdLineFlag(argc, argv, "help")) { - printHelp(); - return 0; - } - - if (checkCmdLineFlag(argc, argv, "csv")) { - printmode = CSV; - } - - if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) { - if (strcmp(memModeStr, "pageable") == 0) { - memMode = PAGEABLE; - } - else if (strcmp(memModeStr, "pinned") == 0) { - memMode = PINNED; - } - else { - printf("Invalid memory mode - valid modes are pageable or pinned\n"); - printf("See --help for more information\n"); - return -1000; - } - } - else { - // default - pinned memory - memMode = PINNED; - } - - if (getCmdLineArgumentString(argc, argv, "device", &device)) { - int deviceCount; - cudaError_t error_id = cudaGetDeviceCount(&deviceCount); - - if (error_id != cudaSuccess) { - printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); - exit(EXIT_FAILURE); - } - - if (deviceCount == 0) { - printf("!!!!!No devices found!!!!!\n"); - return -2000; - } - - if (strcmp(device, "all") == 0) { - printf("\n!!!!!Cumulative Bandwidth to be computed from all the devices " - "!!!!!!\n\n"); - startDevice = 0; - endDevice = deviceCount - 1; - } - else { - startDevice = endDevice = atoi(device); - - if (startDevice >= deviceCount || startDevice < 0) { - printf("\n!!!!!Invalid GPU number %d given hence default gpu %d will be " - "used !!!!!\n", - startDevice, - 0); - startDevice = endDevice = 0; - } - } - } - - printf("Running on...\n\n"); - - for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) { - cudaDeviceProp deviceProp; - cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice); - - if (error_id == cudaSuccess) { - printf(" Device %d: %s\n", currentDevice, deviceProp.name); - - if (deviceProp.computeMode == cudaComputeModeProhibited) { - fprintf(stderr, - "Error: device is running in , no " - "threads can use ::cudaSetDevice().\n"); - checkCudaErrors(cudaSetDevice(currentDevice)); - - exit(EXIT_FAILURE); - } - } - else { - printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); - checkCudaErrors(cudaSetDevice(currentDevice)); - - exit(EXIT_FAILURE); - } - } - - if (getCmdLineArgumentString(argc, argv, "mode", &modeStr)) { - // figure out the mode - if (strcmp(modeStr, "quick") == 0) { - printf(" Quick Mode\n\n"); - mode = QUICK_MODE; - } - else if (strcmp(modeStr, "shmoo") == 0) { - printf(" Shmoo Mode\n\n"); - mode = SHMOO_MODE; - } - else if (strcmp(modeStr, "range") == 0) { - printf(" Range Mode\n\n"); - mode = RANGE_MODE; - } - else { - printf("Invalid mode - valid modes are quick, range, or shmoo\n"); - printf("See --help for more information\n"); - return -3000; - } - } - else { - // default mode - quick - printf(" Quick Mode\n\n"); - mode = QUICK_MODE; - } - - if (checkCmdLineFlag(argc, argv, "htod")) { - htod = true; - } - - if (checkCmdLineFlag(argc, argv, "dtoh")) { - dtoh = true; - } - - if (checkCmdLineFlag(argc, argv, "dtod")) { - dtod = true; - } - -#if CUDART_VERSION >= 2020 - - if (checkCmdLineFlag(argc, argv, "wc")) { - wc = true; - } - -#endif - - if (checkCmdLineFlag(argc, argv, "cputiming")) { - bDontUseGPUTiming = true; - } - - if (!htod && !dtoh && !dtod) { - // default: All - htod = true; - dtoh = true; - dtod = true; - } - - if (RANGE_MODE == mode) { - if (checkCmdLineFlag(argc, (const char **)argv, "start")) { - start = getCmdLineArgumentInt(argc, argv, "start"); - - if (start <= 0) { - printf("Illegal argument - start must be greater than zero\n"); - return -4000; - } - } - else { - printf("Must specify a starting size in range mode\n"); - printf("See --help for more information\n"); - return -5000; - } - - if (checkCmdLineFlag(argc, (const char **)argv, "end")) { - end = getCmdLineArgumentInt(argc, argv, "end"); - - if (end <= 0) { - printf("Illegal argument - end must be greater than zero\n"); - return -6000; - } - - if (start > end) { - printf("Illegal argument - start is greater than end\n"); - return -7000; - } - } - else { - printf("Must specify an end size in range mode.\n"); - printf("See --help for more information\n"); - return -8000; - } - - if (checkCmdLineFlag(argc, argv, "increment")) { - increment = getCmdLineArgumentInt(argc, argv, "increment"); - - if (increment <= 0) { - printf("Illegal argument - increment must be greater than zero\n"); - return -9000; - } - } - else { - printf("Must specify an increment in user mode\n"); - printf("See --help for more information\n"); - return -10000; - } - } - - if (htod) { - testBandwidth((unsigned int)start, - (unsigned int)end, - (unsigned int)increment, - mode, - HOST_TO_DEVICE, - printmode, - memMode, - startDevice, - endDevice, - wc); - } - - if (dtoh) { - testBandwidth((unsigned int)start, - (unsigned int)end, - (unsigned int)increment, - mode, - DEVICE_TO_HOST, - printmode, - memMode, - startDevice, - endDevice, - wc); - } - - if (dtod) { - testBandwidth((unsigned int)start, - (unsigned int)end, - (unsigned int)increment, - mode, - DEVICE_TO_DEVICE, - printmode, - memMode, - startDevice, - endDevice, - wc); - } - - // Ensure that we reset all CUDA Devices in question - for (int nDevice = startDevice; nDevice <= endDevice; nDevice++) { - cudaSetDevice(nDevice); - } - - return 0; -} - -/////////////////////////////////////////////////////////////////////////////// -// Run a bandwidth test -/////////////////////////////////////////////////////////////////////////////// -void testBandwidth(unsigned int start, - unsigned int end, - unsigned int increment, - testMode mode, - memcpyKind kind, - printMode printmode, - memoryMode memMode, - int startDevice, - int endDevice, - bool wc) -{ - switch (mode) { - case QUICK_MODE: - testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice, endDevice, wc); - break; - - case RANGE_MODE: - testBandwidthRange(start, end, increment, kind, printmode, memMode, startDevice, endDevice, wc); - break; - - case SHMOO_MODE: - testBandwidthShmoo(kind, printmode, memMode, startDevice, endDevice, wc); - break; - - default: - break; - } -} - -////////////////////////////////////////////////////////////////////// -// Run a quick mode bandwidth test -////////////////////////////////////////////////////////////////////// -void testBandwidthQuick(unsigned int size, - memcpyKind kind, - printMode printmode, - memoryMode memMode, - int startDevice, - int endDevice, - bool wc) -{ - testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode, startDevice, endDevice, wc); -} - -/////////////////////////////////////////////////////////////////////// -// Run a range mode bandwidth test -////////////////////////////////////////////////////////////////////// -void testBandwidthRange(unsigned int start, - unsigned int end, - unsigned int increment, - memcpyKind kind, - printMode printmode, - memoryMode memMode, - int startDevice, - int endDevice, - bool wc) -{ - // count the number of copies we're going to run - unsigned int count = 1 + ((end - start) / increment); - - unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int)); - double *bandwidths = (double *)malloc(count * sizeof(double)); - - // Before calculating the cumulative bandwidth, initialize bandwidths array to - // NULL - for (unsigned int i = 0; i < count; i++) { - bandwidths[i] = 0.0; - } - - // Use the device asked by the user - for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) { - cudaSetDevice(currentDevice); - - // run each of the copies - for (unsigned int i = 0; i < count; i++) { - memSizes[i] = start + i * increment; - - switch (kind) { - case DEVICE_TO_HOST: - bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc); - break; - - case HOST_TO_DEVICE: - bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc); - break; - - case DEVICE_TO_DEVICE: - bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]); - break; - } - } - } // Complete the bandwidth computation on all the devices - - // print results - if (printmode == CSV) { - printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); - } - else { - printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); - } - - // clean up - free(memSizes); - free(bandwidths); -} - -////////////////////////////////////////////////////////////////////////////// -// Intense shmoo mode - covers a large range of values with varying increments -////////////////////////////////////////////////////////////////////////////// -void testBandwidthShmoo(memcpyKind kind, - printMode printmode, - memoryMode memMode, - int startDevice, - int endDevice, - bool wc) -{ - // count the number of copies to make - unsigned int count = 1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) - + ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) - + ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) - + ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) - + ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) - + ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) - + ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB); - - unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int)); - double *bandwidths = (double *)malloc(count * sizeof(double)); - - // Before calculating the cumulative bandwidth, initialize bandwidths array to - // NULL - for (unsigned int i = 0; i < count; i++) { - bandwidths[i] = 0.0; - } - - // Use the device asked by the user - for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) { - cudaSetDevice(currentDevice); - // Run the shmoo - int iteration = 0; - unsigned int memSize = 0; - - while (memSize <= SHMOO_MEMSIZE_MAX) { - if (memSize < SHMOO_LIMIT_20KB) { - memSize += SHMOO_INCREMENT_1KB; - } - else if (memSize < SHMOO_LIMIT_50KB) { - memSize += SHMOO_INCREMENT_2KB; - } - else if (memSize < SHMOO_LIMIT_100KB) { - memSize += SHMOO_INCREMENT_10KB; - } - else if (memSize < SHMOO_LIMIT_1MB) { - memSize += SHMOO_INCREMENT_100KB; - } - else if (memSize < SHMOO_LIMIT_16MB) { - memSize += SHMOO_INCREMENT_1MB; - } - else if (memSize < SHMOO_LIMIT_32MB) { - memSize += SHMOO_INCREMENT_2MB; - } - else { - memSize += SHMOO_INCREMENT_4MB; - } - - memSizes[iteration] = memSize; - - switch (kind) { - case DEVICE_TO_HOST: - bandwidths[iteration] += testDeviceToHostTransfer(memSizes[iteration], memMode, wc); - break; - - case HOST_TO_DEVICE: - bandwidths[iteration] += testHostToDeviceTransfer(memSizes[iteration], memMode, wc); - break; - - case DEVICE_TO_DEVICE: - bandwidths[iteration] += testDeviceToDeviceTransfer(memSizes[iteration]); - break; - } - - iteration++; - printf("."); - fflush(0); - } - } // Complete the bandwidth computation on all the devices - - // print results - printf("\n"); - - if (CSV == printmode) { - printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); - } - else { - printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); - } - - // clean up - free(memSizes); - free(bandwidths); -} - -/////////////////////////////////////////////////////////////////////////////// -// test the bandwidth of a device to host memcopy of a specific size -/////////////////////////////////////////////////////////////////////////////// -float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc) -{ - StopWatchInterface *timer = NULL; - float elapsedTimeInMs = 0.0f; - float bandwidthInGBs = 0.0f; - unsigned char *h_idata = NULL; - unsigned char *h_odata = NULL; - cudaEvent_t start, stop; - - sdkCreateTimer(&timer); - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - - // allocate host memory - if (PINNED == memMode) { - // pinned memory mode - use special function to get OS-pinned memory -#if CUDART_VERSION >= 2020 - checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize, (wc) ? cudaHostAllocWriteCombined : 0)); - checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0)); -#else - checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize)); - checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize)); -#endif - } - else { - // pageable memory mode - use malloc - h_idata = (unsigned char *)malloc(memSize); - h_odata = (unsigned char *)malloc(memSize); - - if (h_idata == 0 || h_odata == 0) { - fprintf(stderr, "Not enough memory avaialable on host to run test!\n"); - exit(EXIT_FAILURE); - } - } - - // initialize the memory - for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { - h_idata[i] = (unsigned char)(i & 0xff); - } - - // allocate device memory - unsigned char *d_idata; - checkCudaErrors(cudaMalloc((void **)&d_idata, memSize)); - - // initialize the device memory - checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice)); - - // copy data from GPU to Host - if (PINNED == memMode) { - if (bDontUseGPUTiming) - sdkStartTimer(&timer); - checkCudaErrors(cudaEventRecord(start, 0)); - for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { - checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost, 0)); - } - checkCudaErrors(cudaEventRecord(stop, 0)); - checkCudaErrors(cudaDeviceSynchronize()); - checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); - if (bDontUseGPUTiming) { - sdkStopTimer(&timer); - elapsedTimeInMs = sdkGetTimerValue(&timer); - sdkResetTimer(&timer); - } - } - else { - elapsedTimeInMs = 0; - for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { - sdkStartTimer(&timer); - checkCudaErrors(cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost)); - sdkStopTimer(&timer); - elapsedTimeInMs += sdkGetTimerValue(&timer); - sdkResetTimer(&timer); - memset(flush_buf, i, FLUSH_SIZE); - } - } - - // calculate bandwidth in GB/s - double time_s = elapsedTimeInMs / 1e3; - bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9; - bandwidthInGBs = bandwidthInGBs / time_s; - // clean up memory - checkCudaErrors(cudaEventDestroy(stop)); - checkCudaErrors(cudaEventDestroy(start)); - sdkDeleteTimer(&timer); - - if (PINNED == memMode) { - checkCudaErrors(cudaFreeHost(h_idata)); - checkCudaErrors(cudaFreeHost(h_odata)); - } - else { - free(h_idata); - free(h_odata); - } - - checkCudaErrors(cudaFree(d_idata)); - - return bandwidthInGBs; -} - -/////////////////////////////////////////////////////////////////////////////// -//! test the bandwidth of a host to device memcopy of a specific size -/////////////////////////////////////////////////////////////////////////////// -float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc) -{ - StopWatchInterface *timer = NULL; - float elapsedTimeInMs = 0.0f; - float bandwidthInGBs = 0.0f; - cudaEvent_t start, stop; - sdkCreateTimer(&timer); - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - - // allocate host memory - unsigned char *h_odata = NULL; - - if (PINNED == memMode) { -#if CUDART_VERSION >= 2020 - // pinned memory mode - use special function to get OS-pinned memory - checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0)); -#else - // pinned memory mode - use special function to get OS-pinned memory - checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize)); -#endif - } - else { - // pageable memory mode - use malloc - h_odata = (unsigned char *)malloc(memSize); - - if (h_odata == 0) { - fprintf(stderr, "Not enough memory available on host to run test!\n"); - exit(EXIT_FAILURE); - } - } - - unsigned char *h_cacheClear1 = (unsigned char *)malloc(CACHE_CLEAR_SIZE); - unsigned char *h_cacheClear2 = (unsigned char *)malloc(CACHE_CLEAR_SIZE); - - if (h_cacheClear1 == 0 || h_cacheClear2 == 0) { - fprintf(stderr, "Not enough memory available on host to run test!\n"); - exit(EXIT_FAILURE); - } - - // initialize the memory - for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { - h_odata[i] = (unsigned char)(i & 0xff); - } - - for (unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++) { - h_cacheClear1[i] = (unsigned char)(i & 0xff); - h_cacheClear2[i] = (unsigned char)(0xff - (i & 0xff)); - } - - // allocate device memory - unsigned char *d_idata; - checkCudaErrors(cudaMalloc((void **)&d_idata, memSize)); - - // copy host memory to device memory - if (PINNED == memMode) { - if (bDontUseGPUTiming) - sdkStartTimer(&timer); - checkCudaErrors(cudaEventRecord(start, 0)); - for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { - checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize, cudaMemcpyHostToDevice, 0)); - } - checkCudaErrors(cudaEventRecord(stop, 0)); - checkCudaErrors(cudaDeviceSynchronize()); - checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); - if (bDontUseGPUTiming) { - sdkStopTimer(&timer); - elapsedTimeInMs = sdkGetTimerValue(&timer); - sdkResetTimer(&timer); - } - } - else { - elapsedTimeInMs = 0; - for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { - sdkStartTimer(&timer); - checkCudaErrors(cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice)); - sdkStopTimer(&timer); - elapsedTimeInMs += sdkGetTimerValue(&timer); - sdkResetTimer(&timer); - memset(flush_buf, i, FLUSH_SIZE); - } - } - - // calculate bandwidth in GB/s - double time_s = elapsedTimeInMs / 1e3; - bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9; - bandwidthInGBs = bandwidthInGBs / time_s; - // clean up memory - checkCudaErrors(cudaEventDestroy(stop)); - checkCudaErrors(cudaEventDestroy(start)); - sdkDeleteTimer(&timer); - - if (PINNED == memMode) { - checkCudaErrors(cudaFreeHost(h_odata)); - } - else { - free(h_odata); - } - - free(h_cacheClear1); - free(h_cacheClear2); - checkCudaErrors(cudaFree(d_idata)); - - return bandwidthInGBs; -} - -/////////////////////////////////////////////////////////////////////////////// -//! test the bandwidth of a device to device memcopy of a specific size -/////////////////////////////////////////////////////////////////////////////// -float testDeviceToDeviceTransfer(unsigned int memSize) -{ - StopWatchInterface *timer = NULL; - float elapsedTimeInMs = 0.0f; - float bandwidthInGBs = 0.0f; - cudaEvent_t start, stop; - - sdkCreateTimer(&timer); - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - - // allocate host memory - unsigned char *h_idata = (unsigned char *)malloc(memSize); - - if (h_idata == 0) { - fprintf(stderr, "Not enough memory avaialable on host to run test!\n"); - exit(EXIT_FAILURE); - } - - // initialize the host memory - for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { - h_idata[i] = (unsigned char)(i & 0xff); - } - - // allocate device memory - unsigned char *d_idata; - checkCudaErrors(cudaMalloc((void **)&d_idata, memSize)); - unsigned char *d_odata; - checkCudaErrors(cudaMalloc((void **)&d_odata, memSize)); - - // initialize memory - checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice)); - - // run the memcopy - sdkStartTimer(&timer); - checkCudaErrors(cudaEventRecord(start, 0)); - - for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { - checkCudaErrors(cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice)); - } - - checkCudaErrors(cudaEventRecord(stop, 0)); - - // Since device to device memory copies are non-blocking, - // cudaDeviceSynchronize() is required in order to get - // proper timing. - checkCudaErrors(cudaDeviceSynchronize()); - - // get the total elapsed time in ms - sdkStopTimer(&timer); - checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); - - if (bDontUseGPUTiming) { - elapsedTimeInMs = sdkGetTimerValue(&timer); - } - - // calculate bandwidth in GB/s - double time_s = elapsedTimeInMs / 1e3; - bandwidthInGBs = (2.0f * memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9; - bandwidthInGBs = bandwidthInGBs / time_s; - - // clean up memory - sdkDeleteTimer(&timer); - free(h_idata); - checkCudaErrors(cudaEventDestroy(stop)); - checkCudaErrors(cudaEventDestroy(start)); - checkCudaErrors(cudaFree(d_idata)); - checkCudaErrors(cudaFree(d_odata)); - - return bandwidthInGBs; -} - -///////////////////////////////////////////////////////// -// print results in an easily read format -//////////////////////////////////////////////////////// -void printResultsReadable(unsigned int *memSizes, - double *bandwidths, - unsigned int count, - memcpyKind kind, - memoryMode memMode, - int iNumDevs, - bool wc) -{ - printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs); - printf(" %s Memory Transfers\n", sMemoryMode[memMode]); - - if (wc) { - printf(" Write-Combined Memory Writes are Enabled"); - } - - printf(" Transfer Size (Bytes)\tBandwidth(GB/s)\n"); - unsigned int i; - - for (i = 0; i < (count - 1); i++) { - printf(" %u\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]); - } - - printf(" %u\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]); -} - -/////////////////////////////////////////////////////////////////////////// -// print results in a database format -/////////////////////////////////////////////////////////////////////////// -void printResultsCSV(unsigned int *memSizes, - double *bandwidths, - unsigned int count, - memcpyKind kind, - memoryMode memMode, - int iNumDevs, - bool wc) -{ - std::string sConfig; - - // log config information - if (kind == DEVICE_TO_DEVICE) { - sConfig += "D2D"; - } - else { - if (kind == DEVICE_TO_HOST) { - sConfig += "D2H"; - } - else if (kind == HOST_TO_DEVICE) { - sConfig += "H2D"; - } - - if (memMode == PAGEABLE) { - sConfig += "-Paged"; - } - else if (memMode == PINNED) { - sConfig += "-Pinned"; - - if (wc) { - sConfig += "-WriteCombined"; - } - } - } - - unsigned int i; - double dSeconds = 0.0; - - for (i = 0; i < count; i++) { - dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1e9)); - printf("bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u " - "bytes, NumDevsUsed = %d\n", - sConfig.c_str(), - bandwidths[i], - dSeconds, - memSizes[i], - iNumDevs); - } -} - -/////////////////////////////////////////////////////////////////////////// -// Print help screen -/////////////////////////////////////////////////////////////////////////// -void printHelp(void) -{ - printf("Usage: bandwidthTest [OPTION]...\n"); - printf("Test the bandwidth for device to host, host to device, and device to " - "device transfers\n"); - printf("\n"); - printf("Example: measure the bandwidth of device to host pinned memory copies " - "in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n"); - printf("./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 " - "--increment=1024 --dtoh\n"); - - printf("\n"); - printf("Options:\n"); - printf("--help\tDisplay this help menu\n"); - printf("--csv\tPrint results as a CSV\n"); - printf("--device=[deviceno]\tSpecify the device device to be used\n"); - printf(" all - compute cumulative bandwidth on all the devices\n"); - printf(" 0,1,2,...,n - Specify any particular device to be used\n"); - printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n"); - printf(" pageable - pageable memory\n"); - printf(" pinned - non-pageable system memory\n"); - printf("--mode=[MODE]\tSpecify the mode to use\n"); - printf(" quick - performs a quick measurement\n"); - printf(" range - measures a user-specified range of values\n"); - printf(" shmoo - performs an intense shmoo of a large range of values\n"); - - printf("--htod\tMeasure host to device transfers\n"); - printf("--dtoh\tMeasure device to host transfers\n"); - printf("--dtod\tMeasure device to device transfers\n"); -#if CUDART_VERSION >= 2020 - printf("--wc\tAllocate pinned memory as write-combined\n"); -#endif - printf("--cputiming\tForce CPU-based timing always\n"); - - printf("Range mode options\n"); - printf("--start=[SIZE]\tStarting transfer size in bytes\n"); - printf("--end=[SIZE]\tEnding transfer size in bytes\n"); - printf("--increment=[SIZE]\tIncrement size in bytes\n"); -}