diff --git a/CHANGELOG.md b/CHANGELOG.md index 93d1500b..12e424fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ ### CUDA 12.9 * Updated toolchain for cross-compilation for Tegra Linux platforms. +* Repository has been updated with consistent code formatting across all samples +* Many small code tweaks and bug fixes (see commit history for details) +* Removed the following outdated samples: + * `1_Utilities` + * `bandwidthTest` - this sample was out of date and did not produce accurate results. For bandwidth + testing of NVIDIA GPU platforms, please refer to [NVBandwidth](https://github.com/NVIDIA/nvbandwidth) ### CUDA 12.8 * Updated build system across the repository to CMake. Removed Visual Studio project files and Makefiles. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..431410a8 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,103 @@ + +# Contributing to the CUDA Samples + +Thank you for your interest in contributing to the CUDA Samples! + + +## Getting Started + +1. **Fork & Clone the Repository**: + + Fork the reporistory and clone the fork. For more information, check [GitHub's documentation on forking](https://docs.github.com/en/github/getting-started-with-github/fork-a-repo) and [cloning a repository](https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/cloning-a-repository). + +## Making Changes + +1. **Create a New Branch**: + + ```bash + git checkout -b your-feature-branch + ``` + +2. **Make Changes**. + +3. **Build and Test**: + + Ensure changes don't break existing functionality by building and running tests. + + For more details on building and testing, refer to the [Building and Testing](#building-and-testing) section below. + +4. **Commit Changes**: + + ```bash + git commit -m "Brief description of the change" + ``` + +## Building and Testing + +For information on building a running tests on the samples, please refer to the main [README](README.md) + +## Creating a Pull Request + +1. Push changes to your fork +2. Create a pull request targeting the `master` branch of the original CUDA Samples repository. Refer to [GitHub's documentation](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) for more information on creating a pull request. +3. Describe the purpose and context of the changes in the pull request description. + +## Code Formatting (pre-commit hooks) + +The CUDA Samples repository uses [pre-commit](https://pre-commit.com/) to execute all code linters and formatters. These +tools ensure a consistent coding style throughout the project. Using pre-commit ensures that linter +versions and options are aligned for all developers. Additionally, there is a CI check in place to +enforce that committed code follows our standards. + +The linters used by the CUDA Samples are listed in `.pre-commit-config.yaml`. +For example, C++ and CUDA code is formatted with [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html). + +To use `pre-commit`, install via `conda` or `pip`: + +```bash +conda config --add channels conda-forge +conda install pre-commit +``` + +```bash +pip install pre-commit +``` + +Then run pre-commit hooks before committing code: + +```bash +pre-commit run +``` + +By default, pre-commit runs on staged files (only changes and additions that will be committed). +To run pre-commit checks on all files, execute: + +```bash +pre-commit run --all-files +``` + +Optionally, you may set up the pre-commit hooks to run automatically when you make a git commit. This can be done by running: + +```bash +pre-commit install +``` + +Now code linters and formatters will be run each time you commit changes. + +You can skip these checks with `git commit --no-verify` or with the short version `git commit -n`, althoguh please note +that this may result in pull requests being rejected if subsequent checks fail. + +## Review Process + +Once submitted, maintainers will be automatically assigned to review the pull request. They might suggest changes or improvements. Constructive feedback is a part of the collaborative process, aimed at ensuring the highest quality code. + +For constructive feedback and effective communication during reviews, we recommend following [Conventional Comments](https://conventionalcomments.org/). + +Further recommended reading for successful PR reviews: + +- [How to Do Code Reviews Like a Human (Part One)](https://mtlynch.io/human-code-reviews-1/) +- [How to Do Code Reviews Like a Human (Part Two)](https://mtlynch.io/human-code-reviews-2/) + +## Thank You + +Your contributions enhance the CUDA Samples for the entire community. We appreciate your effort and collaboration! diff --git a/README.md b/README.md index d284ea4b..12f5d8fd 100644 --- a/README.md +++ b/README.md @@ -148,12 +148,14 @@ we provide a script to do so, `run_tests.py`. This Python3 script finds all executables in a subdirectory you choose, matching application names with command line arguments specified in `test_args.json`. It accepts the following command line arguments: -| Switch | Purpose | Example | -| -------- | -------------------------------------------------------------------------------------------------------------- | ----------------------- | -| --dir | Specify the root directory to search for executables (recursively) | --dir ./build/Samples | -| --config | JSON configuration file for executable arguments | --config test_args.json | -| --output | Output directory for test results (stdout saved to .txt files - directory will be created if it doesn't exist) | --output ./test | -| --args | Global arguments to pass to all executables (not currently used) | --args arg_1 arg_2 ... | +| Switch | Purpose | Example | +| ---------- | -------------------------------------------------------------------------------------------------------------- | ----------------------- | +| --dir | Specify the root directory to search for executables (recursively) | --dir ./build/Samples | +| --config | JSON configuration file for executable arguments | --config test_args.json | +| --output | Output directory for test results (stdout saved to .txt files - directory will be created if it doesn't exist) | --output ./test | +| --args | Global arguments to pass to all executables (not currently used) | --args arg_1 arg_2 ... | +| --parallel | Number of applications to execute in parallel. | --parallel 8 | + Application configurations are loaded from `test_args.json` and matched against executable names (discarding the `.exe` extension on Windows). @@ -281,18 +283,18 @@ and system configuration): ``` Test Summary: -Ran 181 tests -All tests passed! +Ran 199 test runs for 180 executables. +All test runs passed! ``` If some samples fail, you will see something like this: ``` Test Summary: -Ran 181 tests -Failed tests (2): - volumeFiltering: returned 1 - postProcessGL: returned 1 +Ran 199 test runs for 180 executables. +Failed runs (2): + bicubicTexture (run 1/5): Failed (code 1) + Mandelbrot (run 1/2): Failed (code 1) ``` You can inspect the stdout logs in the output directory (generally `APM_.txt` or `APM_.run.txt`) to help diff --git a/Samples/0_Introduction/simpleIPC/simpleIPC.cu b/Samples/0_Introduction/simpleIPC/simpleIPC.cu index 1e2a9a93..ab59fc4d 100644 --- a/Samples/0_Introduction/simpleIPC/simpleIPC.cu +++ b/Samples/0_Introduction/simpleIPC/simpleIPC.cu @@ -99,8 +99,21 @@ static void childProcess(int id) std::vector ptrs; std::vector events; std::vector verification_buffer(DATA_SIZE); + char pidString[20] = {0}; + char lshmName[40] = {0}; - if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) { + // Use parent process ID to create a unique shared memory name for Linux multi-process +#ifdef __linux__ + pid_t pid; + pid = getppid(); + snprintf(pidString, sizeof(pidString), "%d", pid); +#endif + strcat(lshmName, shmName); + strcat(lshmName, pidString); + + printf("CP: lshmName = %s\n", lshmName); + + if (sharedMemoryOpen(lshmName, sizeof(shmStruct), &info) != 0) { printf("Failed to create shared memory slab\n"); exit(EXIT_FAILURE); } @@ -195,10 +208,23 @@ static void parentProcess(char *app) std::vector ptrs; std::vector events; std::vector processes; + char pidString[20] = {0}; + char lshmName[40] = {0}; + + // Use current process ID to create a unique shared memory name for Linux multi-process +#ifdef __linux__ + pid_t pid; + pid = getpid(); + snprintf(pidString, sizeof(pidString), "%d", pid); +#endif + strcat(lshmName, shmName); + strcat(lshmName, pidString); + + printf("PP: lshmName = %s\n", lshmName); checkCudaErrors(cudaGetDeviceCount(&devCount)); - if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) { + if (sharedMemoryCreate(lshmName, sizeof(*shm), &info) != 0) { printf("Failed to create shared memory slab\n"); exit(EXIT_FAILURE); } diff --git a/Samples/1_Utilities/CMakeLists.txt b/Samples/1_Utilities/CMakeLists.txt index ee4d097c..1f3f481b 100644 --- a/Samples/1_Utilities/CMakeLists.txt +++ b/Samples/1_Utilities/CMakeLists.txt @@ -1,4 +1,3 @@ -add_subdirectory(bandwidthTest) add_subdirectory(deviceQuery) add_subdirectory(deviceQueryDrv) add_subdirectory(topologyQuery) diff --git a/Samples/1_Utilities/bandwidthTest/.vscode/c_cpp_properties.json b/Samples/1_Utilities/bandwidthTest/.vscode/c_cpp_properties.json deleted file mode 100644 index f0066b0f..00000000 --- a/Samples/1_Utilities/bandwidthTest/.vscode/c_cpp_properties.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "configurations": [ - { - "name": "Linux", - "includePath": [ - "${workspaceFolder}/**", - "${workspaceFolder}/../../../Common" - ], - "defines": [], - "compilerPath": "/usr/local/cuda/bin/nvcc", - "cStandard": "gnu17", - "cppStandard": "gnu++14", - "intelliSenseMode": "linux-gcc-x64", - "configurationProvider": "ms-vscode.makefile-tools" - } - ], - "version": 4 -} diff --git a/Samples/1_Utilities/bandwidthTest/.vscode/extensions.json b/Samples/1_Utilities/bandwidthTest/.vscode/extensions.json deleted file mode 100644 index c7eb54dc..00000000 --- a/Samples/1_Utilities/bandwidthTest/.vscode/extensions.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "recommendations": [ - "nvidia.nsight-vscode-edition", - "ms-vscode.cpptools", - "ms-vscode.makefile-tools" - ] -} diff --git a/Samples/1_Utilities/bandwidthTest/.vscode/launch.json b/Samples/1_Utilities/bandwidthTest/.vscode/launch.json deleted file mode 100644 index 81bb9bf1..00000000 --- a/Samples/1_Utilities/bandwidthTest/.vscode/launch.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "configurations": [ - { - "name": "CUDA C++: Launch", - "type": "cuda-gdb", - "request": "launch", - "program": "${workspaceFolder}/bandwidthTest" - } - ] -} diff --git a/Samples/1_Utilities/bandwidthTest/.vscode/tasks.json b/Samples/1_Utilities/bandwidthTest/.vscode/tasks.json deleted file mode 100644 index 4509aeb1..00000000 --- a/Samples/1_Utilities/bandwidthTest/.vscode/tasks.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "version": "2.0.0", - "tasks": [ - { - "label": "sample", - "type": "shell", - "command": "make dbg=1", - "problemMatcher": ["$nvcc"], - "group": { - "kind": "build", - "isDefault": true - } - } - ] -} diff --git a/Samples/1_Utilities/bandwidthTest/CMakeLists.txt b/Samples/1_Utilities/bandwidthTest/CMakeLists.txt deleted file mode 100644 index a9535dc7..00000000 --- a/Samples/1_Utilities/bandwidthTest/CMakeLists.txt +++ /dev/null @@ -1,30 +0,0 @@ -cmake_minimum_required(VERSION 3.20) - -list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules") - -project(bandwidthTest LANGUAGES C CXX CUDA) - -find_package(CUDAToolkit REQUIRED) - -set(CMAKE_POSITION_INDEPENDENT_CODE ON) - -set(CMAKE_CUDA_ARCHITECTURES 75 80 86 87 89 90 100 101 120) -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") -if(ENABLE_CUDA_DEBUG) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets) -else() - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option) -endif() - -# Include directories and libraries -include_directories(../../../Common) - -# Source file -# Add target for bandwidthTest -add_executable(bandwidthTest bandwidthTest.cu) - -target_compile_options(bandwidthTest PRIVATE $<$:--extended-lambda>) - -target_compile_features(bandwidthTest PRIVATE cxx_std_17 cuda_std_17) - -set_target_properties(bandwidthTest PROPERTIES CUDA_SEPARABLE_COMPILATION ON) diff --git a/Samples/1_Utilities/bandwidthTest/README.md b/Samples/1_Utilities/bandwidthTest/README.md deleted file mode 100644 index 0cfbd154..00000000 --- a/Samples/1_Utilities/bandwidthTest/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# bandwidthTest - Bandwidth Test - -## Description - -This is a simple test program to measure the memcopy bandwidth of the GPU and memcpy bandwidth across PCI-e. This test application is capable of measuring device to device copy bandwidth, host to device copy bandwidth for pageable and page-locked memory, and device to host copy bandwidth for pageable and page-locked memory. - -## Key Concepts - -CUDA Streams and Events, Performance Strategies - -## Supported SM Architectures - -[SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) [SM 8.9 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus) - -## Supported OSes - -Linux, Windows - -## Supported CPU Architecture - -x86_64, armv7l - -## CUDA APIs involved - -### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) -cudaHostAlloc, cudaMemcpy, cudaMalloc, cudaMemcpyAsync, cudaFree, cudaGetErrorString, cudaMallocHost, cudaSetDevice, cudaGetDeviceProperties, cudaDeviceSynchronize, cudaEventRecord, cudaFreeHost, cudaEventDestroy, cudaEventElapsedTime, cudaGetDeviceCount, cudaEventCreate - -## Prerequisites - -Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. - -## References (for more details) diff --git a/Samples/1_Utilities/bandwidthTest/bandwidthTest.cu b/Samples/1_Utilities/bandwidthTest/bandwidthTest.cu deleted file mode 100644 index 56764d02..00000000 --- a/Samples/1_Utilities/bandwidthTest/bandwidthTest.cu +++ /dev/null @@ -1,1049 +0,0 @@ -/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of NVIDIA CORPORATION nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * This is a simple test program to measure the memcopy bandwidth of the GPU. - * It can measure device to device copy bandwidth, host to device copy bandwidth - * for pageable and pinned memory, and device to host copy bandwidth for - * pageable and pinned memory. - * - * Usage: - * ./bandwidthTest [option]... - */ - -// CUDA runtime -#include - -// includes -#include -#include -#include // helper functions for CUDA error checking and initialization -#include // helper for shared functions common to CUDA Samples -#include -#include - -static const char *sSDKsample = "CUDA Bandwidth Test"; - -// defines, project -#define MEMCOPY_ITERATIONS 100 -#define DEFAULT_SIZE (32 * (1e6)) // 32 M -#define DEFAULT_INCREMENT (4 * (1e6)) // 4 M -#define CACHE_CLEAR_SIZE (16 * (1e6)) // 16 M - -// shmoo mode defines -#define SHMOO_MEMSIZE_MAX (64 * (1e6)) // 64 M -#define SHMOO_MEMSIZE_START (1e3) // 1 KB -#define SHMOO_INCREMENT_1KB (1e3) // 1 KB -#define SHMOO_INCREMENT_2KB (2 * 1e3) // 2 KB -#define SHMOO_INCREMENT_10KB (10 * (1e3)) // 10KB -#define SHMOO_INCREMENT_100KB (100 * (1e3)) // 100 KB -#define SHMOO_INCREMENT_1MB (1e6) // 1 MB -#define SHMOO_INCREMENT_2MB (2 * 1e6) // 2 MB -#define SHMOO_INCREMENT_4MB (4 * 1e6) // 4 MB -#define SHMOO_LIMIT_20KB (20 * (1e3)) // 20 KB -#define SHMOO_LIMIT_50KB (50 * (1e3)) // 50 KB -#define SHMOO_LIMIT_100KB (100 * (1e3)) // 100 KB -#define SHMOO_LIMIT_1MB (1e6) // 1 MB -#define SHMOO_LIMIT_16MB (16 * 1e6) // 16 MB -#define SHMOO_LIMIT_32MB (32 * 1e6) // 32 MB - -// CPU cache flush -#define FLUSH_SIZE (256 * 1024 * 1024) -char *flush_buf; - -// enums, project -enum testMode { QUICK_MODE, RANGE_MODE, SHMOO_MODE }; -enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE }; -enum printMode { USER_READABLE, CSV }; -enum memoryMode { PINNED, PAGEABLE }; - -const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device", "Device to Device", NULL}; - -const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL}; - -// if true, use CPU based timing for everything -static bool bDontUseGPUTiming; - -int *pArgc = NULL; -char **pArgv = NULL; - -//////////////////////////////////////////////////////////////////////////////// -// declaration, forward -int runTest(const int argc, const char **argv); -void testBandwidth(unsigned int start, - unsigned int end, - unsigned int increment, - testMode mode, - memcpyKind kind, - printMode printmode, - memoryMode memMode, - int startDevice, - int endDevice, - bool wc); -void testBandwidthQuick(unsigned int size, - memcpyKind kind, - printMode printmode, - memoryMode memMode, - int startDevice, - int endDevice, - bool wc); -void testBandwidthRange(unsigned int start, - unsigned int end, - unsigned int increment, - memcpyKind kind, - printMode printmode, - memoryMode memMode, - int startDevice, - int endDevice, - bool wc); -void testBandwidthShmoo(memcpyKind kind, - printMode printmode, - memoryMode memMode, - int startDevice, - int endDevice, - bool wc); -float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc); -float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc); -float testDeviceToDeviceTransfer(unsigned int memSize); -void printResultsReadable(unsigned int *memSizes, - double *bandwidths, - unsigned int count, - memcpyKind kind, - memoryMode memMode, - int iNumDevs, - bool wc); -void printResultsCSV(unsigned int *memSizes, - double *bandwidths, - unsigned int count, - memcpyKind kind, - memoryMode memMode, - int iNumDevs, - bool wc); -void printHelp(void); - -//////////////////////////////////////////////////////////////////////////////// -// Program main -//////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) -{ - pArgc = &argc; - pArgv = argv; - - flush_buf = (char *)malloc(FLUSH_SIZE); - - // set logfile name and start logs - printf("[%s] - Starting...\n", sSDKsample); - - int iRetVal = runTest(argc, (const char **)argv); - - if (iRetVal < 0) { - checkCudaErrors(cudaSetDevice(0)); - } - - // finish - printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL"); - - printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n"); - - free(flush_buf); - - exit((iRetVal == 0) ? EXIT_SUCCESS : EXIT_FAILURE); -} - -/////////////////////////////////////////////////////////////////////////////// -// Parse args, run the appropriate tests -/////////////////////////////////////////////////////////////////////////////// -int runTest(const int argc, const char **argv) -{ - int start = DEFAULT_SIZE; - int end = DEFAULT_SIZE; - int startDevice = 0; - int endDevice = 0; - int increment = DEFAULT_INCREMENT; - testMode mode = QUICK_MODE; - bool htod = false; - bool dtoh = false; - bool dtod = false; - bool wc = false; - char *modeStr; - char *device = NULL; - printMode printmode = USER_READABLE; - char *memModeStr = NULL; - memoryMode memMode = PINNED; - - // process command line args - if (checkCmdLineFlag(argc, argv, "help")) { - printHelp(); - return 0; - } - - if (checkCmdLineFlag(argc, argv, "csv")) { - printmode = CSV; - } - - if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) { - if (strcmp(memModeStr, "pageable") == 0) { - memMode = PAGEABLE; - } - else if (strcmp(memModeStr, "pinned") == 0) { - memMode = PINNED; - } - else { - printf("Invalid memory mode - valid modes are pageable or pinned\n"); - printf("See --help for more information\n"); - return -1000; - } - } - else { - // default - pinned memory - memMode = PINNED; - } - - if (getCmdLineArgumentString(argc, argv, "device", &device)) { - int deviceCount; - cudaError_t error_id = cudaGetDeviceCount(&deviceCount); - - if (error_id != cudaSuccess) { - printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); - exit(EXIT_FAILURE); - } - - if (deviceCount == 0) { - printf("!!!!!No devices found!!!!!\n"); - return -2000; - } - - if (strcmp(device, "all") == 0) { - printf("\n!!!!!Cumulative Bandwidth to be computed from all the devices " - "!!!!!!\n\n"); - startDevice = 0; - endDevice = deviceCount - 1; - } - else { - startDevice = endDevice = atoi(device); - - if (startDevice >= deviceCount || startDevice < 0) { - printf("\n!!!!!Invalid GPU number %d given hence default gpu %d will be " - "used !!!!!\n", - startDevice, - 0); - startDevice = endDevice = 0; - } - } - } - - printf("Running on...\n\n"); - - for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) { - cudaDeviceProp deviceProp; - cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice); - - if (error_id == cudaSuccess) { - printf(" Device %d: %s\n", currentDevice, deviceProp.name); - - if (deviceProp.computeMode == cudaComputeModeProhibited) { - fprintf(stderr, - "Error: device is running in , no " - "threads can use ::cudaSetDevice().\n"); - checkCudaErrors(cudaSetDevice(currentDevice)); - - exit(EXIT_FAILURE); - } - } - else { - printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); - checkCudaErrors(cudaSetDevice(currentDevice)); - - exit(EXIT_FAILURE); - } - } - - if (getCmdLineArgumentString(argc, argv, "mode", &modeStr)) { - // figure out the mode - if (strcmp(modeStr, "quick") == 0) { - printf(" Quick Mode\n\n"); - mode = QUICK_MODE; - } - else if (strcmp(modeStr, "shmoo") == 0) { - printf(" Shmoo Mode\n\n"); - mode = SHMOO_MODE; - } - else if (strcmp(modeStr, "range") == 0) { - printf(" Range Mode\n\n"); - mode = RANGE_MODE; - } - else { - printf("Invalid mode - valid modes are quick, range, or shmoo\n"); - printf("See --help for more information\n"); - return -3000; - } - } - else { - // default mode - quick - printf(" Quick Mode\n\n"); - mode = QUICK_MODE; - } - - if (checkCmdLineFlag(argc, argv, "htod")) { - htod = true; - } - - if (checkCmdLineFlag(argc, argv, "dtoh")) { - dtoh = true; - } - - if (checkCmdLineFlag(argc, argv, "dtod")) { - dtod = true; - } - -#if CUDART_VERSION >= 2020 - - if (checkCmdLineFlag(argc, argv, "wc")) { - wc = true; - } - -#endif - - if (checkCmdLineFlag(argc, argv, "cputiming")) { - bDontUseGPUTiming = true; - } - - if (!htod && !dtoh && !dtod) { - // default: All - htod = true; - dtoh = true; - dtod = true; - } - - if (RANGE_MODE == mode) { - if (checkCmdLineFlag(argc, (const char **)argv, "start")) { - start = getCmdLineArgumentInt(argc, argv, "start"); - - if (start <= 0) { - printf("Illegal argument - start must be greater than zero\n"); - return -4000; - } - } - else { - printf("Must specify a starting size in range mode\n"); - printf("See --help for more information\n"); - return -5000; - } - - if (checkCmdLineFlag(argc, (const char **)argv, "end")) { - end = getCmdLineArgumentInt(argc, argv, "end"); - - if (end <= 0) { - printf("Illegal argument - end must be greater than zero\n"); - return -6000; - } - - if (start > end) { - printf("Illegal argument - start is greater than end\n"); - return -7000; - } - } - else { - printf("Must specify an end size in range mode.\n"); - printf("See --help for more information\n"); - return -8000; - } - - if (checkCmdLineFlag(argc, argv, "increment")) { - increment = getCmdLineArgumentInt(argc, argv, "increment"); - - if (increment <= 0) { - printf("Illegal argument - increment must be greater than zero\n"); - return -9000; - } - } - else { - printf("Must specify an increment in user mode\n"); - printf("See --help for more information\n"); - return -10000; - } - } - - if (htod) { - testBandwidth((unsigned int)start, - (unsigned int)end, - (unsigned int)increment, - mode, - HOST_TO_DEVICE, - printmode, - memMode, - startDevice, - endDevice, - wc); - } - - if (dtoh) { - testBandwidth((unsigned int)start, - (unsigned int)end, - (unsigned int)increment, - mode, - DEVICE_TO_HOST, - printmode, - memMode, - startDevice, - endDevice, - wc); - } - - if (dtod) { - testBandwidth((unsigned int)start, - (unsigned int)end, - (unsigned int)increment, - mode, - DEVICE_TO_DEVICE, - printmode, - memMode, - startDevice, - endDevice, - wc); - } - - // Ensure that we reset all CUDA Devices in question - for (int nDevice = startDevice; nDevice <= endDevice; nDevice++) { - cudaSetDevice(nDevice); - } - - return 0; -} - -/////////////////////////////////////////////////////////////////////////////// -// Run a bandwidth test -/////////////////////////////////////////////////////////////////////////////// -void testBandwidth(unsigned int start, - unsigned int end, - unsigned int increment, - testMode mode, - memcpyKind kind, - printMode printmode, - memoryMode memMode, - int startDevice, - int endDevice, - bool wc) -{ - switch (mode) { - case QUICK_MODE: - testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice, endDevice, wc); - break; - - case RANGE_MODE: - testBandwidthRange(start, end, increment, kind, printmode, memMode, startDevice, endDevice, wc); - break; - - case SHMOO_MODE: - testBandwidthShmoo(kind, printmode, memMode, startDevice, endDevice, wc); - break; - - default: - break; - } -} - -////////////////////////////////////////////////////////////////////// -// Run a quick mode bandwidth test -////////////////////////////////////////////////////////////////////// -void testBandwidthQuick(unsigned int size, - memcpyKind kind, - printMode printmode, - memoryMode memMode, - int startDevice, - int endDevice, - bool wc) -{ - testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode, startDevice, endDevice, wc); -} - -/////////////////////////////////////////////////////////////////////// -// Run a range mode bandwidth test -////////////////////////////////////////////////////////////////////// -void testBandwidthRange(unsigned int start, - unsigned int end, - unsigned int increment, - memcpyKind kind, - printMode printmode, - memoryMode memMode, - int startDevice, - int endDevice, - bool wc) -{ - // count the number of copies we're going to run - unsigned int count = 1 + ((end - start) / increment); - - unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int)); - double *bandwidths = (double *)malloc(count * sizeof(double)); - - // Before calculating the cumulative bandwidth, initialize bandwidths array to - // NULL - for (unsigned int i = 0; i < count; i++) { - bandwidths[i] = 0.0; - } - - // Use the device asked by the user - for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) { - cudaSetDevice(currentDevice); - - // run each of the copies - for (unsigned int i = 0; i < count; i++) { - memSizes[i] = start + i * increment; - - switch (kind) { - case DEVICE_TO_HOST: - bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc); - break; - - case HOST_TO_DEVICE: - bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc); - break; - - case DEVICE_TO_DEVICE: - bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]); - break; - } - } - } // Complete the bandwidth computation on all the devices - - // print results - if (printmode == CSV) { - printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); - } - else { - printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); - } - - // clean up - free(memSizes); - free(bandwidths); -} - -////////////////////////////////////////////////////////////////////////////// -// Intense shmoo mode - covers a large range of values with varying increments -////////////////////////////////////////////////////////////////////////////// -void testBandwidthShmoo(memcpyKind kind, - printMode printmode, - memoryMode memMode, - int startDevice, - int endDevice, - bool wc) -{ - // count the number of copies to make - unsigned int count = 1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) - + ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) - + ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) - + ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) - + ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) - + ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) - + ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB); - - unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int)); - double *bandwidths = (double *)malloc(count * sizeof(double)); - - // Before calculating the cumulative bandwidth, initialize bandwidths array to - // NULL - for (unsigned int i = 0; i < count; i++) { - bandwidths[i] = 0.0; - } - - // Use the device asked by the user - for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) { - cudaSetDevice(currentDevice); - // Run the shmoo - int iteration = 0; - unsigned int memSize = 0; - - while (memSize <= SHMOO_MEMSIZE_MAX) { - if (memSize < SHMOO_LIMIT_20KB) { - memSize += SHMOO_INCREMENT_1KB; - } - else if (memSize < SHMOO_LIMIT_50KB) { - memSize += SHMOO_INCREMENT_2KB; - } - else if (memSize < SHMOO_LIMIT_100KB) { - memSize += SHMOO_INCREMENT_10KB; - } - else if (memSize < SHMOO_LIMIT_1MB) { - memSize += SHMOO_INCREMENT_100KB; - } - else if (memSize < SHMOO_LIMIT_16MB) { - memSize += SHMOO_INCREMENT_1MB; - } - else if (memSize < SHMOO_LIMIT_32MB) { - memSize += SHMOO_INCREMENT_2MB; - } - else { - memSize += SHMOO_INCREMENT_4MB; - } - - memSizes[iteration] = memSize; - - switch (kind) { - case DEVICE_TO_HOST: - bandwidths[iteration] += testDeviceToHostTransfer(memSizes[iteration], memMode, wc); - break; - - case HOST_TO_DEVICE: - bandwidths[iteration] += testHostToDeviceTransfer(memSizes[iteration], memMode, wc); - break; - - case DEVICE_TO_DEVICE: - bandwidths[iteration] += testDeviceToDeviceTransfer(memSizes[iteration]); - break; - } - - iteration++; - printf("."); - fflush(0); - } - } // Complete the bandwidth computation on all the devices - - // print results - printf("\n"); - - if (CSV == printmode) { - printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); - } - else { - printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); - } - - // clean up - free(memSizes); - free(bandwidths); -} - -/////////////////////////////////////////////////////////////////////////////// -// test the bandwidth of a device to host memcopy of a specific size -/////////////////////////////////////////////////////////////////////////////// -float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc) -{ - StopWatchInterface *timer = NULL; - float elapsedTimeInMs = 0.0f; - float bandwidthInGBs = 0.0f; - unsigned char *h_idata = NULL; - unsigned char *h_odata = NULL; - cudaEvent_t start, stop; - - sdkCreateTimer(&timer); - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - - // allocate host memory - if (PINNED == memMode) { - // pinned memory mode - use special function to get OS-pinned memory -#if CUDART_VERSION >= 2020 - checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize, (wc) ? cudaHostAllocWriteCombined : 0)); - checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0)); -#else - checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize)); - checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize)); -#endif - } - else { - // pageable memory mode - use malloc - h_idata = (unsigned char *)malloc(memSize); - h_odata = (unsigned char *)malloc(memSize); - - if (h_idata == 0 || h_odata == 0) { - fprintf(stderr, "Not enough memory avaialable on host to run test!\n"); - exit(EXIT_FAILURE); - } - } - - // initialize the memory - for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { - h_idata[i] = (unsigned char)(i & 0xff); - } - - // allocate device memory - unsigned char *d_idata; - checkCudaErrors(cudaMalloc((void **)&d_idata, memSize)); - - // initialize the device memory - checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice)); - - // copy data from GPU to Host - if (PINNED == memMode) { - if (bDontUseGPUTiming) - sdkStartTimer(&timer); - checkCudaErrors(cudaEventRecord(start, 0)); - for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { - checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost, 0)); - } - checkCudaErrors(cudaEventRecord(stop, 0)); - checkCudaErrors(cudaDeviceSynchronize()); - checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); - if (bDontUseGPUTiming) { - sdkStopTimer(&timer); - elapsedTimeInMs = sdkGetTimerValue(&timer); - sdkResetTimer(&timer); - } - } - else { - elapsedTimeInMs = 0; - for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { - sdkStartTimer(&timer); - checkCudaErrors(cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost)); - sdkStopTimer(&timer); - elapsedTimeInMs += sdkGetTimerValue(&timer); - sdkResetTimer(&timer); - memset(flush_buf, i, FLUSH_SIZE); - } - } - - // calculate bandwidth in GB/s - double time_s = elapsedTimeInMs / 1e3; - bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9; - bandwidthInGBs = bandwidthInGBs / time_s; - // clean up memory - checkCudaErrors(cudaEventDestroy(stop)); - checkCudaErrors(cudaEventDestroy(start)); - sdkDeleteTimer(&timer); - - if (PINNED == memMode) { - checkCudaErrors(cudaFreeHost(h_idata)); - checkCudaErrors(cudaFreeHost(h_odata)); - } - else { - free(h_idata); - free(h_odata); - } - - checkCudaErrors(cudaFree(d_idata)); - - return bandwidthInGBs; -} - -/////////////////////////////////////////////////////////////////////////////// -//! test the bandwidth of a host to device memcopy of a specific size -/////////////////////////////////////////////////////////////////////////////// -float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc) -{ - StopWatchInterface *timer = NULL; - float elapsedTimeInMs = 0.0f; - float bandwidthInGBs = 0.0f; - cudaEvent_t start, stop; - sdkCreateTimer(&timer); - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - - // allocate host memory - unsigned char *h_odata = NULL; - - if (PINNED == memMode) { -#if CUDART_VERSION >= 2020 - // pinned memory mode - use special function to get OS-pinned memory - checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0)); -#else - // pinned memory mode - use special function to get OS-pinned memory - checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize)); -#endif - } - else { - // pageable memory mode - use malloc - h_odata = (unsigned char *)malloc(memSize); - - if (h_odata == 0) { - fprintf(stderr, "Not enough memory available on host to run test!\n"); - exit(EXIT_FAILURE); - } - } - - unsigned char *h_cacheClear1 = (unsigned char *)malloc(CACHE_CLEAR_SIZE); - unsigned char *h_cacheClear2 = (unsigned char *)malloc(CACHE_CLEAR_SIZE); - - if (h_cacheClear1 == 0 || h_cacheClear2 == 0) { - fprintf(stderr, "Not enough memory available on host to run test!\n"); - exit(EXIT_FAILURE); - } - - // initialize the memory - for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { - h_odata[i] = (unsigned char)(i & 0xff); - } - - for (unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++) { - h_cacheClear1[i] = (unsigned char)(i & 0xff); - h_cacheClear2[i] = (unsigned char)(0xff - (i & 0xff)); - } - - // allocate device memory - unsigned char *d_idata; - checkCudaErrors(cudaMalloc((void **)&d_idata, memSize)); - - // copy host memory to device memory - if (PINNED == memMode) { - if (bDontUseGPUTiming) - sdkStartTimer(&timer); - checkCudaErrors(cudaEventRecord(start, 0)); - for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { - checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize, cudaMemcpyHostToDevice, 0)); - } - checkCudaErrors(cudaEventRecord(stop, 0)); - checkCudaErrors(cudaDeviceSynchronize()); - checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); - if (bDontUseGPUTiming) { - sdkStopTimer(&timer); - elapsedTimeInMs = sdkGetTimerValue(&timer); - sdkResetTimer(&timer); - } - } - else { - elapsedTimeInMs = 0; - for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { - sdkStartTimer(&timer); - checkCudaErrors(cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice)); - sdkStopTimer(&timer); - elapsedTimeInMs += sdkGetTimerValue(&timer); - sdkResetTimer(&timer); - memset(flush_buf, i, FLUSH_SIZE); - } - } - - // calculate bandwidth in GB/s - double time_s = elapsedTimeInMs / 1e3; - bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9; - bandwidthInGBs = bandwidthInGBs / time_s; - // clean up memory - checkCudaErrors(cudaEventDestroy(stop)); - checkCudaErrors(cudaEventDestroy(start)); - sdkDeleteTimer(&timer); - - if (PINNED == memMode) { - checkCudaErrors(cudaFreeHost(h_odata)); - } - else { - free(h_odata); - } - - free(h_cacheClear1); - free(h_cacheClear2); - checkCudaErrors(cudaFree(d_idata)); - - return bandwidthInGBs; -} - -/////////////////////////////////////////////////////////////////////////////// -//! test the bandwidth of a device to device memcopy of a specific size -/////////////////////////////////////////////////////////////////////////////// -float testDeviceToDeviceTransfer(unsigned int memSize) -{ - StopWatchInterface *timer = NULL; - float elapsedTimeInMs = 0.0f; - float bandwidthInGBs = 0.0f; - cudaEvent_t start, stop; - - sdkCreateTimer(&timer); - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - - // allocate host memory - unsigned char *h_idata = (unsigned char *)malloc(memSize); - - if (h_idata == 0) { - fprintf(stderr, "Not enough memory avaialable on host to run test!\n"); - exit(EXIT_FAILURE); - } - - // initialize the host memory - for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { - h_idata[i] = (unsigned char)(i & 0xff); - } - - // allocate device memory - unsigned char *d_idata; - checkCudaErrors(cudaMalloc((void **)&d_idata, memSize)); - unsigned char *d_odata; - checkCudaErrors(cudaMalloc((void **)&d_odata, memSize)); - - // initialize memory - checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice)); - - // run the memcopy - sdkStartTimer(&timer); - checkCudaErrors(cudaEventRecord(start, 0)); - - for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { - checkCudaErrors(cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice)); - } - - checkCudaErrors(cudaEventRecord(stop, 0)); - - // Since device to device memory copies are non-blocking, - // cudaDeviceSynchronize() is required in order to get - // proper timing. - checkCudaErrors(cudaDeviceSynchronize()); - - // get the total elapsed time in ms - sdkStopTimer(&timer); - checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); - - if (bDontUseGPUTiming) { - elapsedTimeInMs = sdkGetTimerValue(&timer); - } - - // calculate bandwidth in GB/s - double time_s = elapsedTimeInMs / 1e3; - bandwidthInGBs = (2.0f * memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9; - bandwidthInGBs = bandwidthInGBs / time_s; - - // clean up memory - sdkDeleteTimer(&timer); - free(h_idata); - checkCudaErrors(cudaEventDestroy(stop)); - checkCudaErrors(cudaEventDestroy(start)); - checkCudaErrors(cudaFree(d_idata)); - checkCudaErrors(cudaFree(d_odata)); - - return bandwidthInGBs; -} - -///////////////////////////////////////////////////////// -// print results in an easily read format -//////////////////////////////////////////////////////// -void printResultsReadable(unsigned int *memSizes, - double *bandwidths, - unsigned int count, - memcpyKind kind, - memoryMode memMode, - int iNumDevs, - bool wc) -{ - printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs); - printf(" %s Memory Transfers\n", sMemoryMode[memMode]); - - if (wc) { - printf(" Write-Combined Memory Writes are Enabled"); - } - - printf(" Transfer Size (Bytes)\tBandwidth(GB/s)\n"); - unsigned int i; - - for (i = 0; i < (count - 1); i++) { - printf(" %u\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]); - } - - printf(" %u\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]); -} - -/////////////////////////////////////////////////////////////////////////// -// print results in a database format -/////////////////////////////////////////////////////////////////////////// -void printResultsCSV(unsigned int *memSizes, - double *bandwidths, - unsigned int count, - memcpyKind kind, - memoryMode memMode, - int iNumDevs, - bool wc) -{ - std::string sConfig; - - // log config information - if (kind == DEVICE_TO_DEVICE) { - sConfig += "D2D"; - } - else { - if (kind == DEVICE_TO_HOST) { - sConfig += "D2H"; - } - else if (kind == HOST_TO_DEVICE) { - sConfig += "H2D"; - } - - if (memMode == PAGEABLE) { - sConfig += "-Paged"; - } - else if (memMode == PINNED) { - sConfig += "-Pinned"; - - if (wc) { - sConfig += "-WriteCombined"; - } - } - } - - unsigned int i; - double dSeconds = 0.0; - - for (i = 0; i < count; i++) { - dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1e9)); - printf("bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u " - "bytes, NumDevsUsed = %d\n", - sConfig.c_str(), - bandwidths[i], - dSeconds, - memSizes[i], - iNumDevs); - } -} - -/////////////////////////////////////////////////////////////////////////// -// Print help screen -/////////////////////////////////////////////////////////////////////////// -void printHelp(void) -{ - printf("Usage: bandwidthTest [OPTION]...\n"); - printf("Test the bandwidth for device to host, host to device, and device to " - "device transfers\n"); - printf("\n"); - printf("Example: measure the bandwidth of device to host pinned memory copies " - "in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n"); - printf("./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 " - "--increment=1024 --dtoh\n"); - - printf("\n"); - printf("Options:\n"); - printf("--help\tDisplay this help menu\n"); - printf("--csv\tPrint results as a CSV\n"); - printf("--device=[deviceno]\tSpecify the device device to be used\n"); - printf(" all - compute cumulative bandwidth on all the devices\n"); - printf(" 0,1,2,...,n - Specify any particular device to be used\n"); - printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n"); - printf(" pageable - pageable memory\n"); - printf(" pinned - non-pageable system memory\n"); - printf("--mode=[MODE]\tSpecify the mode to use\n"); - printf(" quick - performs a quick measurement\n"); - printf(" range - measures a user-specified range of values\n"); - printf(" shmoo - performs an intense shmoo of a large range of values\n"); - - printf("--htod\tMeasure host to device transfers\n"); - printf("--dtoh\tMeasure device to host transfers\n"); - printf("--dtod\tMeasure device to device transfers\n"); -#if CUDART_VERSION >= 2020 - printf("--wc\tAllocate pinned memory as write-combined\n"); -#endif - printf("--cputiming\tForce CPU-based timing always\n"); - - printf("Range mode options\n"); - printf("--start=[SIZE]\tStarting transfer size in bytes\n"); - printf("--end=[SIZE]\tEnding transfer size in bytes\n"); - printf("--increment=[SIZE]\tIncrement size in bytes\n"); -} diff --git a/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu b/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu index 199a862c..3bfa9fe5 100644 --- a/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu +++ b/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu @@ -102,13 +102,23 @@ static void childProcess(int id) int threads = 128; cudaDeviceProp prop; std::vector ptrs; + pid_t pid; + char pidString[20] = {0}; + char lshmName[40] = {0}; std::vector verification_buffer(DATA_SIZE); + pid = getppid(); + snprintf(pidString, sizeof(pidString), "%d", pid); + strcat(lshmName, shmName); + strcat(lshmName, pidString); + + printf("CP: lshmName = %s\n", lshmName); + ipcHandle *ipcChildHandle = NULL; checkIpcErrors(ipcOpenSocket(ipcChildHandle)); - if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) { + if (sharedMemoryOpen(lshmName, sizeof(shmStruct), &info) != 0) { printf("Failed to create shared memory slab\n"); exit(EXIT_FAILURE); } @@ -245,6 +255,16 @@ static void parentProcess(char *app) std::vector ptrs; std::vector processes; cudaMemAllocationHandleType handleType = cudaMemHandleTypeNone; + pid_t pid; + char pidString[20] = {0}; + char lshmName[40] = {0}; + + pid = getpid(); + snprintf(pidString, sizeof(pidString), "%d", pid); + strcat(lshmName, shmName); + strcat(lshmName, pidString); + + printf("PP: lshmName = %s\n", lshmName); checkCudaErrors(cudaGetDeviceCount(&devCount)); std::vector devices(devCount); @@ -252,7 +272,7 @@ static void parentProcess(char *app) cuDeviceGet(&devices[i], i); } - if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) { + if (sharedMemoryCreate(lshmName, sizeof(*shm), &info) != 0) { printf("Failed to create shared memory slab\n"); exit(EXIT_FAILURE); } diff --git a/Samples/3_CUDA_Features/memMapIPCDrv/memMapIpc.cpp b/Samples/3_CUDA_Features/memMapIPCDrv/memMapIpc.cpp index 7811a9c0..0fe208d2 100644 --- a/Samples/3_CUDA_Features/memMapIPCDrv/memMapIpc.cpp +++ b/Samples/3_CUDA_Features/memMapIPCDrv/memMapIpc.cpp @@ -310,10 +310,24 @@ static void childProcess(int devId, int id, char **argv) ipcHandle *ipcChildHandle = NULL; int blocks = 0; int threads = 128; + char pidString[20] = {0}; + char lshmName[40] = {0}; + + + // Use parent process ID to create a unique shared memory name for Linux multi-process +#ifdef __linux__ + pid_t pid; + pid = getppid(); + snprintf(pidString, sizeof(pidString), "%d", pid); +#endif + strcat(lshmName, shmName); + strcat(lshmName, pidString); + + printf("CP: lshmName = %s\n", lshmName); checkIpcErrors(ipcOpenSocket(ipcChildHandle)); - if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) { + if (sharedMemoryOpen(lshmName, sizeof(shmStruct), &info) != 0) { printf("Failed to create shared memory slab\n"); exit(EXIT_FAILURE); } @@ -421,11 +435,24 @@ static void parentProcess(char *app) volatile shmStruct *shm = NULL; sharedMemoryInfo info; std::vector processes; + char pidString[20] = {0}; + char lshmName[40] = {0}; + + // Use current process ID to create a unique shared memory name for Linux multi-process +#ifdef __linux__ + pid_t pid; + pid = getpid(); + snprintf(pidString, sizeof(pidString), "%d", pid); +#endif + strcat(lshmName, shmName); + strcat(lshmName, pidString); + + printf("PP: lshmName = %s\n", lshmName); checkCudaErrors(cuDeviceGetCount(&devCount)); std::vector devices(devCount); - if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) { + if (sharedMemoryCreate(lshmName, sizeof(*shm), &info) != 0) { printf("Failed to create shared memory slab\n"); exit(EXIT_FAILURE); } diff --git a/Samples/4_CUDA_Libraries/cudaNvSci/main.cpp b/Samples/4_CUDA_Libraries/cudaNvSci/main.cpp index 461fc906..b01dc430 100644 --- a/Samples/4_CUDA_Libraries/cudaNvSci/main.cpp +++ b/Samples/4_CUDA_Libraries/cudaNvSci/main.cpp @@ -25,8 +25,8 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include #include +#include #include #include #include diff --git a/Samples/4_CUDA_Libraries/oceanFFT/oceanFFT.cpp b/Samples/4_CUDA_Libraries/oceanFFT/oceanFFT.cpp index a4cc4890..8eeb9da8 100644 --- a/Samples/4_CUDA_Libraries/oceanFFT/oceanFFT.cpp +++ b/Samples/4_CUDA_Libraries/oceanFFT/oceanFFT.cpp @@ -45,13 +45,15 @@ #include #endif +// includes for OpenGL +#include + // includes #include #include #include #include #include -#include #include #include #include diff --git a/Samples/5_Domain_Specific/marchingCubes/marchingCubes.cpp b/Samples/5_Domain_Specific/marchingCubes/marchingCubes.cpp index 555fdde6..c6bcb450 100644 --- a/Samples/5_Domain_Specific/marchingCubes/marchingCubes.cpp +++ b/Samples/5_Domain_Specific/marchingCubes/marchingCubes.cpp @@ -86,12 +86,14 @@ #include #endif +// includes for OpenGL +#include + // includes #include #include #include // includes cuda.h and cuda_runtime_api.h #include -#include #include #include #include diff --git a/Samples/5_Domain_Specific/nbody/render_particles.cpp b/Samples/5_Domain_Specific/nbody/render_particles.cpp index 12432f9e..2c033c07 100644 --- a/Samples/5_Domain_Specific/nbody/render_particles.cpp +++ b/Samples/5_Domain_Specific/nbody/render_particles.cpp @@ -28,11 +28,15 @@ #include "render_particles.h" #define HELPERGL_EXTERN_GL_FUNC_IMPLEMENTATION + +// includes for OpenGL +#include + +// includes #include #include #include #include -#include #include #define GL_POINT_SPRITE_ARB 0x8861 diff --git a/Samples/5_Domain_Specific/simpleD3D11/simpleD3D11.cpp b/Samples/5_Domain_Specific/simpleD3D11/simpleD3D11.cpp index dad4828e..1596c022 100644 --- a/Samples/5_Domain_Specific/simpleD3D11/simpleD3D11.cpp +++ b/Samples/5_Domain_Specific/simpleD3D11/simpleD3D11.cpp @@ -31,9 +31,12 @@ #pragma warning(disable : 4312) -#include +// includes for Windows #include +// includes for multimedia +#include + // This header inclues all the necessary D3D11 and CUDA includes #include #include diff --git a/Samples/5_Domain_Specific/simpleD3D11Texture/simpleD3D11Texture.cpp b/Samples/5_Domain_Specific/simpleD3D11Texture/simpleD3D11Texture.cpp index ade0e2b1..113d1bd9 100644 --- a/Samples/5_Domain_Specific/simpleD3D11Texture/simpleD3D11Texture.cpp +++ b/Samples/5_Domain_Specific/simpleD3D11Texture/simpleD3D11Texture.cpp @@ -31,9 +31,12 @@ #pragma warning(disable : 4312) -#include +// includes for Windows #include +// includes for multimedia +#include + // This header inclues all the necessary D3D11 and CUDA includes #include #include diff --git a/Samples/5_Domain_Specific/smokeParticles/ParticleSystem.cpp b/Samples/5_Domain_Specific/smokeParticles/ParticleSystem.cpp index 3952b7b6..c016a88d 100644 --- a/Samples/5_Domain_Specific/smokeParticles/ParticleSystem.cpp +++ b/Samples/5_Domain_Specific/smokeParticles/ParticleSystem.cpp @@ -33,11 +33,15 @@ #include #define HELPERGL_EXTERN_GL_FUNC_IMPLEMENTATION + +// includes for OpenGL +#include + +// includes #include #include #include #include -#include #include "ParticleSystem.cuh" #include "ParticleSystem.h" diff --git a/Samples/5_Domain_Specific/smokeParticles/ParticleSystem_cuda.cu b/Samples/5_Domain_Specific/smokeParticles/ParticleSystem_cuda.cu index 09838164..69af1466 100644 --- a/Samples/5_Domain_Specific/smokeParticles/ParticleSystem_cuda.cu +++ b/Samples/5_Domain_Specific/smokeParticles/ParticleSystem_cuda.cu @@ -29,11 +29,15 @@ This file contains simple wrapper functions that call the CUDA kernels */ #define HELPERGL_EXTERN_GL_FUNC_IMPLEMENTATION + +// includes for OpenGL +#include + +// includes #include #include #include #include -#include #include #include "ParticleSystem.cuh" diff --git a/Samples/CMakeLists.txt b/Samples/CMakeLists.txt index 42424942..3f7a8f3c 100644 --- a/Samples/CMakeLists.txt +++ b/Samples/CMakeLists.txt @@ -1,11 +1,33 @@ +# This layer of CMakeLists.txt adds folders, for better organization in Visual Studio +# and other IDEs that support this feature. + +set_property(GLOBAL PROPERTY USE_FOLDERS ON) + +set(CMAKE_FOLDER "0_Introduction") add_subdirectory(0_Introduction) + +set(CMAKE_FOLDER "1_Utilities") add_subdirectory(1_Utilities) + +set(CMAKE_FOLDER "2_Concepts_and_Techniques") add_subdirectory(2_Concepts_and_Techniques) + +set(CMAKE_FOLDER "3_CUDA_Features") add_subdirectory(3_CUDA_Features) + +set(CMAKE_FOLDER "4_CUDA_Libraries") add_subdirectory(4_CUDA_Libraries) + +set(CMAKE_FOLDER "5_Domain_Specific") add_subdirectory(5_Domain_Specific) + +set(CMAKE_FOLDER "6_Performance") add_subdirectory(6_Performance) + +set(CMAKE_FOLDER "7_libNVVM") add_subdirectory(7_libNVVM) + if(BUILD_TEGRA) + set(CMAKE_FOLDER "8_Platform_Specific/Tegra") add_subdirectory(8_Platform_Specific/Tegra) endif() diff --git a/run_tests.py b/run_tests.py index 50320c81..98b7dee1 100644 --- a/run_tests.py +++ b/run_tests.py @@ -33,6 +33,15 @@ import json import subprocess import argparse from pathlib import Path +import concurrent.futures +import threading + +print_lock = threading.Lock() + +def safe_print(*args, **kwargs): + """Thread-safe print function""" + with print_lock: + print(*args, **kwargs) def normalize_exe_name(name): """Normalize executable name across platforms by removing .exe if present""" @@ -78,96 +87,49 @@ def find_executables(root_dir): return executables -def run_test(executable, output_dir, args_config, global_args=None): - """Run a single test and capture output""" +def run_single_test_instance(executable, args, output_file, global_args, run_description): + """Run a single instance of a test executable with specific arguments.""" exe_path = str(executable) exe_name = executable.name - base_name = normalize_exe_name(exe_name) - # Check if this executable should be skipped - if base_name in args_config and args_config[base_name].get("skip", False): - print(f"Skipping {exe_name} (marked as skip in config)") - return 0 + safe_print(f"Starting {exe_name} {run_description}") - # Get argument sets for this executable - arg_sets = [] - if base_name in args_config: - config = args_config[base_name] - if "args" in config: - # Single argument set (backwards compatibility) - if isinstance(config["args"], list): - arg_sets.append(config["args"]) - else: - print(f"Warning: Arguments for {base_name} must be a list") - elif "runs" in config: - # Multiple argument sets - for run in config["runs"]: - if isinstance(run.get("args", []), list): - arg_sets.append(run.get("args", [])) - else: - print(f"Warning: Arguments for {base_name} run must be a list") + try: + cmd = [f"./{exe_name}"] + cmd.extend(args) + if global_args: + cmd.extend(global_args) - # If no specific args defined, run once with no args - if not arg_sets: - arg_sets.append([]) + safe_print(f" Command ({exe_name} {run_description}): {' '.join(cmd)}") - # Run for each argument set - failed = False - run_number = 1 - for args in arg_sets: - # Create output file name with run number if multiple runs - if len(arg_sets) > 1: - output_file = os.path.abspath(f"{output_dir}/APM_{exe_name}.run{run_number}.txt") - print(f"Running {exe_name} (run {run_number}/{len(arg_sets)})") - else: - output_file = os.path.abspath(f"{output_dir}/APM_{exe_name}.txt") - print(f"Running {exe_name}") + # Run the executable in its own directory using cwd + with open(output_file, 'w') as f: + result = subprocess.run( + cmd, + stdout=f, + stderr=subprocess.STDOUT, + timeout=300, # 5 minute timeout + cwd=os.path.dirname(exe_path) # Execute in the executable's directory + ) - try: - # Prepare command with arguments - cmd = [f"./{exe_name}"] - cmd.extend(args) + status = "Passed" if result.returncode == 0 else "Failed" + safe_print(f" Finished {exe_name} {run_description}: {status} (code {result.returncode})") + return {"name": exe_name, "description": run_description, "return_code": result.returncode, "status": status} - # Add global arguments if provided - if global_args: - cmd.extend(global_args) + except subprocess.TimeoutExpired: + safe_print(f"Error ({exe_name} {run_description}): Timed out after 5 minutes") + return {"name": exe_name, "description": run_description, "return_code": -1, "status": "Timeout"} + except Exception as e: + safe_print(f"Error running {exe_name} {run_description}: {str(e)}") + return {"name": exe_name, "description": run_description, "return_code": -1, "status": f"Error: {str(e)}"} - print(f" Command: {' '.join(cmd)}") - - # Store current directory - original_dir = os.getcwd() - - try: - # Change to executable's directory - os.chdir(os.path.dirname(exe_path)) - - # Run the executable and capture output - with open(output_file, 'w') as f: - result = subprocess.run( - cmd, - stdout=f, - stderr=subprocess.STDOUT, - timeout=300 # 5 minute timeout - ) - - if result.returncode != 0: - failed = True - print(f" Test completed with return code {result.returncode}") - - finally: - # Always restore original directory - os.chdir(original_dir) - - except subprocess.TimeoutExpired: - print(f"Error: {exe_name} timed out after 5 minutes") - failed = True - except Exception as e: - print(f"Error running {exe_name}: {str(e)}") - failed = True - - run_number += 1 - - return 1 if failed else 0 +def run_test(executable, output_dir, args_config, global_args=None): + """Deprecated: This function is replaced by the parallel execution logic in main.""" + # This function is no longer called directly by the main logic. + # It remains here temporarily in case it's needed for reference or single-threaded debugging. + # The core logic is now in run_single_test_instance and managed by ThreadPoolExecutor. + print("Warning: run_test function called directly - this indicates an issue in the refactoring.") + return 1 # Indicate failure if called def main(): parser = argparse.ArgumentParser(description='Run all executables and capture output') @@ -175,6 +137,7 @@ def main(): parser.add_argument('--config', help='JSON configuration file for executable arguments') parser.add_argument('--output', default='.', # Default to current directory help='Output directory for test results') + parser.add_argument('--parallel', type=int, default=1, help='Number of parallel tests to run') parser.add_argument('--args', nargs=argparse.REMAINDER, help='Global arguments to pass to all executables') args = parser.parse_args() @@ -192,23 +155,104 @@ def main(): return 1 print(f"Found {len(executables)} executables") + print(f"Running tests with up to {args.parallel} parallel tasks.") + + tasks = [] + for exe in executables: + exe_name = exe.name + base_name = normalize_exe_name(exe_name) + + # Check if this executable should be skipped globally + if base_name in args_config and args_config[base_name].get("skip", False): + safe_print(f"Skipping {exe_name} (marked as skip in config)") + continue + + arg_sets_configs = [] + if base_name in args_config: + config = args_config[base_name] + if "args" in config: + if isinstance(config["args"], list): + arg_sets_configs.append({"args": config["args"]}) # Wrap in dict for consistency + else: + safe_print(f"Warning: Arguments for {base_name} must be a list") + elif "runs" in config: + for i, run_config in enumerate(config["runs"]): + if run_config.get("skip", False): + safe_print(f"Skipping run {i+1} for {exe_name} (marked as skip in config)") + continue + if isinstance(run_config.get("args", []), list): + arg_sets_configs.append(run_config) + else: + safe_print(f"Warning: Arguments for {base_name} run {i+1} must be a list") + + # If no specific args defined, create one run with no args + if not arg_sets_configs: + arg_sets_configs.append({"args": []}) + + # Create tasks for each run configuration + num_runs = len(arg_sets_configs) + for i, run_config in enumerate(arg_sets_configs): + current_args = run_config.get("args", []) + run_desc = f"(run {i+1}/{num_runs})" if num_runs > 1 else "" + + # Create output file name + if num_runs > 1: + output_file = os.path.abspath(f"{args.output}/APM_{exe_name}.run{i+1}.txt") + else: + output_file = os.path.abspath(f"{args.output}/APM_{exe_name}.txt") + + tasks.append({ + "executable": exe, + "args": current_args, + "output_file": output_file, + "global_args": args.args, + "description": run_desc + }) failed = [] - for exe in executables: - ret_code = run_test(exe, args.output, args_config, args.args) - if ret_code != 0: - failed.append((exe.name, ret_code)) + total_runs = len(tasks) + completed_runs = 0 + + with concurrent.futures.ThreadPoolExecutor(max_workers=args.parallel) as executor: + future_to_task = { + executor.submit(run_single_test_instance, + task["executable"], + task["args"], + task["output_file"], + task["global_args"], + task["description"]): task + for task in tasks + } + + for future in concurrent.futures.as_completed(future_to_task): + task_info = future_to_task[future] + completed_runs += 1 + safe_print(f"Progress: {completed_runs}/{total_runs} runs completed.") + try: + result = future.result() + if result["return_code"] != 0: + failed.append(result) + except Exception as exc: + safe_print(f'Task {task_info["executable"].name} {task_info["description"]} generated an exception: {exc}') + failed.append({ + "name": task_info["executable"].name, + "description": task_info["description"], + "return_code": -1, + "status": f"Execution Exception: {exc}" + }) # Print summary print("\nTest Summary:") - print(f"Ran {len(executables)} tests") + print(f"Ran {total_runs} test runs for {len(executables)} executables.") if failed: - print(f"Failed tests ({len(failed)}):") - for name, code in failed: - print(f" {name}: returned {code}") - return failed[0][1] # Return first failure code + print(f"Failed runs ({len(failed)}):") + for fail in failed: + print(f" {fail['name']} {fail['description']}: {fail['status']} (code {fail['return_code']})") + # Return the return code of the first failure, or 1 if only exceptions occurred + first_failure_code = next((f["return_code"] for f in failed if f["return_code"] != -1), 1) + return first_failure_code else: - print("All tests passed!") + print("All test runs passed!") return 0 if __name__ == '__main__':