/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // System includes #include #include // CUDA runtime #include // helper functions and utilities to work with CUDA #include #include #ifndef MAX #define MAX(a, b) (a > b ? a : b) #endif /* Add two vectors on the GPU */ __global__ void vectorAddGPU(float *a, float *b, float *c, int N) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < N) { c[idx] = a[idx] + b[idx]; } } // Allocate generic memory with malloc() and pin it laster instead of using // cudaHostAlloc() bool bPinGenericMemory = false; // Macro to aligned up to the memory size in question #define MEMORY_ALIGNMENT 4096 #define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1))) int main(int argc, char **argv) { int n, nelem, deviceCount; int idev = 0; // use default device 0 char *device = NULL; unsigned int flags; size_t bytes; float *a, *b, *c; // Pinned memory allocated on the CPU float *a_UA, *b_UA, *c_UA; // Non-4K Aligned Pinned memory on the CPU float *d_a, *d_b, *d_c; // Device pointers for mapped memory float errorNorm, refNorm, ref, diff; cudaDeviceProp deviceProp; if (checkCmdLineFlag(argc, (const char **)argv, "help")) { printf("Usage: simpleZeroCopy [OPTION]\n\n"); printf("Options:\n"); printf(" --device=[device #] Specify the device to be used\n"); printf( " --use_generic_memory (optional) use generic page-aligned for system " "memory\n"); return EXIT_SUCCESS; } /* Get the device selected by the user or default to 0, and then set it. */ if (getCmdLineArgumentString(argc, (const char **)argv, "device", &device)) { cudaGetDeviceCount(&deviceCount); idev = atoi(device); if (idev >= deviceCount || idev < 0) { fprintf(stderr, "Device number %d is invalid, will use default CUDA device 0.\n", idev); idev = 0; } } // if GPU found supports SM 1.2, then continue, otherwise we exit if (!checkCudaCapabilities(1, 2)) { exit(EXIT_SUCCESS); } if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) { #if defined(__APPLE__) || defined(MACOSX) bPinGenericMemory = false; // Generic Pinning of System Paged memory is not // currently supported on Mac OSX #else bPinGenericMemory = true; #endif } if (bPinGenericMemory) { printf("> Using Generic System Paged Memory (malloc)\n"); } else { printf("> Using CUDA Host Allocated (cudaHostAlloc)\n"); } checkCudaErrors(cudaSetDevice(idev)); /* Verify the selected device supports mapped memory and set the device flags for mapping host memory. */ checkCudaErrors(cudaGetDeviceProperties(&deviceProp, idev)); #if CUDART_VERSION >= 2020 if (!deviceProp.canMapHostMemory) { fprintf(stderr, "Device %d does not support mapping CPU host memory!\n", idev); exit(EXIT_SUCCESS); } checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost)); #else fprintf(stderr, "CUDART version %d.%d does not support " " field\n", , CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10); exit(EXIT_SUCCESS); #endif #if CUDART_VERSION < 4000 if (bPinGenericMemory) { fprintf( stderr, "CUDART version %d.%d does not support function\n", CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10); exit(EXIT_SUCCESS); } #endif /* Allocate mapped CPU memory. */ nelem = 1048576; bytes = nelem * sizeof(float); if (bPinGenericMemory) { #if CUDART_VERSION >= 4000 a_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT); b_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT); c_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT); // We need to ensure memory is aligned to 4K (so we will need to padd memory // accordingly) a = (float *)ALIGN_UP(a_UA, MEMORY_ALIGNMENT); b = (float *)ALIGN_UP(b_UA, MEMORY_ALIGNMENT); c = (float *)ALIGN_UP(c_UA, MEMORY_ALIGNMENT); checkCudaErrors(cudaHostRegister(a, bytes, cudaHostRegisterMapped)); checkCudaErrors(cudaHostRegister(b, bytes, cudaHostRegisterMapped)); checkCudaErrors(cudaHostRegister(c, bytes, cudaHostRegisterMapped)); #endif } else { #if CUDART_VERSION >= 2020 flags = cudaHostAllocMapped; checkCudaErrors(cudaHostAlloc((void **)&a, bytes, flags)); checkCudaErrors(cudaHostAlloc((void **)&b, bytes, flags)); checkCudaErrors(cudaHostAlloc((void **)&c, bytes, flags)); #endif } /* Initialize the vectors. */ for (n = 0; n < nelem; n++) { a[n] = rand() / (float)RAND_MAX; b[n] = rand() / (float)RAND_MAX; } /* Get the device pointers for the pinned CPU memory mapped into the GPU memory space. */ #if CUDART_VERSION >= 2020 checkCudaErrors(cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0)); checkCudaErrors(cudaHostGetDevicePointer((void **)&d_b, (void *)b, 0)); checkCudaErrors(cudaHostGetDevicePointer((void **)&d_c, (void *)c, 0)); #endif /* Call the GPU kernel using the CPU pointers residing in CPU mapped memory. */ printf("> vectorAddGPU kernel will add vectors using mapped CPU memory...\n"); dim3 block(256); dim3 grid((unsigned int)ceil(nelem / (float)block.x)); vectorAddGPU<<>>(d_a, d_b, d_c, nelem); checkCudaErrors(cudaDeviceSynchronize()); getLastCudaError("vectorAddGPU() execution failed"); /* Compare the results */ printf("> Checking the results from vectorAddGPU() ...\n"); errorNorm = 0.f; refNorm = 0.f; for (n = 0; n < nelem; n++) { ref = a[n] + b[n]; diff = c[n] - ref; errorNorm += diff * diff; refNorm += ref * ref; } errorNorm = (float)sqrt((double)errorNorm); refNorm = (float)sqrt((double)refNorm); /* Memory clean up */ printf("> Releasing CPU memory...\n"); if (bPinGenericMemory) { #if CUDART_VERSION >= 4000 checkCudaErrors(cudaHostUnregister(a)); checkCudaErrors(cudaHostUnregister(b)); checkCudaErrors(cudaHostUnregister(c)); free(a_UA); free(b_UA); free(c_UA); #endif } else { #if CUDART_VERSION >= 2020 checkCudaErrors(cudaFreeHost(a)); checkCudaErrors(cudaFreeHost(b)); checkCudaErrors(cudaFreeHost(c)); #endif } exit(errorNorm / refNorm < 1.e-6f ? EXIT_SUCCESS : EXIT_FAILURE); }