/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // System includes #include #include #include #include // CUDA runtime #include // helper functions and utilities to work with CUDA #include #include #define THREADS_PER_BLOCK 512 #define ALLOWABLE_VARIANCE 1.e-6f #define NUM_ELEMENTS 8000000 // Stores the square of each input element in output array __global__ void squareArray(const float *input, float *output, int numElements) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < numElements) { output[idx] = input[idx] * input[idx]; } } // Stores the negative of each input element in output array __global__ void negateArray(const float *input, float *output, int numElements) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < numElements) { output[idx] = input[idx] * -1; } } struct negSquareArrays { float *input; float *square; float *negSquare; int numElements; size_t bytes; size_t numBlocks; }; void fillRandomly(float *array, int numElements) { for (int n = 0; n < numElements; n++) { array[n] = rand() / (float)RAND_MAX; } } void resetOutputArrays(negSquareArrays *hostArrays) { fillRandomly(hostArrays->square, hostArrays->numElements); fillRandomly(hostArrays->negSquare, hostArrays->numElements); } void prepareHostArrays(negSquareArrays *hostArrays) { hostArrays->numElements = NUM_ELEMENTS; size_t bytes = hostArrays->numElements * sizeof(float); size_t numBlocks = hostArrays->numElements / (size_t)THREADS_PER_BLOCK; if ((numBlocks % (size_t)THREADS_PER_BLOCK) != 0) { numBlocks++; } hostArrays->input = (float *)malloc(bytes); hostArrays->square = (float *)malloc(bytes); hostArrays->negSquare = (float *)malloc(bytes); hostArrays->bytes = bytes; hostArrays->numBlocks = numBlocks; fillRandomly(hostArrays->input, hostArrays->numElements); fillRandomly(hostArrays->square, hostArrays->numElements); fillRandomly(hostArrays->negSquare, hostArrays->numElements); } void createFreeGraph(cudaGraphExec_t *graphExec, float *dPtr) { cudaGraph_t graph; cudaGraphNode_t freeNode; checkCudaErrors(cudaGraphCreate(&graph, 0)); checkCudaErrors( cudaGraphAddMemFreeNode(&freeNode, graph, NULL, 0, (void *)dPtr)); checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); checkCudaErrors(cudaGraphDestroy(graph)); } /** * Demonstrates explicitly creating a CUDA graph including memory nodes. * createNegateSquaresGraphWithStreamCapture constructs an equivalent graph * using stream capture. * * If d_negSquare_out is non null, then: * 1) d_negSquare will not be freed; * 2) the value of d_negSquare_out will be set to d_negSquare. * * Diagram of the graph constructed by createNegateSquaresGraphExplicitly: * * alloc d_input * | * alloc d_square * | * Memcpy a to device * | * launch kernel squareArray ------->---- Memcpy d_square to host * | | * free d_input | * | | * allocate d_negSquare | * | | * launch kernel negateArray -------->--- free d_square * | * Memcpy d_negSquare to host * | * free d_negSquare */ void createNegateSquaresGraphExplicitly(cudaGraphExec_t *graphExec, int device, negSquareArrays *hostArrays, float **d_negSquare_out = NULL) { // Array buffers on device float *d_input, *d_square, *d_negSquare; // Memory allocation parameters cudaMemAllocNodeParams allocParams; memset(&allocParams, 0, sizeof(allocParams)); allocParams.bytesize = hostArrays->bytes; allocParams.poolProps.allocType = cudaMemAllocationTypePinned; allocParams.poolProps.location.id = device; allocParams.poolProps.location.type = cudaMemLocationTypeDevice; // Kernel launch parameters cudaKernelNodeParams kernelNodeParams = {0}; kernelNodeParams.gridDim = dim3(hostArrays->numBlocks, 1, 1); kernelNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1); kernelNodeParams.sharedMemBytes = 0; kernelNodeParams.extra = NULL; cudaGraph_t graph; cudaGraphNode_t allocNodeInput, allocNodeSquare, allocNodeNegSquare; cudaGraphNode_t copyNodeInput, copyNodeSquare, copyNodeNegSquare; cudaGraphNode_t squareKernelNode, negateKernelNode; cudaGraphNode_t freeNodeInput, freeNodeSquare; // Buffer for storing graph node dependencies std::vector nodeDependencies; checkCudaErrors(cudaGraphCreate(&graph, 0)); checkCudaErrors( cudaGraphAddMemAllocNode(&allocNodeInput, graph, NULL, 0, &allocParams)); d_input = (float *)allocParams.dptr; // To keep the graph structure simple (fewer branching dependencies), // allocNodeSquare should depend on allocNodeInput checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeSquare, graph, &allocNodeInput, 1, &allocParams)); d_square = (float *)allocParams.dptr; // copyNodeInput needs to depend on allocNodeInput because copyNodeInput // writes to d_input. It does so here indirectly through allocNodeSquare. checkCudaErrors(cudaGraphAddMemcpyNode1D( ©NodeInput, graph, &allocNodeSquare, 1, d_input, hostArrays->input, hostArrays->bytes, cudaMemcpyHostToDevice)); void *squareKernelArgs[3] = {(void *)&d_input, (void *)&d_square, (void *)&(hostArrays->numElements)}; kernelNodeParams.func = (void *)squareArray; kernelNodeParams.kernelParams = (void **)squareKernelArgs; // Square kernel depends on copyNodeInput to ensure all data is on the device // before kernel launch. checkCudaErrors(cudaGraphAddKernelNode(&squareKernelNode, graph, ©NodeInput, 1, &kernelNodeParams)); checkCudaErrors(cudaGraphAddMemcpyNode1D( ©NodeSquare, graph, &squareKernelNode, 1, hostArrays->square, d_square, hostArrays->bytes, cudaMemcpyDeviceToHost)); // Free of d_input depends on the square kernel to ensure that d_input is not // freed while being read by the kernel. It also depends on the alloc of // d_input via squareKernelNode > copyNodeInput > allocNodeSquare > // allocNodeInput. checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeInput, graph, &squareKernelNode, 1, d_input)); // Allocation of C depends on free of A so CUDA can reuse the virtual address. checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeNegSquare, graph, &freeNodeInput, 1, &allocParams)); d_negSquare = (float *)allocParams.dptr; if (d_negSquare == d_input) { printf( "Check verified that d_negSquare and d_input share a virtual " "address.\n"); } void *negateKernelArgs[3] = {(void *)&d_square, (void *)&d_negSquare, (void *)&(hostArrays->numElements)}; kernelNodeParams.func = (void *)negateArray; kernelNodeParams.kernelParams = (void **)negateKernelArgs; checkCudaErrors(cudaGraphAddKernelNode( &negateKernelNode, graph, &allocNodeNegSquare, 1, &kernelNodeParams)); nodeDependencies.push_back(copyNodeSquare); nodeDependencies.push_back(negateKernelNode); checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeSquare, graph, nodeDependencies.data(), nodeDependencies.size(), d_square)); nodeDependencies.clear(); checkCudaErrors(cudaGraphAddMemcpyNode1D( ©NodeNegSquare, graph, &negateKernelNode, 1, hostArrays->negSquare, d_negSquare, hostArrays->bytes, cudaMemcpyDeviceToHost)); if (d_negSquare_out == NULL) { cudaGraphNode_t freeNodeNegSquare; checkCudaErrors(cudaGraphAddMemFreeNode( &freeNodeNegSquare, graph, ©NodeNegSquare, 1, d_negSquare)); } else { *d_negSquare_out = d_negSquare; } checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); checkCudaErrors(cudaGraphDestroy(graph)); } /** * Adds work to a CUDA stream which negates the square of values in the input * array. * * If d_negSquare_out is non null, then: * 1) d_negSquare will not be freed; * 2) the value of d_negSquare_out will be set to d_negSquare. * * Diagram of the stream operations in doNegateSquaresInStream * --------------------------------------------------------------------- * | STREAM | STREAM2 | * --------------------------------------------------------------------- * * alloc d_input * | * alloc d_square * | * Memcpy a to device * | * launch kernel squareArray * | * record squareKernelCompleteEvent -->-- wait squareKernelCompleteEvent * | | * free d_input | * | | * allocate d_negSquare Memcpy d_square to host * | | * launch kernel negateArray | * | | * record negateKernelCompleteEvent -->-- wait negateKernelCompleteEvent * | | * Memcpy d_negSquare to host | * | free d_square * free d_negSquare | * | | * wait squareFreeEvent --------------<---- record squareFreeEvent */ void doNegateSquaresInStream(cudaStream_t stream1, negSquareArrays *hostArrays, float **d_negSquare_out = NULL) { float *d_input, *d_square, *d_negSquare; cudaStream_t stream2; cudaEvent_t squareKernelCompleteEvent, negateKernelCompleteEvent, squareFreeEvent; checkCudaErrors(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking)); checkCudaErrors(cudaEventCreate(&squareKernelCompleteEvent)); checkCudaErrors(cudaEventCreate(&negateKernelCompleteEvent)); checkCudaErrors(cudaEventCreate(&squareFreeEvent)); // Virtual addresses are assigned synchronously when cudaMallocAsync is // called, thus there is no performace benefit gained by separating the // allocations into two streams. checkCudaErrors(cudaMallocAsync(&d_input, hostArrays->bytes, stream1)); checkCudaErrors(cudaMallocAsync(&d_square, hostArrays->bytes, stream1)); checkCudaErrors(cudaMemcpyAsync(d_input, hostArrays->input, hostArrays->bytes, cudaMemcpyHostToDevice, stream1)); squareArray<<numBlocks, THREADS_PER_BLOCK, 0, stream1>>>( d_input, d_square, hostArrays->numElements); checkCudaErrors(cudaEventRecord(squareKernelCompleteEvent, stream1)); checkCudaErrors(cudaStreamWaitEvent(stream2, squareKernelCompleteEvent, 0)); checkCudaErrors(cudaMemcpyAsync(hostArrays->square, d_square, hostArrays->bytes, cudaMemcpyDeviceToHost, stream2)); checkCudaErrors(cudaFreeAsync(d_input, stream1)); checkCudaErrors(cudaMallocAsync(&d_negSquare, hostArrays->bytes, stream1)); negateArray<<numBlocks, THREADS_PER_BLOCK, 0, stream1>>>( d_square, d_negSquare, hostArrays->numElements); checkCudaErrors(cudaEventRecord(negateKernelCompleteEvent, stream1)); checkCudaErrors(cudaMemcpyAsync(hostArrays->negSquare, d_negSquare, hostArrays->bytes, cudaMemcpyDeviceToHost, stream1)); if (d_negSquare_out == NULL) { checkCudaErrors(cudaFreeAsync(d_negSquare, stream1)); } else { *d_negSquare_out = d_negSquare; } checkCudaErrors(cudaStreamWaitEvent(stream2, negateKernelCompleteEvent, 0)); checkCudaErrors(cudaFreeAsync(d_square, stream2)); checkCudaErrors(cudaEventRecord(squareFreeEvent, stream2)); checkCudaErrors(cudaStreamWaitEvent(stream1, squareFreeEvent, 0)); checkCudaErrors(cudaStreamDestroy(stream2)); checkCudaErrors(cudaEventDestroy(squareKernelCompleteEvent)); checkCudaErrors(cudaEventDestroy(negateKernelCompleteEvent)); checkCudaErrors(cudaEventDestroy(squareFreeEvent)); } /** * Demonstrates creating a CUDA graph including memory nodes using stream * capture. createNegateSquaresGraphExplicitly constructs an equivalent graph * without stream capture. */ void createNegateSquaresGraphWithStreamCapture(cudaGraphExec_t *graphExec, negSquareArrays *hostArrays, float **d_negSquare_out = NULL) { cudaGraph_t graph; cudaStream_t stream; checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); checkCudaErrors(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); doNegateSquaresInStream(stream, hostArrays, d_negSquare_out); checkCudaErrors(cudaStreamEndCapture(stream, &graph)); checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); checkCudaErrors(cudaStreamDestroy(stream)); checkCudaErrors(cudaGraphDestroy(graph)); } void prepareRefArrays(negSquareArrays *hostArrays, negSquareArrays *deviceRefArrays, bool **foundValidationFailure) { deviceRefArrays->bytes = hostArrays->bytes; deviceRefArrays->numElements = hostArrays->numElements; for (int i = 0; i < hostArrays->numElements; i++) { hostArrays->square[i] = hostArrays->input[i] * hostArrays->input[i]; hostArrays->negSquare[i] = hostArrays->square[i] * -1; } checkCudaErrors( cudaMalloc((void **)&deviceRefArrays->negSquare, deviceRefArrays->bytes)); checkCudaErrors(cudaMemcpy(deviceRefArrays->negSquare, hostArrays->negSquare, hostArrays->bytes, cudaMemcpyHostToDevice)); checkCudaErrors( cudaMallocManaged((void **)foundValidationFailure, sizeof(bool))); } int checkValidationFailure(bool *foundValidationFailure) { if (*foundValidationFailure) { printf("Validation FAILURE!\n\n"); *foundValidationFailure = false; return EXIT_FAILURE; } else { printf("Validation PASSED!\n\n"); return EXIT_SUCCESS; } } __global__ void validateGPU(float *d_negSquare, negSquareArrays devRefArrays, bool *foundValidationFailure) { int idx = blockIdx.x * blockDim.x + threadIdx.x; float ref, diff; if (idx < devRefArrays.numElements) { ref = devRefArrays.negSquare[idx]; diff = d_negSquare[idx] - ref; diff *= diff; ref *= ref; if (diff / ref > ALLOWABLE_VARIANCE) { *foundValidationFailure = true; } } } void validateHost(negSquareArrays *hostArrays, bool *foundValidationFailure) { float ref, diff; for (int i = 0; i < hostArrays->numElements; i++) { ref = hostArrays->input[i] * hostArrays->input[i] * -1; diff = hostArrays->negSquare[i] - ref; diff *= diff; ref *= ref; if (diff / ref > ALLOWABLE_VARIANCE) { *foundValidationFailure = true; } } } int main(int argc, char **argv) { negSquareArrays hostArrays, deviceRefArrays; cudaStream_t stream; cudaGraphExec_t graphExec, graphExecFreeC; // Declare pointers for GPU buffers float *d_negSquare = NULL; bool *foundValidationFailure = NULL; srand(time(0)); int device = findCudaDevice(argc, (const char **)argv); int driverVersion = 0; int deviceSupportsMemoryPools = 0; cudaDriverGetVersion(&driverVersion); printf("Driver version is: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10); if (driverVersion < 11040) { printf("Waiving execution as driver does not support Graph Memory Nodes\n"); exit(EXIT_WAIVED); } cudaDeviceGetAttribute(&deviceSupportsMemoryPools, cudaDevAttrMemoryPoolsSupported, device); if (!deviceSupportsMemoryPools) { printf("Waiving execution as device does not support Memory Pools\n"); exit(EXIT_WAIVED); } else { printf("Setting up sample.\n"); } prepareHostArrays(&hostArrays); prepareRefArrays(&hostArrays, &deviceRefArrays, &foundValidationFailure); checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); printf("Setup complete.\n\n"); printf("Running negateSquares in a stream.\n"); doNegateSquaresInStream(stream, &hostArrays); checkCudaErrors(cudaStreamSynchronize(stream)); printf("Validating negateSquares in a stream...\n"); validateHost(&hostArrays, foundValidationFailure); checkValidationFailure(foundValidationFailure); resetOutputArrays(&hostArrays); printf("Running negateSquares in a stream-captured graph.\n"); createNegateSquaresGraphWithStreamCapture(&graphExec, &hostArrays); checkCudaErrors(cudaGraphLaunch(graphExec, stream)); checkCudaErrors(cudaStreamSynchronize(stream)); printf("Validating negateSquares in a stream-captured graph...\n"); validateHost(&hostArrays, foundValidationFailure); checkValidationFailure(foundValidationFailure); resetOutputArrays(&hostArrays); printf("Running negateSquares in an explicitly constructed graph.\n"); createNegateSquaresGraphExplicitly(&graphExec, device, &hostArrays); checkCudaErrors(cudaGraphLaunch(graphExec, stream)); checkCudaErrors(cudaStreamSynchronize(stream)); printf("Validating negateSquares in an explicitly constructed graph...\n"); validateHost(&hostArrays, foundValidationFailure); checkValidationFailure(foundValidationFailure); resetOutputArrays(&hostArrays); // Each of the three examples below free d_negSquare outside the graph. As // demonstrated by validateGPU, d_negSquare can be accessed by outside the // graph before d_negSquare is freed. printf("Running negateSquares with d_negSquare freed outside the stream.\n"); createNegateSquaresGraphExplicitly(&graphExec, device, &hostArrays, &d_negSquare); checkCudaErrors(cudaGraphLaunch(graphExec, stream)); validateGPU<<>>( d_negSquare, deviceRefArrays, foundValidationFailure); // Since cudaFree is synchronous, the stream must synchronize before freeing // d_negSquare to ensure d_negSquare no longer being accessed. checkCudaErrors(cudaStreamSynchronize(stream)); checkCudaErrors(cudaFree(d_negSquare)); printf( "Validating negateSquares with d_negSquare freed outside the " "stream...\n"); validateHost(&hostArrays, foundValidationFailure); checkValidationFailure(foundValidationFailure); resetOutputArrays(&hostArrays); printf("Running negateSquares with d_negSquare freed outside the graph.\n"); checkCudaErrors(cudaGraphLaunch(graphExec, stream)); validateGPU<<>>( d_negSquare, deviceRefArrays, foundValidationFailure); checkCudaErrors(cudaFreeAsync(d_negSquare, stream)); checkCudaErrors(cudaStreamSynchronize(stream)); printf( "Validating negateSquares with d_negSquare freed outside the graph...\n"); checkValidationFailure(foundValidationFailure); resetOutputArrays(&hostArrays); printf( "Running negateSquares with d_negSquare freed in a different graph.\n"); createFreeGraph(&graphExecFreeC, d_negSquare); checkCudaErrors(cudaGraphLaunch(graphExec, stream)); validateGPU<<>>( d_negSquare, deviceRefArrays, foundValidationFailure); checkCudaErrors(cudaGraphLaunch(graphExecFreeC, stream)); checkCudaErrors(cudaStreamSynchronize(stream)); printf( "Validating negateSquares with d_negSquare freed in a different " "graph...\n"); checkValidationFailure(foundValidationFailure); printf("Cleaning up sample.\n"); checkCudaErrors(cudaGraphExecDestroy(graphExec)); checkCudaErrors(cudaGraphExecDestroy(graphExecFreeC)); checkCudaErrors(cudaStreamDestroy(stream)); checkCudaErrors(cudaFree(foundValidationFailure)); checkCudaErrors(cudaFree(deviceRefArrays.negSquare)); free(hostArrays.input); free(hostArrays.square); free(hostArrays.negSquare); printf("Cleanup complete. Exiting sample.\n"); }