/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // System includes #include #include // helper functions and utilities to work with CUDA #include #include #define NUM_GRAPHS 8 #define THREADS_PER_BLOCK 512 void printMemoryFootprint(int device) { size_t footprint; checkCudaErrors(cudaDeviceGetGraphMemAttribute( device, (cudaGraphMemAttributeType)0, &footprint)); printf(" FOOTPRINT: %lu bytes\n", footprint); } void prepareAllocParams(cudaMemAllocNodeParams *allocParams, size_t bytes, int device) { memset(allocParams, 0, sizeof(*allocParams)); allocParams->bytesize = bytes; allocParams->poolProps.allocType = cudaMemAllocationTypePinned; allocParams->poolProps.location.id = device; allocParams->poolProps.location.type = cudaMemLocationTypeDevice; } void createVirtAddrReuseGraph(cudaGraphExec_t *graphExec, size_t bytes, int device) { cudaGraph_t graph; cudaGraphNode_t allocNodeA, allocNodeB, freeNodeA, freeNodeB; cudaMemAllocNodeParams allocParams; float *d_a, *d_b; checkCudaErrors(cudaGraphCreate(&graph, 0)); prepareAllocParams(&allocParams, bytes, device); checkCudaErrors( cudaGraphAddMemAllocNode(&allocNodeA, graph, NULL, 0, &allocParams)); d_a = (float *)allocParams.dptr; checkCudaErrors( cudaGraphAddMemFreeNode(&freeNodeA, graph, &allocNodeA, 1, (void *)d_a)); // The dependency between the allocation of d_b and the free of d_a allows d_b // to reuse the same VA. checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeB, graph, &freeNodeA, 1, &allocParams)); d_b = (float *)allocParams.dptr; if (d_a == d_b) { printf("Check confirms that d_a and d_b share a virtual address.\n"); } else { printf("Check shows that d_a and d_b DO NOT share a virtual address.\n"); } checkCudaErrors( cudaGraphAddMemFreeNode(&freeNodeB, graph, &allocNodeB, 1, (void *)d_b)); checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); checkCudaErrors(cudaGraphDestroy(graph)); } void virtualAddressReuseSingleGraph(size_t bytes, int device) { cudaStream_t stream; cudaGraphExec_t graphExec; printf("================================\n"); printf("Running virtual address reuse example.\n"); printf( "Sequential allocations & frees within a single graph enable CUDA to " "reuse virtual addresses.\n\n"); createVirtAddrReuseGraph(&graphExec, bytes, device); checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); checkCudaErrors(cudaGraphLaunch(graphExec, stream)); checkCudaErrors(cudaStreamSynchronize(stream)); printMemoryFootprint(device); checkCudaErrors(cudaGraphExecDestroy(graphExec)); checkCudaErrors(cudaStreamDestroy(stream)); } // This is a kernel that does no real work but runs at least for a specified // number of clocks __global__ void clockBlock(clock_t clock_count) { unsigned int start_clock = (unsigned int)clock(); clock_t clock_offset = 0; while (clock_offset < clock_count) { unsigned int end_clock = (unsigned int)clock(); // The code below should work like // this (thanks to modular arithmetics): // // clock_offset = (clock_t) (end_clock > start_clock ? // end_clock - start_clock : // end_clock + (0xffffffffu - start_clock)); // // Indeed, let m = 2^32 then // end - start = end + m - start (mod m). clock_offset = (clock_t)(end_clock - start_clock); } } // A pointer to the allocated device buffer is returned in dPtr so the caller // can compare virtual addresses. The kernel node is added to increase the // graph's runtime. void createSimpleAllocFreeGraph(cudaGraphExec_t *graphExec, float **dPtr, size_t bytes, int device) { cudaGraph_t graph; cudaGraphNode_t allocNodeA, freeNodeA, blockDeviceNode; cudaMemAllocNodeParams allocParams; cudaKernelNodeParams blockDeviceNodeParams = {0}; int numElements = bytes / sizeof(float); float kernelTime = 5; // time for each thread to run in microseconds checkCudaErrors(cudaGraphCreate(&graph, 0)); prepareAllocParams(&allocParams, bytes, device); checkCudaErrors( cudaGraphAddMemAllocNode(&allocNodeA, graph, NULL, 0, &allocParams)); *dPtr = (float *)allocParams.dptr; cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device)); clock_t time_clocks = (clock_t)((kernelTime / 1000.0) * deviceProp.clockRate); void *blockDeviceArgs[1] = {(void *)&time_clocks}; size_t numBlocks = numElements / (size_t)THREADS_PER_BLOCK; blockDeviceNodeParams.gridDim = dim3(numBlocks, 1, 1); blockDeviceNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1); blockDeviceNodeParams.sharedMemBytes = 0; blockDeviceNodeParams.extra = NULL; blockDeviceNodeParams.func = (void *)clockBlock; blockDeviceNodeParams.kernelParams = (void **)blockDeviceArgs; checkCudaErrors(cudaGraphAddKernelNode(&blockDeviceNode, graph, &allocNodeA, 1, &blockDeviceNodeParams)); checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeA, graph, &blockDeviceNode, 1, (void *)*dPtr)); checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); checkCudaErrors(cudaGraphDestroy(graph)); } void physicalMemoryReuseSingleStream(size_t bytes, int device) { cudaStream_t stream; cudaGraphExec_t graphExecs[NUM_GRAPHS]; float *dPtrs[NUM_GRAPHS]; bool virtualAddrDiffer = true; printf("================================\n"); printf("Running physical memory reuse example.\n"); printf( "CUDA reuses the same physical memory for allocations from separate " "graphs when the allocation lifetimes don't overlap.\n\n"); for (int i = 0; i < NUM_GRAPHS; i++) { createSimpleAllocFreeGraph(&graphExecs[i], &dPtrs[i], bytes, device); } printf("Creating the graph execs does not reserve any physical memory.\n"); printMemoryFootprint(device); checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); checkCudaErrors(cudaGraphLaunch(graphExecs[0], stream)); printf("\nThe first graph launched reserves the memory it needs.\n"); printMemoryFootprint(device); checkCudaErrors(cudaGraphLaunch(graphExecs[0], stream)); printf( "A subsequent launch of the same graph in the same stream reuses the " "same physical memory. "); printf("Thus the memory footprint does not grow here.\n"); printMemoryFootprint(device); printf( "\nSubsequent launches of other graphs in the same stream also reuse the " "physical memory. "); printf("Thus the memory footprint does not grow here.\n"); for (int i = 1; i < NUM_GRAPHS; i++) { checkCudaErrors(cudaGraphLaunch(graphExecs[i], stream)); printf("%02d: ", i); printMemoryFootprint(device); } checkCudaErrors(cudaStreamSynchronize(stream)); for (int i = 0; i < NUM_GRAPHS; i++) { for (int j = i + 1; j < NUM_GRAPHS; j++) { if (dPtrs[i] == dPtrs[j]) { virtualAddrDiffer = false; printf("Error: Graph exec %d and %d have the same virtual address!\n", i - 1, i); } } checkCudaErrors(cudaGraphExecDestroy(graphExecs[i])); } if (virtualAddrDiffer) { printf("\nCheck confirms all graphs use a different virtual address.\n"); } else { printf( "\nAll graphs do NOT use different virtual addresses. Exiting test.\n"); exit(EXIT_FAILURE); } checkCudaErrors(cudaStreamDestroy(stream)); } void simultaneousStreams(size_t bytes, int device) { cudaStream_t streams[NUM_GRAPHS]; cudaGraphExec_t graphExecs[NUM_GRAPHS]; float *dPtrs[NUM_GRAPHS]; printf("================================\n"); printf("Running simultaneous streams example.\n"); printf("Graphs that can run concurrently need separate physical memory. "); printf( "In this example, each graph launched in a separate stream increases the " "total memory footprint.\n\n"); printf( "When launching a new graph, CUDA may reuse physical memory from a graph " "whose execution has already "); printf( "finished -- even if the new graph is being launched in a different " "stream from the completed graph. "); printf( "Therefore, a kernel node is added to the graphs to increase " "runtime.\n\n"); for (int i = 0; i < NUM_GRAPHS; i++) { createSimpleAllocFreeGraph(&graphExecs[i], &dPtrs[i], bytes, device); checkCudaErrors( cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking)); } printf("Initial footprint:\n"); printMemoryFootprint(device); printf( "\nEach graph launch in a seperate stream grows the memory footprint:\n"); for (int i = 1; i < NUM_GRAPHS; i++) { checkCudaErrors(cudaGraphLaunch(graphExecs[i], streams[i])); printf("%02d: ", i); printMemoryFootprint(device); } for (int i = 0; i < NUM_GRAPHS; i++) { checkCudaErrors(cudaStreamSynchronize(streams[i])); checkCudaErrors(cudaGraphExecDestroy(graphExecs[i])); checkCudaErrors(cudaStreamDestroy(streams[i])); } } void createSimpleAllocNoFreeGraph(cudaGraphExec_t *graphExec, float **dPtr, size_t bytes, int device) { cudaGraph_t graph; cudaGraphNode_t allocNodeA; cudaMemAllocNodeParams allocParams; checkCudaErrors(cudaGraphCreate(&graph, 0)); prepareAllocParams(&allocParams, bytes, device); checkCudaErrors( cudaGraphAddMemAllocNode(&allocNodeA, graph, NULL, 0, &allocParams)); *dPtr = (float *)allocParams.dptr; checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); checkCudaErrors(cudaGraphDestroy(graph)); } void unfreedAllocations(size_t bytes, int device) { cudaStream_t stream; cudaGraphExec_t graphExecs[NUM_GRAPHS]; float *dPtrs[NUM_GRAPHS]; printf("================================\n"); printf("Running unfreed streams example.\n"); printf( "CUDA cannot reuse phyiscal memory from graphs which do not free their " "allocations.\n\n"); for (int i = 0; i < NUM_GRAPHS; i++) { createSimpleAllocNoFreeGraph(&graphExecs[i], &dPtrs[i], bytes, device); } checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); printf( "Despite being launched in the same stream, each graph launch grows the " "memory footprint. "); printf( "Since the allocation is not freed, CUDA keeps the memory valid for " "use.\n"); for (int i = 0; i < NUM_GRAPHS; i++) { checkCudaErrors(cudaGraphLaunch(graphExecs[i], stream)); printf("%02d: ", i); printMemoryFootprint(device); } checkCudaErrors(cudaStreamSynchronize(stream)); checkCudaErrors(cudaDeviceGraphMemTrim(device)); printf( "\nTrimming does not impact the memory footprint since the un-freed " "allocations are still holding onto the memory.\n"); printMemoryFootprint(device); for (int i = 0; i < NUM_GRAPHS; i++) { checkCudaErrors(cudaFree(dPtrs[i])); } printf("\nFreeing the allocations does not shrink the footprint.\n"); printMemoryFootprint(device); checkCudaErrors(cudaDeviceGraphMemTrim(device)); printf( "\nSince the allocations are now freed, trimming does reduce the " "footprint even when the graph execs are not yet destroyed.\n"); printMemoryFootprint(device); for (int i = 0; i < NUM_GRAPHS; i++) { checkCudaErrors(cudaGraphExecDestroy(graphExecs[i])); } checkCudaErrors(cudaStreamDestroy(stream)); } void cleanupMemory(int device) { checkCudaErrors(cudaDeviceGraphMemTrim(device)); printf("\nCleaning up example by trimming device memory.\n"); printMemoryFootprint(device); printf("\n"); } int main(int argc, char **argv) { size_t bytes = 64 * 1024 * 1024; int device = findCudaDevice(argc, (const char **)argv); int driverVersion = 0; int deviceSupportsMemoryPools = 0; cudaDriverGetVersion(&driverVersion); printf("Driver version is: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10); if (driverVersion < 11040) { printf("Waiving execution as driver does not support Graph Memory Nodes\n"); exit(EXIT_WAIVED); } cudaDeviceGetAttribute(&deviceSupportsMemoryPools, cudaDevAttrMemoryPoolsSupported, device); if (!deviceSupportsMemoryPools) { printf("Waiving execution as device does not support Memory Pools\n"); exit(EXIT_WAIVED); } else { printf("Running sample.\n"); } virtualAddressReuseSingleGraph(bytes, device); cleanupMemory(device); physicalMemoryReuseSingleStream(bytes, device); cleanupMemory(device); simultaneousStreams(bytes, device); cleanupMemory(device); unfreedAllocations(bytes, device); cleanupMemory(device); printf("================================\n"); printf("Sample complete.\n"); }