cuda-samples/Samples/graphMemoryFootprint/graphMemoryFootprint.cu

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

// System includes
#include <assert.h>
#include <stdio.h>

// helper functions and utilities to work with CUDA
#include <helper_cuda.h>
#include <helper_functions.h>

#define NUM_GRAPHS 8
#define THREADS_PER_BLOCK 512

void printMemoryFootprint(int device) {
  size_t footprint;
  checkCudaErrors(cudaDeviceGetGraphMemAttribute(
      device, (cudaGraphMemAttributeType)0, &footprint));
  printf("    FOOTPRINT: %lu bytes\n", footprint);
}

void prepareAllocParams(cudaMemAllocNodeParams *allocParams, size_t bytes,
                        int device) {
  memset(allocParams, 0, sizeof(*allocParams));

  allocParams->bytesize = bytes;
  allocParams->poolProps.allocType = cudaMemAllocationTypePinned;
  allocParams->poolProps.location.id = device;
  allocParams->poolProps.location.type = cudaMemLocationTypeDevice;
}

void createVirtAddrReuseGraph(cudaGraphExec_t *graphExec, size_t bytes,
                              int device) {
  cudaGraph_t graph;
  cudaGraphNode_t allocNodeA, allocNodeB, freeNodeA, freeNodeB;
  cudaMemAllocNodeParams allocParams;
  float *d_a, *d_b;

  checkCudaErrors(cudaGraphCreate(&graph, 0));
  prepareAllocParams(&allocParams, bytes, device);

  checkCudaErrors(
      cudaGraphAddMemAllocNode(&allocNodeA, graph, NULL, 0, &allocParams));
  d_a = (float *)allocParams.dptr;
  checkCudaErrors(
      cudaGraphAddMemFreeNode(&freeNodeA, graph, &allocNodeA, 1, (void *)d_a));

  // The dependency between the allocation of d_b and the free of d_a allows d_b
  // to reuse the same VA.
  checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeB, graph, &freeNodeA, 1,
                                           &allocParams));
  d_b = (float *)allocParams.dptr;

  if (d_a == d_b) {
    printf("Check confirms that d_a and d_b share a virtual address.\n");
  } else {
    printf("Check shows that d_a and d_b DO NOT share a virtual address.\n");
  }

  checkCudaErrors(
      cudaGraphAddMemFreeNode(&freeNodeB, graph, &allocNodeB, 1, (void *)d_b));

  checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0));
  checkCudaErrors(cudaGraphDestroy(graph));
}

void virtualAddressReuseSingleGraph(size_t bytes, int device) {
  cudaStream_t stream;
  cudaGraphExec_t graphExec;

  printf("================================\n");
  printf("Running virtual address reuse example.\n");
  printf(
      "Sequential allocations & frees within a single graph enable CUDA to "
      "reuse virtual addresses.\n\n");

  createVirtAddrReuseGraph(&graphExec, bytes, device);
  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));

  checkCudaErrors(cudaGraphLaunch(graphExec, stream));
  checkCudaErrors(cudaStreamSynchronize(stream));
  printMemoryFootprint(device);

  checkCudaErrors(cudaGraphExecDestroy(graphExec));
  checkCudaErrors(cudaStreamDestroy(stream));
}

// This is a kernel that does no real work but runs at least for a specified
// number of clocks
__global__ void clockBlock(clock_t clock_count) {
  unsigned int start_clock = (unsigned int)clock();

  clock_t clock_offset = 0;

  while (clock_offset < clock_count) {
    unsigned int end_clock = (unsigned int)clock();

    // The code below should work like
    // this (thanks to modular arithmetics):
    //
    // clock_offset = (clock_t) (end_clock > start_clock ?
    //                           end_clock - start_clock :
    //                           end_clock + (0xffffffffu - start_clock));
    //
    // Indeed, let m = 2^32 then
    // end - start = end + m - start (mod m).

    clock_offset = (clock_t)(end_clock - start_clock);
  }
}

// A pointer to the allocated device buffer is returned in dPtr so the caller
// can compare virtual addresses. The kernel node is added to increase the
// graph's runtime.
void createSimpleAllocFreeGraph(cudaGraphExec_t *graphExec, float **dPtr,
                                size_t bytes, int device) {
  cudaGraph_t graph;
  cudaGraphNode_t allocNodeA, freeNodeA, blockDeviceNode;
  cudaMemAllocNodeParams allocParams;
  cudaKernelNodeParams blockDeviceNodeParams = {0};
  int numElements = bytes / sizeof(float);
  float kernelTime = 5;  // time for each thread to run in microseconds

  checkCudaErrors(cudaGraphCreate(&graph, 0));
  prepareAllocParams(&allocParams, bytes, device);

  checkCudaErrors(
      cudaGraphAddMemAllocNode(&allocNodeA, graph, NULL, 0, &allocParams));
  *dPtr = (float *)allocParams.dptr;

  cudaDeviceProp deviceProp;
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device));
  clock_t time_clocks = (clock_t)((kernelTime / 1000.0) * deviceProp.clockRate);

  void *blockDeviceArgs[1] = {(void *)&time_clocks};

  size_t numBlocks = numElements / (size_t)THREADS_PER_BLOCK;
  blockDeviceNodeParams.gridDim = dim3(numBlocks, 1, 1);
  blockDeviceNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1);
  blockDeviceNodeParams.sharedMemBytes = 0;
  blockDeviceNodeParams.extra = NULL;
  blockDeviceNodeParams.func = (void *)clockBlock;
  blockDeviceNodeParams.kernelParams = (void **)blockDeviceArgs;
  checkCudaErrors(cudaGraphAddKernelNode(&blockDeviceNode, graph, &allocNodeA,
                                         1, &blockDeviceNodeParams));

  checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeA, graph, &blockDeviceNode,
                                          1, (void *)*dPtr));

  checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0));
  checkCudaErrors(cudaGraphDestroy(graph));
}

void physicalMemoryReuseSingleStream(size_t bytes, int device) {
  cudaStream_t stream;
  cudaGraphExec_t graphExecs[NUM_GRAPHS];
  float *dPtrs[NUM_GRAPHS];
  bool virtualAddrDiffer = true;

  printf("================================\n");
  printf("Running physical memory reuse example.\n");
  printf(
      "CUDA reuses the same physical memory for allocations from separate "
      "graphs when the allocation lifetimes don't overlap.\n\n");

  for (int i = 0; i < NUM_GRAPHS; i++) {
    createSimpleAllocFreeGraph(&graphExecs[i], &dPtrs[i], bytes, device);
  }

  printf("Creating the graph execs does not reserve any physical memory.\n");
  printMemoryFootprint(device);

  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));

  checkCudaErrors(cudaGraphLaunch(graphExecs[0], stream));
  printf("\nThe first graph launched reserves the memory it needs.\n");
  printMemoryFootprint(device);

  checkCudaErrors(cudaGraphLaunch(graphExecs[0], stream));
  printf(
      "A subsequent launch of the same graph in the same stream reuses the "
      "same physical memory. ");
  printf("Thus the memory footprint does not grow here.\n");
  printMemoryFootprint(device);

  printf(
      "\nSubsequent launches of other graphs in the same stream also reuse the "
      "physical memory. ");
  printf("Thus the memory footprint does not grow here.\n");
  for (int i = 1; i < NUM_GRAPHS; i++) {
    checkCudaErrors(cudaGraphLaunch(graphExecs[i], stream));
    printf("%02d: ", i);
    printMemoryFootprint(device);
  }

  checkCudaErrors(cudaStreamSynchronize(stream));

  for (int i = 0; i < NUM_GRAPHS; i++) {
    for (int j = i + 1; j < NUM_GRAPHS; j++) {
      if (dPtrs[i] == dPtrs[j]) {
        virtualAddrDiffer = false;
        printf("Error: Graph exec %d and %d have the same virtual address!\n",
               i - 1, i);
      }
    }
    checkCudaErrors(cudaGraphExecDestroy(graphExecs[i]));
  }
  if (virtualAddrDiffer) {
    printf("\nCheck confirms all graphs use a different virtual address.\n");
  } else {
    printf(
        "\nAll graphs do NOT use different virtual addresses. Exiting test.\n");
    exit(EXIT_FAILURE);
  }

  checkCudaErrors(cudaStreamDestroy(stream));
}

void simultaneousStreams(size_t bytes, int device) {
  cudaStream_t streams[NUM_GRAPHS];
  cudaGraphExec_t graphExecs[NUM_GRAPHS];
  float *dPtrs[NUM_GRAPHS];

  printf("================================\n");
  printf("Running simultaneous streams example.\n");
  printf("Graphs that can run concurrently need separate physical memory. ");
  printf(
      "In this example, each graph launched in a separate stream increases the "
      "total memory footprint.\n\n");

  printf(
      "When launching a new graph, CUDA may reuse physical memory from a graph "
      "whose execution has already ");
  printf(
      "finished -- even if the new graph is being launched in a different "
      "stream from the completed graph. ");
  printf(
      "Therefore, a kernel node is added to the graphs to increase "
      "runtime.\n\n");

  for (int i = 0; i < NUM_GRAPHS; i++) {
    createSimpleAllocFreeGraph(&graphExecs[i], &dPtrs[i], bytes, device);
    checkCudaErrors(
        cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking));
  }

  printf("Initial footprint:\n");
  printMemoryFootprint(device);

  printf(
      "\nEach graph launch in a seperate stream grows the memory footprint:\n");
  for (int i = 1; i < NUM_GRAPHS; i++) {
    checkCudaErrors(cudaGraphLaunch(graphExecs[i], streams[i]));
    printf("%02d: ", i);
    printMemoryFootprint(device);
  }

  for (int i = 0; i < NUM_GRAPHS; i++) {
    checkCudaErrors(cudaStreamSynchronize(streams[i]));
    checkCudaErrors(cudaGraphExecDestroy(graphExecs[i]));
    checkCudaErrors(cudaStreamDestroy(streams[i]));
  }
}

void createSimpleAllocNoFreeGraph(cudaGraphExec_t *graphExec, float **dPtr,
                                  size_t bytes, int device) {
  cudaGraph_t graph;
  cudaGraphNode_t allocNodeA;
  cudaMemAllocNodeParams allocParams;

  checkCudaErrors(cudaGraphCreate(&graph, 0));
  prepareAllocParams(&allocParams, bytes, device);

  checkCudaErrors(
      cudaGraphAddMemAllocNode(&allocNodeA, graph, NULL, 0, &allocParams));
  *dPtr = (float *)allocParams.dptr;

  checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0));
  checkCudaErrors(cudaGraphDestroy(graph));
}

void unfreedAllocations(size_t bytes, int device) {
  cudaStream_t stream;
  cudaGraphExec_t graphExecs[NUM_GRAPHS];
  float *dPtrs[NUM_GRAPHS];

  printf("================================\n");
  printf("Running unfreed streams example.\n");
  printf(
      "CUDA cannot reuse phyiscal memory from graphs which do not free their "
      "allocations.\n\n");

  for (int i = 0; i < NUM_GRAPHS; i++) {
    createSimpleAllocNoFreeGraph(&graphExecs[i], &dPtrs[i], bytes, device);
  }

  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));

  printf(
      "Despite being launched in the same stream, each graph launch grows the "
      "memory footprint. ");
  printf(
      "Since the allocation is not freed, CUDA keeps the memory valid for "
      "use.\n");
  for (int i = 0; i < NUM_GRAPHS; i++) {
    checkCudaErrors(cudaGraphLaunch(graphExecs[i], stream));
    printf("%02d: ", i);
    printMemoryFootprint(device);
  }

  checkCudaErrors(cudaStreamSynchronize(stream));

  checkCudaErrors(cudaDeviceGraphMemTrim(device));
  printf(
      "\nTrimming does not impact the memory footprint since the un-freed "
      "allocations are still holding onto the memory.\n");
  printMemoryFootprint(device);

  for (int i = 0; i < NUM_GRAPHS; i++) {
    checkCudaErrors(cudaFree(dPtrs[i]));
  }
  printf("\nFreeing the allocations does not shrink the footprint.\n");
  printMemoryFootprint(device);

  checkCudaErrors(cudaDeviceGraphMemTrim(device));
  printf(
      "\nSince the allocations are now freed, trimming does reduce the "
      "footprint even when the graph execs are not yet destroyed.\n");
  printMemoryFootprint(device);

  for (int i = 0; i < NUM_GRAPHS; i++) {
    checkCudaErrors(cudaGraphExecDestroy(graphExecs[i]));
  }
  checkCudaErrors(cudaStreamDestroy(stream));
}

void cleanupMemory(int device) {
  checkCudaErrors(cudaDeviceGraphMemTrim(device));
  printf("\nCleaning up example by trimming device memory.\n");
  printMemoryFootprint(device);
  printf("\n");
}

int main(int argc, char **argv) {
  size_t bytes = 64 * 1024 * 1024;
  int device = findCudaDevice(argc, (const char **)argv);

  int driverVersion = 0;
  int deviceSupportsMemoryPools = 0;

  cudaDriverGetVersion(&driverVersion);
  printf("Driver version is: %d.%d\n", driverVersion / 1000,
         (driverVersion % 100) / 10);

  if (driverVersion < 11040) {
    printf("Waiving execution as driver does not support Graph Memory Nodes\n");
    exit(EXIT_WAIVED);
  }

  cudaDeviceGetAttribute(&deviceSupportsMemoryPools,
                         cudaDevAttrMemoryPoolsSupported, device);
  if (!deviceSupportsMemoryPools) {
    printf("Waiving execution as device does not support Memory Pools\n");
    exit(EXIT_WAIVED);
  } else {
    printf("Running sample.\n");
  }

  virtualAddressReuseSingleGraph(bytes, device);
  cleanupMemory(device);

  physicalMemoryReuseSingleStream(bytes, device);
  cleanupMemory(device);

  simultaneousStreams(bytes, device);
  cleanupMemory(device);

  unfreedAllocations(bytes, device);
  cleanupMemory(device);

  printf("================================\n");
  printf("Sample complete.\n");
}
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`// System includes`
			`#include <assert.h>`
			`#include <stdio.h>`

			`// helper functions and utilities to work with CUDA`
			`#include <helper_cuda.h>`
			`#include <helper_functions.h>`

			`#define NUM_GRAPHS 8`
			`#define THREADS_PER_BLOCK 512`

			`void printMemoryFootprint(int device) {`
			`size_t footprint;`
			`checkCudaErrors(cudaDeviceGetGraphMemAttribute(`
			`device, (cudaGraphMemAttributeType)0, &footprint));`
			`printf(" FOOTPRINT: %lu bytes\n", footprint);`
			`}`

			`void prepareAllocParams(cudaMemAllocNodeParams *allocParams, size_t bytes,`
			`int device) {`
			`memset(allocParams, 0, sizeof(*allocParams));`

			`allocParams->bytesize = bytes;`
			`allocParams->poolProps.allocType = cudaMemAllocationTypePinned;`
			`allocParams->poolProps.location.id = device;`
			`allocParams->poolProps.location.type = cudaMemLocationTypeDevice;`
			`}`

			`void createVirtAddrReuseGraph(cudaGraphExec_t *graphExec, size_t bytes,`
			`int device) {`
			`cudaGraph_t graph;`
			`cudaGraphNode_t allocNodeA, allocNodeB, freeNodeA, freeNodeB;`
			`cudaMemAllocNodeParams allocParams;`
			`float d_a, d_b;`

			`checkCudaErrors(cudaGraphCreate(&graph, 0));`
			`prepareAllocParams(&allocParams, bytes, device);`

			`checkCudaErrors(`
			`cudaGraphAddMemAllocNode(&allocNodeA, graph, NULL, 0, &allocParams));`
			`d_a = (float *)allocParams.dptr;`
			`checkCudaErrors(`
			`cudaGraphAddMemFreeNode(&freeNodeA, graph, &allocNodeA, 1, (void *)d_a));`

			`// The dependency between the allocation of d_b and the free of d_a allows d_b`
			`// to reuse the same VA.`
			`checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeB, graph, &freeNodeA, 1,`
			`&allocParams));`
			`d_b = (float *)allocParams.dptr;`

			`if (d_a == d_b) {`
			`printf("Check confirms that d_a and d_b share a virtual address.\n");`
			`} else {`
			`printf("Check shows that d_a and d_b DO NOT share a virtual address.\n");`
			`}`

			`checkCudaErrors(`
			`cudaGraphAddMemFreeNode(&freeNodeB, graph, &allocNodeB, 1, (void *)d_b));`

			`checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0));`
			`checkCudaErrors(cudaGraphDestroy(graph));`
			`}`

			`void virtualAddressReuseSingleGraph(size_t bytes, int device) {`
			`cudaStream_t stream;`
			`cudaGraphExec_t graphExec;`

			`printf("================================\n");`
			`printf("Running virtual address reuse example.\n");`
			`printf(`
			`"Sequential allocations & frees within a single graph enable CUDA to "`
			`"reuse virtual addresses.\n\n");`

			`createVirtAddrReuseGraph(&graphExec, bytes, device);`
			`checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));`

			`checkCudaErrors(cudaGraphLaunch(graphExec, stream));`
			`checkCudaErrors(cudaStreamSynchronize(stream));`
			`printMemoryFootprint(device);`

			`checkCudaErrors(cudaGraphExecDestroy(graphExec));`
			`checkCudaErrors(cudaStreamDestroy(stream));`
			`}`

			`// This is a kernel that does no real work but runs at least for a specified`
			`// number of clocks`
			`__global__ void clockBlock(clock_t clock_count) {`
			`unsigned int start_clock = (unsigned int)clock();`

			`clock_t clock_offset = 0;`

			`while (clock_offset < clock_count) {`
			`unsigned int end_clock = (unsigned int)clock();`

			`// The code below should work like`
			`// this (thanks to modular arithmetics):`
			`//`
			`// clock_offset = (clock_t) (end_clock > start_clock ?`
			`// end_clock - start_clock :`
			`// end_clock + (0xffffffffu - start_clock));`
			`//`
			`// Indeed, let m = 2^32 then`
			`// end - start = end + m - start (mod m).`

			`clock_offset = (clock_t)(end_clock - start_clock);`
			`}`
			`}`

			`// A pointer to the allocated device buffer is returned in dPtr so the caller`
			`// can compare virtual addresses. The kernel node is added to increase the`
			`// graph's runtime.`
			`void createSimpleAllocFreeGraph(cudaGraphExec_t graphExec, float *dPtr,`
			`size_t bytes, int device) {`
			`cudaGraph_t graph;`
			`cudaGraphNode_t allocNodeA, freeNodeA, blockDeviceNode;`
			`cudaMemAllocNodeParams allocParams;`
			`cudaKernelNodeParams blockDeviceNodeParams = {0};`
			`int numElements = bytes / sizeof(float);`
			`float kernelTime = 5; // time for each thread to run in microseconds`

			`checkCudaErrors(cudaGraphCreate(&graph, 0));`
			`prepareAllocParams(&allocParams, bytes, device);`

			`checkCudaErrors(`
			`cudaGraphAddMemAllocNode(&allocNodeA, graph, NULL, 0, &allocParams));`
			`dPtr = (float )allocParams.dptr;`

			`cudaDeviceProp deviceProp;`
			`checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device));`
			`clock_t time_clocks = (clock_t)((kernelTime / 1000.0) * deviceProp.clockRate);`

			`void blockDeviceArgs[1] = {(void )&time_clocks};`

			`size_t numBlocks = numElements / (size_t)THREADS_PER_BLOCK;`
			`blockDeviceNodeParams.gridDim = dim3(numBlocks, 1, 1);`
			`blockDeviceNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1);`
			`blockDeviceNodeParams.sharedMemBytes = 0;`
			`blockDeviceNodeParams.extra = NULL;`
			`blockDeviceNodeParams.func = (void *)clockBlock;`
			`blockDeviceNodeParams.kernelParams = (void **)blockDeviceArgs;`
			`checkCudaErrors(cudaGraphAddKernelNode(&blockDeviceNode, graph, &allocNodeA,`
			`1, &blockDeviceNodeParams));`

			`checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeA, graph, &blockDeviceNode,`
			`1, (void )dPtr));`

			`checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0));`
			`checkCudaErrors(cudaGraphDestroy(graph));`
			`}`

			`void physicalMemoryReuseSingleStream(size_t bytes, int device) {`
			`cudaStream_t stream;`
			`cudaGraphExec_t graphExecs[NUM_GRAPHS];`
			`float *dPtrs[NUM_GRAPHS];`
			`bool virtualAddrDiffer = true;`

			`printf("================================\n");`
			`printf("Running physical memory reuse example.\n");`
			`printf(`
			`"CUDA reuses the same physical memory for allocations from separate "`
			`"graphs when the allocation lifetimes don't overlap.\n\n");`

			`for (int i = 0; i < NUM_GRAPHS; i++) {`
			`createSimpleAllocFreeGraph(&graphExecs[i], &dPtrs[i], bytes, device);`
			`}`

			`printf("Creating the graph execs does not reserve any physical memory.\n");`
			`printMemoryFootprint(device);`

			`checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));`

			`checkCudaErrors(cudaGraphLaunch(graphExecs[0], stream));`
			`printf("\nThe first graph launched reserves the memory it needs.\n");`
			`printMemoryFootprint(device);`

			`checkCudaErrors(cudaGraphLaunch(graphExecs[0], stream));`
			`printf(`
			`"A subsequent launch of the same graph in the same stream reuses the "`
			`"same physical memory. ");`
			`printf("Thus the memory footprint does not grow here.\n");`
			`printMemoryFootprint(device);`

			`printf(`
			`"\nSubsequent launches of other graphs in the same stream also reuse the "`
			`"physical memory. ");`
			`printf("Thus the memory footprint does not grow here.\n");`
			`for (int i = 1; i < NUM_GRAPHS; i++) {`
			`checkCudaErrors(cudaGraphLaunch(graphExecs[i], stream));`
			`printf("%02d: ", i);`
			`printMemoryFootprint(device);`
			`}`

			`checkCudaErrors(cudaStreamSynchronize(stream));`

			`for (int i = 0; i < NUM_GRAPHS; i++) {`
			`for (int j = i + 1; j < NUM_GRAPHS; j++) {`
			`if (dPtrs[i] == dPtrs[j]) {`
			`virtualAddrDiffer = false;`
			`printf("Error: Graph exec %d and %d have the same virtual address!\n",`
			`i - 1, i);`
			`}`
			`}`
			`checkCudaErrors(cudaGraphExecDestroy(graphExecs[i]));`
			`}`
			`if (virtualAddrDiffer) {`
			`printf("\nCheck confirms all graphs use a different virtual address.\n");`
			`} else {`
			`printf(`
			`"\nAll graphs do NOT use different virtual addresses. Exiting test.\n");`
			`exit(EXIT_FAILURE);`
			`}`

			`checkCudaErrors(cudaStreamDestroy(stream));`
			`}`

			`void simultaneousStreams(size_t bytes, int device) {`
			`cudaStream_t streams[NUM_GRAPHS];`
			`cudaGraphExec_t graphExecs[NUM_GRAPHS];`
			`float *dPtrs[NUM_GRAPHS];`

			`printf("================================\n");`
			`printf("Running simultaneous streams example.\n");`
			`printf("Graphs that can run concurrently need separate physical memory. ");`
			`printf(`
			`"In this example, each graph launched in a separate stream increases the "`
			`"total memory footprint.\n\n");`

			`printf(`
			`"When launching a new graph, CUDA may reuse physical memory from a graph "`
			`"whose execution has already ");`
			`printf(`
			`"finished -- even if the new graph is being launched in a different "`
			`"stream from the completed graph. ");`
			`printf(`
			`"Therefore, a kernel node is added to the graphs to increase "`
			`"runtime.\n\n");`

			`for (int i = 0; i < NUM_GRAPHS; i++) {`
			`createSimpleAllocFreeGraph(&graphExecs[i], &dPtrs[i], bytes, device);`
			`checkCudaErrors(`
			`cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking));`
			`}`

			`printf("Initial footprint:\n");`
			`printMemoryFootprint(device);`

			`printf(`
			`"\nEach graph launch in a seperate stream grows the memory footprint:\n");`
			`for (int i = 1; i < NUM_GRAPHS; i++) {`
			`checkCudaErrors(cudaGraphLaunch(graphExecs[i], streams[i]));`
			`printf("%02d: ", i);`
			`printMemoryFootprint(device);`
			`}`

			`for (int i = 0; i < NUM_GRAPHS; i++) {`
			`checkCudaErrors(cudaStreamSynchronize(streams[i]));`
			`checkCudaErrors(cudaGraphExecDestroy(graphExecs[i]));`
			`checkCudaErrors(cudaStreamDestroy(streams[i]));`
			`}`
			`}`

			`void createSimpleAllocNoFreeGraph(cudaGraphExec_t graphExec, float *dPtr,`
			`size_t bytes, int device) {`
			`cudaGraph_t graph;`
			`cudaGraphNode_t allocNodeA;`
			`cudaMemAllocNodeParams allocParams;`

			`checkCudaErrors(cudaGraphCreate(&graph, 0));`
			`prepareAllocParams(&allocParams, bytes, device);`

			`checkCudaErrors(`
			`cudaGraphAddMemAllocNode(&allocNodeA, graph, NULL, 0, &allocParams));`
			`dPtr = (float )allocParams.dptr;`

			`checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0));`
			`checkCudaErrors(cudaGraphDestroy(graph));`
			`}`

			`void unfreedAllocations(size_t bytes, int device) {`
			`cudaStream_t stream;`
			`cudaGraphExec_t graphExecs[NUM_GRAPHS];`
			`float *dPtrs[NUM_GRAPHS];`

			`printf("================================\n");`
			`printf("Running unfreed streams example.\n");`
			`printf(`
			`"CUDA cannot reuse phyiscal memory from graphs which do not free their "`
			`"allocations.\n\n");`

			`for (int i = 0; i < NUM_GRAPHS; i++) {`
			`createSimpleAllocNoFreeGraph(&graphExecs[i], &dPtrs[i], bytes, device);`
			`}`

			`checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));`

			`printf(`
			`"Despite being launched in the same stream, each graph launch grows the "`
			`"memory footprint. ");`
			`printf(`
			`"Since the allocation is not freed, CUDA keeps the memory valid for "`
			`"use.\n");`
			`for (int i = 0; i < NUM_GRAPHS; i++) {`
			`checkCudaErrors(cudaGraphLaunch(graphExecs[i], stream));`
			`printf("%02d: ", i);`
			`printMemoryFootprint(device);`
			`}`

			`checkCudaErrors(cudaStreamSynchronize(stream));`

			`checkCudaErrors(cudaDeviceGraphMemTrim(device));`
			`printf(`
			`"\nTrimming does not impact the memory footprint since the un-freed "`
			`"allocations are still holding onto the memory.\n");`
			`printMemoryFootprint(device);`

			`for (int i = 0; i < NUM_GRAPHS; i++) {`
			`checkCudaErrors(cudaFree(dPtrs[i]));`
			`}`
			`printf("\nFreeing the allocations does not shrink the footprint.\n");`
			`printMemoryFootprint(device);`

			`checkCudaErrors(cudaDeviceGraphMemTrim(device));`
			`printf(`
			`"\nSince the allocations are now freed, trimming does reduce the "`
			`"footprint even when the graph execs are not yet destroyed.\n");`
			`printMemoryFootprint(device);`

			`for (int i = 0; i < NUM_GRAPHS; i++) {`
			`checkCudaErrors(cudaGraphExecDestroy(graphExecs[i]));`
			`}`
			`checkCudaErrors(cudaStreamDestroy(stream));`
			`}`

			`void cleanupMemory(int device) {`
			`checkCudaErrors(cudaDeviceGraphMemTrim(device));`
			`printf("\nCleaning up example by trimming device memory.\n");`
			`printMemoryFootprint(device);`
			`printf("\n");`
			`}`

			`int main(int argc, char **argv) {`
			`size_t bytes = 64 * 1024 * 1024;`
			`int device = findCudaDevice(argc, (const char **)argv);`

			`int driverVersion = 0;`
			`int deviceSupportsMemoryPools = 0;`

			`cudaDriverGetVersion(&driverVersion);`
			`printf("Driver version is: %d.%d\n", driverVersion / 1000,`
			`(driverVersion % 100) / 10);`

			`if (driverVersion < 11040) {`
			`printf("Waiving execution as driver does not support Graph Memory Nodes\n");`
			`exit(EXIT_WAIVED);`
			`}`

			`cudaDeviceGetAttribute(&deviceSupportsMemoryPools,`
			`cudaDevAttrMemoryPoolsSupported, device);`
			`if (!deviceSupportsMemoryPools) {`
			`printf("Waiving execution as device does not support Memory Pools\n");`
			`exit(EXIT_WAIVED);`
			`} else {`
			`printf("Running sample.\n");`
			`}`

			`virtualAddressReuseSingleGraph(bytes, device);`
			`cleanupMemory(device);`

			`physicalMemoryReuseSingleStream(bytes, device);`
			`cleanupMemory(device);`

			`simultaneousStreams(bytes, device);`
			`cleanupMemory(device);`

			`unfreedAllocations(bytes, device);`
			`cleanupMemory(device);`

			`printf("================================\n");`
			`printf("Sample complete.\n");`
			`}`