mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-24 20:49:15 +08:00
408 lines
14 KiB
Plaintext
408 lines
14 KiB
Plaintext
|
/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||
|
*
|
||
|
* Redistribution and use in source and binary forms, with or without
|
||
|
* modification, are permitted provided that the following conditions
|
||
|
* are met:
|
||
|
* * Redistributions of source code must retain the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer.
|
||
|
* * Redistributions in binary form must reproduce the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer in the
|
||
|
* documentation and/or other materials provided with the distribution.
|
||
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||
|
* contributors may be used to endorse or promote products derived
|
||
|
* from this software without specific prior written permission.
|
||
|
*
|
||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
*/
|
||
|
|
||
|
// System includes
|
||
|
#include <assert.h>
|
||
|
#include <stdio.h>
|
||
|
|
||
|
// helper functions and utilities to work with CUDA
|
||
|
#include <helper_cuda.h>
|
||
|
#include <helper_functions.h>
|
||
|
|
||
|
#define NUM_GRAPHS 8
|
||
|
#define THREADS_PER_BLOCK 512
|
||
|
|
||
|
void printMemoryFootprint(int device) {
|
||
|
size_t footprint;
|
||
|
checkCudaErrors(cudaDeviceGetGraphMemAttribute(
|
||
|
device, (cudaGraphMemAttributeType)0, &footprint));
|
||
|
printf(" FOOTPRINT: %lu bytes\n", footprint);
|
||
|
}
|
||
|
|
||
|
void prepareAllocParams(cudaMemAllocNodeParams *allocParams, size_t bytes,
|
||
|
int device) {
|
||
|
memset(allocParams, 0, sizeof(*allocParams));
|
||
|
|
||
|
allocParams->bytesize = bytes;
|
||
|
allocParams->poolProps.allocType = cudaMemAllocationTypePinned;
|
||
|
allocParams->poolProps.location.id = device;
|
||
|
allocParams->poolProps.location.type = cudaMemLocationTypeDevice;
|
||
|
}
|
||
|
|
||
|
void createVirtAddrReuseGraph(cudaGraphExec_t *graphExec, size_t bytes,
|
||
|
int device) {
|
||
|
cudaGraph_t graph;
|
||
|
cudaGraphNode_t allocNodeA, allocNodeB, freeNodeA, freeNodeB;
|
||
|
cudaMemAllocNodeParams allocParams;
|
||
|
float *d_a, *d_b;
|
||
|
|
||
|
checkCudaErrors(cudaGraphCreate(&graph, 0));
|
||
|
prepareAllocParams(&allocParams, bytes, device);
|
||
|
|
||
|
checkCudaErrors(
|
||
|
cudaGraphAddMemAllocNode(&allocNodeA, graph, NULL, 0, &allocParams));
|
||
|
d_a = (float *)allocParams.dptr;
|
||
|
checkCudaErrors(
|
||
|
cudaGraphAddMemFreeNode(&freeNodeA, graph, &allocNodeA, 1, (void *)d_a));
|
||
|
|
||
|
// The dependency between the allocation of d_b and the free of d_a allows d_b
|
||
|
// to reuse the same VA.
|
||
|
checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeB, graph, &freeNodeA, 1,
|
||
|
&allocParams));
|
||
|
d_b = (float *)allocParams.dptr;
|
||
|
|
||
|
if (d_a == d_b) {
|
||
|
printf("Check confirms that d_a and d_b share a virtual address.\n");
|
||
|
} else {
|
||
|
printf("Check shows that d_a and d_b DO NOT share a virtual address.\n");
|
||
|
}
|
||
|
|
||
|
checkCudaErrors(
|
||
|
cudaGraphAddMemFreeNode(&freeNodeB, graph, &allocNodeB, 1, (void *)d_b));
|
||
|
|
||
|
checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0));
|
||
|
checkCudaErrors(cudaGraphDestroy(graph));
|
||
|
}
|
||
|
|
||
|
void virtualAddressReuseSingleGraph(size_t bytes, int device) {
|
||
|
cudaStream_t stream;
|
||
|
cudaGraphExec_t graphExec;
|
||
|
|
||
|
printf("================================\n");
|
||
|
printf("Running virtual address reuse example.\n");
|
||
|
printf(
|
||
|
"Sequential allocations & frees within a single graph enable CUDA to "
|
||
|
"reuse virtual addresses.\n\n");
|
||
|
|
||
|
createVirtAddrReuseGraph(&graphExec, bytes, device);
|
||
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||
|
|
||
|
checkCudaErrors(cudaGraphLaunch(graphExec, stream));
|
||
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
||
|
printMemoryFootprint(device);
|
||
|
|
||
|
checkCudaErrors(cudaGraphExecDestroy(graphExec));
|
||
|
checkCudaErrors(cudaStreamDestroy(stream));
|
||
|
}
|
||
|
|
||
|
// This is a kernel that does no real work but runs at least for a specified
|
||
|
// number of clocks
|
||
|
__global__ void clockBlock(clock_t clock_count) {
|
||
|
unsigned int start_clock = (unsigned int)clock();
|
||
|
|
||
|
clock_t clock_offset = 0;
|
||
|
|
||
|
while (clock_offset < clock_count) {
|
||
|
unsigned int end_clock = (unsigned int)clock();
|
||
|
|
||
|
// The code below should work like
|
||
|
// this (thanks to modular arithmetics):
|
||
|
//
|
||
|
// clock_offset = (clock_t) (end_clock > start_clock ?
|
||
|
// end_clock - start_clock :
|
||
|
// end_clock + (0xffffffffu - start_clock));
|
||
|
//
|
||
|
// Indeed, let m = 2^32 then
|
||
|
// end - start = end + m - start (mod m).
|
||
|
|
||
|
clock_offset = (clock_t)(end_clock - start_clock);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// A pointer to the allocated device buffer is returned in dPtr so the caller
|
||
|
// can compare virtual addresses. The kernel node is added to increase the
|
||
|
// graph's runtime.
|
||
|
void createSimpleAllocFreeGraph(cudaGraphExec_t *graphExec, float **dPtr,
|
||
|
size_t bytes, int device) {
|
||
|
cudaGraph_t graph;
|
||
|
cudaGraphNode_t allocNodeA, freeNodeA, blockDeviceNode;
|
||
|
cudaMemAllocNodeParams allocParams;
|
||
|
cudaKernelNodeParams blockDeviceNodeParams = {0};
|
||
|
int numElements = bytes / sizeof(float);
|
||
|
float kernelTime = 5; // time for each thread to run in microseconds
|
||
|
|
||
|
checkCudaErrors(cudaGraphCreate(&graph, 0));
|
||
|
prepareAllocParams(&allocParams, bytes, device);
|
||
|
|
||
|
checkCudaErrors(
|
||
|
cudaGraphAddMemAllocNode(&allocNodeA, graph, NULL, 0, &allocParams));
|
||
|
*dPtr = (float *)allocParams.dptr;
|
||
|
|
||
|
cudaDeviceProp deviceProp;
|
||
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device));
|
||
|
clock_t time_clocks = (clock_t)((kernelTime / 1000.0) * deviceProp.clockRate);
|
||
|
|
||
|
void *blockDeviceArgs[1] = {(void *)&time_clocks};
|
||
|
|
||
|
size_t numBlocks = numElements / (size_t)THREADS_PER_BLOCK;
|
||
|
blockDeviceNodeParams.gridDim = dim3(numBlocks, 1, 1);
|
||
|
blockDeviceNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1);
|
||
|
blockDeviceNodeParams.sharedMemBytes = 0;
|
||
|
blockDeviceNodeParams.extra = NULL;
|
||
|
blockDeviceNodeParams.func = (void *)clockBlock;
|
||
|
blockDeviceNodeParams.kernelParams = (void **)blockDeviceArgs;
|
||
|
checkCudaErrors(cudaGraphAddKernelNode(&blockDeviceNode, graph, &allocNodeA,
|
||
|
1, &blockDeviceNodeParams));
|
||
|
|
||
|
checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeA, graph, &blockDeviceNode,
|
||
|
1, (void *)*dPtr));
|
||
|
|
||
|
checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0));
|
||
|
checkCudaErrors(cudaGraphDestroy(graph));
|
||
|
}
|
||
|
|
||
|
void physicalMemoryReuseSingleStream(size_t bytes, int device) {
|
||
|
cudaStream_t stream;
|
||
|
cudaGraphExec_t graphExecs[NUM_GRAPHS];
|
||
|
float *dPtrs[NUM_GRAPHS];
|
||
|
bool virtualAddrDiffer = true;
|
||
|
|
||
|
printf("================================\n");
|
||
|
printf("Running physical memory reuse example.\n");
|
||
|
printf(
|
||
|
"CUDA reuses the same physical memory for allocations from separate "
|
||
|
"graphs when the allocation lifetimes don't overlap.\n\n");
|
||
|
|
||
|
for (int i = 0; i < NUM_GRAPHS; i++) {
|
||
|
createSimpleAllocFreeGraph(&graphExecs[i], &dPtrs[i], bytes, device);
|
||
|
}
|
||
|
|
||
|
printf("Creating the graph execs does not reserve any physical memory.\n");
|
||
|
printMemoryFootprint(device);
|
||
|
|
||
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||
|
|
||
|
checkCudaErrors(cudaGraphLaunch(graphExecs[0], stream));
|
||
|
printf("\nThe first graph launched reserves the memory it needs.\n");
|
||
|
printMemoryFootprint(device);
|
||
|
|
||
|
checkCudaErrors(cudaGraphLaunch(graphExecs[0], stream));
|
||
|
printf(
|
||
|
"A subsequent launch of the same graph in the same stream reuses the "
|
||
|
"same physical memory. ");
|
||
|
printf("Thus the memory footprint does not grow here.\n");
|
||
|
printMemoryFootprint(device);
|
||
|
|
||
|
printf(
|
||
|
"\nSubsequent launches of other graphs in the same stream also reuse the "
|
||
|
"physical memory. ");
|
||
|
printf("Thus the memory footprint does not grow here.\n");
|
||
|
for (int i = 1; i < NUM_GRAPHS; i++) {
|
||
|
checkCudaErrors(cudaGraphLaunch(graphExecs[i], stream));
|
||
|
printf("%02d: ", i);
|
||
|
printMemoryFootprint(device);
|
||
|
}
|
||
|
|
||
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
||
|
|
||
|
for (int i = 0; i < NUM_GRAPHS; i++) {
|
||
|
for (int j = i + 1; j < NUM_GRAPHS; j++) {
|
||
|
if (dPtrs[i] == dPtrs[j]) {
|
||
|
virtualAddrDiffer = false;
|
||
|
printf("Error: Graph exec %d and %d have the same virtual address!\n",
|
||
|
i - 1, i);
|
||
|
}
|
||
|
}
|
||
|
checkCudaErrors(cudaGraphExecDestroy(graphExecs[i]));
|
||
|
}
|
||
|
if (virtualAddrDiffer) {
|
||
|
printf("\nCheck confirms all graphs use a different virtual address.\n");
|
||
|
} else {
|
||
|
printf(
|
||
|
"\nAll graphs do NOT use different virtual addresses. Exiting test.\n");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
checkCudaErrors(cudaStreamDestroy(stream));
|
||
|
}
|
||
|
|
||
|
void simultaneousStreams(size_t bytes, int device) {
|
||
|
cudaStream_t streams[NUM_GRAPHS];
|
||
|
cudaGraphExec_t graphExecs[NUM_GRAPHS];
|
||
|
float *dPtrs[NUM_GRAPHS];
|
||
|
|
||
|
printf("================================\n");
|
||
|
printf("Running simultaneous streams example.\n");
|
||
|
printf("Graphs that can run concurrently need separate physical memory. ");
|
||
|
printf(
|
||
|
"In this example, each graph launched in a separate stream increases the "
|
||
|
"total memory footprint.\n\n");
|
||
|
|
||
|
printf(
|
||
|
"When launching a new graph, CUDA may reuse physical memory from a graph "
|
||
|
"whose execution has already ");
|
||
|
printf(
|
||
|
"finished -- even if the new graph is being launched in a different "
|
||
|
"stream from the completed graph. ");
|
||
|
printf(
|
||
|
"Therefore, a kernel node is added to the graphs to increase "
|
||
|
"runtime.\n\n");
|
||
|
|
||
|
for (int i = 0; i < NUM_GRAPHS; i++) {
|
||
|
createSimpleAllocFreeGraph(&graphExecs[i], &dPtrs[i], bytes, device);
|
||
|
checkCudaErrors(
|
||
|
cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking));
|
||
|
}
|
||
|
|
||
|
printf("Initial footprint:\n");
|
||
|
printMemoryFootprint(device);
|
||
|
|
||
|
printf(
|
||
|
"\nEach graph launch in a seperate stream grows the memory footprint:\n");
|
||
|
for (int i = 1; i < NUM_GRAPHS; i++) {
|
||
|
checkCudaErrors(cudaGraphLaunch(graphExecs[i], streams[i]));
|
||
|
printf("%02d: ", i);
|
||
|
printMemoryFootprint(device);
|
||
|
}
|
||
|
|
||
|
for (int i = 0; i < NUM_GRAPHS; i++) {
|
||
|
checkCudaErrors(cudaStreamSynchronize(streams[i]));
|
||
|
checkCudaErrors(cudaGraphExecDestroy(graphExecs[i]));
|
||
|
checkCudaErrors(cudaStreamDestroy(streams[i]));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void createSimpleAllocNoFreeGraph(cudaGraphExec_t *graphExec, float **dPtr,
|
||
|
size_t bytes, int device) {
|
||
|
cudaGraph_t graph;
|
||
|
cudaGraphNode_t allocNodeA;
|
||
|
cudaMemAllocNodeParams allocParams;
|
||
|
|
||
|
checkCudaErrors(cudaGraphCreate(&graph, 0));
|
||
|
prepareAllocParams(&allocParams, bytes, device);
|
||
|
|
||
|
checkCudaErrors(
|
||
|
cudaGraphAddMemAllocNode(&allocNodeA, graph, NULL, 0, &allocParams));
|
||
|
*dPtr = (float *)allocParams.dptr;
|
||
|
|
||
|
checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0));
|
||
|
checkCudaErrors(cudaGraphDestroy(graph));
|
||
|
}
|
||
|
|
||
|
void unfreedAllocations(size_t bytes, int device) {
|
||
|
cudaStream_t stream;
|
||
|
cudaGraphExec_t graphExecs[NUM_GRAPHS];
|
||
|
float *dPtrs[NUM_GRAPHS];
|
||
|
|
||
|
printf("================================\n");
|
||
|
printf("Running unfreed streams example.\n");
|
||
|
printf(
|
||
|
"CUDA cannot reuse phyiscal memory from graphs which do not free their "
|
||
|
"allocations.\n\n");
|
||
|
|
||
|
for (int i = 0; i < NUM_GRAPHS; i++) {
|
||
|
createSimpleAllocNoFreeGraph(&graphExecs[i], &dPtrs[i], bytes, device);
|
||
|
}
|
||
|
|
||
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||
|
|
||
|
printf(
|
||
|
"Despite being launched in the same stream, each graph launch grows the "
|
||
|
"memory footprint. ");
|
||
|
printf(
|
||
|
"Since the allocation is not freed, CUDA keeps the memory valid for "
|
||
|
"use.\n");
|
||
|
for (int i = 0; i < NUM_GRAPHS; i++) {
|
||
|
checkCudaErrors(cudaGraphLaunch(graphExecs[i], stream));
|
||
|
printf("%02d: ", i);
|
||
|
printMemoryFootprint(device);
|
||
|
}
|
||
|
|
||
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
||
|
|
||
|
checkCudaErrors(cudaDeviceGraphMemTrim(device));
|
||
|
printf(
|
||
|
"\nTrimming does not impact the memory footprint since the un-freed "
|
||
|
"allocations are still holding onto the memory.\n");
|
||
|
printMemoryFootprint(device);
|
||
|
|
||
|
for (int i = 0; i < NUM_GRAPHS; i++) {
|
||
|
checkCudaErrors(cudaFree(dPtrs[i]));
|
||
|
}
|
||
|
printf("\nFreeing the allocations does not shrink the footprint.\n");
|
||
|
printMemoryFootprint(device);
|
||
|
|
||
|
checkCudaErrors(cudaDeviceGraphMemTrim(device));
|
||
|
printf(
|
||
|
"\nSince the allocations are now freed, trimming does reduce the "
|
||
|
"footprint even when the graph execs are not yet destroyed.\n");
|
||
|
printMemoryFootprint(device);
|
||
|
|
||
|
for (int i = 0; i < NUM_GRAPHS; i++) {
|
||
|
checkCudaErrors(cudaGraphExecDestroy(graphExecs[i]));
|
||
|
}
|
||
|
checkCudaErrors(cudaStreamDestroy(stream));
|
||
|
}
|
||
|
|
||
|
void cleanupMemory(int device) {
|
||
|
checkCudaErrors(cudaDeviceGraphMemTrim(device));
|
||
|
printf("\nCleaning up example by trimming device memory.\n");
|
||
|
printMemoryFootprint(device);
|
||
|
printf("\n");
|
||
|
}
|
||
|
|
||
|
int main(int argc, char **argv) {
|
||
|
size_t bytes = 64 * 1024 * 1024;
|
||
|
int device = findCudaDevice(argc, (const char **)argv);
|
||
|
|
||
|
int driverVersion = 0;
|
||
|
int deviceSupportsMemoryPools = 0;
|
||
|
|
||
|
cudaDriverGetVersion(&driverVersion);
|
||
|
printf("Driver version is: %d.%d\n", driverVersion / 1000,
|
||
|
(driverVersion % 100) / 10);
|
||
|
|
||
|
if (driverVersion < 11040) {
|
||
|
printf("Waiving execution as driver does not support Graph Memory Nodes\n");
|
||
|
exit(EXIT_WAIVED);
|
||
|
}
|
||
|
|
||
|
cudaDeviceGetAttribute(&deviceSupportsMemoryPools,
|
||
|
cudaDevAttrMemoryPoolsSupported, device);
|
||
|
if (!deviceSupportsMemoryPools) {
|
||
|
printf("Waiving execution as device does not support Memory Pools\n");
|
||
|
exit(EXIT_WAIVED);
|
||
|
} else {
|
||
|
printf("Running sample.\n");
|
||
|
}
|
||
|
|
||
|
virtualAddressReuseSingleGraph(bytes, device);
|
||
|
cleanupMemory(device);
|
||
|
|
||
|
physicalMemoryReuseSingleStream(bytes, device);
|
||
|
cleanupMemory(device);
|
||
|
|
||
|
simultaneousStreams(bytes, device);
|
||
|
cleanupMemory(device);
|
||
|
|
||
|
unfreedAllocations(bytes, device);
|
||
|
cleanupMemory(device);
|
||
|
|
||
|
printf("================================\n");
|
||
|
printf("Sample complete.\n");
|
||
|
}
|