cuda-samples/Samples/3_CUDA_Features/graphMemoryNodes/graphMemoryNodes.cu
2022-01-13 11:35:24 +05:30

554 lines
21 KiB
Plaintext

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// System includes
#include <assert.h>
#include <stdio.h>
#include <climits>
#include <vector>
// CUDA runtime
#include <cuda_runtime.h>
// helper functions and utilities to work with CUDA
#include <helper_cuda.h>
#include <helper_functions.h>
#define THREADS_PER_BLOCK 512
#define ALLOWABLE_VARIANCE 1.e-6f
#define NUM_ELEMENTS 8000000
// Stores the square of each input element in output array
__global__ void squareArray(const float *input, float *output,
int numElements) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < numElements) {
output[idx] = input[idx] * input[idx];
}
}
// Stores the negative of each input element in output array
__global__ void negateArray(const float *input, float *output,
int numElements) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < numElements) {
output[idx] = input[idx] * -1;
}
}
struct negSquareArrays {
float *input;
float *square;
float *negSquare;
int numElements;
size_t bytes;
size_t numBlocks;
};
void fillRandomly(float *array, int numElements) {
for (int n = 0; n < numElements; n++) {
array[n] = rand() / (float)RAND_MAX;
}
}
void resetOutputArrays(negSquareArrays *hostArrays) {
fillRandomly(hostArrays->square, hostArrays->numElements);
fillRandomly(hostArrays->negSquare, hostArrays->numElements);
}
void prepareHostArrays(negSquareArrays *hostArrays) {
hostArrays->numElements = NUM_ELEMENTS;
size_t bytes = hostArrays->numElements * sizeof(float);
size_t numBlocks = hostArrays->numElements / (size_t)THREADS_PER_BLOCK;
if ((numBlocks % (size_t)THREADS_PER_BLOCK) != 0) {
numBlocks++;
}
hostArrays->input = (float *)malloc(bytes);
hostArrays->square = (float *)malloc(bytes);
hostArrays->negSquare = (float *)malloc(bytes);
hostArrays->bytes = bytes;
hostArrays->numBlocks = numBlocks;
fillRandomly(hostArrays->input, hostArrays->numElements);
fillRandomly(hostArrays->square, hostArrays->numElements);
fillRandomly(hostArrays->negSquare, hostArrays->numElements);
}
void createFreeGraph(cudaGraphExec_t *graphExec, float *dPtr) {
cudaGraph_t graph;
cudaGraphNode_t freeNode;
checkCudaErrors(cudaGraphCreate(&graph, 0));
checkCudaErrors(
cudaGraphAddMemFreeNode(&freeNode, graph, NULL, 0, (void *)dPtr));
checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0));
checkCudaErrors(cudaGraphDestroy(graph));
}
/**
* Demonstrates explicitly creating a CUDA graph including memory nodes.
* createNegateSquaresGraphWithStreamCapture constructs an equivalent graph
* using stream capture.
*
* If d_negSquare_out is non null, then:
* 1) d_negSquare will not be freed;
* 2) the value of d_negSquare_out will be set to d_negSquare.
*
* Diagram of the graph constructed by createNegateSquaresGraphExplicitly:
*
* alloc d_input
* |
* alloc d_square
* |
* Memcpy a to device
* |
* launch kernel squareArray ------->---- Memcpy d_square to host
* | |
* free d_input |
* | |
* allocate d_negSquare |
* | |
* launch kernel negateArray -------->--- free d_square
* |
* Memcpy d_negSquare to host
* |
* free d_negSquare
*/
void createNegateSquaresGraphExplicitly(cudaGraphExec_t *graphExec, int device,
negSquareArrays *hostArrays,
float **d_negSquare_out = NULL) {
// Array buffers on device
float *d_input, *d_square, *d_negSquare;
// Memory allocation parameters
cudaMemAllocNodeParams allocParams;
memset(&allocParams, 0, sizeof(allocParams));
allocParams.bytesize = hostArrays->bytes;
allocParams.poolProps.allocType = cudaMemAllocationTypePinned;
allocParams.poolProps.location.id = device;
allocParams.poolProps.location.type = cudaMemLocationTypeDevice;
// Kernel launch parameters
cudaKernelNodeParams kernelNodeParams = {0};
kernelNodeParams.gridDim = dim3(hostArrays->numBlocks, 1, 1);
kernelNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1);
kernelNodeParams.sharedMemBytes = 0;
kernelNodeParams.extra = NULL;
cudaGraph_t graph;
cudaGraphNode_t allocNodeInput, allocNodeSquare, allocNodeNegSquare;
cudaGraphNode_t copyNodeInput, copyNodeSquare, copyNodeNegSquare;
cudaGraphNode_t squareKernelNode, negateKernelNode;
cudaGraphNode_t freeNodeInput, freeNodeSquare;
// Buffer for storing graph node dependencies
std::vector<cudaGraphNode_t> nodeDependencies;
checkCudaErrors(cudaGraphCreate(&graph, 0));
checkCudaErrors(
cudaGraphAddMemAllocNode(&allocNodeInput, graph, NULL, 0, &allocParams));
d_input = (float *)allocParams.dptr;
// To keep the graph structure simple (fewer branching dependencies),
// allocNodeSquare should depend on allocNodeInput
checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeSquare, graph,
&allocNodeInput, 1, &allocParams));
d_square = (float *)allocParams.dptr;
// copyNodeInput needs to depend on allocNodeInput because copyNodeInput
// writes to d_input. It does so here indirectly through allocNodeSquare.
checkCudaErrors(cudaGraphAddMemcpyNode1D(
&copyNodeInput, graph, &allocNodeSquare, 1, d_input, hostArrays->input,
hostArrays->bytes, cudaMemcpyHostToDevice));
void *squareKernelArgs[3] = {(void *)&d_input, (void *)&d_square,
(void *)&(hostArrays->numElements)};
kernelNodeParams.func = (void *)squareArray;
kernelNodeParams.kernelParams = (void **)squareKernelArgs;
// Square kernel depends on copyNodeInput to ensure all data is on the device
// before kernel launch.
checkCudaErrors(cudaGraphAddKernelNode(&squareKernelNode, graph,
&copyNodeInput, 1, &kernelNodeParams));
checkCudaErrors(cudaGraphAddMemcpyNode1D(
&copyNodeSquare, graph, &squareKernelNode, 1, hostArrays->square,
d_square, hostArrays->bytes, cudaMemcpyDeviceToHost));
// Free of d_input depends on the square kernel to ensure that d_input is not
// freed while being read by the kernel. It also depends on the alloc of
// d_input via squareKernelNode > copyNodeInput > allocNodeSquare >
// allocNodeInput.
checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeInput, graph,
&squareKernelNode, 1, d_input));
// Allocation of C depends on free of A so CUDA can reuse the virtual address.
checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeNegSquare, graph,
&freeNodeInput, 1, &allocParams));
d_negSquare = (float *)allocParams.dptr;
if (d_negSquare == d_input) {
printf(
"Check verified that d_negSquare and d_input share a virtual "
"address.\n");
}
void *negateKernelArgs[3] = {(void *)&d_square, (void *)&d_negSquare,
(void *)&(hostArrays->numElements)};
kernelNodeParams.func = (void *)negateArray;
kernelNodeParams.kernelParams = (void **)negateKernelArgs;
checkCudaErrors(cudaGraphAddKernelNode(
&negateKernelNode, graph, &allocNodeNegSquare, 1, &kernelNodeParams));
nodeDependencies.push_back(copyNodeSquare);
nodeDependencies.push_back(negateKernelNode);
checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeSquare, graph,
nodeDependencies.data(),
nodeDependencies.size(), d_square));
nodeDependencies.clear();
checkCudaErrors(cudaGraphAddMemcpyNode1D(
&copyNodeNegSquare, graph, &negateKernelNode, 1, hostArrays->negSquare,
d_negSquare, hostArrays->bytes, cudaMemcpyDeviceToHost));
if (d_negSquare_out == NULL) {
cudaGraphNode_t freeNodeNegSquare;
checkCudaErrors(cudaGraphAddMemFreeNode(
&freeNodeNegSquare, graph, &copyNodeNegSquare, 1, d_negSquare));
} else {
*d_negSquare_out = d_negSquare;
}
checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0));
checkCudaErrors(cudaGraphDestroy(graph));
}
/**
* Adds work to a CUDA stream which negates the square of values in the input
* array.
*
* If d_negSquare_out is non null, then:
* 1) d_negSquare will not be freed;
* 2) the value of d_negSquare_out will be set to d_negSquare.
*
* Diagram of the stream operations in doNegateSquaresInStream
* ---------------------------------------------------------------------
* | STREAM | STREAM2 |
* ---------------------------------------------------------------------
*
* alloc d_input
* |
* alloc d_square
* |
* Memcpy a to device
* |
* launch kernel squareArray
* |
* record squareKernelCompleteEvent -->-- wait squareKernelCompleteEvent
* | |
* free d_input |
* | |
* allocate d_negSquare Memcpy d_square to host
* | |
* launch kernel negateArray |
* | |
* record negateKernelCompleteEvent -->-- wait negateKernelCompleteEvent
* | |
* Memcpy d_negSquare to host |
* | free d_square
* free d_negSquare |
* | |
* wait squareFreeEvent --------------<---- record squareFreeEvent
*/
void doNegateSquaresInStream(cudaStream_t stream1, negSquareArrays *hostArrays,
float **d_negSquare_out = NULL) {
float *d_input, *d_square, *d_negSquare;
cudaStream_t stream2;
cudaEvent_t squareKernelCompleteEvent, negateKernelCompleteEvent,
squareFreeEvent;
checkCudaErrors(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking));
checkCudaErrors(cudaEventCreate(&squareKernelCompleteEvent));
checkCudaErrors(cudaEventCreate(&negateKernelCompleteEvent));
checkCudaErrors(cudaEventCreate(&squareFreeEvent));
// Virtual addresses are assigned synchronously when cudaMallocAsync is
// called, thus there is no performace benefit gained by separating the
// allocations into two streams.
checkCudaErrors(cudaMallocAsync(&d_input, hostArrays->bytes, stream1));
checkCudaErrors(cudaMallocAsync(&d_square, hostArrays->bytes, stream1));
checkCudaErrors(cudaMemcpyAsync(d_input, hostArrays->input, hostArrays->bytes,
cudaMemcpyHostToDevice, stream1));
squareArray<<<hostArrays->numBlocks, THREADS_PER_BLOCK, 0, stream1>>>(
d_input, d_square, hostArrays->numElements);
checkCudaErrors(cudaEventRecord(squareKernelCompleteEvent, stream1));
checkCudaErrors(cudaStreamWaitEvent(stream2, squareKernelCompleteEvent, 0));
checkCudaErrors(cudaMemcpyAsync(hostArrays->square, d_square,
hostArrays->bytes, cudaMemcpyDeviceToHost,
stream2));
checkCudaErrors(cudaFreeAsync(d_input, stream1));
checkCudaErrors(cudaMallocAsync(&d_negSquare, hostArrays->bytes, stream1));
negateArray<<<hostArrays->numBlocks, THREADS_PER_BLOCK, 0, stream1>>>(
d_square, d_negSquare, hostArrays->numElements);
checkCudaErrors(cudaEventRecord(negateKernelCompleteEvent, stream1));
checkCudaErrors(cudaMemcpyAsync(hostArrays->negSquare, d_negSquare,
hostArrays->bytes, cudaMemcpyDeviceToHost,
stream1));
if (d_negSquare_out == NULL) {
checkCudaErrors(cudaFreeAsync(d_negSquare, stream1));
} else {
*d_negSquare_out = d_negSquare;
}
checkCudaErrors(cudaStreamWaitEvent(stream2, negateKernelCompleteEvent, 0));
checkCudaErrors(cudaFreeAsync(d_square, stream2));
checkCudaErrors(cudaEventRecord(squareFreeEvent, stream2));
checkCudaErrors(cudaStreamWaitEvent(stream1, squareFreeEvent, 0));
checkCudaErrors(cudaStreamDestroy(stream2));
checkCudaErrors(cudaEventDestroy(squareKernelCompleteEvent));
checkCudaErrors(cudaEventDestroy(negateKernelCompleteEvent));
checkCudaErrors(cudaEventDestroy(squareFreeEvent));
}
/**
* Demonstrates creating a CUDA graph including memory nodes using stream
* capture. createNegateSquaresGraphExplicitly constructs an equivalent graph
* without stream capture.
*/
void createNegateSquaresGraphWithStreamCapture(cudaGraphExec_t *graphExec,
negSquareArrays *hostArrays,
float **d_negSquare_out = NULL) {
cudaGraph_t graph;
cudaStream_t stream;
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
checkCudaErrors(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
doNegateSquaresInStream(stream, hostArrays, d_negSquare_out);
checkCudaErrors(cudaStreamEndCapture(stream, &graph));
checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0));
checkCudaErrors(cudaStreamDestroy(stream));
checkCudaErrors(cudaGraphDestroy(graph));
}
void prepareRefArrays(negSquareArrays *hostArrays,
negSquareArrays *deviceRefArrays,
bool **foundValidationFailure) {
deviceRefArrays->bytes = hostArrays->bytes;
deviceRefArrays->numElements = hostArrays->numElements;
for (int i = 0; i < hostArrays->numElements; i++) {
hostArrays->square[i] = hostArrays->input[i] * hostArrays->input[i];
hostArrays->negSquare[i] = hostArrays->square[i] * -1;
}
checkCudaErrors(
cudaMalloc((void **)&deviceRefArrays->negSquare, deviceRefArrays->bytes));
checkCudaErrors(cudaMemcpy(deviceRefArrays->negSquare, hostArrays->negSquare,
hostArrays->bytes, cudaMemcpyHostToDevice));
checkCudaErrors(
cudaMallocManaged((void **)foundValidationFailure, sizeof(bool)));
}
int checkValidationFailure(bool *foundValidationFailure) {
if (*foundValidationFailure) {
printf("Validation FAILURE!\n\n");
*foundValidationFailure = false;
return EXIT_FAILURE;
} else {
printf("Validation PASSED!\n\n");
return EXIT_SUCCESS;
}
}
__global__ void validateGPU(float *d_negSquare, negSquareArrays devRefArrays,
bool *foundValidationFailure) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
float ref, diff;
if (idx < devRefArrays.numElements) {
ref = devRefArrays.negSquare[idx];
diff = d_negSquare[idx] - ref;
diff *= diff;
ref *= ref;
if (diff / ref > ALLOWABLE_VARIANCE) {
*foundValidationFailure = true;
}
}
}
void validateHost(negSquareArrays *hostArrays, bool *foundValidationFailure) {
float ref, diff;
for (int i = 0; i < hostArrays->numElements; i++) {
ref = hostArrays->input[i] * hostArrays->input[i] * -1;
diff = hostArrays->negSquare[i] - ref;
diff *= diff;
ref *= ref;
if (diff / ref > ALLOWABLE_VARIANCE) {
*foundValidationFailure = true;
}
}
}
int main(int argc, char **argv) {
negSquareArrays hostArrays, deviceRefArrays;
cudaStream_t stream;
cudaGraphExec_t graphExec, graphExecFreeC;
// Declare pointers for GPU buffers
float *d_negSquare = NULL;
bool *foundValidationFailure = NULL;
srand(time(0));
int device = findCudaDevice(argc, (const char **)argv);
int driverVersion = 0;
int deviceSupportsMemoryPools = 0;
cudaDriverGetVersion(&driverVersion);
printf("Driver version is: %d.%d\n", driverVersion / 1000,
(driverVersion % 100) / 10);
if (driverVersion < 11040) {
printf("Waiving execution as driver does not support Graph Memory Nodes\n");
exit(EXIT_WAIVED);
}
cudaDeviceGetAttribute(&deviceSupportsMemoryPools,
cudaDevAttrMemoryPoolsSupported, device);
if (!deviceSupportsMemoryPools) {
printf("Waiving execution as device does not support Memory Pools\n");
exit(EXIT_WAIVED);
} else {
printf("Setting up sample.\n");
}
prepareHostArrays(&hostArrays);
prepareRefArrays(&hostArrays, &deviceRefArrays, &foundValidationFailure);
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
printf("Setup complete.\n\n");
printf("Running negateSquares in a stream.\n");
doNegateSquaresInStream(stream, &hostArrays);
checkCudaErrors(cudaStreamSynchronize(stream));
printf("Validating negateSquares in a stream...\n");
validateHost(&hostArrays, foundValidationFailure);
checkValidationFailure(foundValidationFailure);
resetOutputArrays(&hostArrays);
printf("Running negateSquares in a stream-captured graph.\n");
createNegateSquaresGraphWithStreamCapture(&graphExec, &hostArrays);
checkCudaErrors(cudaGraphLaunch(graphExec, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
printf("Validating negateSquares in a stream-captured graph...\n");
validateHost(&hostArrays, foundValidationFailure);
checkValidationFailure(foundValidationFailure);
resetOutputArrays(&hostArrays);
printf("Running negateSquares in an explicitly constructed graph.\n");
createNegateSquaresGraphExplicitly(&graphExec, device, &hostArrays);
checkCudaErrors(cudaGraphLaunch(graphExec, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
printf("Validating negateSquares in an explicitly constructed graph...\n");
validateHost(&hostArrays, foundValidationFailure);
checkValidationFailure(foundValidationFailure);
resetOutputArrays(&hostArrays);
// Each of the three examples below free d_negSquare outside the graph. As
// demonstrated by validateGPU, d_negSquare can be accessed by outside the
// graph before d_negSquare is freed.
printf("Running negateSquares with d_negSquare freed outside the stream.\n");
createNegateSquaresGraphExplicitly(&graphExec, device, &hostArrays,
&d_negSquare);
checkCudaErrors(cudaGraphLaunch(graphExec, stream));
validateGPU<<<hostArrays.numBlocks, THREADS_PER_BLOCK, 0, stream>>>(
d_negSquare, deviceRefArrays, foundValidationFailure);
// Since cudaFree is synchronous, the stream must synchronize before freeing
// d_negSquare to ensure d_negSquare no longer being accessed.
checkCudaErrors(cudaStreamSynchronize(stream));
checkCudaErrors(cudaFree(d_negSquare));
printf(
"Validating negateSquares with d_negSquare freed outside the "
"stream...\n");
validateHost(&hostArrays, foundValidationFailure);
checkValidationFailure(foundValidationFailure);
resetOutputArrays(&hostArrays);
printf("Running negateSquares with d_negSquare freed outside the graph.\n");
checkCudaErrors(cudaGraphLaunch(graphExec, stream));
validateGPU<<<hostArrays.numBlocks, THREADS_PER_BLOCK, 0, stream>>>(
d_negSquare, deviceRefArrays, foundValidationFailure);
checkCudaErrors(cudaFreeAsync(d_negSquare, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
printf(
"Validating negateSquares with d_negSquare freed outside the graph...\n");
checkValidationFailure(foundValidationFailure);
resetOutputArrays(&hostArrays);
printf(
"Running negateSquares with d_negSquare freed in a different graph.\n");
createFreeGraph(&graphExecFreeC, d_negSquare);
checkCudaErrors(cudaGraphLaunch(graphExec, stream));
validateGPU<<<hostArrays.numBlocks, THREADS_PER_BLOCK, 0, stream>>>(
d_negSquare, deviceRefArrays, foundValidationFailure);
checkCudaErrors(cudaGraphLaunch(graphExecFreeC, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
printf(
"Validating negateSquares with d_negSquare freed in a different "
"graph...\n");
checkValidationFailure(foundValidationFailure);
printf("Cleaning up sample.\n");
checkCudaErrors(cudaGraphExecDestroy(graphExec));
checkCudaErrors(cudaGraphExecDestroy(graphExecFreeC));
checkCudaErrors(cudaStreamDestroy(stream));
checkCudaErrors(cudaFree(foundValidationFailure));
checkCudaErrors(cudaFree(deviceRefArrays.negSquare));
free(hostArrays.input);
free(hostArrays.square);
free(hostArrays.negSquare);
printf("Cleanup complete. Exiting sample.\n");
}