mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-24 19:39:17 +08:00
554 lines
21 KiB
Plaintext
554 lines
21 KiB
Plaintext
/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
// System includes
|
|
#include <assert.h>
|
|
#include <stdio.h>
|
|
|
|
#include <climits>
|
|
#include <vector>
|
|
|
|
// CUDA runtime
|
|
#include <cuda_runtime.h>
|
|
|
|
// helper functions and utilities to work with CUDA
|
|
#include <helper_cuda.h>
|
|
#include <helper_functions.h>
|
|
|
|
#define THREADS_PER_BLOCK 512
|
|
#define ALLOWABLE_VARIANCE 1.e-6f
|
|
#define NUM_ELEMENTS 8000000
|
|
|
|
// Stores the square of each input element in output array
|
|
__global__ void squareArray(const float *input, float *output,
|
|
int numElements) {
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
if (idx < numElements) {
|
|
output[idx] = input[idx] * input[idx];
|
|
}
|
|
}
|
|
|
|
// Stores the negative of each input element in output array
|
|
__global__ void negateArray(const float *input, float *output,
|
|
int numElements) {
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
if (idx < numElements) {
|
|
output[idx] = input[idx] * -1;
|
|
}
|
|
}
|
|
|
|
struct negSquareArrays {
|
|
float *input;
|
|
float *square;
|
|
float *negSquare;
|
|
int numElements;
|
|
size_t bytes;
|
|
size_t numBlocks;
|
|
};
|
|
|
|
void fillRandomly(float *array, int numElements) {
|
|
for (int n = 0; n < numElements; n++) {
|
|
array[n] = rand() / (float)RAND_MAX;
|
|
}
|
|
}
|
|
|
|
void resetOutputArrays(negSquareArrays *hostArrays) {
|
|
fillRandomly(hostArrays->square, hostArrays->numElements);
|
|
fillRandomly(hostArrays->negSquare, hostArrays->numElements);
|
|
}
|
|
|
|
void prepareHostArrays(negSquareArrays *hostArrays) {
|
|
hostArrays->numElements = NUM_ELEMENTS;
|
|
size_t bytes = hostArrays->numElements * sizeof(float);
|
|
|
|
size_t numBlocks = hostArrays->numElements / (size_t)THREADS_PER_BLOCK;
|
|
if ((numBlocks % (size_t)THREADS_PER_BLOCK) != 0) {
|
|
numBlocks++;
|
|
}
|
|
|
|
hostArrays->input = (float *)malloc(bytes);
|
|
hostArrays->square = (float *)malloc(bytes);
|
|
hostArrays->negSquare = (float *)malloc(bytes);
|
|
hostArrays->bytes = bytes;
|
|
hostArrays->numBlocks = numBlocks;
|
|
|
|
fillRandomly(hostArrays->input, hostArrays->numElements);
|
|
fillRandomly(hostArrays->square, hostArrays->numElements);
|
|
fillRandomly(hostArrays->negSquare, hostArrays->numElements);
|
|
}
|
|
|
|
void createFreeGraph(cudaGraphExec_t *graphExec, float *dPtr) {
|
|
cudaGraph_t graph;
|
|
cudaGraphNode_t freeNode;
|
|
|
|
checkCudaErrors(cudaGraphCreate(&graph, 0));
|
|
|
|
checkCudaErrors(
|
|
cudaGraphAddMemFreeNode(&freeNode, graph, NULL, 0, (void *)dPtr));
|
|
|
|
checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0));
|
|
checkCudaErrors(cudaGraphDestroy(graph));
|
|
}
|
|
|
|
/**
|
|
* Demonstrates explicitly creating a CUDA graph including memory nodes.
|
|
* createNegateSquaresGraphWithStreamCapture constructs an equivalent graph
|
|
* using stream capture.
|
|
*
|
|
* If d_negSquare_out is non null, then:
|
|
* 1) d_negSquare will not be freed;
|
|
* 2) the value of d_negSquare_out will be set to d_negSquare.
|
|
*
|
|
* Diagram of the graph constructed by createNegateSquaresGraphExplicitly:
|
|
*
|
|
* alloc d_input
|
|
* |
|
|
* alloc d_square
|
|
* |
|
|
* Memcpy a to device
|
|
* |
|
|
* launch kernel squareArray ------->---- Memcpy d_square to host
|
|
* | |
|
|
* free d_input |
|
|
* | |
|
|
* allocate d_negSquare |
|
|
* | |
|
|
* launch kernel negateArray -------->--- free d_square
|
|
* |
|
|
* Memcpy d_negSquare to host
|
|
* |
|
|
* free d_negSquare
|
|
*/
|
|
void createNegateSquaresGraphExplicitly(cudaGraphExec_t *graphExec, int device,
|
|
negSquareArrays *hostArrays,
|
|
float **d_negSquare_out = NULL) {
|
|
// Array buffers on device
|
|
float *d_input, *d_square, *d_negSquare;
|
|
|
|
// Memory allocation parameters
|
|
cudaMemAllocNodeParams allocParams;
|
|
memset(&allocParams, 0, sizeof(allocParams));
|
|
allocParams.bytesize = hostArrays->bytes;
|
|
allocParams.poolProps.allocType = cudaMemAllocationTypePinned;
|
|
allocParams.poolProps.location.id = device;
|
|
allocParams.poolProps.location.type = cudaMemLocationTypeDevice;
|
|
|
|
// Kernel launch parameters
|
|
cudaKernelNodeParams kernelNodeParams = {0};
|
|
kernelNodeParams.gridDim = dim3(hostArrays->numBlocks, 1, 1);
|
|
kernelNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.extra = NULL;
|
|
|
|
cudaGraph_t graph;
|
|
cudaGraphNode_t allocNodeInput, allocNodeSquare, allocNodeNegSquare;
|
|
cudaGraphNode_t copyNodeInput, copyNodeSquare, copyNodeNegSquare;
|
|
cudaGraphNode_t squareKernelNode, negateKernelNode;
|
|
cudaGraphNode_t freeNodeInput, freeNodeSquare;
|
|
|
|
// Buffer for storing graph node dependencies
|
|
std::vector<cudaGraphNode_t> nodeDependencies;
|
|
|
|
checkCudaErrors(cudaGraphCreate(&graph, 0));
|
|
|
|
checkCudaErrors(
|
|
cudaGraphAddMemAllocNode(&allocNodeInput, graph, NULL, 0, &allocParams));
|
|
d_input = (float *)allocParams.dptr;
|
|
|
|
// To keep the graph structure simple (fewer branching dependencies),
|
|
// allocNodeSquare should depend on allocNodeInput
|
|
checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeSquare, graph,
|
|
&allocNodeInput, 1, &allocParams));
|
|
d_square = (float *)allocParams.dptr;
|
|
|
|
// copyNodeInput needs to depend on allocNodeInput because copyNodeInput
|
|
// writes to d_input. It does so here indirectly through allocNodeSquare.
|
|
checkCudaErrors(cudaGraphAddMemcpyNode1D(
|
|
©NodeInput, graph, &allocNodeSquare, 1, d_input, hostArrays->input,
|
|
hostArrays->bytes, cudaMemcpyHostToDevice));
|
|
|
|
void *squareKernelArgs[3] = {(void *)&d_input, (void *)&d_square,
|
|
(void *)&(hostArrays->numElements)};
|
|
kernelNodeParams.func = (void *)squareArray;
|
|
kernelNodeParams.kernelParams = (void **)squareKernelArgs;
|
|
|
|
// Square kernel depends on copyNodeInput to ensure all data is on the device
|
|
// before kernel launch.
|
|
checkCudaErrors(cudaGraphAddKernelNode(&squareKernelNode, graph,
|
|
©NodeInput, 1, &kernelNodeParams));
|
|
|
|
checkCudaErrors(cudaGraphAddMemcpyNode1D(
|
|
©NodeSquare, graph, &squareKernelNode, 1, hostArrays->square,
|
|
d_square, hostArrays->bytes, cudaMemcpyDeviceToHost));
|
|
|
|
// Free of d_input depends on the square kernel to ensure that d_input is not
|
|
// freed while being read by the kernel. It also depends on the alloc of
|
|
// d_input via squareKernelNode > copyNodeInput > allocNodeSquare >
|
|
// allocNodeInput.
|
|
checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeInput, graph,
|
|
&squareKernelNode, 1, d_input));
|
|
|
|
// Allocation of C depends on free of A so CUDA can reuse the virtual address.
|
|
checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeNegSquare, graph,
|
|
&freeNodeInput, 1, &allocParams));
|
|
d_negSquare = (float *)allocParams.dptr;
|
|
|
|
if (d_negSquare == d_input) {
|
|
printf(
|
|
"Check verified that d_negSquare and d_input share a virtual "
|
|
"address.\n");
|
|
}
|
|
|
|
void *negateKernelArgs[3] = {(void *)&d_square, (void *)&d_negSquare,
|
|
(void *)&(hostArrays->numElements)};
|
|
kernelNodeParams.func = (void *)negateArray;
|
|
kernelNodeParams.kernelParams = (void **)negateKernelArgs;
|
|
|
|
checkCudaErrors(cudaGraphAddKernelNode(
|
|
&negateKernelNode, graph, &allocNodeNegSquare, 1, &kernelNodeParams));
|
|
|
|
nodeDependencies.push_back(copyNodeSquare);
|
|
nodeDependencies.push_back(negateKernelNode);
|
|
checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeSquare, graph,
|
|
nodeDependencies.data(),
|
|
nodeDependencies.size(), d_square));
|
|
nodeDependencies.clear();
|
|
|
|
checkCudaErrors(cudaGraphAddMemcpyNode1D(
|
|
©NodeNegSquare, graph, &negateKernelNode, 1, hostArrays->negSquare,
|
|
d_negSquare, hostArrays->bytes, cudaMemcpyDeviceToHost));
|
|
|
|
if (d_negSquare_out == NULL) {
|
|
cudaGraphNode_t freeNodeNegSquare;
|
|
checkCudaErrors(cudaGraphAddMemFreeNode(
|
|
&freeNodeNegSquare, graph, ©NodeNegSquare, 1, d_negSquare));
|
|
} else {
|
|
*d_negSquare_out = d_negSquare;
|
|
}
|
|
|
|
checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0));
|
|
checkCudaErrors(cudaGraphDestroy(graph));
|
|
}
|
|
|
|
/**
|
|
* Adds work to a CUDA stream which negates the square of values in the input
|
|
* array.
|
|
*
|
|
* If d_negSquare_out is non null, then:
|
|
* 1) d_negSquare will not be freed;
|
|
* 2) the value of d_negSquare_out will be set to d_negSquare.
|
|
*
|
|
* Diagram of the stream operations in doNegateSquaresInStream
|
|
* ---------------------------------------------------------------------
|
|
* | STREAM | STREAM2 |
|
|
* ---------------------------------------------------------------------
|
|
*
|
|
* alloc d_input
|
|
* |
|
|
* alloc d_square
|
|
* |
|
|
* Memcpy a to device
|
|
* |
|
|
* launch kernel squareArray
|
|
* |
|
|
* record squareKernelCompleteEvent -->-- wait squareKernelCompleteEvent
|
|
* | |
|
|
* free d_input |
|
|
* | |
|
|
* allocate d_negSquare Memcpy d_square to host
|
|
* | |
|
|
* launch kernel negateArray |
|
|
* | |
|
|
* record negateKernelCompleteEvent -->-- wait negateKernelCompleteEvent
|
|
* | |
|
|
* Memcpy d_negSquare to host |
|
|
* | free d_square
|
|
* free d_negSquare |
|
|
* | |
|
|
* wait squareFreeEvent --------------<---- record squareFreeEvent
|
|
*/
|
|
void doNegateSquaresInStream(cudaStream_t stream1, negSquareArrays *hostArrays,
|
|
float **d_negSquare_out = NULL) {
|
|
float *d_input, *d_square, *d_negSquare;
|
|
cudaStream_t stream2;
|
|
cudaEvent_t squareKernelCompleteEvent, negateKernelCompleteEvent,
|
|
squareFreeEvent;
|
|
|
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking));
|
|
|
|
checkCudaErrors(cudaEventCreate(&squareKernelCompleteEvent));
|
|
checkCudaErrors(cudaEventCreate(&negateKernelCompleteEvent));
|
|
checkCudaErrors(cudaEventCreate(&squareFreeEvent));
|
|
|
|
// Virtual addresses are assigned synchronously when cudaMallocAsync is
|
|
// called, thus there is no performace benefit gained by separating the
|
|
// allocations into two streams.
|
|
checkCudaErrors(cudaMallocAsync(&d_input, hostArrays->bytes, stream1));
|
|
checkCudaErrors(cudaMallocAsync(&d_square, hostArrays->bytes, stream1));
|
|
|
|
checkCudaErrors(cudaMemcpyAsync(d_input, hostArrays->input, hostArrays->bytes,
|
|
cudaMemcpyHostToDevice, stream1));
|
|
squareArray<<<hostArrays->numBlocks, THREADS_PER_BLOCK, 0, stream1>>>(
|
|
d_input, d_square, hostArrays->numElements);
|
|
checkCudaErrors(cudaEventRecord(squareKernelCompleteEvent, stream1));
|
|
|
|
checkCudaErrors(cudaStreamWaitEvent(stream2, squareKernelCompleteEvent, 0));
|
|
checkCudaErrors(cudaMemcpyAsync(hostArrays->square, d_square,
|
|
hostArrays->bytes, cudaMemcpyDeviceToHost,
|
|
stream2));
|
|
|
|
checkCudaErrors(cudaFreeAsync(d_input, stream1));
|
|
checkCudaErrors(cudaMallocAsync(&d_negSquare, hostArrays->bytes, stream1));
|
|
negateArray<<<hostArrays->numBlocks, THREADS_PER_BLOCK, 0, stream1>>>(
|
|
d_square, d_negSquare, hostArrays->numElements);
|
|
checkCudaErrors(cudaEventRecord(negateKernelCompleteEvent, stream1));
|
|
checkCudaErrors(cudaMemcpyAsync(hostArrays->negSquare, d_negSquare,
|
|
hostArrays->bytes, cudaMemcpyDeviceToHost,
|
|
stream1));
|
|
if (d_negSquare_out == NULL) {
|
|
checkCudaErrors(cudaFreeAsync(d_negSquare, stream1));
|
|
} else {
|
|
*d_negSquare_out = d_negSquare;
|
|
}
|
|
|
|
checkCudaErrors(cudaStreamWaitEvent(stream2, negateKernelCompleteEvent, 0));
|
|
checkCudaErrors(cudaFreeAsync(d_square, stream2));
|
|
checkCudaErrors(cudaEventRecord(squareFreeEvent, stream2));
|
|
|
|
checkCudaErrors(cudaStreamWaitEvent(stream1, squareFreeEvent, 0));
|
|
|
|
checkCudaErrors(cudaStreamDestroy(stream2));
|
|
checkCudaErrors(cudaEventDestroy(squareKernelCompleteEvent));
|
|
checkCudaErrors(cudaEventDestroy(negateKernelCompleteEvent));
|
|
checkCudaErrors(cudaEventDestroy(squareFreeEvent));
|
|
}
|
|
|
|
/**
|
|
* Demonstrates creating a CUDA graph including memory nodes using stream
|
|
* capture. createNegateSquaresGraphExplicitly constructs an equivalent graph
|
|
* without stream capture.
|
|
*/
|
|
void createNegateSquaresGraphWithStreamCapture(cudaGraphExec_t *graphExec,
|
|
negSquareArrays *hostArrays,
|
|
float **d_negSquare_out = NULL) {
|
|
cudaGraph_t graph;
|
|
cudaStream_t stream;
|
|
|
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
|
|
|
checkCudaErrors(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
|
|
doNegateSquaresInStream(stream, hostArrays, d_negSquare_out);
|
|
checkCudaErrors(cudaStreamEndCapture(stream, &graph));
|
|
|
|
checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0));
|
|
checkCudaErrors(cudaStreamDestroy(stream));
|
|
checkCudaErrors(cudaGraphDestroy(graph));
|
|
}
|
|
|
|
void prepareRefArrays(negSquareArrays *hostArrays,
|
|
negSquareArrays *deviceRefArrays,
|
|
bool **foundValidationFailure) {
|
|
deviceRefArrays->bytes = hostArrays->bytes;
|
|
deviceRefArrays->numElements = hostArrays->numElements;
|
|
|
|
for (int i = 0; i < hostArrays->numElements; i++) {
|
|
hostArrays->square[i] = hostArrays->input[i] * hostArrays->input[i];
|
|
hostArrays->negSquare[i] = hostArrays->square[i] * -1;
|
|
}
|
|
|
|
checkCudaErrors(
|
|
cudaMalloc((void **)&deviceRefArrays->negSquare, deviceRefArrays->bytes));
|
|
checkCudaErrors(cudaMemcpy(deviceRefArrays->negSquare, hostArrays->negSquare,
|
|
hostArrays->bytes, cudaMemcpyHostToDevice));
|
|
|
|
checkCudaErrors(
|
|
cudaMallocManaged((void **)foundValidationFailure, sizeof(bool)));
|
|
}
|
|
|
|
int checkValidationFailure(bool *foundValidationFailure) {
|
|
if (*foundValidationFailure) {
|
|
printf("Validation FAILURE!\n\n");
|
|
*foundValidationFailure = false;
|
|
return EXIT_FAILURE;
|
|
} else {
|
|
printf("Validation PASSED!\n\n");
|
|
return EXIT_SUCCESS;
|
|
}
|
|
}
|
|
|
|
__global__ void validateGPU(float *d_negSquare, negSquareArrays devRefArrays,
|
|
bool *foundValidationFailure) {
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
float ref, diff;
|
|
|
|
if (idx < devRefArrays.numElements) {
|
|
ref = devRefArrays.negSquare[idx];
|
|
diff = d_negSquare[idx] - ref;
|
|
diff *= diff;
|
|
ref *= ref;
|
|
if (diff / ref > ALLOWABLE_VARIANCE) {
|
|
*foundValidationFailure = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
void validateHost(negSquareArrays *hostArrays, bool *foundValidationFailure) {
|
|
float ref, diff;
|
|
|
|
for (int i = 0; i < hostArrays->numElements; i++) {
|
|
ref = hostArrays->input[i] * hostArrays->input[i] * -1;
|
|
diff = hostArrays->negSquare[i] - ref;
|
|
diff *= diff;
|
|
ref *= ref;
|
|
if (diff / ref > ALLOWABLE_VARIANCE) {
|
|
*foundValidationFailure = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
int main(int argc, char **argv) {
|
|
negSquareArrays hostArrays, deviceRefArrays;
|
|
cudaStream_t stream;
|
|
cudaGraphExec_t graphExec, graphExecFreeC;
|
|
|
|
// Declare pointers for GPU buffers
|
|
float *d_negSquare = NULL;
|
|
bool *foundValidationFailure = NULL;
|
|
|
|
srand(time(0));
|
|
int device = findCudaDevice(argc, (const char **)argv);
|
|
|
|
int driverVersion = 0;
|
|
int deviceSupportsMemoryPools = 0;
|
|
|
|
cudaDriverGetVersion(&driverVersion);
|
|
printf("Driver version is: %d.%d\n", driverVersion / 1000,
|
|
(driverVersion % 100) / 10);
|
|
|
|
if (driverVersion < 11040) {
|
|
printf("Waiving execution as driver does not support Graph Memory Nodes\n");
|
|
exit(EXIT_WAIVED);
|
|
}
|
|
|
|
cudaDeviceGetAttribute(&deviceSupportsMemoryPools,
|
|
cudaDevAttrMemoryPoolsSupported, device);
|
|
if (!deviceSupportsMemoryPools) {
|
|
printf("Waiving execution as device does not support Memory Pools\n");
|
|
exit(EXIT_WAIVED);
|
|
} else {
|
|
printf("Setting up sample.\n");
|
|
}
|
|
|
|
prepareHostArrays(&hostArrays);
|
|
prepareRefArrays(&hostArrays, &deviceRefArrays, &foundValidationFailure);
|
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
|
printf("Setup complete.\n\n");
|
|
|
|
printf("Running negateSquares in a stream.\n");
|
|
doNegateSquaresInStream(stream, &hostArrays);
|
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
|
printf("Validating negateSquares in a stream...\n");
|
|
validateHost(&hostArrays, foundValidationFailure);
|
|
checkValidationFailure(foundValidationFailure);
|
|
resetOutputArrays(&hostArrays);
|
|
|
|
printf("Running negateSquares in a stream-captured graph.\n");
|
|
createNegateSquaresGraphWithStreamCapture(&graphExec, &hostArrays);
|
|
checkCudaErrors(cudaGraphLaunch(graphExec, stream));
|
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
|
printf("Validating negateSquares in a stream-captured graph...\n");
|
|
validateHost(&hostArrays, foundValidationFailure);
|
|
checkValidationFailure(foundValidationFailure);
|
|
resetOutputArrays(&hostArrays);
|
|
|
|
printf("Running negateSquares in an explicitly constructed graph.\n");
|
|
createNegateSquaresGraphExplicitly(&graphExec, device, &hostArrays);
|
|
checkCudaErrors(cudaGraphLaunch(graphExec, stream));
|
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
|
printf("Validating negateSquares in an explicitly constructed graph...\n");
|
|
validateHost(&hostArrays, foundValidationFailure);
|
|
checkValidationFailure(foundValidationFailure);
|
|
resetOutputArrays(&hostArrays);
|
|
|
|
// Each of the three examples below free d_negSquare outside the graph. As
|
|
// demonstrated by validateGPU, d_negSquare can be accessed by outside the
|
|
// graph before d_negSquare is freed.
|
|
|
|
printf("Running negateSquares with d_negSquare freed outside the stream.\n");
|
|
createNegateSquaresGraphExplicitly(&graphExec, device, &hostArrays,
|
|
&d_negSquare);
|
|
checkCudaErrors(cudaGraphLaunch(graphExec, stream));
|
|
validateGPU<<<hostArrays.numBlocks, THREADS_PER_BLOCK, 0, stream>>>(
|
|
d_negSquare, deviceRefArrays, foundValidationFailure);
|
|
// Since cudaFree is synchronous, the stream must synchronize before freeing
|
|
// d_negSquare to ensure d_negSquare no longer being accessed.
|
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
|
checkCudaErrors(cudaFree(d_negSquare));
|
|
printf(
|
|
"Validating negateSquares with d_negSquare freed outside the "
|
|
"stream...\n");
|
|
validateHost(&hostArrays, foundValidationFailure);
|
|
checkValidationFailure(foundValidationFailure);
|
|
resetOutputArrays(&hostArrays);
|
|
|
|
printf("Running negateSquares with d_negSquare freed outside the graph.\n");
|
|
checkCudaErrors(cudaGraphLaunch(graphExec, stream));
|
|
validateGPU<<<hostArrays.numBlocks, THREADS_PER_BLOCK, 0, stream>>>(
|
|
d_negSquare, deviceRefArrays, foundValidationFailure);
|
|
checkCudaErrors(cudaFreeAsync(d_negSquare, stream));
|
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
|
printf(
|
|
"Validating negateSquares with d_negSquare freed outside the graph...\n");
|
|
checkValidationFailure(foundValidationFailure);
|
|
resetOutputArrays(&hostArrays);
|
|
|
|
printf(
|
|
"Running negateSquares with d_negSquare freed in a different graph.\n");
|
|
createFreeGraph(&graphExecFreeC, d_negSquare);
|
|
checkCudaErrors(cudaGraphLaunch(graphExec, stream));
|
|
validateGPU<<<hostArrays.numBlocks, THREADS_PER_BLOCK, 0, stream>>>(
|
|
d_negSquare, deviceRefArrays, foundValidationFailure);
|
|
checkCudaErrors(cudaGraphLaunch(graphExecFreeC, stream));
|
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
|
printf(
|
|
"Validating negateSquares with d_negSquare freed in a different "
|
|
"graph...\n");
|
|
checkValidationFailure(foundValidationFailure);
|
|
|
|
printf("Cleaning up sample.\n");
|
|
checkCudaErrors(cudaGraphExecDestroy(graphExec));
|
|
checkCudaErrors(cudaGraphExecDestroy(graphExecFreeC));
|
|
checkCudaErrors(cudaStreamDestroy(stream));
|
|
checkCudaErrors(cudaFree(foundValidationFailure));
|
|
checkCudaErrors(cudaFree(deviceRefArrays.negSquare));
|
|
free(hostArrays.input);
|
|
free(hostArrays.square);
|
|
free(hostArrays.negSquare);
|
|
printf("Cleanup complete. Exiting sample.\n");
|
|
} |