cuda-samples/Samples/simpleCudaGraphs/simpleCudaGraphs.cu
2021-10-21 16:34:49 +05:30

414 lines
15 KiB
Plaintext

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <cooperative_groups.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <vector>
namespace cg = cooperative_groups;
#define THREADS_PER_BLOCK 512
#define GRAPH_LAUNCH_ITERATIONS 3
typedef struct callBackData {
const char *fn_name;
double *data;
} callBackData_t;
__global__ void reduce(float *inputVec, double *outputVec, size_t inputSize,
size_t outputSize) {
__shared__ double tmp[THREADS_PER_BLOCK];
cg::thread_block cta = cg::this_thread_block();
size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x;
double temp_sum = 0.0;
for (int i = globaltid; i < inputSize; i += gridDim.x * blockDim.x) {
temp_sum += (double)inputVec[i];
}
tmp[cta.thread_rank()] = temp_sum;
cg::sync(cta);
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
double beta = temp_sum;
double temp;
for (int i = tile32.size() / 2; i > 0; i >>= 1) {
if (tile32.thread_rank() < i) {
temp = tmp[cta.thread_rank() + i];
beta += temp;
tmp[cta.thread_rank()] = beta;
}
cg::sync(tile32);
}
cg::sync(cta);
if (cta.thread_rank() == 0 && blockIdx.x < outputSize) {
beta = 0.0;
for (int i = 0; i < cta.size(); i += tile32.size()) {
beta += tmp[i];
}
outputVec[blockIdx.x] = beta;
}
}
__global__ void reduceFinal(double *inputVec, double *result,
size_t inputSize) {
__shared__ double tmp[THREADS_PER_BLOCK];
cg::thread_block cta = cg::this_thread_block();
size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x;
double temp_sum = 0.0;
for (int i = globaltid; i < inputSize; i += gridDim.x * blockDim.x) {
temp_sum += (double)inputVec[i];
}
tmp[cta.thread_rank()] = temp_sum;
cg::sync(cta);
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
// do reduction in shared mem
if ((blockDim.x >= 512) && (cta.thread_rank() < 256)) {
tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 256];
}
cg::sync(cta);
if ((blockDim.x >= 256) && (cta.thread_rank() < 128)) {
tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 128];
}
cg::sync(cta);
if ((blockDim.x >= 128) && (cta.thread_rank() < 64)) {
tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 64];
}
cg::sync(cta);
if (cta.thread_rank() < 32) {
// Fetch final intermediate sum from 2nd warp
if (blockDim.x >= 64) temp_sum += tmp[cta.thread_rank() + 32];
// Reduce final warp using shuffle
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
temp_sum += tile32.shfl_down(temp_sum, offset);
}
}
// write result for this block to global mem
if (cta.thread_rank() == 0) result[0] = temp_sum;
}
void init_input(float *a, size_t size) {
for (size_t i = 0; i < size; i++) a[i] = (rand() & 0xFF) / (float)RAND_MAX;
}
void CUDART_CB myHostNodeCallback(void *data) {
// Check status of GPU after stream operations are done
callBackData_t *tmp = (callBackData_t *)(data);
// checkCudaErrors(tmp->status);
double *result = (double *)(tmp->data);
char *function = (char *)(tmp->fn_name);
printf("[%s] Host callback final reduced sum = %lf\n", function, *result);
*result = 0.0; // reset the result
}
void cudaGraphsManual(float *inputVec_h, float *inputVec_d, double *outputVec_d,
double *result_d, size_t inputSize, size_t numOfBlocks) {
cudaStream_t streamForGraph;
cudaGraph_t graph;
std::vector<cudaGraphNode_t> nodeDependencies;
cudaGraphNode_t memcpyNode, kernelNode, memsetNode;
double result_h = 0.0;
checkCudaErrors(cudaStreamCreate(&streamForGraph));
cudaKernelNodeParams kernelNodeParams = {0};
cudaMemcpy3DParms memcpyParams = {0};
cudaMemsetParams memsetParams = {0};
memcpyParams.srcArray = NULL;
memcpyParams.srcPos = make_cudaPos(0, 0, 0);
memcpyParams.srcPtr =
make_cudaPitchedPtr(inputVec_h, sizeof(float) * inputSize, inputSize, 1);
memcpyParams.dstArray = NULL;
memcpyParams.dstPos = make_cudaPos(0, 0, 0);
memcpyParams.dstPtr =
make_cudaPitchedPtr(inputVec_d, sizeof(float) * inputSize, inputSize, 1);
memcpyParams.extent = make_cudaExtent(sizeof(float) * inputSize, 1, 1);
memcpyParams.kind = cudaMemcpyHostToDevice;
memsetParams.dst = (void *)outputVec_d;
memsetParams.value = 0;
memsetParams.pitch = 0;
memsetParams.elementSize = sizeof(float); // elementSize can be max 4 bytes
memsetParams.width = numOfBlocks * 2;
memsetParams.height = 1;
checkCudaErrors(cudaGraphCreate(&graph, 0));
checkCudaErrors(
cudaGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &memcpyParams));
checkCudaErrors(
cudaGraphAddMemsetNode(&memsetNode, graph, NULL, 0, &memsetParams));
nodeDependencies.push_back(memsetNode);
nodeDependencies.push_back(memcpyNode);
void *kernelArgs[4] = {(void *)&inputVec_d, (void *)&outputVec_d, &inputSize,
&numOfBlocks};
kernelNodeParams.func = (void *)reduce;
kernelNodeParams.gridDim = dim3(numOfBlocks, 1, 1);
kernelNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1);
kernelNodeParams.sharedMemBytes = 0;
kernelNodeParams.kernelParams = (void **)kernelArgs;
kernelNodeParams.extra = NULL;
checkCudaErrors(
cudaGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(),
nodeDependencies.size(), &kernelNodeParams));
nodeDependencies.clear();
nodeDependencies.push_back(kernelNode);
memset(&memsetParams, 0, sizeof(memsetParams));
memsetParams.dst = result_d;
memsetParams.value = 0;
memsetParams.elementSize = sizeof(float);
memsetParams.width = 2;
memsetParams.height = 1;
checkCudaErrors(
cudaGraphAddMemsetNode(&memsetNode, graph, NULL, 0, &memsetParams));
nodeDependencies.push_back(memsetNode);
memset(&kernelNodeParams, 0, sizeof(kernelNodeParams));
kernelNodeParams.func = (void *)reduceFinal;
kernelNodeParams.gridDim = dim3(1, 1, 1);
kernelNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1);
kernelNodeParams.sharedMemBytes = 0;
void *kernelArgs2[3] = {(void *)&outputVec_d, (void *)&result_d,
&numOfBlocks};
kernelNodeParams.kernelParams = kernelArgs2;
kernelNodeParams.extra = NULL;
checkCudaErrors(
cudaGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(),
nodeDependencies.size(), &kernelNodeParams));
nodeDependencies.clear();
nodeDependencies.push_back(kernelNode);
memset(&memcpyParams, 0, sizeof(memcpyParams));
memcpyParams.srcArray = NULL;
memcpyParams.srcPos = make_cudaPos(0, 0, 0);
memcpyParams.srcPtr = make_cudaPitchedPtr(result_d, sizeof(double), 1, 1);
memcpyParams.dstArray = NULL;
memcpyParams.dstPos = make_cudaPos(0, 0, 0);
memcpyParams.dstPtr = make_cudaPitchedPtr(&result_h, sizeof(double), 1, 1);
memcpyParams.extent = make_cudaExtent(sizeof(double), 1, 1);
memcpyParams.kind = cudaMemcpyDeviceToHost;
checkCudaErrors(
cudaGraphAddMemcpyNode(&memcpyNode, graph, nodeDependencies.data(),
nodeDependencies.size(), &memcpyParams));
nodeDependencies.clear();
nodeDependencies.push_back(memcpyNode);
cudaGraphNode_t hostNode;
cudaHostNodeParams hostParams = {0};
hostParams.fn = myHostNodeCallback;
callBackData_t hostFnData;
hostFnData.data = &result_h;
hostFnData.fn_name = "cudaGraphsManual";
hostParams.userData = &hostFnData;
checkCudaErrors(cudaGraphAddHostNode(&hostNode, graph,
nodeDependencies.data(),
nodeDependencies.size(), &hostParams));
cudaGraphNode_t *nodes = NULL;
size_t numNodes = 0;
checkCudaErrors(cudaGraphGetNodes(graph, nodes, &numNodes));
printf("\nNum of nodes in the graph created manually = %zu\n", numNodes);
cudaGraphExec_t graphExec;
checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
cudaGraph_t clonedGraph;
cudaGraphExec_t clonedGraphExec;
checkCudaErrors(cudaGraphClone(&clonedGraph, graph));
checkCudaErrors(
cudaGraphInstantiate(&clonedGraphExec, clonedGraph, NULL, NULL, 0));
for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) {
checkCudaErrors(cudaGraphLaunch(graphExec, streamForGraph));
}
checkCudaErrors(cudaStreamSynchronize(streamForGraph));
printf("Cloned Graph Output.. \n");
for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) {
checkCudaErrors(cudaGraphLaunch(clonedGraphExec, streamForGraph));
}
checkCudaErrors(cudaStreamSynchronize(streamForGraph));
checkCudaErrors(cudaGraphExecDestroy(graphExec));
checkCudaErrors(cudaGraphExecDestroy(clonedGraphExec));
checkCudaErrors(cudaGraphDestroy(graph));
checkCudaErrors(cudaGraphDestroy(clonedGraph));
checkCudaErrors(cudaStreamDestroy(streamForGraph));
}
void cudaGraphsUsingStreamCapture(float *inputVec_h, float *inputVec_d,
double *outputVec_d, double *result_d,
size_t inputSize, size_t numOfBlocks) {
cudaStream_t stream1, stream2, stream3, streamForGraph;
cudaEvent_t forkStreamEvent, memsetEvent1, memsetEvent2;
cudaGraph_t graph;
double result_h = 0.0;
checkCudaErrors(cudaStreamCreate(&stream1));
checkCudaErrors(cudaStreamCreate(&stream2));
checkCudaErrors(cudaStreamCreate(&stream3));
checkCudaErrors(cudaStreamCreate(&streamForGraph));
checkCudaErrors(cudaEventCreate(&forkStreamEvent));
checkCudaErrors(cudaEventCreate(&memsetEvent1));
checkCudaErrors(cudaEventCreate(&memsetEvent2));
checkCudaErrors(cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal));
checkCudaErrors(cudaEventRecord(forkStreamEvent, stream1));
checkCudaErrors(cudaStreamWaitEvent(stream2, forkStreamEvent, 0));
checkCudaErrors(cudaStreamWaitEvent(stream3, forkStreamEvent, 0));
checkCudaErrors(cudaMemcpyAsync(inputVec_d, inputVec_h,
sizeof(float) * inputSize, cudaMemcpyDefault,
stream1));
checkCudaErrors(
cudaMemsetAsync(outputVec_d, 0, sizeof(double) * numOfBlocks, stream2));
checkCudaErrors(cudaEventRecord(memsetEvent1, stream2));
checkCudaErrors(cudaMemsetAsync(result_d, 0, sizeof(double), stream3));
checkCudaErrors(cudaEventRecord(memsetEvent2, stream3));
checkCudaErrors(cudaStreamWaitEvent(stream1, memsetEvent1, 0));
reduce<<<numOfBlocks, THREADS_PER_BLOCK, 0, stream1>>>(
inputVec_d, outputVec_d, inputSize, numOfBlocks);
checkCudaErrors(cudaStreamWaitEvent(stream1, memsetEvent2, 0));
reduceFinal<<<1, THREADS_PER_BLOCK, 0, stream1>>>(outputVec_d, result_d,
numOfBlocks);
checkCudaErrors(cudaMemcpyAsync(&result_h, result_d, sizeof(double),
cudaMemcpyDefault, stream1));
callBackData_t hostFnData = {0};
hostFnData.data = &result_h;
hostFnData.fn_name = "cudaGraphsUsingStreamCapture";
cudaHostFn_t fn = myHostNodeCallback;
checkCudaErrors(cudaLaunchHostFunc(stream1, fn, &hostFnData));
checkCudaErrors(cudaStreamEndCapture(stream1, &graph));
cudaGraphNode_t *nodes = NULL;
size_t numNodes = 0;
checkCudaErrors(cudaGraphGetNodes(graph, nodes, &numNodes));
printf("\nNum of nodes in the graph created using stream capture API = %zu\n",
numNodes);
cudaGraphExec_t graphExec;
checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
cudaGraph_t clonedGraph;
cudaGraphExec_t clonedGraphExec;
checkCudaErrors(cudaGraphClone(&clonedGraph, graph));
checkCudaErrors(
cudaGraphInstantiate(&clonedGraphExec, clonedGraph, NULL, NULL, 0));
for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) {
checkCudaErrors(cudaGraphLaunch(graphExec, streamForGraph));
}
checkCudaErrors(cudaStreamSynchronize(streamForGraph));
printf("Cloned Graph Output.. \n");
for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) {
checkCudaErrors(cudaGraphLaunch(clonedGraphExec, streamForGraph));
}
checkCudaErrors(cudaStreamSynchronize(streamForGraph));
checkCudaErrors(cudaGraphExecDestroy(graphExec));
checkCudaErrors(cudaGraphExecDestroy(clonedGraphExec));
checkCudaErrors(cudaGraphDestroy(graph));
checkCudaErrors(cudaGraphDestroy(clonedGraph));
checkCudaErrors(cudaStreamDestroy(stream1));
checkCudaErrors(cudaStreamDestroy(stream2));
checkCudaErrors(cudaStreamDestroy(streamForGraph));
}
int main(int argc, char **argv) {
size_t size = 1 << 24; // number of elements to reduce
size_t maxBlocks = 512;
// This will pick the best possible CUDA capable device
int devID = findCudaDevice(argc, (const char **)argv);
printf("%zu elements\n", size);
printf("threads per block = %d\n", THREADS_PER_BLOCK);
printf("Graph Launch iterations = %d\n", GRAPH_LAUNCH_ITERATIONS);
float *inputVec_d = NULL, *inputVec_h = NULL;
double *outputVec_d = NULL, *result_d;
checkCudaErrors(cudaMallocHost(&inputVec_h, sizeof(float) * size));
checkCudaErrors(cudaMalloc(&inputVec_d, sizeof(float) * size));
checkCudaErrors(cudaMalloc(&outputVec_d, sizeof(double) * maxBlocks));
checkCudaErrors(cudaMalloc(&result_d, sizeof(double)));
init_input(inputVec_h, size);
cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, size,
maxBlocks);
cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d,
size, maxBlocks);
checkCudaErrors(cudaFree(inputVec_d));
checkCudaErrors(cudaFree(outputVec_d));
checkCudaErrors(cudaFree(result_d));
checkCudaErrors(cudaFreeHost(inputVec_h));
return EXIT_SUCCESS;
}