diff --git a/Samples/3_CUDA_Features/graphConditionalNodes/graphConditionalNodes.cu b/Samples/3_CUDA_Features/graphConditionalNodes/graphConditionalNodes.cu index 839cbdbe..5552ce18 100644 --- a/Samples/3_CUDA_Features/graphConditionalNodes/graphConditionalNodes.cu +++ b/Samples/3_CUDA_Features/graphConditionalNodes/graphConditionalNodes.cu @@ -82,11 +82,12 @@ void simpleIfGraph(void) cudaGraphNode_t node; void *kernelArgs[2]; - char *dPtr; // Pointer to device memory location // Allocate a byte of device memory to use as input + char *dPtr; checkCudaErrors(cudaMalloc((void**)&dPtr, 1)); + printf("simpleIfGraph: Building graph...\n"); cudaGraphCreate(&graph, 0); // Create conditional handle. @@ -135,8 +136,195 @@ void simpleIfGraph(void) checkCudaErrors(cudaGraphExecDestroy(graphExec)); checkCudaErrors(cudaGraphDestroy(graph)); checkCudaErrors(cudaFree(dPtr)); + + printf("simpleIfGraph: Complete\n\n"); } +/* + * Create a graph containing a single conditional while node. + * The default value of the conditional variable is set to true, so this + * effectively becomes a do-while loop as the conditional body will always + * execute at least once. The body of the conditional contains 3 kernel nodes: + * A [ B -> C -> D ] + * Nodes B and C are just dummy nodes for demonstrative purposes. Node D + * will decrement a device memory location and set the condition value to false + * when the value reaches zero, terminating the loop. + * In this example, stream capture is used to populate the conditional body. + */ + +// This kernel will only be executed if the condition is true +__global__ void doWhileEmptyKernel(void) +{ + printf("GPU: doWhileEmptyKernel()\n"); + return; +} + +__global__ void doWhileLoopKernel(char *dPtr, cudaGraphConditionalHandle handle) +{ + if (--(*dPtr) == 0) { + cudaGraphSetConditional(handle, 0); + } + printf("GPU: counter = %d\n", *dPtr); +} + +void simpleDoWhileGraph(void) +{ + cudaGraph_t graph; + cudaGraphExec_t graphExec; + cudaGraphNode_t node; + + // Allocate a byte of device memory to use as input + char *dPtr; + checkCudaErrors(cudaMalloc((void**)&dPtr, 1)); + + printf("simpleDoWhileGraph: Building graph...\n"); + checkCudaErrors(cudaGraphCreate(&graph, 0)); + + cudaGraphConditionalHandle handle; + checkCudaErrors(cudaGraphConditionalHandleCreate(&handle, graph, 1, cudaGraphCondAssignDefault)); + + cudaGraphNodeParams cParams = { cudaGraphNodeTypeConditional }; + cParams.conditional.handle = handle; + cParams.conditional.type = cudaGraphCondTypeWhile; + cParams.conditional.size = 1; + checkCudaErrors(cudaGraphAddNode(&node, graph, NULL, 0, &cParams)); + + cudaGraph_t bodyGraph = cParams.conditional.phGraph_out[0]; + + cudaStream_t captureStream; + checkCudaErrors(cudaStreamCreate(&captureStream)); + + checkCudaErrors(cudaStreamBeginCaptureToGraph(captureStream, bodyGraph, nullptr, nullptr, 0, cudaStreamCaptureModeRelaxed)); + doWhileEmptyKernel<<<1, 1, 0, captureStream>>>(); + doWhileEmptyKernel<<<1, 1, 0, captureStream>>>(); + doWhileLoopKernel<<<1, 1, 0, captureStream>>>(dPtr, handle); + checkCudaErrors(cudaStreamEndCapture(captureStream, nullptr)); + checkCudaErrors(cudaStreamDestroy(captureStream)); + + checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); + + // Initialize device memory and launch the graph + checkCudaErrors(cudaMemset(dPtr, 10, 1)); // Set dPtr to 10 + printf("Host: Launching graph with loop counter set to 10\n"); + checkCudaErrors(cudaGraphLaunch(graphExec, 0)); + checkCudaErrors(cudaDeviceSynchronize()); + + // Cleanup + checkCudaErrors(cudaGraphExecDestroy(graphExec)); + checkCudaErrors(cudaGraphDestroy(graph)); + checkCudaErrors(cudaFree(dPtr)); + + printf("simpleDoWhileGraph: Complete\n\n"); +} + + +/* + * Create a graph containing a conditional while loop using stream capture. + * This demonstrates how to insert a conditional node into a stream which is + * being captured. The graph consists of a kernel node followed by a conditional + * while node which contains a single kernel node: + * + * A -> B [ C ] + * + * The same kernel will be used for both nodes A and C. This kernel will test + * a device memory location and set the condition when the location is non-zero. + * We must run the kernel before the loop as well as inside the loop in order + * to behave like a while loop. We need to evaluate the device memory location + * before the conditional node is evaluated in order to set the condition variable + * properly. Because we're using a kernel upstream of the conditional node, + * there is no need to use the handle default value to initialize the conditional + * value. + */ + +__global__ void capturedWhileKernel(char *dPtr, cudaGraphConditionalHandle handle) +{ + printf("GPU: counter = %d\n", *dPtr); + if (*dPtr) { + (*dPtr)--; + } + cudaGraphSetConditional(handle, *dPtr); +} + +void capturedWhileGraph(void) +{ + cudaGraph_t graph; + cudaGraphExec_t graphExec; + + cudaStreamCaptureStatus status; + const cudaGraphNode_t *dependencies; + size_t numDependencies; + + // Allocate a byte of device memory to use as input + char *dPtr; + checkCudaErrors(cudaMalloc((void**)&dPtr, 1)); + + printf("capturedWhileGraph: Building graph...\n"); + cudaStream_t captureStream; + checkCudaErrors(cudaStreamCreate(&captureStream)); + + checkCudaErrors(cudaStreamBeginCapture(captureStream, cudaStreamCaptureModeRelaxed)); + + // Obtain the handle of the graph + checkCudaErrors(cudaStreamGetCaptureInfo(captureStream, &status, NULL, &graph, &dependencies, &numDependencies)); + + // Create the conditional handle + cudaGraphConditionalHandle handle; + checkCudaErrors(cudaGraphConditionalHandleCreate(&handle, graph)); + + // Insert kernel node A + capturedWhileKernel<<<1, 1, 0, captureStream>>>(dPtr, handle); + + // Obtain the handle for node A + checkCudaErrors(cudaStreamGetCaptureInfo(captureStream, &status, NULL, &graph, &dependencies, &numDependencies)); + + // Insert conditional node B + cudaGraphNode_t node; + cudaGraphNodeParams cParams = { cudaGraphNodeTypeConditional }; + cParams.conditional.handle = handle; + cParams.conditional.type = cudaGraphCondTypeWhile; + cParams.conditional.size = 1; + checkCudaErrors(cudaGraphAddNode(&node, graph, dependencies, numDependencies, &cParams)); + + cudaGraph_t bodyGraph = cParams.conditional.phGraph_out[0]; + + checkCudaErrors(cudaStreamEndCapture(captureStream, &graph)); + checkCudaErrors(cudaStreamDestroy(captureStream)); + + // Populate conditional body graph using stream capture + cudaStream_t bodyStream; + checkCudaErrors(cudaStreamCreate(&bodyStream)); + + checkCudaErrors(cudaStreamBeginCaptureToGraph(bodyStream, bodyGraph, nullptr, nullptr, 0, cudaStreamCaptureModeRelaxed)); + + // Insert kernel node C + capturedWhileKernel<<<1, 1, 0, bodyStream>>>(dPtr, handle); + checkCudaErrors(cudaStreamEndCapture(bodyStream, nullptr)); + checkCudaErrors(cudaStreamDestroy(bodyStream)); + + checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); + + // Initialize device memory and launch the graph + // Device memory is zero, so the conditional node will not execute + checkCudaErrors(cudaMemset(dPtr, 0, 1)); // Set dPtr to 0 + printf("Host: Launching graph with loop counter set to 0\n"); + checkCudaErrors(cudaGraphLaunch(graphExec, 0)); + checkCudaErrors(cudaDeviceSynchronize()); + + // Initialize device memory and launch the graph + checkCudaErrors(cudaMemset(dPtr, 10, 1)); // Set dPtr to 10 + printf("Host: Launching graph with loop counter set to 10\n"); + checkCudaErrors(cudaGraphLaunch(graphExec, 0)); + checkCudaErrors(cudaDeviceSynchronize()); + + // Cleanup + checkCudaErrors(cudaGraphExecDestroy(graphExec)); + checkCudaErrors(cudaGraphDestroy(graph)); + checkCudaErrors(cudaFree(dPtr)); + + printf("capturedWhileGraph: Complete\n\n"); +} + + int main(int argc, char **argv) { int device = findCudaDevice(argc, (const char **)argv); @@ -152,6 +340,8 @@ int main(int argc, char **argv) { } simpleIfGraph(); + simpleDoWhileGraph(); + capturedWhileGraph(); return 0; }