/* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This is a simple application showing the performance characteristics of cudaGraphs. */ #define USE_NVTX #include #include #include #include typedef volatile int LatchType; std::chrono::time_point getCpuTime() { return std::chrono::high_resolution_clock::now(); } template float getMicroSecondDuration(T start, T end) { return std::chrono::duration_cast(end-start).count() *.001f; } float getAsyncMicroSecondDuration(cudaEvent_t start, cudaEvent_t end) { float ms; cudaEventElapsedTime(&ms, start, end); return ms*1000; } #ifdef USE_NVTX #include class Tracer { public: Tracer(const char* name) { nvtxRangePushA(name); } ~Tracer() { nvtxRangePop(); } }; #define RANGE(name) Tracer uniq_name_using_macros(name); #define RANGE_PUSH(name) nvtxRangePushA(name) #define RANGE_POP() nvtxRangePop(); #else #define RANGE(name) #endif std::vector stream; cudaEvent_t event[1]; cudaEvent_t timingEvent[2]; struct hostData { long long timeElapsed; bool timeoutDetected; long long timeElapsed2; bool timeoutDetected2; LatchType latch; LatchType latch2; }; struct hostData *hostData; __global__ void empty() { } // Function to read the GPU nanosecond timer in a kernel __device__ __forceinline__ unsigned long long __globaltimer() { unsigned long long globaltimer; asm volatile ("mov.u64 %0, %globaltimer;" : "=l"(globaltimer)); return globaltimer; } __global__ void delay(long long ticks) { long long endTime = clock64() + ticks; while (clock64() < endTime); } __global__ void waitWithTimeout(long long nanoseconds, bool* timeoutDetected, long long *timeElapsed, LatchType* latch) { long long startTime = __globaltimer(); long long endTime = startTime + nanoseconds; long long time = 0; do { time = __globaltimer(); } while (time < endTime && (latch == NULL || *latch == 0)); if (timeElapsed != NULL) { *timeElapsed = time - startTime; } if (timeoutDetected) { // report timeout if latch not detected *timeoutDetected = (latch == NULL || *latch == 0); } } __global__ void preUploadAnnotation() { } __global__ void postUploadAnnotation() { } cudaGraph_t createParallelChain(int length, int width, bool singleEntry = false) { RANGE_PUSH(__func__); RANGE("capture"); cudaGraph_t graph; cudaStreamBeginCapture(stream[0], cudaStreamCaptureModeGlobal); int streamIdx = 0; if (singleEntry) { empty<<<1,1,0,stream[streamIdx]>>>(); } cudaEventRecord(event[0], stream[0]); for (int i = 1; i < width; i++) { cudaStreamWaitEvent(stream[i], event[0]); } for (int i = 0; i < width; i++) { streamIdx = i; for (int j = 0; j < length; j++) { empty<<<1,1,0,stream[streamIdx]>>>(); } } for (int i = 1; i < width; i++) { cudaEventRecord(event[0], stream[i]); cudaStreamWaitEvent(stream[0], event[0]); } cudaStreamEndCapture(stream[0], &graph); return graph; } std::vector metricName; std::vector metricValue; int counter2 = 0; void runDemo(cudaGraph_t graph, int length, int width) { cudaGraphExec_t graphExec; { auto start = getCpuTime(); cudaGraphInstantiateWithFlags(&graphExec, graph, 0); auto end = getCpuTime(); metricName.push_back("instantiation"); metricValue.push_back(getMicroSecondDuration(start, end)); } { RANGE("launch including upload"); auto start = getCpuTime(); cudaGraphLaunch(graphExec, stream[0]); auto apiReturn = getCpuTime(); cudaStreamSynchronize(stream[0]); auto streamSync = getCpuTime(); metricName.push_back("first_launch_api"); metricValue.push_back(getMicroSecondDuration(start, apiReturn)); metricName.push_back("first_launch_total"); metricValue.push_back(getMicroSecondDuration(start, streamSync)); } { RANGE("repeat lauch in empty stream"); auto start = getCpuTime(); cudaGraphLaunch(graphExec, stream[0]); auto apiReturn = getCpuTime(); cudaStreamSynchronize(stream[0]); auto streamSync = getCpuTime(); metricName.push_back("repeat_launch_api"); metricValue.push_back(getMicroSecondDuration(start, apiReturn)); metricName.push_back("repeat_launch_total"); metricValue.push_back(getMicroSecondDuration(start, streamSync)); } { // re-instantiating the exec to simulate first launch into a busy stream. cudaGraphExecDestroy(graphExec); cudaGraphInstantiateWithFlags(&graphExec, graph, 0); long long maxTimeoutNanoSeconds = 4000 + 500*length*width; waitWithTimeout<<<1,1,0,stream[0]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch); RANGE("launch including upload in busy stream"); cudaEventRecord(timingEvent[0], stream[0]); cudaGraphLaunch(graphExec, stream[0]); cudaEventRecord(timingEvent[1], stream[0]); hostData->latch = 1; cudaStreamSynchronize(stream[0]); metricName.push_back("first_launch_device"); metricValue.push_back(getAsyncMicroSecondDuration(timingEvent[0], timingEvent[1])); metricName.push_back("blockingKernelTimeoutDetected"); metricValue.push_back(hostData->timeoutDetected); hostData->latch = 0; hostData->timeoutDetected = 0; } { RANGE("repeat lauch in busy stream"); long long maxTimeoutNanoSeconds = 4000 + 500*length*width; waitWithTimeout<<<1,1,0,stream[0]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch); cudaEventRecord(timingEvent[0], stream[0]); cudaGraphLaunch(graphExec, stream[0]); cudaEventRecord(timingEvent[1], stream[0]); hostData->latch = 1; cudaStreamSynchronize(stream[0]); metricName.push_back("repeat_launch_device"); metricValue.push_back(getAsyncMicroSecondDuration(timingEvent[0], timingEvent[1])); metricName.push_back("blockingKernelTimeoutDetected"); metricValue.push_back(hostData->timeoutDetected); hostData->latch = 0; hostData->timeoutDetected = 0; } { // re-instantiating the exec to provide upload with work to do. cudaGraphExecDestroy(graphExec); cudaGraphInstantiateWithFlags(&graphExec, graph, 0); long long maxTimeoutNanoSeconds = 4000 + 1000*length*width; waitWithTimeout<<<1,1,0,stream[0]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected2, &hostData->timeElapsed2, &hostData->latch2); maxTimeoutNanoSeconds = 2000 + 500*length*width; waitWithTimeout<<<1,1,0,stream[1]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch); RANGE("uploading a graph off of the critical path"); preUploadAnnotation<<<1,1,0,stream[1]>>>(); cudaEventRecord(timingEvent[0], stream[0]); auto start = getCpuTime(); cudaGraphUpload(graphExec, stream[1]); auto apiReturn = getCpuTime(); cudaEventRecord(event[0],stream[1]); cudaEventRecord(timingEvent[1], stream[0]); postUploadAnnotation<<<1,1,0,stream[1]>>>(); hostData->latch = 1; // release the blocking kernel for the upload cudaStreamWaitEvent(stream[0],event[0]); cudaGraphLaunch(graphExec, stream[0]); cudaEventSynchronize(event[0]); // upload done, similuate critical path being ready for the graph to run by the release of the second latch hostData->latch2 = 1; // release the work cudaStreamSynchronize(stream[0]); metricName.push_back("upload_api_time"); metricValue.push_back(getMicroSecondDuration(start, apiReturn)); metricName.push_back("updoad_device_time"); metricValue.push_back(getAsyncMicroSecondDuration(timingEvent[0], timingEvent[1])); metricName.push_back("blockingKernelTimeoutDetected"); metricValue.push_back(hostData->timeoutDetected); hostData->latch = 0; hostData->latch2 = 0; hostData->timeoutDetected = 0; hostData->timeoutDetected2 = 0; } cudaGraphExecDestroy(graphExec); cudaGraphDestroy(graph); RANGE_POP(); } void usage() { printf("programName [outputFmt] [numTrials] [length] [width] [pattern] [stride] [maxLength] \n"); printf("\toutputFmt - program output, default=3 (see below)\n"); printf("\tnumTrials (per length)\n"); printf("\tstarting length of the topology\n"); printf("\twidth - width of the graph topology\n"); printf("\tpattern - Structure of graph, default=0 (see below)\n"); printf("\tstride - how to grow the length between each set of trials \n"); printf("\tmaxLength - maximum lenght to try \n"); printf("\n"); printf("outputFmt can be:\n"); printf("\t0: this help message\n"); printf("\t1: csv data headers\n"); printf("\t2: per trial csv data\n"); printf("\t3: csv data & headers\n"); printf("\t4: csv data is printed and trials are averaged for each length\n"); printf("\t5: csv data is printed and trials are averaged for each length and headers are printed\n"); printf("\n"); printf("Pattern can be:\n"); printf("\t0: No interconnect between branches\n"); printf("\t1: Adds an extra root node before the initial fork\n"); } int main(int argc, char **argv) { if(argc < 1) { usage(); return 0; } int numTrials=1, length=20, width=1, outputFmt=3, pattern=0, stride = 1; if(argc > 1) outputFmt = atoi(argv[1]); if(argc > 2) numTrials = atoi(argv[2]); if(argc > 3) length= atoi(argv[3]); if(argc > 4) width= atoi(argv[4]); if(argc > 5) pattern = atoi(argv[5]); if(argc > 6) stride = atoi(argv[6]); int maxLength = length; if(argc > 7) maxLength = atoi(argv[7]); if (maxLength < length) { maxLength = length; } if((outputFmt & 4) && (outputFmt & 2)) { printf("printing average and all samples doesn't make sense\n"); } if(length == 0 || width == 0 || outputFmt == 0 || outputFmt > 5 || pattern > 1) { usage(); return 0; } bool singleEntry = (pattern == 1); cudaGraph_t graph; cudaFree(0); cudaMallocHost(&hostData, sizeof(*hostData)); stream.resize(width); for (int i = 0; i < width; i++) { cudaStreamCreate(&stream[i]); } cudaEventCreate(&event[0], cudaEventDisableTiming); cudaEventCreate(&timingEvent[0], 0); cudaEventCreate(&timingEvent[1], 0); { RANGE("warmup"); for (int i = 0; i < width; i++) { empty<<<1,1,0,stream[i]>>>(); } cudaStreamSynchronize(stream[0]); auto start = getCpuTime(); graph = createParallelChain(length, width, singleEntry); auto end = getCpuTime(); metricValue.push_back(getMicroSecondDuration(start, end)); metricName.push_back("capture"); runDemo(graph, length, width); } if (outputFmt & 1) { printf("length, width, pattern, "); for (int i = 0; i < metricName.size(); i++) { printf("%s, ", metricName[i]); } printf("\r\n"); } if (!(outputFmt & 6)) { printf("skipping trials since no output is expected\n"); return; } std::vector metricTotal; metricTotal.resize(metricValue.size()); while (length <= maxLength) { for (int i = 0; i < numTrials; i++) { metricName.clear(); metricValue.clear(); auto start = getCpuTime(); graph = createParallelChain(length, width, singleEntry); auto end = getCpuTime(); metricValue.push_back(getMicroSecondDuration(start, end)); runDemo(graph, length, width); if (outputFmt & 2) { printf("%d, %d, %d, ",length, width, pattern); for (int i = 0; i < metricValue.size(); i++) { printf("%0.3f, ", metricValue[i]); } printf("\r\n"); } if (outputFmt & 4) { for (int i = 0; i < metricTotal.size(); i++) { metricTotal[i] += metricValue[i]; } } } if (outputFmt & 4) { printf("%d, %d, %d, ",length, width, pattern); for (int i = 0; i < metricTotal.size(); i++) { printf("%0.3f, ", metricTotal[i]/numTrials); metricTotal[i] = 0; } printf("\r\n"); } length += stride; } printf("\n"); }