cuda-samples/Samples/6_Performance/cudaGraphsPerfScaling/cudaGraphPerfScaling.cu

/* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * This is a simple application showing the performance characteristics of cudaGraphs.
 */

#define USE_NVTX

#include <cstdio>
#include <cuda_runtime.h>
#include <vector>
#include <chrono>

typedef volatile int LatchType;

std::chrono::time_point<std::chrono::high_resolution_clock> getCpuTime()
{
    return std::chrono::high_resolution_clock::now();
}

template <typename T>
float getMicroSecondDuration(T start, T end)
{
    return std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count() *.001f;
}

float getAsyncMicroSecondDuration(cudaEvent_t start, cudaEvent_t end)
{
    float ms;
    cudaEventElapsedTime(&ms, start, end);
    return ms*1000;
}

#ifdef USE_NVTX
#include <nvtx3/nvToolsExt.h>

class Tracer {
public:
    Tracer(const char* name) {
        nvtxRangePushA(name);
    }
    ~Tracer() {
        nvtxRangePop();
    }
};
#define RANGE(name) Tracer uniq_name_using_macros(name);
#define RANGE_PUSH(name) nvtxRangePushA(name)
#define RANGE_POP() nvtxRangePop();
#else
#define RANGE(name)
#endif

std::vector<cudaStream_t> stream;
cudaEvent_t event[1];
cudaEvent_t timingEvent[2];

struct hostData {
    long long timeElapsed;
    bool timeoutDetected;
    long long timeElapsed2;
    bool timeoutDetected2;
    LatchType latch;
    LatchType latch2;
};

struct hostData *hostData;

__global__ void empty()
{
}

// Function to read the GPU nanosecond timer in a kernel
__device__ __forceinline__ unsigned long long   __globaltimer() { 
    unsigned long long globaltimer;   
    asm volatile ("mov.u64 %0, %globaltimer;"   : "=l"(globaltimer));   
    return globaltimer; 
}

__global__ void delay(long long ticks)
{
    long long endTime = clock64() + ticks;
    while (clock64() < endTime);
}

__global__ void waitWithTimeout(long long nanoseconds, bool* timeoutDetected, long long *timeElapsed, LatchType* latch)
{
    long long startTime = __globaltimer();
    long long endTime = startTime + nanoseconds;
    long long time = 0;
    do {
        time = __globaltimer();
    } while (time < endTime && (latch == NULL || *latch == 0));
    if (timeElapsed != NULL) {
        *timeElapsed = time - startTime;
    }
    if (timeoutDetected) {
        // report timeout if latch not detected
        *timeoutDetected = (latch == NULL || *latch == 0);
    }
}

__global__ void preUploadAnnotation()
{
}

__global__ void postUploadAnnotation()
{
}

cudaGraph_t createParallelChain(int length, int width, bool singleEntry = false)
{
    RANGE_PUSH(__func__);
    RANGE("capture");
    cudaGraph_t graph;
    cudaStreamBeginCapture(stream[0], cudaStreamCaptureModeGlobal);
    int streamIdx = 0; 
    if (singleEntry) {
       empty<<<1,1,0,stream[streamIdx]>>>();
    }

    cudaEventRecord(event[0], stream[0]);
    for (int i = 1; i < width; i++) {
        cudaStreamWaitEvent(stream[i], event[0]);
    }

    for (int i = 0; i < width; i++) {
        streamIdx = i;
        for (int j = 0; j < length; j++) {
            empty<<<1,1,0,stream[streamIdx]>>>();
        }
    }

    for (int i = 1; i < width; i++) {
        cudaEventRecord(event[0], stream[i]);
        cudaStreamWaitEvent(stream[0], event[0]);
    }

    cudaStreamEndCapture(stream[0], &graph);
    return graph;
}

std::vector<const char*> metricName;
std::vector<float> metricValue;

int counter2 = 0;
void runDemo(cudaGraph_t graph, int length, int width)
{
    cudaGraphExec_t graphExec;
    {
        auto start = getCpuTime();
        cudaGraphInstantiateWithFlags(&graphExec, graph, 0);
        auto end = getCpuTime();
        metricName.push_back("instantiation");
        metricValue.push_back(getMicroSecondDuration(start, end));
    }
    {
        RANGE("launch including upload");
        auto start = getCpuTime();
        cudaGraphLaunch(graphExec, stream[0]);
        auto apiReturn = getCpuTime();
        cudaStreamSynchronize(stream[0]);
        auto streamSync = getCpuTime();
        metricName.push_back("first_launch_api");
        metricValue.push_back(getMicroSecondDuration(start, apiReturn));
        metricName.push_back("first_launch_total");
        metricValue.push_back(getMicroSecondDuration(start, streamSync));
    }
    {
        RANGE("repeat lauch in empty stream");
        auto start = getCpuTime();
        cudaGraphLaunch(graphExec, stream[0]);
        auto apiReturn = getCpuTime();
        cudaStreamSynchronize(stream[0]);
        auto streamSync = getCpuTime();
        metricName.push_back("repeat_launch_api");
        metricValue.push_back(getMicroSecondDuration(start, apiReturn));
        metricName.push_back("repeat_launch_total");
        metricValue.push_back(getMicroSecondDuration(start, streamSync));
    }
    {
        // re-instantiating the exec to simulate first launch into a busy stream. 
        cudaGraphExecDestroy(graphExec);
        cudaGraphInstantiateWithFlags(&graphExec, graph, 0);

        long long maxTimeoutNanoSeconds = 4000 + 500*length*width;
        waitWithTimeout<<<1,1,0,stream[0]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch);

        RANGE("launch including upload in busy stream");
        cudaEventRecord(timingEvent[0], stream[0]);
        cudaGraphLaunch(graphExec, stream[0]);
        cudaEventRecord(timingEvent[1], stream[0]);

        hostData->latch = 1;
        cudaStreamSynchronize(stream[0]);

        metricName.push_back("first_launch_device");
        metricValue.push_back(getAsyncMicroSecondDuration(timingEvent[0], timingEvent[1]));
        metricName.push_back("blockingKernelTimeoutDetected");
        metricValue.push_back(hostData->timeoutDetected);
        hostData->latch = 0;
        hostData->timeoutDetected = 0;
    }
    {
        RANGE("repeat lauch in busy stream");
        long long maxTimeoutNanoSeconds = 4000 + 500*length*width;
        waitWithTimeout<<<1,1,0,stream[0]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch);
        cudaEventRecord(timingEvent[0], stream[0]);
        cudaGraphLaunch(graphExec, stream[0]);
        cudaEventRecord(timingEvent[1], stream[0]);

        hostData->latch = 1;
        cudaStreamSynchronize(stream[0]);

        metricName.push_back("repeat_launch_device");
        metricValue.push_back(getAsyncMicroSecondDuration(timingEvent[0], timingEvent[1]));
        metricName.push_back("blockingKernelTimeoutDetected");
        metricValue.push_back(hostData->timeoutDetected);
        hostData->latch = 0;
        hostData->timeoutDetected = 0;
    }
    {
        // re-instantiating the exec to provide upload with work to do.
        cudaGraphExecDestroy(graphExec);
        cudaGraphInstantiateWithFlags(&graphExec, graph, 0);
        long long maxTimeoutNanoSeconds = 4000 + 1000*length*width;
        waitWithTimeout<<<1,1,0,stream[0]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected2, &hostData->timeElapsed2, &hostData->latch2);
        maxTimeoutNanoSeconds = 2000 + 500*length*width;
        waitWithTimeout<<<1,1,0,stream[1]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch);

        RANGE("uploading a graph off of the critical path");
        preUploadAnnotation<<<1,1,0,stream[1]>>>();
        cudaEventRecord(timingEvent[0], stream[0]);
        auto start = getCpuTime();
        cudaGraphUpload(graphExec, stream[1]);
        auto apiReturn = getCpuTime();
        cudaEventRecord(event[0],stream[1]);
        cudaEventRecord(timingEvent[1], stream[0]);
        postUploadAnnotation<<<1,1,0,stream[1]>>>();

        hostData->latch = 1; // release the blocking kernel for the upload
        cudaStreamWaitEvent(stream[0],event[0]);
        cudaGraphLaunch(graphExec, stream[0]);
        cudaEventSynchronize(event[0]); // upload done, similuate critical path being ready for the graph to run by the release of the second latch

        hostData->latch2 = 1; // release the work 
        cudaStreamSynchronize(stream[0]);

        metricName.push_back("upload_api_time");
        metricValue.push_back(getMicroSecondDuration(start, apiReturn));
        metricName.push_back("updoad_device_time");
        metricValue.push_back(getAsyncMicroSecondDuration(timingEvent[0], timingEvent[1]));
        metricName.push_back("blockingKernelTimeoutDetected");
        metricValue.push_back(hostData->timeoutDetected);

        hostData->latch = 0;
        hostData->latch2 = 0;
        hostData->timeoutDetected = 0;
        hostData->timeoutDetected2 = 0;
    }
    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
    RANGE_POP();
}

void usage() {
    printf("programName [outputFmt] [numTrials] [length] [width] [pattern] [stride] [maxLength] \n");
    printf("\toutputFmt - program output, default=3 (see below)\n");
    printf("\tnumTrials (per length)\n");
    printf("\tstarting length of the topology\n");
    printf("\twidth - width of the graph topology\n");
    printf("\tpattern - Structure of graph, default=0 (see below)\n");
    printf("\tstride - how to grow the length between each set of trials \n");
    printf("\tmaxLength - maximum lenght to try \n");
    printf("\n");
    printf("outputFmt can be:\n");
    printf("\t0: this help message\n");
    printf("\t1: csv data headers\n");
    printf("\t2: per trial csv data\n");
    printf("\t3: csv data & headers\n");
    printf("\t4: csv data is printed and trials are averaged for each length\n");
    printf("\t5: csv data is printed and trials are averaged for each length and headers are printed\n");
    printf("\n");
    printf("Pattern can be:\n");
    printf("\t0: No interconnect between branches\n");
    printf("\t1: Adds an extra root node before the initial fork\n");
}

int main(int argc, char **argv)
{
    if(argc < 1) {
        usage();
        return 0;
    }

    int numTrials=1, length=20, width=1, outputFmt=3, pattern=0, stride = 1;
    if(argc > 1) outputFmt = atoi(argv[1]);
    if(argc > 2) numTrials = atoi(argv[2]);
    if(argc > 3) length= atoi(argv[3]);
    if(argc > 4) width= atoi(argv[4]);
    if(argc > 5) pattern = atoi(argv[5]);
    if(argc > 6) stride = atoi(argv[6]);
    int maxLength = length;
    if(argc > 7) maxLength = atoi(argv[7]);
    if (maxLength < length) {
        maxLength = length;
    }

    if((outputFmt & 4) && (outputFmt & 2)) {
        printf("printing average and all samples doesn't make sense\n");
    }

    if(length == 0 ||
       width == 0 ||
       outputFmt == 0 ||
       outputFmt > 5 ||
       pattern > 1)
    {
        usage();
        return 0;
    }

    bool singleEntry = (pattern == 1);

    cudaGraph_t graph;

    cudaFree(0);
    cudaMallocHost(&hostData, sizeof(*hostData));
    stream.resize(width);
    for (int i = 0; i < width; i++)
    {
        cudaStreamCreate(&stream[i]);
    }

    cudaEventCreate(&event[0], cudaEventDisableTiming);
    cudaEventCreate(&timingEvent[0], 0);
    cudaEventCreate(&timingEvent[1], 0);

    {
        RANGE("warmup");
        for (int i = 0; i < width; i++)
        {
            empty<<<1,1,0,stream[i]>>>();
        }
        cudaStreamSynchronize(stream[0]);

        auto start = getCpuTime();
        graph = createParallelChain(length, width, singleEntry);
        auto end = getCpuTime();
        metricValue.push_back(getMicroSecondDuration(start, end));
        metricName.push_back("capture");
        runDemo(graph, length, width);
    }

    if (outputFmt & 1) {
        printf("length, width, pattern, ");
        for (int i = 0; i < metricName.size(); i++) {
            printf("%s, ", metricName[i]);
        } 
        printf("\r\n");
    }

    if (!(outputFmt & 6)) {
        printf("skipping trials since no output is expected\n");
        return;
    }
    
    std::vector<double> metricTotal;
    metricTotal.resize(metricValue.size());

    while (length <= maxLength) {
        for (int i = 0; i < numTrials; i++) {
            metricName.clear();
            metricValue.clear();
            auto start = getCpuTime();
            graph = createParallelChain(length, width, singleEntry);
            auto end = getCpuTime();
            metricValue.push_back(getMicroSecondDuration(start, end));

            runDemo(graph, length, width);

            if (outputFmt & 2) {
                printf("%d, %d, %d, ",length, width, pattern);
                for (int i = 0; i < metricValue.size(); i++) {
                    printf("%0.3f, ", metricValue[i]);
                } 
                printf("\r\n");
            }
            if (outputFmt & 4) {
                for (int i = 0; i < metricTotal.size(); i++) {
                    metricTotal[i] += metricValue[i];
                } 
            }
        }

        if (outputFmt & 4) {
            printf("%d, %d, %d, ",length, width, pattern);
            for (int i = 0; i < metricTotal.size(); i++) {
                printf("%0.3f, ", metricTotal[i]/numTrials);
                metricTotal[i] = 0;
            } 
            printf("\r\n");
        }

        length += stride;
    }

    printf("\n");
}
Updating samples for CUDA 12.5 2024-07-26 00:30:13 +08:00			`/* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`/*`
			`* This is a simple application showing the performance characteristics of cudaGraphs.`
			`*/`

			`#define USE_NVTX`

			`#include <cstdio>`
			`#include <cuda_runtime.h>`
			`#include <vector>`
			`#include <chrono>`

			`typedef volatile int LatchType;`

			`std::chrono::time_point<std::chrono::high_resolution_clock> getCpuTime()`
			`{`
			`return std::chrono::high_resolution_clock::now();`
			`}`

			`template <typename T>`
			`float getMicroSecondDuration(T start, T end)`
			`{`
			`return std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count() *.001f;`
			`}`

			`float getAsyncMicroSecondDuration(cudaEvent_t start, cudaEvent_t end)`
			`{`
			`float ms;`
			`cudaEventElapsedTime(&ms, start, end);`
			`return ms*1000;`
			`}`

			`#ifdef USE_NVTX`
			`#include <nvtx3/nvToolsExt.h>`

			`class Tracer {`
			`public:`
			`Tracer(const char* name) {`
			`nvtxRangePushA(name);`
			`}`
			`~Tracer() {`
			`nvtxRangePop();`
			`}`
			`};`
			`#define RANGE(name) Tracer uniq_name_using_macros(name);`
			`#define RANGE_PUSH(name) nvtxRangePushA(name)`
			`#define RANGE_POP() nvtxRangePop();`
			`#else`
			`#define RANGE(name)`
			`#endif`

			`std::vector<cudaStream_t> stream;`
			`cudaEvent_t event[1];`
			`cudaEvent_t timingEvent[2];`

			`struct hostData {`
			`long long timeElapsed;`
			`bool timeoutDetected;`
			`long long timeElapsed2;`
			`bool timeoutDetected2;`
			`LatchType latch;`
			`LatchType latch2;`
			`};`

			`struct hostData *hostData;`

			`__global__ void empty()`
			`{`
			`}`

			`// Function to read the GPU nanosecond timer in a kernel`
			`__device__ __forceinline__ unsigned long long __globaltimer() {`
			`unsigned long long globaltimer;`
			`asm volatile ("mov.u64 %0, %globaltimer;" : "=l"(globaltimer));`
			`return globaltimer;`
			`}`

			`__global__ void delay(long long ticks)`
			`{`
			`long long endTime = clock64() + ticks;`
			`while (clock64() < endTime);`
			`}`

			`__global__ void waitWithTimeout(long long nanoseconds, bool* timeoutDetected, long long timeElapsed, LatchType latch)`
			`{`
			`long long startTime = __globaltimer();`
			`long long endTime = startTime + nanoseconds;`
			`long long time = 0;`
			`do {`
			`time = __globaltimer();`
			`} while (time < endTime && (latch == NULL \|\| *latch == 0));`
			`if (timeElapsed != NULL) {`
			`*timeElapsed = time - startTime;`
			`}`
			`if (timeoutDetected) {`
			`// report timeout if latch not detected`
			`timeoutDetected = (latch == NULL \|\| latch == 0);`
			`}`
			`}`

			`__global__ void preUploadAnnotation()`
			`{`
			`}`

			`__global__ void postUploadAnnotation()`
			`{`
			`}`

			`cudaGraph_t createParallelChain(int length, int width, bool singleEntry = false)`
			`{`
			`RANGE_PUSH(__func__);`
			`RANGE("capture");`
			`cudaGraph_t graph;`
			`cudaStreamBeginCapture(stream[0], cudaStreamCaptureModeGlobal);`
			`int streamIdx = 0;`
			`if (singleEntry) {`
			`empty<<<1,1,0,stream[streamIdx]>>>();`
			`}`

			`cudaEventRecord(event[0], stream[0]);`
			`for (int i = 1; i < width; i++) {`
			`cudaStreamWaitEvent(stream[i], event[0]);`
			`}`

			`for (int i = 0; i < width; i++) {`
			`streamIdx = i;`
			`for (int j = 0; j < length; j++) {`
			`empty<<<1,1,0,stream[streamIdx]>>>();`
			`}`
			`}`

			`for (int i = 1; i < width; i++) {`
			`cudaEventRecord(event[0], stream[i]);`
			`cudaStreamWaitEvent(stream[0], event[0]);`
			`}`

			`cudaStreamEndCapture(stream[0], &graph);`
			`return graph;`
			`}`

			`std::vector<const char*> metricName;`
			`std::vector<float> metricValue;`

			`int counter2 = 0;`
			`void runDemo(cudaGraph_t graph, int length, int width)`
			`{`
			`cudaGraphExec_t graphExec;`
			`{`
			`auto start = getCpuTime();`
			`cudaGraphInstantiateWithFlags(&graphExec, graph, 0);`
			`auto end = getCpuTime();`
			`metricName.push_back("instantiation");`
			`metricValue.push_back(getMicroSecondDuration(start, end));`
			`}`
			`{`
			`RANGE("launch including upload");`
			`auto start = getCpuTime();`
			`cudaGraphLaunch(graphExec, stream[0]);`
			`auto apiReturn = getCpuTime();`
			`cudaStreamSynchronize(stream[0]);`
			`auto streamSync = getCpuTime();`
			`metricName.push_back("first_launch_api");`
			`metricValue.push_back(getMicroSecondDuration(start, apiReturn));`
			`metricName.push_back("first_launch_total");`
			`metricValue.push_back(getMicroSecondDuration(start, streamSync));`
			`}`
			`{`
			`RANGE("repeat lauch in empty stream");`
			`auto start = getCpuTime();`
			`cudaGraphLaunch(graphExec, stream[0]);`
			`auto apiReturn = getCpuTime();`
			`cudaStreamSynchronize(stream[0]);`
			`auto streamSync = getCpuTime();`
			`metricName.push_back("repeat_launch_api");`
			`metricValue.push_back(getMicroSecondDuration(start, apiReturn));`
			`metricName.push_back("repeat_launch_total");`
			`metricValue.push_back(getMicroSecondDuration(start, streamSync));`
			`}`
			`{`
			`// re-instantiating the exec to simulate first launch into a busy stream.`
			`cudaGraphExecDestroy(graphExec);`
			`cudaGraphInstantiateWithFlags(&graphExec, graph, 0);`

			`long long maxTimeoutNanoSeconds = 4000 + 500lengthwidth;`
			`waitWithTimeout<<<1,1,0,stream[0]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch);`

			`RANGE("launch including upload in busy stream");`
			`cudaEventRecord(timingEvent[0], stream[0]);`
			`cudaGraphLaunch(graphExec, stream[0]);`
			`cudaEventRecord(timingEvent[1], stream[0]);`

			`hostData->latch = 1;`
			`cudaStreamSynchronize(stream[0]);`

			`metricName.push_back("first_launch_device");`
			`metricValue.push_back(getAsyncMicroSecondDuration(timingEvent[0], timingEvent[1]));`
			`metricName.push_back("blockingKernelTimeoutDetected");`
			`metricValue.push_back(hostData->timeoutDetected);`
			`hostData->latch = 0;`
			`hostData->timeoutDetected = 0;`
			`}`
			`{`
			`RANGE("repeat lauch in busy stream");`
			`long long maxTimeoutNanoSeconds = 4000 + 500lengthwidth;`
			`waitWithTimeout<<<1,1,0,stream[0]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch);`
			`cudaEventRecord(timingEvent[0], stream[0]);`
			`cudaGraphLaunch(graphExec, stream[0]);`
			`cudaEventRecord(timingEvent[1], stream[0]);`

			`hostData->latch = 1;`
			`cudaStreamSynchronize(stream[0]);`

			`metricName.push_back("repeat_launch_device");`
			`metricValue.push_back(getAsyncMicroSecondDuration(timingEvent[0], timingEvent[1]));`
			`metricName.push_back("blockingKernelTimeoutDetected");`
			`metricValue.push_back(hostData->timeoutDetected);`
			`hostData->latch = 0;`
			`hostData->timeoutDetected = 0;`
			`}`
			`{`
			`// re-instantiating the exec to provide upload with work to do.`
			`cudaGraphExecDestroy(graphExec);`
			`cudaGraphInstantiateWithFlags(&graphExec, graph, 0);`
			`long long maxTimeoutNanoSeconds = 4000 + 1000lengthwidth;`
			`waitWithTimeout<<<1,1,0,stream[0]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected2, &hostData->timeElapsed2, &hostData->latch2);`
			`maxTimeoutNanoSeconds = 2000 + 500lengthwidth;`
			`waitWithTimeout<<<1,1,0,stream[1]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch);`

			`RANGE("uploading a graph off of the critical path");`
			`preUploadAnnotation<<<1,1,0,stream[1]>>>();`
			`cudaEventRecord(timingEvent[0], stream[0]);`
			`auto start = getCpuTime();`
			`cudaGraphUpload(graphExec, stream[1]);`
			`auto apiReturn = getCpuTime();`
			`cudaEventRecord(event[0],stream[1]);`
			`cudaEventRecord(timingEvent[1], stream[0]);`
			`postUploadAnnotation<<<1,1,0,stream[1]>>>();`

			`hostData->latch = 1; // release the blocking kernel for the upload`
			`cudaStreamWaitEvent(stream[0],event[0]);`
			`cudaGraphLaunch(graphExec, stream[0]);`
			`cudaEventSynchronize(event[0]); // upload done, similuate critical path being ready for the graph to run by the release of the second latch`

			`hostData->latch2 = 1; // release the work`
			`cudaStreamSynchronize(stream[0]);`

			`metricName.push_back("upload_api_time");`
			`metricValue.push_back(getMicroSecondDuration(start, apiReturn));`
			`metricName.push_back("updoad_device_time");`
			`metricValue.push_back(getAsyncMicroSecondDuration(timingEvent[0], timingEvent[1]));`
			`metricName.push_back("blockingKernelTimeoutDetected");`
			`metricValue.push_back(hostData->timeoutDetected);`

			`hostData->latch = 0;`
			`hostData->latch2 = 0;`
			`hostData->timeoutDetected = 0;`
			`hostData->timeoutDetected2 = 0;`
			`}`
			`cudaGraphExecDestroy(graphExec);`
			`cudaGraphDestroy(graph);`
			`RANGE_POP();`
			`}`

			`void usage() {`
			`printf("programName [outputFmt] [numTrials] [length] [width] [pattern] [stride] [maxLength] \n");`
			`printf("\toutputFmt - program output, default=3 (see below)\n");`
			`printf("\tnumTrials (per length)\n");`
			`printf("\tstarting length of the topology\n");`
			`printf("\twidth - width of the graph topology\n");`
			`printf("\tpattern - Structure of graph, default=0 (see below)\n");`
			`printf("\tstride - how to grow the length between each set of trials \n");`
			`printf("\tmaxLength - maximum lenght to try \n");`
			`printf("\n");`
			`printf("outputFmt can be:\n");`
			`printf("\t0: this help message\n");`
			`printf("\t1: csv data headers\n");`
			`printf("\t2: per trial csv data\n");`
			`printf("\t3: csv data & headers\n");`
			`printf("\t4: csv data is printed and trials are averaged for each length\n");`
			`printf("\t5: csv data is printed and trials are averaged for each length and headers are printed\n");`
			`printf("\n");`
			`printf("Pattern can be:\n");`
			`printf("\t0: No interconnect between branches\n");`
			`printf("\t1: Adds an extra root node before the initial fork\n");`
			`}`

			`int main(int argc, char **argv)`
			`{`
			`if(argc < 1) {`
			`usage();`
			`return 0;`
			`}`

			`int numTrials=1, length=20, width=1, outputFmt=3, pattern=0, stride = 1;`
			`if(argc > 1) outputFmt = atoi(argv[1]);`
			`if(argc > 2) numTrials = atoi(argv[2]);`
			`if(argc > 3) length= atoi(argv[3]);`
			`if(argc > 4) width= atoi(argv[4]);`
			`if(argc > 5) pattern = atoi(argv[5]);`
			`if(argc > 6) stride = atoi(argv[6]);`
			`int maxLength = length;`
			`if(argc > 7) maxLength = atoi(argv[7]);`
			`if (maxLength < length) {`
			`maxLength = length;`
			`}`

			`if((outputFmt & 4) && (outputFmt & 2)) {`
			`printf("printing average and all samples doesn't make sense\n");`
			`}`

			`if(length == 0 \|\|`
			`width == 0 \|\|`
			`outputFmt == 0 \|\|`
			`outputFmt > 5 \|\|`
			`pattern > 1)`
			`{`
			`usage();`
			`return 0;`
			`}`

			`bool singleEntry = (pattern == 1);`

			`cudaGraph_t graph;`

			`cudaFree(0);`
			`cudaMallocHost(&hostData, sizeof(*hostData));`
			`stream.resize(width);`
			`for (int i = 0; i < width; i++)`
			`{`
			`cudaStreamCreate(&stream[i]);`
			`}`

			`cudaEventCreate(&event[0], cudaEventDisableTiming);`
			`cudaEventCreate(&timingEvent[0], 0);`
			`cudaEventCreate(&timingEvent[1], 0);`

			`{`
			`RANGE("warmup");`
			`for (int i = 0; i < width; i++)`
			`{`
			`empty<<<1,1,0,stream[i]>>>();`
			`}`
			`cudaStreamSynchronize(stream[0]);`

			`auto start = getCpuTime();`
			`graph = createParallelChain(length, width, singleEntry);`
			`auto end = getCpuTime();`
			`metricValue.push_back(getMicroSecondDuration(start, end));`
			`metricName.push_back("capture");`
			`runDemo(graph, length, width);`
			`}`

			`if (outputFmt & 1) {`
			`printf("length, width, pattern, ");`
			`for (int i = 0; i < metricName.size(); i++) {`
			`printf("%s, ", metricName[i]);`
			`}`
			`printf("\r\n");`
			`}`

			`if (!(outputFmt & 6)) {`
			`printf("skipping trials since no output is expected\n");`
			`return;`
			`}`

			`std::vector<double> metricTotal;`
			`metricTotal.resize(metricValue.size());`

			`while (length <= maxLength) {`
			`for (int i = 0; i < numTrials; i++) {`
			`metricName.clear();`
			`metricValue.clear();`
			`auto start = getCpuTime();`
			`graph = createParallelChain(length, width, singleEntry);`
			`auto end = getCpuTime();`
			`metricValue.push_back(getMicroSecondDuration(start, end));`

			`runDemo(graph, length, width);`

			`if (outputFmt & 2) {`
			`printf("%d, %d, %d, ",length, width, pattern);`
			`for (int i = 0; i < metricValue.size(); i++) {`
			`printf("%0.3f, ", metricValue[i]);`
			`}`
			`printf("\r\n");`
			`}`
			`if (outputFmt & 4) {`
			`for (int i = 0; i < metricTotal.size(); i++) {`
			`metricTotal[i] += metricValue[i];`
			`}`
			`}`
			`}`

			`if (outputFmt & 4) {`
			`printf("%d, %d, %d, ",length, width, pattern);`
			`for (int i = 0; i < metricTotal.size(); i++) {`
			`printf("%0.3f, ", metricTotal[i]/numTrials);`
			`metricTotal[i] = 0;`
			`}`
			`printf("\r\n");`
			`}`

			`length += stride;`
			`}`

			`printf("\n");`
			`}`