cuda-samples/Samples/6_Performance/cudaGraphsPerfScaling/cudaGraphPerfScaling.cu

435 lines
14 KiB
Plaintext
Raw Normal View History

2024-07-26 00:30:13 +08:00
/* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This is a simple application showing the performance characteristics of cudaGraphs.
*/
#define USE_NVTX
#include <cstdio>
#include <cuda_runtime.h>
#include <vector>
#include <chrono>
typedef volatile int LatchType;
std::chrono::time_point<std::chrono::high_resolution_clock> getCpuTime()
{
return std::chrono::high_resolution_clock::now();
}
template <typename T>
float getMicroSecondDuration(T start, T end)
{
return std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count() *.001f;
}
float getAsyncMicroSecondDuration(cudaEvent_t start, cudaEvent_t end)
{
float ms;
cudaEventElapsedTime(&ms, start, end);
return ms*1000;
}
#ifdef USE_NVTX
#include <nvtx3/nvToolsExt.h>
class Tracer {
public:
Tracer(const char* name) {
nvtxRangePushA(name);
}
~Tracer() {
nvtxRangePop();
}
};
#define RANGE(name) Tracer uniq_name_using_macros(name);
#define RANGE_PUSH(name) nvtxRangePushA(name)
#define RANGE_POP() nvtxRangePop();
#else
#define RANGE(name)
#endif
std::vector<cudaStream_t> stream;
cudaEvent_t event[1];
cudaEvent_t timingEvent[2];
struct hostData {
long long timeElapsed;
bool timeoutDetected;
long long timeElapsed2;
bool timeoutDetected2;
LatchType latch;
LatchType latch2;
};
struct hostData *hostData;
__global__ void empty()
{
}
// Function to read the GPU nanosecond timer in a kernel
__device__ __forceinline__ unsigned long long __globaltimer() {
unsigned long long globaltimer;
asm volatile ("mov.u64 %0, %globaltimer;" : "=l"(globaltimer));
return globaltimer;
}
__global__ void delay(long long ticks)
{
long long endTime = clock64() + ticks;
while (clock64() < endTime);
}
__global__ void waitWithTimeout(long long nanoseconds, bool* timeoutDetected, long long *timeElapsed, LatchType* latch)
{
long long startTime = __globaltimer();
long long endTime = startTime + nanoseconds;
long long time = 0;
do {
time = __globaltimer();
} while (time < endTime && (latch == NULL || *latch == 0));
if (timeElapsed != NULL) {
*timeElapsed = time - startTime;
}
if (timeoutDetected) {
// report timeout if latch not detected
*timeoutDetected = (latch == NULL || *latch == 0);
}
}
__global__ void preUploadAnnotation()
{
}
__global__ void postUploadAnnotation()
{
}
cudaGraph_t createParallelChain(int length, int width, bool singleEntry = false)
{
RANGE_PUSH(__func__);
RANGE("capture");
cudaGraph_t graph;
cudaStreamBeginCapture(stream[0], cudaStreamCaptureModeGlobal);
int streamIdx = 0;
if (singleEntry) {
empty<<<1,1,0,stream[streamIdx]>>>();
}
cudaEventRecord(event[0], stream[0]);
for (int i = 1; i < width; i++) {
cudaStreamWaitEvent(stream[i], event[0]);
}
for (int i = 0; i < width; i++) {
streamIdx = i;
for (int j = 0; j < length; j++) {
empty<<<1,1,0,stream[streamIdx]>>>();
}
}
for (int i = 1; i < width; i++) {
cudaEventRecord(event[0], stream[i]);
cudaStreamWaitEvent(stream[0], event[0]);
}
cudaStreamEndCapture(stream[0], &graph);
return graph;
}
std::vector<const char*> metricName;
std::vector<float> metricValue;
int counter2 = 0;
void runDemo(cudaGraph_t graph, int length, int width)
{
cudaGraphExec_t graphExec;
{
auto start = getCpuTime();
cudaGraphInstantiateWithFlags(&graphExec, graph, 0);
auto end = getCpuTime();
metricName.push_back("instantiation");
metricValue.push_back(getMicroSecondDuration(start, end));
}
{
RANGE("launch including upload");
auto start = getCpuTime();
cudaGraphLaunch(graphExec, stream[0]);
auto apiReturn = getCpuTime();
cudaStreamSynchronize(stream[0]);
auto streamSync = getCpuTime();
metricName.push_back("first_launch_api");
metricValue.push_back(getMicroSecondDuration(start, apiReturn));
metricName.push_back("first_launch_total");
metricValue.push_back(getMicroSecondDuration(start, streamSync));
}
{
RANGE("repeat lauch in empty stream");
auto start = getCpuTime();
cudaGraphLaunch(graphExec, stream[0]);
auto apiReturn = getCpuTime();
cudaStreamSynchronize(stream[0]);
auto streamSync = getCpuTime();
metricName.push_back("repeat_launch_api");
metricValue.push_back(getMicroSecondDuration(start, apiReturn));
metricName.push_back("repeat_launch_total");
metricValue.push_back(getMicroSecondDuration(start, streamSync));
}
{
// re-instantiating the exec to simulate first launch into a busy stream.
cudaGraphExecDestroy(graphExec);
cudaGraphInstantiateWithFlags(&graphExec, graph, 0);
long long maxTimeoutNanoSeconds = 4000 + 500*length*width;
waitWithTimeout<<<1,1,0,stream[0]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch);
RANGE("launch including upload in busy stream");
cudaEventRecord(timingEvent[0], stream[0]);
cudaGraphLaunch(graphExec, stream[0]);
cudaEventRecord(timingEvent[1], stream[0]);
hostData->latch = 1;
cudaStreamSynchronize(stream[0]);
metricName.push_back("first_launch_device");
metricValue.push_back(getAsyncMicroSecondDuration(timingEvent[0], timingEvent[1]));
metricName.push_back("blockingKernelTimeoutDetected");
metricValue.push_back(hostData->timeoutDetected);
hostData->latch = 0;
hostData->timeoutDetected = 0;
}
{
RANGE("repeat lauch in busy stream");
long long maxTimeoutNanoSeconds = 4000 + 500*length*width;
waitWithTimeout<<<1,1,0,stream[0]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch);
cudaEventRecord(timingEvent[0], stream[0]);
cudaGraphLaunch(graphExec, stream[0]);
cudaEventRecord(timingEvent[1], stream[0]);
hostData->latch = 1;
cudaStreamSynchronize(stream[0]);
metricName.push_back("repeat_launch_device");
metricValue.push_back(getAsyncMicroSecondDuration(timingEvent[0], timingEvent[1]));
metricName.push_back("blockingKernelTimeoutDetected");
metricValue.push_back(hostData->timeoutDetected);
hostData->latch = 0;
hostData->timeoutDetected = 0;
}
{
// re-instantiating the exec to provide upload with work to do.
cudaGraphExecDestroy(graphExec);
cudaGraphInstantiateWithFlags(&graphExec, graph, 0);
long long maxTimeoutNanoSeconds = 4000 + 1000*length*width;
waitWithTimeout<<<1,1,0,stream[0]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected2, &hostData->timeElapsed2, &hostData->latch2);
maxTimeoutNanoSeconds = 2000 + 500*length*width;
waitWithTimeout<<<1,1,0,stream[1]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch);
RANGE("uploading a graph off of the critical path");
preUploadAnnotation<<<1,1,0,stream[1]>>>();
cudaEventRecord(timingEvent[0], stream[0]);
auto start = getCpuTime();
cudaGraphUpload(graphExec, stream[1]);
auto apiReturn = getCpuTime();
cudaEventRecord(event[0],stream[1]);
cudaEventRecord(timingEvent[1], stream[0]);
postUploadAnnotation<<<1,1,0,stream[1]>>>();
hostData->latch = 1; // release the blocking kernel for the upload
cudaStreamWaitEvent(stream[0],event[0]);
cudaGraphLaunch(graphExec, stream[0]);
cudaEventSynchronize(event[0]); // upload done, similuate critical path being ready for the graph to run by the release of the second latch
hostData->latch2 = 1; // release the work
cudaStreamSynchronize(stream[0]);
metricName.push_back("upload_api_time");
metricValue.push_back(getMicroSecondDuration(start, apiReturn));
metricName.push_back("updoad_device_time");
metricValue.push_back(getAsyncMicroSecondDuration(timingEvent[0], timingEvent[1]));
metricName.push_back("blockingKernelTimeoutDetected");
metricValue.push_back(hostData->timeoutDetected);
hostData->latch = 0;
hostData->latch2 = 0;
hostData->timeoutDetected = 0;
hostData->timeoutDetected2 = 0;
}
cudaGraphExecDestroy(graphExec);
cudaGraphDestroy(graph);
RANGE_POP();
}
void usage() {
printf("programName [outputFmt] [numTrials] [length] [width] [pattern] [stride] [maxLength] \n");
printf("\toutputFmt - program output, default=3 (see below)\n");
printf("\tnumTrials (per length)\n");
printf("\tstarting length of the topology\n");
printf("\twidth - width of the graph topology\n");
printf("\tpattern - Structure of graph, default=0 (see below)\n");
printf("\tstride - how to grow the length between each set of trials \n");
printf("\tmaxLength - maximum lenght to try \n");
printf("\n");
printf("outputFmt can be:\n");
printf("\t0: this help message\n");
printf("\t1: csv data headers\n");
printf("\t2: per trial csv data\n");
printf("\t3: csv data & headers\n");
printf("\t4: csv data is printed and trials are averaged for each length\n");
printf("\t5: csv data is printed and trials are averaged for each length and headers are printed\n");
printf("\n");
printf("Pattern can be:\n");
printf("\t0: No interconnect between branches\n");
printf("\t1: Adds an extra root node before the initial fork\n");
}
int main(int argc, char **argv)
{
if(argc < 1) {
usage();
return 0;
}
int numTrials=1, length=20, width=1, outputFmt=3, pattern=0, stride = 1;
if(argc > 1) outputFmt = atoi(argv[1]);
if(argc > 2) numTrials = atoi(argv[2]);
if(argc > 3) length= atoi(argv[3]);
if(argc > 4) width= atoi(argv[4]);
if(argc > 5) pattern = atoi(argv[5]);
if(argc > 6) stride = atoi(argv[6]);
int maxLength = length;
if(argc > 7) maxLength = atoi(argv[7]);
if (maxLength < length) {
maxLength = length;
}
if((outputFmt & 4) && (outputFmt & 2)) {
printf("printing average and all samples doesn't make sense\n");
}
if(length == 0 ||
width == 0 ||
outputFmt == 0 ||
outputFmt > 5 ||
pattern > 1)
{
usage();
return 0;
}
bool singleEntry = (pattern == 1);
cudaGraph_t graph;
cudaFree(0);
cudaMallocHost(&hostData, sizeof(*hostData));
stream.resize(width);
for (int i = 0; i < width; i++)
{
cudaStreamCreate(&stream[i]);
}
cudaEventCreate(&event[0], cudaEventDisableTiming);
cudaEventCreate(&timingEvent[0], 0);
cudaEventCreate(&timingEvent[1], 0);
{
RANGE("warmup");
for (int i = 0; i < width; i++)
{
empty<<<1,1,0,stream[i]>>>();
}
cudaStreamSynchronize(stream[0]);
auto start = getCpuTime();
graph = createParallelChain(length, width, singleEntry);
auto end = getCpuTime();
metricValue.push_back(getMicroSecondDuration(start, end));
metricName.push_back("capture");
runDemo(graph, length, width);
}
if (outputFmt & 1) {
printf("length, width, pattern, ");
for (int i = 0; i < metricName.size(); i++) {
printf("%s, ", metricName[i]);
}
printf("\r\n");
}
if (!(outputFmt & 6)) {
printf("skipping trials since no output is expected\n");
return;
}
std::vector<double> metricTotal;
metricTotal.resize(metricValue.size());
while (length <= maxLength) {
for (int i = 0; i < numTrials; i++) {
metricName.clear();
metricValue.clear();
auto start = getCpuTime();
graph = createParallelChain(length, width, singleEntry);
auto end = getCpuTime();
metricValue.push_back(getMicroSecondDuration(start, end));
runDemo(graph, length, width);
if (outputFmt & 2) {
printf("%d, %d, %d, ",length, width, pattern);
for (int i = 0; i < metricValue.size(); i++) {
printf("%0.3f, ", metricValue[i]);
}
printf("\r\n");
}
if (outputFmt & 4) {
for (int i = 0; i < metricTotal.size(); i++) {
metricTotal[i] += metricValue[i];
}
}
}
if (outputFmt & 4) {
printf("%d, %d, %d, ",length, width, pattern);
for (int i = 0; i < metricTotal.size(); i++) {
printf("%0.3f, ", metricTotal[i]/numTrials);
metricTotal[i] = 0;
}
printf("\r\n");
}
length += stride;
}
printf("\n");
}