mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-28 21:49:15 +08:00
435 lines
14 KiB
Plaintext
435 lines
14 KiB
Plaintext
|
/* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||
|
*
|
||
|
* Redistribution and use in source and binary forms, with or without
|
||
|
* modification, are permitted provided that the following conditions
|
||
|
* are met:
|
||
|
* * Redistributions of source code must retain the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer.
|
||
|
* * Redistributions in binary form must reproduce the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer in the
|
||
|
* documentation and/or other materials provided with the distribution.
|
||
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||
|
* contributors may be used to endorse or promote products derived
|
||
|
* from this software without specific prior written permission.
|
||
|
*
|
||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
*/
|
||
|
|
||
|
/*
|
||
|
* This is a simple application showing the performance characteristics of cudaGraphs.
|
||
|
*/
|
||
|
|
||
|
#define USE_NVTX
|
||
|
|
||
|
#include <cstdio>
|
||
|
#include <cuda_runtime.h>
|
||
|
#include <vector>
|
||
|
#include <chrono>
|
||
|
|
||
|
typedef volatile int LatchType;
|
||
|
|
||
|
std::chrono::time_point<std::chrono::high_resolution_clock> getCpuTime()
|
||
|
{
|
||
|
return std::chrono::high_resolution_clock::now();
|
||
|
}
|
||
|
|
||
|
template <typename T>
|
||
|
float getMicroSecondDuration(T start, T end)
|
||
|
{
|
||
|
return std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count() *.001f;
|
||
|
}
|
||
|
|
||
|
float getAsyncMicroSecondDuration(cudaEvent_t start, cudaEvent_t end)
|
||
|
{
|
||
|
float ms;
|
||
|
cudaEventElapsedTime(&ms, start, end);
|
||
|
return ms*1000;
|
||
|
}
|
||
|
|
||
|
#ifdef USE_NVTX
|
||
|
#include <nvtx3/nvToolsExt.h>
|
||
|
|
||
|
class Tracer {
|
||
|
public:
|
||
|
Tracer(const char* name) {
|
||
|
nvtxRangePushA(name);
|
||
|
}
|
||
|
~Tracer() {
|
||
|
nvtxRangePop();
|
||
|
}
|
||
|
};
|
||
|
#define RANGE(name) Tracer uniq_name_using_macros(name);
|
||
|
#define RANGE_PUSH(name) nvtxRangePushA(name)
|
||
|
#define RANGE_POP() nvtxRangePop();
|
||
|
#else
|
||
|
#define RANGE(name)
|
||
|
#endif
|
||
|
|
||
|
std::vector<cudaStream_t> stream;
|
||
|
cudaEvent_t event[1];
|
||
|
cudaEvent_t timingEvent[2];
|
||
|
|
||
|
struct hostData {
|
||
|
long long timeElapsed;
|
||
|
bool timeoutDetected;
|
||
|
long long timeElapsed2;
|
||
|
bool timeoutDetected2;
|
||
|
LatchType latch;
|
||
|
LatchType latch2;
|
||
|
};
|
||
|
|
||
|
struct hostData *hostData;
|
||
|
|
||
|
__global__ void empty()
|
||
|
{
|
||
|
}
|
||
|
|
||
|
// Function to read the GPU nanosecond timer in a kernel
|
||
|
__device__ __forceinline__ unsigned long long __globaltimer() {
|
||
|
unsigned long long globaltimer;
|
||
|
asm volatile ("mov.u64 %0, %globaltimer;" : "=l"(globaltimer));
|
||
|
return globaltimer;
|
||
|
}
|
||
|
|
||
|
__global__ void delay(long long ticks)
|
||
|
{
|
||
|
long long endTime = clock64() + ticks;
|
||
|
while (clock64() < endTime);
|
||
|
}
|
||
|
|
||
|
__global__ void waitWithTimeout(long long nanoseconds, bool* timeoutDetected, long long *timeElapsed, LatchType* latch)
|
||
|
{
|
||
|
long long startTime = __globaltimer();
|
||
|
long long endTime = startTime + nanoseconds;
|
||
|
long long time = 0;
|
||
|
do {
|
||
|
time = __globaltimer();
|
||
|
} while (time < endTime && (latch == NULL || *latch == 0));
|
||
|
if (timeElapsed != NULL) {
|
||
|
*timeElapsed = time - startTime;
|
||
|
}
|
||
|
if (timeoutDetected) {
|
||
|
// report timeout if latch not detected
|
||
|
*timeoutDetected = (latch == NULL || *latch == 0);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__global__ void preUploadAnnotation()
|
||
|
{
|
||
|
}
|
||
|
|
||
|
__global__ void postUploadAnnotation()
|
||
|
{
|
||
|
}
|
||
|
|
||
|
cudaGraph_t createParallelChain(int length, int width, bool singleEntry = false)
|
||
|
{
|
||
|
RANGE_PUSH(__func__);
|
||
|
RANGE("capture");
|
||
|
cudaGraph_t graph;
|
||
|
cudaStreamBeginCapture(stream[0], cudaStreamCaptureModeGlobal);
|
||
|
int streamIdx = 0;
|
||
|
if (singleEntry) {
|
||
|
empty<<<1,1,0,stream[streamIdx]>>>();
|
||
|
}
|
||
|
|
||
|
cudaEventRecord(event[0], stream[0]);
|
||
|
for (int i = 1; i < width; i++) {
|
||
|
cudaStreamWaitEvent(stream[i], event[0]);
|
||
|
}
|
||
|
|
||
|
for (int i = 0; i < width; i++) {
|
||
|
streamIdx = i;
|
||
|
for (int j = 0; j < length; j++) {
|
||
|
empty<<<1,1,0,stream[streamIdx]>>>();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
for (int i = 1; i < width; i++) {
|
||
|
cudaEventRecord(event[0], stream[i]);
|
||
|
cudaStreamWaitEvent(stream[0], event[0]);
|
||
|
}
|
||
|
|
||
|
cudaStreamEndCapture(stream[0], &graph);
|
||
|
return graph;
|
||
|
}
|
||
|
|
||
|
std::vector<const char*> metricName;
|
||
|
std::vector<float> metricValue;
|
||
|
|
||
|
int counter2 = 0;
|
||
|
void runDemo(cudaGraph_t graph, int length, int width)
|
||
|
{
|
||
|
cudaGraphExec_t graphExec;
|
||
|
{
|
||
|
auto start = getCpuTime();
|
||
|
cudaGraphInstantiateWithFlags(&graphExec, graph, 0);
|
||
|
auto end = getCpuTime();
|
||
|
metricName.push_back("instantiation");
|
||
|
metricValue.push_back(getMicroSecondDuration(start, end));
|
||
|
}
|
||
|
{
|
||
|
RANGE("launch including upload");
|
||
|
auto start = getCpuTime();
|
||
|
cudaGraphLaunch(graphExec, stream[0]);
|
||
|
auto apiReturn = getCpuTime();
|
||
|
cudaStreamSynchronize(stream[0]);
|
||
|
auto streamSync = getCpuTime();
|
||
|
metricName.push_back("first_launch_api");
|
||
|
metricValue.push_back(getMicroSecondDuration(start, apiReturn));
|
||
|
metricName.push_back("first_launch_total");
|
||
|
metricValue.push_back(getMicroSecondDuration(start, streamSync));
|
||
|
}
|
||
|
{
|
||
|
RANGE("repeat lauch in empty stream");
|
||
|
auto start = getCpuTime();
|
||
|
cudaGraphLaunch(graphExec, stream[0]);
|
||
|
auto apiReturn = getCpuTime();
|
||
|
cudaStreamSynchronize(stream[0]);
|
||
|
auto streamSync = getCpuTime();
|
||
|
metricName.push_back("repeat_launch_api");
|
||
|
metricValue.push_back(getMicroSecondDuration(start, apiReturn));
|
||
|
metricName.push_back("repeat_launch_total");
|
||
|
metricValue.push_back(getMicroSecondDuration(start, streamSync));
|
||
|
}
|
||
|
{
|
||
|
// re-instantiating the exec to simulate first launch into a busy stream.
|
||
|
cudaGraphExecDestroy(graphExec);
|
||
|
cudaGraphInstantiateWithFlags(&graphExec, graph, 0);
|
||
|
|
||
|
long long maxTimeoutNanoSeconds = 4000 + 500*length*width;
|
||
|
waitWithTimeout<<<1,1,0,stream[0]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch);
|
||
|
|
||
|
RANGE("launch including upload in busy stream");
|
||
|
cudaEventRecord(timingEvent[0], stream[0]);
|
||
|
cudaGraphLaunch(graphExec, stream[0]);
|
||
|
cudaEventRecord(timingEvent[1], stream[0]);
|
||
|
|
||
|
hostData->latch = 1;
|
||
|
cudaStreamSynchronize(stream[0]);
|
||
|
|
||
|
metricName.push_back("first_launch_device");
|
||
|
metricValue.push_back(getAsyncMicroSecondDuration(timingEvent[0], timingEvent[1]));
|
||
|
metricName.push_back("blockingKernelTimeoutDetected");
|
||
|
metricValue.push_back(hostData->timeoutDetected);
|
||
|
hostData->latch = 0;
|
||
|
hostData->timeoutDetected = 0;
|
||
|
}
|
||
|
{
|
||
|
RANGE("repeat lauch in busy stream");
|
||
|
long long maxTimeoutNanoSeconds = 4000 + 500*length*width;
|
||
|
waitWithTimeout<<<1,1,0,stream[0]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch);
|
||
|
cudaEventRecord(timingEvent[0], stream[0]);
|
||
|
cudaGraphLaunch(graphExec, stream[0]);
|
||
|
cudaEventRecord(timingEvent[1], stream[0]);
|
||
|
|
||
|
hostData->latch = 1;
|
||
|
cudaStreamSynchronize(stream[0]);
|
||
|
|
||
|
metricName.push_back("repeat_launch_device");
|
||
|
metricValue.push_back(getAsyncMicroSecondDuration(timingEvent[0], timingEvent[1]));
|
||
|
metricName.push_back("blockingKernelTimeoutDetected");
|
||
|
metricValue.push_back(hostData->timeoutDetected);
|
||
|
hostData->latch = 0;
|
||
|
hostData->timeoutDetected = 0;
|
||
|
}
|
||
|
{
|
||
|
// re-instantiating the exec to provide upload with work to do.
|
||
|
cudaGraphExecDestroy(graphExec);
|
||
|
cudaGraphInstantiateWithFlags(&graphExec, graph, 0);
|
||
|
long long maxTimeoutNanoSeconds = 4000 + 1000*length*width;
|
||
|
waitWithTimeout<<<1,1,0,stream[0]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected2, &hostData->timeElapsed2, &hostData->latch2);
|
||
|
maxTimeoutNanoSeconds = 2000 + 500*length*width;
|
||
|
waitWithTimeout<<<1,1,0,stream[1]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch);
|
||
|
|
||
|
RANGE("uploading a graph off of the critical path");
|
||
|
preUploadAnnotation<<<1,1,0,stream[1]>>>();
|
||
|
cudaEventRecord(timingEvent[0], stream[0]);
|
||
|
auto start = getCpuTime();
|
||
|
cudaGraphUpload(graphExec, stream[1]);
|
||
|
auto apiReturn = getCpuTime();
|
||
|
cudaEventRecord(event[0],stream[1]);
|
||
|
cudaEventRecord(timingEvent[1], stream[0]);
|
||
|
postUploadAnnotation<<<1,1,0,stream[1]>>>();
|
||
|
|
||
|
hostData->latch = 1; // release the blocking kernel for the upload
|
||
|
cudaStreamWaitEvent(stream[0],event[0]);
|
||
|
cudaGraphLaunch(graphExec, stream[0]);
|
||
|
cudaEventSynchronize(event[0]); // upload done, similuate critical path being ready for the graph to run by the release of the second latch
|
||
|
|
||
|
hostData->latch2 = 1; // release the work
|
||
|
cudaStreamSynchronize(stream[0]);
|
||
|
|
||
|
metricName.push_back("upload_api_time");
|
||
|
metricValue.push_back(getMicroSecondDuration(start, apiReturn));
|
||
|
metricName.push_back("updoad_device_time");
|
||
|
metricValue.push_back(getAsyncMicroSecondDuration(timingEvent[0], timingEvent[1]));
|
||
|
metricName.push_back("blockingKernelTimeoutDetected");
|
||
|
metricValue.push_back(hostData->timeoutDetected);
|
||
|
|
||
|
hostData->latch = 0;
|
||
|
hostData->latch2 = 0;
|
||
|
hostData->timeoutDetected = 0;
|
||
|
hostData->timeoutDetected2 = 0;
|
||
|
}
|
||
|
cudaGraphExecDestroy(graphExec);
|
||
|
cudaGraphDestroy(graph);
|
||
|
RANGE_POP();
|
||
|
}
|
||
|
|
||
|
void usage() {
|
||
|
printf("programName [outputFmt] [numTrials] [length] [width] [pattern] [stride] [maxLength] \n");
|
||
|
printf("\toutputFmt - program output, default=3 (see below)\n");
|
||
|
printf("\tnumTrials (per length)\n");
|
||
|
printf("\tstarting length of the topology\n");
|
||
|
printf("\twidth - width of the graph topology\n");
|
||
|
printf("\tpattern - Structure of graph, default=0 (see below)\n");
|
||
|
printf("\tstride - how to grow the length between each set of trials \n");
|
||
|
printf("\tmaxLength - maximum lenght to try \n");
|
||
|
printf("\n");
|
||
|
printf("outputFmt can be:\n");
|
||
|
printf("\t0: this help message\n");
|
||
|
printf("\t1: csv data headers\n");
|
||
|
printf("\t2: per trial csv data\n");
|
||
|
printf("\t3: csv data & headers\n");
|
||
|
printf("\t4: csv data is printed and trials are averaged for each length\n");
|
||
|
printf("\t5: csv data is printed and trials are averaged for each length and headers are printed\n");
|
||
|
printf("\n");
|
||
|
printf("Pattern can be:\n");
|
||
|
printf("\t0: No interconnect between branches\n");
|
||
|
printf("\t1: Adds an extra root node before the initial fork\n");
|
||
|
}
|
||
|
|
||
|
int main(int argc, char **argv)
|
||
|
{
|
||
|
if(argc < 1) {
|
||
|
usage();
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
int numTrials=1, length=20, width=1, outputFmt=3, pattern=0, stride = 1;
|
||
|
if(argc > 1) outputFmt = atoi(argv[1]);
|
||
|
if(argc > 2) numTrials = atoi(argv[2]);
|
||
|
if(argc > 3) length= atoi(argv[3]);
|
||
|
if(argc > 4) width= atoi(argv[4]);
|
||
|
if(argc > 5) pattern = atoi(argv[5]);
|
||
|
if(argc > 6) stride = atoi(argv[6]);
|
||
|
int maxLength = length;
|
||
|
if(argc > 7) maxLength = atoi(argv[7]);
|
||
|
if (maxLength < length) {
|
||
|
maxLength = length;
|
||
|
}
|
||
|
|
||
|
if((outputFmt & 4) && (outputFmt & 2)) {
|
||
|
printf("printing average and all samples doesn't make sense\n");
|
||
|
}
|
||
|
|
||
|
if(length == 0 ||
|
||
|
width == 0 ||
|
||
|
outputFmt == 0 ||
|
||
|
outputFmt > 5 ||
|
||
|
pattern > 1)
|
||
|
{
|
||
|
usage();
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
bool singleEntry = (pattern == 1);
|
||
|
|
||
|
cudaGraph_t graph;
|
||
|
|
||
|
cudaFree(0);
|
||
|
cudaMallocHost(&hostData, sizeof(*hostData));
|
||
|
stream.resize(width);
|
||
|
for (int i = 0; i < width; i++)
|
||
|
{
|
||
|
cudaStreamCreate(&stream[i]);
|
||
|
}
|
||
|
|
||
|
cudaEventCreate(&event[0], cudaEventDisableTiming);
|
||
|
cudaEventCreate(&timingEvent[0], 0);
|
||
|
cudaEventCreate(&timingEvent[1], 0);
|
||
|
|
||
|
{
|
||
|
RANGE("warmup");
|
||
|
for (int i = 0; i < width; i++)
|
||
|
{
|
||
|
empty<<<1,1,0,stream[i]>>>();
|
||
|
}
|
||
|
cudaStreamSynchronize(stream[0]);
|
||
|
|
||
|
auto start = getCpuTime();
|
||
|
graph = createParallelChain(length, width, singleEntry);
|
||
|
auto end = getCpuTime();
|
||
|
metricValue.push_back(getMicroSecondDuration(start, end));
|
||
|
metricName.push_back("capture");
|
||
|
runDemo(graph, length, width);
|
||
|
}
|
||
|
|
||
|
if (outputFmt & 1) {
|
||
|
printf("length, width, pattern, ");
|
||
|
for (int i = 0; i < metricName.size(); i++) {
|
||
|
printf("%s, ", metricName[i]);
|
||
|
}
|
||
|
printf("\r\n");
|
||
|
}
|
||
|
|
||
|
if (!(outputFmt & 6)) {
|
||
|
printf("skipping trials since no output is expected\n");
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
std::vector<double> metricTotal;
|
||
|
metricTotal.resize(metricValue.size());
|
||
|
|
||
|
while (length <= maxLength) {
|
||
|
for (int i = 0; i < numTrials; i++) {
|
||
|
metricName.clear();
|
||
|
metricValue.clear();
|
||
|
auto start = getCpuTime();
|
||
|
graph = createParallelChain(length, width, singleEntry);
|
||
|
auto end = getCpuTime();
|
||
|
metricValue.push_back(getMicroSecondDuration(start, end));
|
||
|
|
||
|
runDemo(graph, length, width);
|
||
|
|
||
|
if (outputFmt & 2) {
|
||
|
printf("%d, %d, %d, ",length, width, pattern);
|
||
|
for (int i = 0; i < metricValue.size(); i++) {
|
||
|
printf("%0.3f, ", metricValue[i]);
|
||
|
}
|
||
|
printf("\r\n");
|
||
|
}
|
||
|
if (outputFmt & 4) {
|
||
|
for (int i = 0; i < metricTotal.size(); i++) {
|
||
|
metricTotal[i] += metricValue[i];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (outputFmt & 4) {
|
||
|
printf("%d, %d, %d, ",length, width, pattern);
|
||
|
for (int i = 0; i < metricTotal.size(); i++) {
|
||
|
printf("%0.3f, ", metricTotal[i]/numTrials);
|
||
|
metricTotal[i] = 0;
|
||
|
}
|
||
|
printf("\r\n");
|
||
|
}
|
||
|
|
||
|
length += stride;
|
||
|
}
|
||
|
|
||
|
printf("\n");
|
||
|
}
|
||
|
|