cuda-samples/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu

/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <cstdio>
#include <vector>

#include <helper_cuda.h>
#include <helper_timer.h>

using namespace std;

const char *sSampleName = "P2P (Peer-to-Peer) GPU Bandwidth Latency Test";

typedef enum {
  P2P_WRITE = 0,
  P2P_READ = 1,
} P2PDataTransfer;

typedef enum {
  CE = 0,
  SM = 1,
} P2PEngine;

P2PEngine p2p_mechanism = CE;  // By default use Copy Engine

// Macro for checking cuda errors following a cuda launch or api call
#define cudaCheckError()                                       \
  {                                                            \
    cudaError_t e = cudaGetLastError();                        \
    if (e != cudaSuccess) {                                    \
      printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, \
             cudaGetErrorString(e));                           \
      exit(EXIT_FAILURE);                                      \
    }                                                          \
  }
__global__ void delay(volatile int *flag,
                      unsigned long long timeout_clocks = 10000000) {
  // Wait until the application notifies us that it has completed queuing up the
  // experiment, or timeout and exit, allowing the application to make progress
  long long int start_clock, sample_clock;
  start_clock = clock64();

  while (!*flag) {
    sample_clock = clock64();

    if (sample_clock - start_clock > timeout_clocks) {
      break;
    }
  }
}

// This kernel is for demonstration purposes only, not a performant kernel for
// p2p transfers.
__global__ void copyp2p(int4 *__restrict__ dest, int4 const *__restrict__ src,
                        size_t num_elems) {
  size_t globalId = blockIdx.x * blockDim.x + threadIdx.x;
  size_t gridSize = blockDim.x * gridDim.x;

#pragma unroll(5)
  for (size_t i = globalId; i < num_elems; i += gridSize) {
    dest[i] = src[i];
  }
}

///////////////////////////////////////////////////////////////////////////
// Print help screen
///////////////////////////////////////////////////////////////////////////
void printHelp(void) {
  printf("Usage:  p2pBandwidthLatencyTest [OPTION]...\n");
  printf("Tests bandwidth/latency of GPU pairs using P2P and without P2P\n");
  printf("\n");

  printf("Options:\n");
  printf("--help\t\tDisplay this help menu\n");
  printf(
      "--p2p_read\tUse P2P reads for data transfers between GPU pairs and show "
      "corresponding results.\n \t\tDefault used is P2P write operation.\n");
  printf("--sm_copy\tUse SM intiated p2p transfers instead of Copy Engine\n");
}

void checkP2Paccess(int numGPUs) {
  for (int i = 0; i < numGPUs; i++) {
    cudaSetDevice(i);
    cudaCheckError();

    for (int j = 0; j < numGPUs; j++) {
      int access;
      if (i != j) {
        cudaDeviceCanAccessPeer(&access, i, j);
        cudaCheckError();
        printf("Device=%d %s Access Peer Device=%d\n", i,
               access ? "CAN" : "CANNOT", j);
      }
    }
  }
  printf(
      "\n***NOTE: In case a device doesn't have P2P access to other one, it "
      "falls back to normal memcopy procedure.\nSo you can see lesser "
      "Bandwidth (GB/s) and unstable Latency (us) in those cases.\n\n");
}

void performP2PCopy(int *dest, int destDevice, int *src, int srcDevice,
                    int num_elems, int repeat, bool p2paccess,
                    cudaStream_t streamToRun) {
  int blockSize = 0;
  int numBlocks = 0;

  cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, copyp2p);
  cudaCheckError();

  if (p2p_mechanism == SM && p2paccess) {
    for (int r = 0; r < repeat; r++) {
      copyp2p<<<numBlocks, blockSize, 0, streamToRun>>>(
          (int4 *)dest, (int4 *)src, num_elems / 4);
    }
  } else {
    for (int r = 0; r < repeat; r++) {
      cudaMemcpyPeerAsync(dest, destDevice, src, srcDevice,
                          sizeof(int) * num_elems, streamToRun);
    }
  }
}

void outputBandwidthMatrix(int numGPUs, bool p2p, P2PDataTransfer p2p_method) {
  int numElems = 10000000;
  int repeat = 5;
  volatile int *flag = NULL;
  vector<int *> buffers(numGPUs);
  vector<int *> buffersD2D(numGPUs);  // buffer for D2D, that is, intra-GPU copy
  vector<cudaEvent_t> start(numGPUs);
  vector<cudaEvent_t> stop(numGPUs);
  vector<cudaStream_t> stream(numGPUs);

  cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable);
  cudaCheckError();

  for (int d = 0; d < numGPUs; d++) {
    cudaSetDevice(d);
    cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking);
    cudaMalloc(&buffers[d], numElems * sizeof(int));
    cudaCheckError();
    cudaMalloc(&buffersD2D[d], numElems * sizeof(int));
    cudaCheckError();
    cudaEventCreate(&start[d]);
    cudaCheckError();
    cudaEventCreate(&stop[d]);
    cudaCheckError();
  }

  vector<double> bandwidthMatrix(numGPUs * numGPUs);

  for (int i = 0; i < numGPUs; i++) {
    cudaSetDevice(i);

    for (int j = 0; j < numGPUs; j++) {
      int access = 0;
      if (p2p) {
        cudaDeviceCanAccessPeer(&access, i, j);
        if (access) {
          cudaDeviceEnablePeerAccess(j, 0);
          cudaCheckError();
          cudaSetDevice(j);
          cudaCheckError();
          cudaDeviceEnablePeerAccess(i, 0);
          cudaCheckError();
          cudaSetDevice(i);
          cudaCheckError();
        }
      }

      cudaStreamSynchronize(stream[i]);
      cudaCheckError();

      // Block the stream until all the work is queued up
      // DANGER! - cudaMemcpy*Async may infinitely block waiting for
      // room to push the operation, so keep the number of repeatitions
      // relatively low.  Higher repeatitions will cause the delay kernel
      // to timeout and lead to unstable results.
      *flag = 0;
      delay<<<1, 1, 0, stream[i]>>>(flag);
      cudaCheckError();
      cudaEventRecord(start[i], stream[i]);
      cudaCheckError();

      if (i == j) {
        // Perform intra-GPU, D2D copies
        performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat,
                       access, stream[i]);

      } else {
        if (p2p_method == P2P_WRITE) {
          performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access,
                         stream[i]);
        } else {
          performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access,
                         stream[i]);
        }
      }

      cudaEventRecord(stop[i], stream[i]);
      cudaCheckError();

      // Release the queued events
      *flag = 1;
      cudaStreamSynchronize(stream[i]);
      cudaCheckError();

      float time_ms;
      cudaEventElapsedTime(&time_ms, start[i], stop[i]);
      double time_s = time_ms / 1e3;

      double gb = numElems * sizeof(int) * repeat / (double)1e9;
      if (i == j) {
        gb *= 2;  // must count both the read and the write here
      }
      bandwidthMatrix[i * numGPUs + j] = gb / time_s;
      if (p2p && access) {
        cudaDeviceDisablePeerAccess(j);
        cudaSetDevice(j);
        cudaDeviceDisablePeerAccess(i);
        cudaSetDevice(i);
        cudaCheckError();
      }
    }
  }

  printf("   D\\D");

  for (int j = 0; j < numGPUs; j++) {
    printf("%6d ", j);
  }

  printf("\n");

  for (int i = 0; i < numGPUs; i++) {
    printf("%6d ", i);

    for (int j = 0; j < numGPUs; j++) {
      printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]);
    }

    printf("\n");
  }

  for (int d = 0; d < numGPUs; d++) {
    cudaSetDevice(d);
    cudaFree(buffers[d]);
    cudaFree(buffersD2D[d]);
    cudaCheckError();
    cudaEventDestroy(start[d]);
    cudaCheckError();
    cudaEventDestroy(stop[d]);
    cudaCheckError();
    cudaStreamDestroy(stream[d]);
    cudaCheckError();
  }

  cudaFreeHost((void *)flag);
  cudaCheckError();
}

void outputBidirectionalBandwidthMatrix(int numGPUs, bool p2p) {
  int numElems = 10000000;
  int repeat = 5;
  volatile int *flag = NULL;
  vector<int *> buffers(numGPUs);
  vector<int *> buffersD2D(numGPUs);
  vector<cudaEvent_t> start(numGPUs);
  vector<cudaEvent_t> stop(numGPUs);
  vector<cudaStream_t> stream0(numGPUs);
  vector<cudaStream_t> stream1(numGPUs);

  cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable);
  cudaCheckError();

  for (int d = 0; d < numGPUs; d++) {
    cudaSetDevice(d);
    cudaMalloc(&buffers[d], numElems * sizeof(int));
    cudaMalloc(&buffersD2D[d], numElems * sizeof(int));
    cudaCheckError();
    cudaEventCreate(&start[d]);
    cudaCheckError();
    cudaEventCreate(&stop[d]);
    cudaCheckError();
    cudaStreamCreateWithFlags(&stream0[d], cudaStreamNonBlocking);
    cudaCheckError();
    cudaStreamCreateWithFlags(&stream1[d], cudaStreamNonBlocking);
    cudaCheckError();
  }

  vector<double> bandwidthMatrix(numGPUs * numGPUs);

  for (int i = 0; i < numGPUs; i++) {
    cudaSetDevice(i);

    for (int j = 0; j < numGPUs; j++) {
      int access = 0;
      if (p2p) {
        cudaDeviceCanAccessPeer(&access, i, j);
        if (access) {
          cudaSetDevice(i);
          cudaDeviceEnablePeerAccess(j, 0);
          cudaCheckError();
          cudaSetDevice(j);
          cudaDeviceEnablePeerAccess(i, 0);
          cudaCheckError();
        }
      }

      cudaSetDevice(i);
      cudaStreamSynchronize(stream0[i]);
      cudaStreamSynchronize(stream1[j]);
      cudaCheckError();

      // Block the stream until all the work is queued up
      // DANGER! - cudaMemcpy*Async may infinitely block waiting for
      // room to push the operation, so keep the number of repeatitions
      // relatively low.  Higher repeatitions will cause the delay kernel
      // to timeout and lead to unstable results.
      *flag = 0;
      cudaSetDevice(i);
      // No need to block stream1 since it'll be blocked on stream0's event
      delay<<<1, 1, 0, stream0[i]>>>(flag);
      cudaCheckError();

      // Force stream1 not to start until stream0 does, in order to ensure
      // the events on stream0 fully encompass the time needed for all
      // operations
      cudaEventRecord(start[i], stream0[i]);
      cudaStreamWaitEvent(stream1[j], start[i], 0);

      if (i == j) {
        // For intra-GPU perform 2 memcopies buffersD2D <-> buffers
        performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat,
                       access, stream0[i]);
        performP2PCopy(buffersD2D[i], i, buffers[i], i, numElems, repeat,
                       access, stream1[i]);
      } else {
        if (access && p2p_mechanism == SM) {
          cudaSetDevice(j);
        }
        performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access,
                       stream1[j]);
        if (access && p2p_mechanism == SM) {
          cudaSetDevice(i);
        }
        performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access,
                       stream0[i]);
      }

      // Notify stream0 that stream1 is complete and record the time of
      // the total transaction
      cudaEventRecord(stop[j], stream1[j]);
      cudaStreamWaitEvent(stream0[i], stop[j], 0);
      cudaEventRecord(stop[i], stream0[i]);

      // Release the queued operations
      *flag = 1;
      cudaStreamSynchronize(stream0[i]);
      cudaStreamSynchronize(stream1[j]);
      cudaCheckError();

      float time_ms;
      cudaEventElapsedTime(&time_ms, start[i], stop[i]);
      double time_s = time_ms / 1e3;

      double gb = 2.0 * numElems * sizeof(int) * repeat / (double)1e9;
      if (i == j) {
        gb *= 2;  // must count both the read and the write here
      }
      bandwidthMatrix[i * numGPUs + j] = gb / time_s;
      if (p2p && access) {
        cudaSetDevice(i);
        cudaDeviceDisablePeerAccess(j);
        cudaSetDevice(j);
        cudaDeviceDisablePeerAccess(i);
      }
    }
  }

  printf("   D\\D");

  for (int j = 0; j < numGPUs; j++) {
    printf("%6d ", j);
  }

  printf("\n");

  for (int i = 0; i < numGPUs; i++) {
    printf("%6d ", i);

    for (int j = 0; j < numGPUs; j++) {
      printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]);
    }

    printf("\n");
  }

  for (int d = 0; d < numGPUs; d++) {
    cudaSetDevice(d);
    cudaFree(buffers[d]);
    cudaFree(buffersD2D[d]);
    cudaCheckError();
    cudaEventDestroy(start[d]);
    cudaCheckError();
    cudaEventDestroy(stop[d]);
    cudaCheckError();
    cudaStreamDestroy(stream0[d]);
    cudaCheckError();
    cudaStreamDestroy(stream1[d]);
    cudaCheckError();
  }

  cudaFreeHost((void *)flag);
  cudaCheckError();
}

void outputLatencyMatrix(int numGPUs, bool p2p, P2PDataTransfer p2p_method) {
  int repeat = 100;
  volatile int *flag = NULL;
  StopWatchInterface *stopWatch = NULL;
  vector<int *> buffers(numGPUs);
  vector<int *> buffersD2D(numGPUs);  // buffer for D2D, that is, intra-GPU copy
  vector<cudaStream_t> stream(numGPUs);
  vector<cudaEvent_t> start(numGPUs);
  vector<cudaEvent_t> stop(numGPUs);

  cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable);
  cudaCheckError();

  if (!sdkCreateTimer(&stopWatch)) {
    printf("Failed to create stop watch\n");
    exit(EXIT_FAILURE);
  }
  sdkStartTimer(&stopWatch);

  for (int d = 0; d < numGPUs; d++) {
    cudaSetDevice(d);
    cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking);
    cudaMalloc(&buffers[d], sizeof(int));
    cudaMalloc(&buffersD2D[d], sizeof(int));
    cudaCheckError();
    cudaEventCreate(&start[d]);
    cudaCheckError();
    cudaEventCreate(&stop[d]);
    cudaCheckError();
  }

  vector<double> gpuLatencyMatrix(numGPUs * numGPUs);
  vector<double> cpuLatencyMatrix(numGPUs * numGPUs);

  for (int i = 0; i < numGPUs; i++) {
    cudaSetDevice(i);

    for (int j = 0; j < numGPUs; j++) {
      int access = 0;
      if (p2p) {
        cudaDeviceCanAccessPeer(&access, i, j);
        if (access) {
          cudaDeviceEnablePeerAccess(j, 0);
          cudaCheckError();
          cudaSetDevice(j);
          cudaDeviceEnablePeerAccess(i, 0);
          cudaSetDevice(i);
          cudaCheckError();
        }
      }
      cudaStreamSynchronize(stream[i]);
      cudaCheckError();

      // Block the stream until all the work is queued up
      // DANGER! - cudaMemcpy*Async may infinitely block waiting for
      // room to push the operation, so keep the number of repeatitions
      // relatively low.  Higher repeatitions will cause the delay kernel
      // to timeout and lead to unstable results.
      *flag = 0;
      delay<<<1, 1, 0, stream[i]>>>(flag);
      cudaCheckError();
      cudaEventRecord(start[i], stream[i]);

      sdkResetTimer(&stopWatch);
      if (i == j) {
        // Perform intra-GPU, D2D copies
        performP2PCopy(buffers[i], i, buffersD2D[i], i, 1, repeat, access,
                       stream[i]);
      } else {
        if (p2p_method == P2P_WRITE) {
          performP2PCopy(buffers[j], j, buffers[i], i, 1, repeat, access,
                         stream[i]);
        } else {
          performP2PCopy(buffers[i], i, buffers[j], j, 1, repeat, access,
                         stream[i]);
        }
      }
      float cpu_time_ms = sdkGetTimerValue(&stopWatch);

      cudaEventRecord(stop[i], stream[i]);
      // Now that the work has been queued up, release the stream
      *flag = 1;
      cudaStreamSynchronize(stream[i]);
      cudaCheckError();

      float gpu_time_ms;
      cudaEventElapsedTime(&gpu_time_ms, start[i], stop[i]);

      gpuLatencyMatrix[i * numGPUs + j] = gpu_time_ms * 1e3 / repeat;
      cpuLatencyMatrix[i * numGPUs + j] = cpu_time_ms * 1e3 / repeat;
      if (p2p && access) {
        cudaDeviceDisablePeerAccess(j);
        cudaSetDevice(j);
        cudaDeviceDisablePeerAccess(i);
        cudaSetDevice(i);
        cudaCheckError();
      }
    }
  }

  printf("   GPU");

  for (int j = 0; j < numGPUs; j++) {
    printf("%6d ", j);
  }

  printf("\n");

  for (int i = 0; i < numGPUs; i++) {
    printf("%6d ", i);

    for (int j = 0; j < numGPUs; j++) {
      printf("%6.02f ", gpuLatencyMatrix[i * numGPUs + j]);
    }

    printf("\n");
  }

  printf("\n   CPU");

  for (int j = 0; j < numGPUs; j++) {
    printf("%6d ", j);
  }

  printf("\n");

  for (int i = 0; i < numGPUs; i++) {
    printf("%6d ", i);

    for (int j = 0; j < numGPUs; j++) {
      printf("%6.02f ", cpuLatencyMatrix[i * numGPUs + j]);
    }

    printf("\n");
  }

  for (int d = 0; d < numGPUs; d++) {
    cudaSetDevice(d);
    cudaFree(buffers[d]);
    cudaFree(buffersD2D[d]);
    cudaCheckError();
    cudaEventDestroy(start[d]);
    cudaCheckError();
    cudaEventDestroy(stop[d]);
    cudaCheckError();
    cudaStreamDestroy(stream[d]);
    cudaCheckError();
  }

  sdkDeleteTimer(&stopWatch);

  cudaFreeHost((void *)flag);
  cudaCheckError();
}

int main(int argc, char **argv) {
  int numGPUs;
  P2PDataTransfer p2p_method = P2P_WRITE;

  cudaGetDeviceCount(&numGPUs);
  cudaCheckError();

  // process command line args
  if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
    printHelp();
    return 0;
  }

  if (checkCmdLineFlag(argc, (const char **)argv, "p2p_read")) {
    p2p_method = P2P_READ;
  }

  if (checkCmdLineFlag(argc, (const char **)argv, "sm_copy")) {
    p2p_mechanism = SM;
  }

  printf("[%s]\n", sSampleName);

  // output devices
  for (int i = 0; i < numGPUs; i++) {
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, i);
    cudaCheckError();
    printf("Device: %d, %s, pciBusID: %x, pciDeviceID: %x, pciDomainID:%x\n", i,
           prop.name, prop.pciBusID, prop.pciDeviceID, prop.pciDomainID);
  }

  checkP2Paccess(numGPUs);

  // Check peer-to-peer connectivity
  printf("P2P Connectivity Matrix\n");
  printf("     D\\D");

  for (int j = 0; j < numGPUs; j++) {
    printf("%6d", j);
  }
  printf("\n");

  for (int i = 0; i < numGPUs; i++) {
    printf("%6d\t", i);
    for (int j = 0; j < numGPUs; j++) {
      if (i != j) {
        int access;
        cudaDeviceCanAccessPeer(&access, i, j);
        cudaCheckError();
        printf("%6d", (access) ? 1 : 0);
      } else {
        printf("%6d", 1);
      }
    }
    printf("\n");
  }

  printf("Unidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n");
  outputBandwidthMatrix(numGPUs, false, P2P_WRITE);
  printf("Unidirectional P2P=Enabled Bandwidth (P2P Writes) Matrix (GB/s)\n");
  outputBandwidthMatrix(numGPUs, true, P2P_WRITE);
  if (p2p_method == P2P_READ) {
    printf("Unidirectional P2P=Enabled Bandwidth (P2P Reads) Matrix (GB/s)\n");
    outputBandwidthMatrix(numGPUs, true, p2p_method);
  }
  printf("Bidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n");
  outputBidirectionalBandwidthMatrix(numGPUs, false);
  printf("Bidirectional P2P=Enabled Bandwidth Matrix (GB/s)\n");
  outputBidirectionalBandwidthMatrix(numGPUs, true);

  printf("P2P=Disabled Latency Matrix (us)\n");
  outputLatencyMatrix(numGPUs, false, P2P_WRITE);
  printf("P2P=Enabled Latency (P2P Writes) Matrix (us)\n");
  outputLatencyMatrix(numGPUs, true, P2P_WRITE);
  if (p2p_method == P2P_READ) {
    printf("P2P=Enabled Latency (P2P Reads) Matrix (us)\n");
    outputLatencyMatrix(numGPUs, true, p2p_method);
  }

  printf(
      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
      "Results may vary when GPU Boost is enabled.\n");

  exit(EXIT_SUCCESS);
}
Add and Update samples for CUDA 10.0 2018-08-25 01:05:15 +08:00			`/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`#include <cstdio>`
			`#include <vector>`

			`#include <helper_cuda.h>`
			`#include <helper_timer.h>`

			`using namespace std;`

			`const char *sSampleName = "P2P (Peer-to-Peer) GPU Bandwidth Latency Test";`

			`typedef enum {`
			`P2P_WRITE = 0,`
			`P2P_READ = 1,`
			`} P2PDataTransfer;`

			`typedef enum {`
			`CE = 0,`
			`SM = 1,`
			`} P2PEngine;`

			`P2PEngine p2p_mechanism = CE; // By default use Copy Engine`

			`// Macro for checking cuda errors following a cuda launch or api call`
			`#define cudaCheckError() \`
			`{ \`
			`cudaError_t e = cudaGetLastError(); \`
			`if (e != cudaSuccess) { \`
			`printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, \`
			`cudaGetErrorString(e)); \`
			`exit(EXIT_FAILURE); \`
			`} \`
			`}`
			`__global__ void delay(volatile int *flag,`
			`unsigned long long timeout_clocks = 10000000) {`
			`// Wait until the application notifies us that it has completed queuing up the`
			`// experiment, or timeout and exit, allowing the application to make progress`
			`long long int start_clock, sample_clock;`
			`start_clock = clock64();`

			`while (!*flag) {`
			`sample_clock = clock64();`

			`if (sample_clock - start_clock > timeout_clocks) {`
			`break;`
			`}`
			`}`
			`}`

			`// This kernel is for demonstration purposes only, not a performant kernel for`
			`// p2p transfers.`
			`__global__ void copyp2p(int4 __restrict__ dest, int4 const __restrict__ src,`
			`size_t num_elems) {`
			`size_t globalId = blockIdx.x * blockDim.x + threadIdx.x;`
			`size_t gridSize = blockDim.x * gridDim.x;`

			`#pragma unroll(5)`
			`for (size_t i = globalId; i < num_elems; i += gridSize) {`
			`dest[i] = src[i];`
			`}`
			`}`

			`///////////////////////////////////////////////////////////////////////////`
			`// Print help screen`
			`///////////////////////////////////////////////////////////////////////////`
			`void printHelp(void) {`
			`printf("Usage: p2pBandwidthLatencyTest [OPTION]...\n");`
			`printf("Tests bandwidth/latency of GPU pairs using P2P and without P2P\n");`
			`printf("\n");`

			`printf("Options:\n");`
			`printf("--help\t\tDisplay this help menu\n");`
			`printf(`
			`"--p2p_read\tUse P2P reads for data transfers between GPU pairs and show "`
			`"corresponding results.\n \t\tDefault used is P2P write operation.\n");`
			`printf("--sm_copy\tUse SM intiated p2p transfers instead of Copy Engine\n");`
			`}`

			`void checkP2Paccess(int numGPUs) {`
			`for (int i = 0; i < numGPUs; i++) {`
			`cudaSetDevice(i);`
			`cudaCheckError();`

			`for (int j = 0; j < numGPUs; j++) {`
			`int access;`
			`if (i != j) {`
			`cudaDeviceCanAccessPeer(&access, i, j);`
			`cudaCheckError();`
			`printf("Device=%d %s Access Peer Device=%d\n", i,`
			`access ? "CAN" : "CANNOT", j);`
			`}`
			`}`
			`}`
			`printf(`
			`"\n***NOTE: In case a device doesn't have P2P access to other one, it "`
			`"falls back to normal memcopy procedure.\nSo you can see lesser "`
			`"Bandwidth (GB/s) and unstable Latency (us) in those cases.\n\n");`
			`}`

			`void performP2PCopy(int dest, int destDevice, int src, int srcDevice,`
			`int num_elems, int repeat, bool p2paccess,`
			`cudaStream_t streamToRun) {`
			`int blockSize = 0;`
			`int numBlocks = 0;`

			`cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, copyp2p);`
			`cudaCheckError();`

			`if (p2p_mechanism == SM && p2paccess) {`
			`for (int r = 0; r < repeat; r++) {`
			`copyp2p<<<numBlocks, blockSize, 0, streamToRun>>>(`
			`(int4 )dest, (int4 )src, num_elems / 4);`
			`}`
			`} else {`
			`for (int r = 0; r < repeat; r++) {`
			`cudaMemcpyPeerAsync(dest, destDevice, src, srcDevice,`
			`sizeof(int) * num_elems, streamToRun);`
			`}`
			`}`
			`}`

			`void outputBandwidthMatrix(int numGPUs, bool p2p, P2PDataTransfer p2p_method) {`
			`int numElems = 10000000;`
			`int repeat = 5;`
			`volatile int *flag = NULL;`
			`vector<int *> buffers(numGPUs);`
			`vector<int *> buffersD2D(numGPUs); // buffer for D2D, that is, intra-GPU copy`
			`vector<cudaEvent_t> start(numGPUs);`
			`vector<cudaEvent_t> stop(numGPUs);`
			`vector<cudaStream_t> stream(numGPUs);`

			`cudaHostAlloc((void *)&flag, sizeof(flag), cudaHostAllocPortable);`
			`cudaCheckError();`

			`for (int d = 0; d < numGPUs; d++) {`
			`cudaSetDevice(d);`
			`cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking);`
			`cudaMalloc(&buffers[d], numElems * sizeof(int));`
			`cudaCheckError();`
			`cudaMalloc(&buffersD2D[d], numElems * sizeof(int));`
			`cudaCheckError();`
			`cudaEventCreate(&start[d]);`
			`cudaCheckError();`
			`cudaEventCreate(&stop[d]);`
			`cudaCheckError();`
			`}`

			`vector<double> bandwidthMatrix(numGPUs * numGPUs);`

			`for (int i = 0; i < numGPUs; i++) {`
			`cudaSetDevice(i);`

			`for (int j = 0; j < numGPUs; j++) {`
			`int access = 0;`
			`if (p2p) {`
			`cudaDeviceCanAccessPeer(&access, i, j);`
			`if (access) {`
			`cudaDeviceEnablePeerAccess(j, 0);`
			`cudaCheckError();`
			`cudaSetDevice(j);`
			`cudaCheckError();`
			`cudaDeviceEnablePeerAccess(i, 0);`
			`cudaCheckError();`
			`cudaSetDevice(i);`
			`cudaCheckError();`
			`}`
			`}`

			`cudaStreamSynchronize(stream[i]);`
			`cudaCheckError();`

			`// Block the stream until all the work is queued up`
			`// DANGER! - cudaMemcpy*Async may infinitely block waiting for`
			`// room to push the operation, so keep the number of repeatitions`
			`// relatively low. Higher repeatitions will cause the delay kernel`
			`// to timeout and lead to unstable results.`
			`*flag = 0;`
			`delay<<<1, 1, 0, stream[i]>>>(flag);`
			`cudaCheckError();`
			`cudaEventRecord(start[i], stream[i]);`
			`cudaCheckError();`

			`if (i == j) {`
			`// Perform intra-GPU, D2D copies`
			`performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat,`
			`access, stream[i]);`

			`} else {`
			`if (p2p_method == P2P_WRITE) {`
			`performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access,`
			`stream[i]);`
			`} else {`
			`performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access,`
			`stream[i]);`
			`}`
			`}`

			`cudaEventRecord(stop[i], stream[i]);`
			`cudaCheckError();`

			`// Release the queued events`
			`*flag = 1;`
			`cudaStreamSynchronize(stream[i]);`
			`cudaCheckError();`

			`float time_ms;`
			`cudaEventElapsedTime(&time_ms, start[i], stop[i]);`
			`double time_s = time_ms / 1e3;`

			`double gb = numElems * sizeof(int) * repeat / (double)1e9;`
			`if (i == j) {`
			`gb *= 2; // must count both the read and the write here`
			`}`
			`bandwidthMatrix[i * numGPUs + j] = gb / time_s;`
			`if (p2p && access) {`
			`cudaDeviceDisablePeerAccess(j);`
			`cudaSetDevice(j);`
			`cudaDeviceDisablePeerAccess(i);`
			`cudaSetDevice(i);`
			`cudaCheckError();`
			`}`
			`}`
			`}`

			`printf(" D\\D");`

			`for (int j = 0; j < numGPUs; j++) {`
			`printf("%6d ", j);`
			`}`

			`printf("\n");`

			`for (int i = 0; i < numGPUs; i++) {`
			`printf("%6d ", i);`

			`for (int j = 0; j < numGPUs; j++) {`
			`printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]);`
			`}`

			`printf("\n");`
			`}`

			`for (int d = 0; d < numGPUs; d++) {`
			`cudaSetDevice(d);`
			`cudaFree(buffers[d]);`
			`cudaFree(buffersD2D[d]);`
			`cudaCheckError();`
			`cudaEventDestroy(start[d]);`
			`cudaCheckError();`
			`cudaEventDestroy(stop[d]);`
			`cudaCheckError();`
			`cudaStreamDestroy(stream[d]);`
			`cudaCheckError();`
			`}`

			`cudaFreeHost((void *)flag);`
			`cudaCheckError();`
			`}`

			`void outputBidirectionalBandwidthMatrix(int numGPUs, bool p2p) {`
			`int numElems = 10000000;`
			`int repeat = 5;`
			`volatile int *flag = NULL;`
			`vector<int *> buffers(numGPUs);`
			`vector<int *> buffersD2D(numGPUs);`
			`vector<cudaEvent_t> start(numGPUs);`
			`vector<cudaEvent_t> stop(numGPUs);`
			`vector<cudaStream_t> stream0(numGPUs);`
			`vector<cudaStream_t> stream1(numGPUs);`

			`cudaHostAlloc((void *)&flag, sizeof(flag), cudaHostAllocPortable);`
			`cudaCheckError();`

			`for (int d = 0; d < numGPUs; d++) {`
			`cudaSetDevice(d);`
			`cudaMalloc(&buffers[d], numElems * sizeof(int));`
			`cudaMalloc(&buffersD2D[d], numElems * sizeof(int));`
			`cudaCheckError();`
			`cudaEventCreate(&start[d]);`
			`cudaCheckError();`
			`cudaEventCreate(&stop[d]);`
			`cudaCheckError();`
			`cudaStreamCreateWithFlags(&stream0[d], cudaStreamNonBlocking);`
			`cudaCheckError();`
			`cudaStreamCreateWithFlags(&stream1[d], cudaStreamNonBlocking);`
			`cudaCheckError();`
			`}`

			`vector<double> bandwidthMatrix(numGPUs * numGPUs);`

			`for (int i = 0; i < numGPUs; i++) {`
			`cudaSetDevice(i);`

			`for (int j = 0; j < numGPUs; j++) {`
			`int access = 0;`
			`if (p2p) {`
			`cudaDeviceCanAccessPeer(&access, i, j);`
			`if (access) {`
			`cudaSetDevice(i);`
			`cudaDeviceEnablePeerAccess(j, 0);`
			`cudaCheckError();`
			`cudaSetDevice(j);`
			`cudaDeviceEnablePeerAccess(i, 0);`
			`cudaCheckError();`
			`}`
			`}`

			`cudaSetDevice(i);`
			`cudaStreamSynchronize(stream0[i]);`
			`cudaStreamSynchronize(stream1[j]);`
			`cudaCheckError();`

			`// Block the stream until all the work is queued up`
			`// DANGER! - cudaMemcpy*Async may infinitely block waiting for`
			`// room to push the operation, so keep the number of repeatitions`
			`// relatively low. Higher repeatitions will cause the delay kernel`
			`// to timeout and lead to unstable results.`
			`*flag = 0;`
			`cudaSetDevice(i);`
			`// No need to block stream1 since it'll be blocked on stream0's event`
			`delay<<<1, 1, 0, stream0[i]>>>(flag);`
			`cudaCheckError();`

			`// Force stream1 not to start until stream0 does, in order to ensure`
			`// the events on stream0 fully encompass the time needed for all`
			`// operations`
			`cudaEventRecord(start[i], stream0[i]);`
			`cudaStreamWaitEvent(stream1[j], start[i], 0);`

			`if (i == j) {`
			`// For intra-GPU perform 2 memcopies buffersD2D <-> buffers`
			`performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat,`
			`access, stream0[i]);`
			`performP2PCopy(buffersD2D[i], i, buffers[i], i, numElems, repeat,`
			`access, stream1[i]);`
			`} else {`
			`if (access && p2p_mechanism == SM) {`
			`cudaSetDevice(j);`
			`}`
			`performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access,`
			`stream1[j]);`
			`if (access && p2p_mechanism == SM) {`
			`cudaSetDevice(i);`
			`}`
			`performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access,`
			`stream0[i]);`
			`}`

			`// Notify stream0 that stream1 is complete and record the time of`
			`// the total transaction`
			`cudaEventRecord(stop[j], stream1[j]);`
			`cudaStreamWaitEvent(stream0[i], stop[j], 0);`
			`cudaEventRecord(stop[i], stream0[i]);`

			`// Release the queued operations`
			`*flag = 1;`
			`cudaStreamSynchronize(stream0[i]);`
			`cudaStreamSynchronize(stream1[j]);`
			`cudaCheckError();`

			`float time_ms;`
			`cudaEventElapsedTime(&time_ms, start[i], stop[i]);`
			`double time_s = time_ms / 1e3;`

			`double gb = 2.0 * numElems * sizeof(int) * repeat / (double)1e9;`
			`if (i == j) {`
			`gb *= 2; // must count both the read and the write here`
			`}`
			`bandwidthMatrix[i * numGPUs + j] = gb / time_s;`
			`if (p2p && access) {`
			`cudaSetDevice(i);`
			`cudaDeviceDisablePeerAccess(j);`
			`cudaSetDevice(j);`
			`cudaDeviceDisablePeerAccess(i);`
			`}`
			`}`
			`}`

			`printf(" D\\D");`

			`for (int j = 0; j < numGPUs; j++) {`
			`printf("%6d ", j);`
			`}`

			`printf("\n");`

			`for (int i = 0; i < numGPUs; i++) {`
			`printf("%6d ", i);`

			`for (int j = 0; j < numGPUs; j++) {`
			`printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]);`
			`}`

			`printf("\n");`
			`}`

			`for (int d = 0; d < numGPUs; d++) {`
			`cudaSetDevice(d);`
			`cudaFree(buffers[d]);`
			`cudaFree(buffersD2D[d]);`
			`cudaCheckError();`
			`cudaEventDestroy(start[d]);`
			`cudaCheckError();`
			`cudaEventDestroy(stop[d]);`
			`cudaCheckError();`
			`cudaStreamDestroy(stream0[d]);`
			`cudaCheckError();`
			`cudaStreamDestroy(stream1[d]);`
			`cudaCheckError();`
			`}`

			`cudaFreeHost((void *)flag);`
			`cudaCheckError();`
			`}`

			`void outputLatencyMatrix(int numGPUs, bool p2p, P2PDataTransfer p2p_method) {`
			`int repeat = 100;`
			`volatile int *flag = NULL;`
			`StopWatchInterface *stopWatch = NULL;`
			`vector<int *> buffers(numGPUs);`
			`vector<int *> buffersD2D(numGPUs); // buffer for D2D, that is, intra-GPU copy`
			`vector<cudaStream_t> stream(numGPUs);`
			`vector<cudaEvent_t> start(numGPUs);`
			`vector<cudaEvent_t> stop(numGPUs);`

			`cudaHostAlloc((void *)&flag, sizeof(flag), cudaHostAllocPortable);`
			`cudaCheckError();`

			`if (!sdkCreateTimer(&stopWatch)) {`
			`printf("Failed to create stop watch\n");`
			`exit(EXIT_FAILURE);`
			`}`
			`sdkStartTimer(&stopWatch);`

			`for (int d = 0; d < numGPUs; d++) {`
			`cudaSetDevice(d);`
			`cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking);`
			`cudaMalloc(&buffers[d], sizeof(int));`
			`cudaMalloc(&buffersD2D[d], sizeof(int));`
			`cudaCheckError();`
			`cudaEventCreate(&start[d]);`
			`cudaCheckError();`
			`cudaEventCreate(&stop[d]);`
			`cudaCheckError();`
			`}`

			`vector<double> gpuLatencyMatrix(numGPUs * numGPUs);`
			`vector<double> cpuLatencyMatrix(numGPUs * numGPUs);`

			`for (int i = 0; i < numGPUs; i++) {`
			`cudaSetDevice(i);`

			`for (int j = 0; j < numGPUs; j++) {`
			`int access = 0;`
			`if (p2p) {`
			`cudaDeviceCanAccessPeer(&access, i, j);`
			`if (access) {`
			`cudaDeviceEnablePeerAccess(j, 0);`
			`cudaCheckError();`
			`cudaSetDevice(j);`
			`cudaDeviceEnablePeerAccess(i, 0);`
			`cudaSetDevice(i);`
			`cudaCheckError();`
			`}`
			`}`
			`cudaStreamSynchronize(stream[i]);`
			`cudaCheckError();`

			`// Block the stream until all the work is queued up`
			`// DANGER! - cudaMemcpy*Async may infinitely block waiting for`
			`// room to push the operation, so keep the number of repeatitions`
			`// relatively low. Higher repeatitions will cause the delay kernel`
			`// to timeout and lead to unstable results.`
			`*flag = 0;`
			`delay<<<1, 1, 0, stream[i]>>>(flag);`
			`cudaCheckError();`
			`cudaEventRecord(start[i], stream[i]);`

			`sdkResetTimer(&stopWatch);`
			`if (i == j) {`
			`// Perform intra-GPU, D2D copies`
			`performP2PCopy(buffers[i], i, buffersD2D[i], i, 1, repeat, access,`
			`stream[i]);`
			`} else {`
			`if (p2p_method == P2P_WRITE) {`
			`performP2PCopy(buffers[j], j, buffers[i], i, 1, repeat, access,`
			`stream[i]);`
			`} else {`
			`performP2PCopy(buffers[i], i, buffers[j], j, 1, repeat, access,`
			`stream[i]);`
			`}`
			`}`
			`float cpu_time_ms = sdkGetTimerValue(&stopWatch);`

			`cudaEventRecord(stop[i], stream[i]);`
			`// Now that the work has been queued up, release the stream`
			`*flag = 1;`
			`cudaStreamSynchronize(stream[i]);`
			`cudaCheckError();`

			`float gpu_time_ms;`
			`cudaEventElapsedTime(&gpu_time_ms, start[i], stop[i]);`

			`gpuLatencyMatrix[i * numGPUs + j] = gpu_time_ms * 1e3 / repeat;`
			`cpuLatencyMatrix[i * numGPUs + j] = cpu_time_ms * 1e3 / repeat;`
			`if (p2p && access) {`
			`cudaDeviceDisablePeerAccess(j);`
			`cudaSetDevice(j);`
			`cudaDeviceDisablePeerAccess(i);`
			`cudaSetDevice(i);`
			`cudaCheckError();`
			`}`
			`}`
			`}`

			`printf(" GPU");`

			`for (int j = 0; j < numGPUs; j++) {`
			`printf("%6d ", j);`
			`}`

			`printf("\n");`

			`for (int i = 0; i < numGPUs; i++) {`
			`printf("%6d ", i);`

			`for (int j = 0; j < numGPUs; j++) {`
			`printf("%6.02f ", gpuLatencyMatrix[i * numGPUs + j]);`
			`}`

			`printf("\n");`
			`}`

			`printf("\n CPU");`

			`for (int j = 0; j < numGPUs; j++) {`
			`printf("%6d ", j);`
			`}`

			`printf("\n");`

			`for (int i = 0; i < numGPUs; i++) {`
			`printf("%6d ", i);`

			`for (int j = 0; j < numGPUs; j++) {`
			`printf("%6.02f ", cpuLatencyMatrix[i * numGPUs + j]);`
			`}`

			`printf("\n");`
			`}`

			`for (int d = 0; d < numGPUs; d++) {`
			`cudaSetDevice(d);`
			`cudaFree(buffers[d]);`
			`cudaFree(buffersD2D[d]);`
			`cudaCheckError();`
			`cudaEventDestroy(start[d]);`
			`cudaCheckError();`
			`cudaEventDestroy(stop[d]);`
			`cudaCheckError();`
			`cudaStreamDestroy(stream[d]);`
			`cudaCheckError();`
			`}`

			`sdkDeleteTimer(&stopWatch);`

			`cudaFreeHost((void *)flag);`
			`cudaCheckError();`
			`}`

			`int main(int argc, char **argv) {`
			`int numGPUs;`
			`P2PDataTransfer p2p_method = P2P_WRITE;`

			`cudaGetDeviceCount(&numGPUs);`
			`cudaCheckError();`

			`// process command line args`
			`if (checkCmdLineFlag(argc, (const char **)argv, "help")) {`
			`printHelp();`
			`return 0;`
			`}`

			`if (checkCmdLineFlag(argc, (const char **)argv, "p2p_read")) {`
			`p2p_method = P2P_READ;`
			`}`

			`if (checkCmdLineFlag(argc, (const char **)argv, "sm_copy")) {`
			`p2p_mechanism = SM;`
			`}`

			`printf("[%s]\n", sSampleName);`

			`// output devices`
			`for (int i = 0; i < numGPUs; i++) {`
			`cudaDeviceProp prop;`
			`cudaGetDeviceProperties(&prop, i);`
			`cudaCheckError();`
			`printf("Device: %d, %s, pciBusID: %x, pciDeviceID: %x, pciDomainID:%x\n", i,`
			`prop.name, prop.pciBusID, prop.pciDeviceID, prop.pciDomainID);`
			`}`

			`checkP2Paccess(numGPUs);`

			`// Check peer-to-peer connectivity`
			`printf("P2P Connectivity Matrix\n");`
			`printf(" D\\D");`

			`for (int j = 0; j < numGPUs; j++) {`
			`printf("%6d", j);`
			`}`
			`printf("\n");`

			`for (int i = 0; i < numGPUs; i++) {`
			`printf("%6d\t", i);`
			`for (int j = 0; j < numGPUs; j++) {`
			`if (i != j) {`
			`int access;`
			`cudaDeviceCanAccessPeer(&access, i, j);`
			`cudaCheckError();`
			`printf("%6d", (access) ? 1 : 0);`
			`} else {`
			`printf("%6d", 1);`
			`}`
			`}`
			`printf("\n");`
			`}`

			`printf("Unidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n");`
			`outputBandwidthMatrix(numGPUs, false, P2P_WRITE);`
			`printf("Unidirectional P2P=Enabled Bandwidth (P2P Writes) Matrix (GB/s)\n");`
			`outputBandwidthMatrix(numGPUs, true, P2P_WRITE);`
			`if (p2p_method == P2P_READ) {`
			`printf("Unidirectional P2P=Enabled Bandwidth (P2P Reads) Matrix (GB/s)\n");`
			`outputBandwidthMatrix(numGPUs, true, p2p_method);`
			`}`
			`printf("Bidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n");`
			`outputBidirectionalBandwidthMatrix(numGPUs, false);`
			`printf("Bidirectional P2P=Enabled Bandwidth Matrix (GB/s)\n");`
			`outputBidirectionalBandwidthMatrix(numGPUs, true);`

			`printf("P2P=Disabled Latency Matrix (us)\n");`
			`outputLatencyMatrix(numGPUs, false, P2P_WRITE);`
			`printf("P2P=Enabled Latency (P2P Writes) Matrix (us)\n");`
			`outputLatencyMatrix(numGPUs, true, P2P_WRITE);`
			`if (p2p_method == P2P_READ) {`
			`printf("P2P=Enabled Latency (P2P Reads) Matrix (us)\n");`
			`outputLatencyMatrix(numGPUs, true, p2p_method);`
			`}`

			`printf(`
			`"\nNOTE: The CUDA Samples are not meant for performance measurements. "`
			`"Results may vary when GPU Boost is enabled.\n");`

			`exit(EXIT_SUCCESS);`
			`}`