/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * This sample demonstrates a combination of Peer-to-Peer (P2P) and
 * Unified Virtual Address Space (UVA) features new to SDK 4.0
 */

// includes, system
#include <stdlib.h>
#include <stdio.h>

// CUDA includes
#include <cuda_runtime.h>

// includes, project
#include <helper_cuda.h>
#include <helper_functions.h>  // helper for shared that are common to CUDA Samples

__global__ void SimpleKernel(float *src, float *dst) {
  // Just a dummy kernel, doing enough for us to verify that everything
  // worked
  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
  dst[idx] = src[idx] * 2.0f;
}

inline bool IsAppBuiltAs64() { return sizeof(void *) == 8; }

int main(int argc, char **argv) {
  printf("[%s] - Starting...\n", argv[0]);

  if (!IsAppBuiltAs64()) {
    printf(
        "%s is only supported with on 64-bit OSs and the application must be "
        "built as a 64-bit target.  Test is being waived.\n",
        argv[0]);
    exit(EXIT_WAIVED);
  }

  // Number of GPUs
  printf("Checking for multiple GPUs...\n");
  int gpu_n;
  checkCudaErrors(cudaGetDeviceCount(&gpu_n));
  printf("CUDA-capable device count: %i\n", gpu_n);

  if (gpu_n < 2) {
    printf(
        "Two or more GPUs with Peer-to-Peer access capability are required for "
        "%s.\n",
        argv[0]);
    printf("Waiving test.\n");
    exit(EXIT_WAIVED);
  }

  // Query device properties
  cudaDeviceProp prop[64];
  int gpuid[2];  // we want to find the first two GPU's that can support P2P

  for (int i = 0; i < gpu_n; i++) {
    checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
  }
  // Check possibility for peer access
  printf("\nChecking GPU(s) for support of peer to peer memory access...\n");

  int can_access_peer;
  int p2pCapableGPUs[2];  // We take only 1 pair of P2P capable GPUs
  p2pCapableGPUs[0] = p2pCapableGPUs[1] = -1;

  // Show all the combinations of supported P2P GPUs
  for (int i = 0; i < gpu_n; i++) {
    for (int j = 0; j < gpu_n; j++) {
      if (i == j) {
        continue;
      }
      checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, i, j));
      printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[i].name,
             i, prop[j].name, j, can_access_peer ? "Yes" : "No");
      if (can_access_peer && p2pCapableGPUs[0] == -1) {
        p2pCapableGPUs[0] = i;
        p2pCapableGPUs[1] = j;
      }
    }
  }

  if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1) {
    printf(
        "Two or more GPUs with Peer-to-Peer access capability are required for "
        "%s.\n",
        argv[0]);
    printf(
        "Peer to Peer access is not available amongst GPUs in the system, "
        "waiving test.\n");

    exit(EXIT_WAIVED);
  }

  // Use first pair of p2p capable GPUs detected.
  gpuid[0] = p2pCapableGPUs[0];
  gpuid[1] = p2pCapableGPUs[1];

  // Enable peer access
  printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid[0],
         gpuid[1]);
  checkCudaErrors(cudaSetDevice(gpuid[0]));
  checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[1], 0));
  checkCudaErrors(cudaSetDevice(gpuid[1]));
  checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[0], 0));

  // Allocate buffers
  const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
  printf("Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n",
         int(buf_size / 1024 / 1024), gpuid[0], gpuid[1]);
  checkCudaErrors(cudaSetDevice(gpuid[0]));
  float *g0;
  checkCudaErrors(cudaMalloc(&g0, buf_size));
  checkCudaErrors(cudaSetDevice(gpuid[1]));
  float *g1;
  checkCudaErrors(cudaMalloc(&g1, buf_size));
  float *h0;
  checkCudaErrors(
      cudaMallocHost(&h0, buf_size));  // Automatically portable with UVA

  // Create CUDA event handles
  printf("Creating event handles...\n");
  cudaEvent_t start_event, stop_event;
  float time_memcpy;
  int eventflags = cudaEventBlockingSync;
  checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
  checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));

  // P2P memcopy() benchmark
  checkCudaErrors(cudaEventRecord(start_event, 0));

  for (int i = 0; i < 100; i++) {
    // With UVA we don't need to specify source and target devices, the
    // runtime figures this out by itself from the pointers
    // Ping-pong copy between GPUs
    if (i % 2 == 0) {
      checkCudaErrors(cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
    } else {
      checkCudaErrors(cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
    }
  }

  checkCudaErrors(cudaEventRecord(stop_event, 0));
  checkCudaErrors(cudaEventSynchronize(stop_event));
  checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
  printf("cudaMemcpyPeer / cudaMemcpy between GPU%d and GPU%d: %.2fGB/s\n",
         gpuid[0], gpuid[1],
         (1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f /
             1024.0f / 1024.0f);

  // Prepare host buffer and copy to GPU 0
  printf("Preparing host buffer and memcpy to GPU%d...\n", gpuid[0]);

  for (int i = 0; i < buf_size / sizeof(float); i++) {
    h0[i] = float(i % 4096);
  }

  checkCudaErrors(cudaSetDevice(gpuid[0]));
  checkCudaErrors(cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault));

  // Kernel launch configuration
  const dim3 threads(512, 1);
  const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1);

  // Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
  // output to the GPU 1 buffer
  printf(
      "Run kernel on GPU%d, taking source data from GPU%d and writing to "
      "GPU%d...\n",
      gpuid[1], gpuid[0], gpuid[1]);
  checkCudaErrors(cudaSetDevice(gpuid[1]));
  SimpleKernel<<<blocks, threads>>>(g0, g1);

  checkCudaErrors(cudaDeviceSynchronize());

  // Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
  // output to the GPU 0 buffer
  printf(
      "Run kernel on GPU%d, taking source data from GPU%d and writing to "
      "GPU%d...\n",
      gpuid[0], gpuid[1], gpuid[0]);
  checkCudaErrors(cudaSetDevice(gpuid[0]));
  SimpleKernel<<<blocks, threads>>>(g1, g0);

  checkCudaErrors(cudaDeviceSynchronize());

  // Copy data back to host and verify
  printf("Copy data back to host from GPU%d and verify results...\n", gpuid[0]);
  checkCudaErrors(cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault));

  int error_count = 0;

  for (int i = 0; i < buf_size / sizeof(float); i++) {
    // Re-generate input data and apply 2x '* 2.0f' computation of both
    // kernel runs
    if (h0[i] != float(i % 4096) * 2.0f * 2.0f) {
      printf("Verification error @ element %i: val = %f, ref = %f\n", i, h0[i],
             (float(i % 4096) * 2.0f * 2.0f));

      if (error_count++ > 10) {
        break;
      }
    }
  }

  // Disable peer access (also unregisters memory for non-UVA cases)
  printf("Disabling peer access...\n");
  checkCudaErrors(cudaSetDevice(gpuid[0]));
  checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[1]));
  checkCudaErrors(cudaSetDevice(gpuid[1]));
  checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[0]));

  // Cleanup and shutdown
  printf("Shutting down...\n");
  checkCudaErrors(cudaEventDestroy(start_event));
  checkCudaErrors(cudaEventDestroy(stop_event));
  checkCudaErrors(cudaSetDevice(gpuid[0]));
  checkCudaErrors(cudaFree(g0));
  checkCudaErrors(cudaSetDevice(gpuid[1]));
  checkCudaErrors(cudaFree(g1));
  checkCudaErrors(cudaFreeHost(h0));

  for (int i = 0; i < gpu_n; i++) {
    checkCudaErrors(cudaSetDevice(i));
  }

  if (error_count != 0) {
    printf("Test failed!\n");
    exit(EXIT_FAILURE);
  } else {
    printf("Test passed\n");
    exit(EXIT_SUCCESS);
  }
}