/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Quadro and Tesla GPUs with compute capability >= 2.0 can overlap two
 * memcopies with kernel execution. This sample illustrates the usage of CUDA
 * streams to achieve overlapping of kernel execution with copying data to and
 * from the device.
 *
 * Additionally, this sample uses CUDA events to measure elapsed time for
 * CUDA calls.  Events are a part of CUDA API and provide a system independent
 * way to measure execution times on CUDA devices with approximately 0.5
 * microsecond precision.
 *
 * Elapsed times are averaged over nreps repetitions (10 by default).
 *
*/

const char *sSDKname = "simpleMultiCopy";

// includes, system
#include <stdio.h>

// include CUDA
#include <cuda_runtime.h>

// includes, project
#include <helper_cuda.h>
#include <helper_functions.h>  // helper for shared that are common to CUDA Samples

// includes, kernels
// Declare the CUDA kernels here and main() code that is needed to launch
// Compute workload on the system
__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;

  if (idx < N) {
    for (int i = 0; i < inner_reps; ++i) {
      g_out[idx] = g_in[idx] + 1;
    }
  }
}

#define STREAM_COUNT 4

// Uncomment to simulate data source/sink IO times
//#define SIMULATE_IO

int *h_data_source;
int *h_data_sink;

int *h_data_in[STREAM_COUNT];
int *d_data_in[STREAM_COUNT];

int *h_data_out[STREAM_COUNT];
int *d_data_out[STREAM_COUNT];

cudaEvent_t cycleDone[STREAM_COUNT];
cudaStream_t stream[STREAM_COUNT];

cudaEvent_t start, stop;

int N = 1 << 22;
int nreps = 10;  // number of times each experiment is repeated
int inner_reps = 5;

int memsize;

dim3 block(512);
dim3 grid;

int thread_blocks;

float processWithStreams(int streams_used);
void init();
bool test();

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char *argv[]) {
  int cuda_device = 0;
  float scale_factor;
  cudaDeviceProp deviceProp;

  printf("[%s] - Starting...\n", sSDKname);

  if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
    cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device=");

    if (cuda_device < 0) {
      printf("Invalid command line parameters\n");
      exit(EXIT_FAILURE);
    } else {
      printf("cuda_device = %d\n", cuda_device);
      cuda_device = gpuDeviceInit(cuda_device);

      if (cuda_device < 0) {
        printf("No CUDA Capable devices found, exiting...\n");
        exit(EXIT_SUCCESS);
      }
    }
  } else {
    // Otherwise pick the device with the highest Gflops/s
    cuda_device = gpuGetMaxGflopsDeviceId();
    checkCudaErrors(cudaSetDevice(cuda_device));
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
    printf("> Using CUDA device [%d]: %s\n", cuda_device, deviceProp.name);
  }

  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
  printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name,
         deviceProp.multiProcessorCount,
         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
             deviceProp.multiProcessorCount);

  // Anything that is less than 32 Cores will have scaled down workload
  scale_factor =
      max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
                    (float)deviceProp.multiProcessorCount)),
          1.0f);
  N = (int)((float)N / scale_factor);

  printf("> Device name: %s\n", deviceProp.name);
  printf("> CUDA Capability %d.%d hardware with %d multi-processors\n",
         deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
  printf("> scale_factor = %.2f\n", 1.0f / scale_factor);
  printf("> array_size   = %d\n\n", N);

  memsize = N * sizeof(int);

  thread_blocks = N / block.x;

  grid.x = thread_blocks % 65535;
  grid.y = (thread_blocks / 65535 + 1);

  // Allocate resources

  h_data_source = (int *)malloc(memsize);
  h_data_sink = (int *)malloc(memsize);

  for (int i = 0; i < STREAM_COUNT; ++i) {
    checkCudaErrors(
        cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault));
    checkCudaErrors(cudaMalloc(&d_data_in[i], memsize));
    checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize));

    checkCudaErrors(
        cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault));
    checkCudaErrors(cudaMalloc(&d_data_out[i], memsize));

    checkCudaErrors(cudaStreamCreate(&stream[i]));
    checkCudaErrors(cudaEventCreate(&cycleDone[i]));

    cudaEventRecord(cycleDone[i], stream[i]);
  }

  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  init();

  // Kernel warmup
  incKernel<<<grid, block>>>(d_data_out[0], d_data_in[0], N, inner_reps);

  // Time copies and kernel
  cudaEventRecord(start, 0);
  checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize,
                                  cudaMemcpyHostToDevice, 0));
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);

  float memcpy_h2d_time;
  cudaEventElapsedTime(&memcpy_h2d_time, start, stop);

  cudaEventRecord(start, 0);
  checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize,
                                  cudaMemcpyDeviceToHost, 0));
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);

  float memcpy_d2h_time;
  cudaEventElapsedTime(&memcpy_d2h_time, start, stop);

  cudaEventRecord(start, 0);
  incKernel<<<grid, block, 0, 0>>>(d_data_out[0], d_data_in[0], N, inner_reps);
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);

  float kernel_time;
  cudaEventElapsedTime(&kernel_time, start, stop);

  printf("\n");
  printf("Relevant properties of this CUDA device\n");
  printf(
      "(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution "
      "(device property \"deviceOverlap\")\n",
      deviceProp.deviceOverlap ? "X" : " ");
  // printf("(%s) Can execute several GPU kernels simultaneously (compute
  // capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " ");
  printf(
      "(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
      "    (Compute Capability >= 2.0 AND (Tesla product OR Quadro "
      "4000/5000/6000/K5000)\n",
      (deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " ");

  printf("\n");
  printf("Measured timings (throughput):\n");
  printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time,
         (memsize * 1e-6) / memcpy_h2d_time);
  printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time,
         (memsize * 1e-6) / memcpy_d2h_time);
  printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time,
         (inner_reps * memsize * 2e-6) / kernel_time);

  printf("\n");
  printf(
      "Theoretical limits for speedup gained from overlapped data "
      "transfers:\n");
  printf("No overlap at all (transfer-kernel-transfer): %f ms \n",
         memcpy_h2d_time + memcpy_d2h_time + kernel_time);
  printf("Compute can overlap with one transfer: %f ms\n",
         max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
  printf("Compute can overlap with both data transfers: %f ms\n",
         max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time));

  // Process pipelined work
  float serial_time = processWithStreams(1);
  float overlap_time = processWithStreams(STREAM_COUNT);

  printf("\nAverage measured timings over %d repetitions:\n", nreps);
  printf(" Avg. time when execution fully serialized\t: %f ms\n",
         serial_time / nreps);
  printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT,
         overlap_time / nreps);
  printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n",
         (serial_time - overlap_time) / nreps);

  printf("\nMeasured throughput:\n");
  printf(" Fully serialized execution\t\t: %f GB/s\n",
         (nreps * (memsize * 2e-6)) / serial_time);
  printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT,
         (nreps * (memsize * 2e-6)) / overlap_time);

  // Verify the results, we will use the results for final output
  bool bResults = test();

  // Free resources

  free(h_data_source);
  free(h_data_sink);

  for (int i = 0; i < STREAM_COUNT; ++i) {
    cudaFreeHost(h_data_in[i]);
    cudaFree(d_data_in[i]);

    cudaFreeHost(h_data_out[i]);
    cudaFree(d_data_out[i]);

    cudaStreamDestroy(stream[i]);
    cudaEventDestroy(cycleDone[i]);
  }

  cudaEventDestroy(start);
  cudaEventDestroy(stop);

  // Test result
  exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE);
}

float processWithStreams(int streams_used) {
  int current_stream = 0;

  float time;

  // Do processing in a loop
  //
  // Note: All memory commands are processed in the order  they are issued,
  // independent of the stream they are enqueued in. Hence the pattern by
  // which the copy and kernel commands are enqueued in the stream
  // has an influence on the achieved overlap.

  cudaEventRecord(start, 0);

  for (int i = 0; i < nreps; ++i) {
    int next_stream = (current_stream + 1) % streams_used;

#ifdef SIMULATE_IO
    // Store the result
    memcpy(h_data_sink, h_data_out[current_stream], memsize);

    // Read new input
    memcpy(h_data_in[next_stream], h_data_source, memsize);
#endif

    // Ensure that processing and copying of the last cycle has finished
    cudaEventSynchronize(cycleDone[next_stream]);

    // Process current frame
    incKernel<<<grid, block, 0, stream[current_stream]>>>(
        d_data_out[current_stream], d_data_in[current_stream], N, inner_reps);

    // Upload next frame
    checkCudaErrors(
        cudaMemcpyAsync(d_data_in[next_stream], h_data_in[next_stream], memsize,
                        cudaMemcpyHostToDevice, stream[next_stream]));

    // Download current frame
    checkCudaErrors(cudaMemcpyAsync(
        h_data_out[current_stream], d_data_out[current_stream], memsize,
        cudaMemcpyDeviceToHost, stream[current_stream]));

    checkCudaErrors(
        cudaEventRecord(cycleDone[current_stream], stream[current_stream]));

    current_stream = next_stream;
  }

  cudaEventRecord(stop, 0);

  cudaDeviceSynchronize();

  cudaEventElapsedTime(&time, start, stop);

  return time;
}

void init() {
  for (int i = 0; i < N; ++i) {
    h_data_source[i] = 0;
  }

  for (int i = 0; i < STREAM_COUNT; ++i) {
    memcpy(h_data_in[i], h_data_source, memsize);
  }
}

bool test() {
  bool passed = true;

  for (int j = 0; j < STREAM_COUNT; ++j) {
    for (int i = 0; i < N; ++i) {
      passed &= (h_data_out[j][i] == 1);
    }
  }

  return passed;
}