/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * This sample illustrates the usage of CUDA streams for overlapping
 * kernel execution with device/host memcopies.  The kernel is used to
 * initialize an array to a specific value, after which the array is
 * copied to the host (CPU) memory.  To increase performance, multiple
 * kernel/memcopy pairs are launched asynchronously, each pair in its
 * own stream.  Devices with Compute Capability 1.1 can overlap a kernel
 * and a memcopy as long as they are issued in different streams.  Kernels
 * are serialized.  Thus, if n pairs are launched, streamed approach
 * can reduce the memcopy cost to the (1/n)th of a single copy of the entire
 * data set.
 *
 * Additionally, this sample uses CUDA events to measure elapsed time for
 * CUDA calls.  Events are a part of CUDA API and provide a system independent
 * way to measure execution times on CUDA devices with approximately 0.5
 * microsecond precision.
 *
 * Elapsed times are averaged over nreps repetitions (10 by default).
 *
*/

const char *sSDKsample = "simpleStreams";

const char *sEventSyncMethod[] = {"cudaEventDefault", "cudaEventBlockingSync",
                                  "cudaEventDisableTiming", NULL};

const char *sDeviceSyncMethod[] = {
    "cudaDeviceScheduleAuto",         "cudaDeviceScheduleSpin",
    "cudaDeviceScheduleYield",        "INVALID",
    "cudaDeviceScheduleBlockingSync", NULL};

// System includes
#include <stdio.h>
#include <assert.h>

// CUDA runtime
#include <cuda_runtime.h>

// helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>

#ifndef WIN32
#include <sys/mman.h>  // for mmap() / munmap()
#endif

// Macro to aligned up to the memory size in question
#define MEMORY_ALIGNMENT 4096
#define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1)))

__global__ void init_array(int *g_data, int *factor, int num_iterations) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;

  for (int i = 0; i < num_iterations; i++) {
    g_data[idx] += *factor;  // non-coalesced on purpose, to burn time
  }
}

bool correct_data(int *a, const int n, const int c) {
  for (int i = 0; i < n; i++) {
    if (a[i] != c) {
      printf("%d: %d %d\n", i, a[i], c);
      return false;
    }
  }

  return true;
}

inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a,
                               int **ppAligned_a, int nbytes) {
#if CUDART_VERSION >= 4000
#if !defined(__arm__) && !defined(__aarch64__)
  if (bPinGenericMemory) {
// allocate a generic page-aligned chunk of system memory
#ifdef WIN32
    printf(
        "> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned "
        "system memory)\n",
        (float)nbytes / 1048576.0f);
    *pp_a = (int *)VirtualAlloc(NULL, (nbytes + MEMORY_ALIGNMENT),
                                MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
#else
    printf(
        "> mmap() allocating %4.2f Mbytes (generic page-aligned system "
        "memory)\n",
        (float)nbytes / 1048576.0f);
    *pp_a = (int *)mmap(NULL, (nbytes + MEMORY_ALIGNMENT),
                        PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
#endif

    *ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);

    printf(
        "> cudaHostRegister() registering %4.2f Mbytes of generic allocated "
        "system memory\n",
        (float)nbytes / 1048576.0f);
    // pin allocate memory
    checkCudaErrors(
        cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped));
  } else
#endif
#endif
  {
    printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n",
           (float)nbytes / 1048576.0f);
    // allocate host memory (pinned is required for achieve asynchronicity)
    checkCudaErrors(cudaMallocHost((void **)pp_a, nbytes));
    *ppAligned_a = *pp_a;
  }
}

inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a,
                           int **ppAligned_a, int nbytes) {
#if CUDART_VERSION >= 4000
#if !defined(__arm__) && !defined(__aarch64__)
  // CUDA 4.0 support pinning of generic host memory
  if (bPinGenericMemory) {
    // unpin and delete host memory
    checkCudaErrors(cudaHostUnregister(*ppAligned_a));
#ifdef WIN32
    VirtualFree(*pp_a, 0, MEM_RELEASE);
#else
    munmap(*pp_a, nbytes);
#endif
  } else
#endif
#endif
  {
    cudaFreeHost(*pp_a);
  }
}

static const char *sSyncMethod[] = {
    "0 (Automatic Blocking)",
    "1 (Spin Blocking)",
    "2 (Yield Blocking)",
    "3 (Undefined Blocking Method)",
    "4 (Blocking Sync Event) = low CPU utilization",
    NULL};

void printHelp() {
  printf("Usage: %s [options below]\n", sSDKsample);
  printf("\t--sync_method=n for CPU/GPU synchronization\n");
  printf("\t             n=%s\n", sSyncMethod[0]);
  printf("\t             n=%s\n", sSyncMethod[1]);
  printf("\t             n=%s\n", sSyncMethod[2]);
  printf("\t   <Default> n=%s\n", sSyncMethod[4]);
  printf(
      "\t--use_generic_memory (default) use generic page-aligned for system "
      "memory\n");
  printf(
      "\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate "
      "system memory\n");
}

#if defined(__APPLE__) || defined(MACOSX)
#define DEFAULT_PINNED_GENERIC_MEMORY false
#else
#define DEFAULT_PINNED_GENERIC_MEMORY true
#endif

int main(int argc, char **argv) {
  int cuda_device = 0;
  int nstreams = 4;              // number of streams for CUDA calls
  int nreps = 10;                // number of times each experiment is repeated
  int n = 16 * 1024 * 1024;      // number of ints in the data set
  int nbytes = n * sizeof(int);  // number of data bytes
  dim3 threads, blocks;          // kernel launch configuration
  float elapsed_time, time_memcpy, time_kernel;  // timing variables
  float scale_factor = 1.0f;

  // allocate generic memory and pin it laster instead of using cudaHostAlloc()

  bool bPinGenericMemory =
      DEFAULT_PINNED_GENERIC_MEMORY;  // we want this to be the default behavior
  int device_sync_method =
      cudaDeviceBlockingSync;  // by default we use BlockingSync

  int niterations;  // number of iterations for the loop inside the kernel

  printf("[ %s ]\n\n", sSDKsample);

  if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
    printHelp();
    return EXIT_SUCCESS;
  }

  if ((device_sync_method = getCmdLineArgumentInt(argc, (const char **)argv,
                                                  "sync_method")) >= 0) {
    if (device_sync_method == 0 || device_sync_method == 1 ||
        device_sync_method == 2 || device_sync_method == 4) {
      printf("Device synchronization method set to = %s\n",
             sSyncMethod[device_sync_method]);
      printf("Setting reps to 100 to demonstrate steady state\n");
      nreps = 100;
    } else {
      printf("Invalid command line option sync_method=\"%d\"\n",
             device_sync_method);
      return EXIT_FAILURE;
    }
  } else {
    printHelp();
    return EXIT_SUCCESS;
  }

  if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) {
#if defined(__APPLE__) || defined(MACOSX)
    bPinGenericMemory = false;  // Generic Pinning of System Paged memory not
                                // currently supported on Mac OSX
#else
    bPinGenericMemory = true;
#endif
  }

  if (checkCmdLineFlag(argc, (const char **)argv, "use_cuda_malloc_host")) {
    bPinGenericMemory = false;
  }

  printf("\n> ");
  cuda_device = findCudaDevice(argc, (const char **)argv);

  // check the compute capability of the device
  int num_devices = 0;
  checkCudaErrors(cudaGetDeviceCount(&num_devices));

  if (0 == num_devices) {
    printf(
        "your system does not have a CUDA capable device, waiving test...\n");
    return EXIT_WAIVED;
  }

  // check if the command-line chosen device ID is within range, exit if not
  if (cuda_device >= num_devices) {
    printf(
        "cuda_device=%d is invalid, must choose device ID between 0 and %d\n",
        cuda_device, num_devices - 1);
    return EXIT_FAILURE;
  }

  checkCudaErrors(cudaSetDevice(cuda_device));

  // Checking for compute capabilities
  cudaDeviceProp deviceProp;
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));

  niterations = 5;

  // Check if GPU can map host memory (Generic Method), if not then we override
  // bPinGenericMemory to be false
  if (bPinGenericMemory) {
    printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name,
           deviceProp.canMapHostMemory ? "Yes" : "No");

    if (deviceProp.canMapHostMemory == 0) {
      printf(
          "Using cudaMallocHost, CUDA device does not support mapping of "
          "generic host memory\n");
      bPinGenericMemory = false;
    }
  }

  // Anything that is less than 32 Cores will have scaled down workload
  scale_factor =
      max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
                    (float)deviceProp.multiProcessorCount)),
          1.0f);
  n = (int)rint((float)n / scale_factor);

  printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major,
         deviceProp.minor);
  printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n",
         deviceProp.multiProcessorCount,
         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
             deviceProp.multiProcessorCount);

  printf("> scale_factor = %1.4f\n", 1.0f / scale_factor);
  printf("> array_size   = %d\n\n", n);

  // enable use of blocking sync, to reduce CPU usage
  printf("> Using CPU/GPU Device Synchronization method (%s)\n",
         sDeviceSyncMethod[device_sync_method]);
  checkCudaErrors(cudaSetDeviceFlags(
      device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0)));

  // allocate host memory
  int c = 5;            // value to which the array will be initialized
  int *h_a = 0;         // pointer to the array data in host memory
  int *hAligned_a = 0;  // pointer to the array data in host memory (aligned to
                        // MEMORY_ALIGNMENT)

  // Allocate Host memory (could be using cudaMallocHost or VirtualAlloc/mmap if
  // using the new CUDA 4.0 features
  AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);

  // allocate device memory
  int *d_a = 0,
      *d_c = 0;  // pointers to data and init value in the device memory
  checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
  checkCudaErrors(cudaMemset(d_a, 0x0, nbytes));
  checkCudaErrors(cudaMalloc((void **)&d_c, sizeof(int)));
  checkCudaErrors(cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice));

  printf("\nStarting Test\n");

  // allocate and initialize an array of stream handles
  cudaStream_t *streams =
      (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));

  for (int i = 0; i < nstreams; i++) {
    checkCudaErrors(cudaStreamCreate(&(streams[i])));
  }

  // create CUDA event handles
  // use blocking sync
  cudaEvent_t start_event, stop_event;
  int eventflags =
      ((device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync
                                                      : cudaEventDefault);

  checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
  checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));

  // time memcopy from device
  checkCudaErrors(cudaEventRecord(start_event, 0));  // record in stream-0, to
                                                     // ensure that all previous
                                                     // CUDA calls have
                                                     // completed
  checkCudaErrors(cudaMemcpyAsync(hAligned_a, d_a, nbytes,
                                  cudaMemcpyDeviceToHost, streams[0]));
  checkCudaErrors(cudaEventRecord(stop_event, 0));
  checkCudaErrors(cudaEventSynchronize(
      stop_event));  // block until the event is actually recorded
  checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
  printf("memcopy:\t%.2f\n", time_memcpy);

  // time kernel
  threads = dim3(512, 1);
  blocks = dim3(n / threads.x, 1);
  checkCudaErrors(cudaEventRecord(start_event, 0));
  init_array<<<blocks, threads, 0, streams[0]>>>(d_a, d_c, niterations);
  checkCudaErrors(cudaEventRecord(stop_event, 0));
  checkCudaErrors(cudaEventSynchronize(stop_event));
  checkCudaErrors(cudaEventElapsedTime(&time_kernel, start_event, stop_event));
  printf("kernel:\t\t%.2f\n", time_kernel);

  //////////////////////////////////////////////////////////////////////
  // time non-streamed execution for reference
  threads = dim3(512, 1);
  blocks = dim3(n / threads.x, 1);
  checkCudaErrors(cudaEventRecord(start_event, 0));

  for (int k = 0; k < nreps; k++) {
    init_array<<<blocks, threads>>>(d_a, d_c, niterations);
    checkCudaErrors(
        cudaMemcpy(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost));
  }

  checkCudaErrors(cudaEventRecord(stop_event, 0));
  checkCudaErrors(cudaEventSynchronize(stop_event));
  checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
  printf("non-streamed:\t%.2f\n", elapsed_time / nreps);

  //////////////////////////////////////////////////////////////////////
  // time execution with nstreams streams
  threads = dim3(512, 1);
  blocks = dim3(n / (nstreams * threads.x), 1);
  memset(hAligned_a, 255,
         nbytes);  // set host memory bits to all 1s, for testing correctness
  checkCudaErrors(cudaMemset(
      d_a, 0, nbytes));  // set device memory to all 0s, for testing correctness
  checkCudaErrors(cudaEventRecord(start_event, 0));

  for (int k = 0; k < nreps; k++) {
    // asynchronously launch nstreams kernels, each operating on its own portion
    // of data
    for (int i = 0; i < nstreams; i++) {
      init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams,
                                                     d_c, niterations);
    }

    // asynchronously launch nstreams memcopies.  Note that memcopy in stream x
    // will only
    //   commence executing when all previous CUDA calls in stream x have
    //   completed
    for (int i = 0; i < nstreams; i++) {
      checkCudaErrors(cudaMemcpyAsync(hAligned_a + i * n / nstreams,
                                      d_a + i * n / nstreams, nbytes / nstreams,
                                      cudaMemcpyDeviceToHost, streams[i]));
    }
  }

  checkCudaErrors(cudaEventRecord(stop_event, 0));
  checkCudaErrors(cudaEventSynchronize(stop_event));
  checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
  printf("%d streams:\t%.2f\n", nstreams, elapsed_time / nreps);

  // check whether the output is correct
  printf("-------------------------------\n");
  bool bResults = correct_data(hAligned_a, n, c * nreps * niterations);

  // release resources
  for (int i = 0; i < nstreams; i++) {
    checkCudaErrors(cudaStreamDestroy(streams[i]));
  }

  checkCudaErrors(cudaEventDestroy(start_event));
  checkCudaErrors(cudaEventDestroy(stop_event));

  // Free cudaMallocHost or Generic Host allocated memory (from CUDA 4.0)
  FreeHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);

  checkCudaErrors(cudaFree(d_a));
  checkCudaErrors(cudaFree(d_c));

  return bResults ? EXIT_SUCCESS : EXIT_FAILURE;
}