cuda-samples/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarlo_kernel.cu

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

////////////////////////////////////////////////////////////////////////////////
// Global types
////////////////////////////////////////////////////////////////////////////////
#include <stdlib.h>
#include <stdio.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;
#include <helper_cuda.h>
#include <curand_kernel.h>
#include "MonteCarlo_common.h"

////////////////////////////////////////////////////////////////////////////////
// Helper reduction template
// Please see the "reduction" CUDA Sample for more information
////////////////////////////////////////////////////////////////////////////////
#include "MonteCarlo_reduction.cuh"

////////////////////////////////////////////////////////////////////////////////
// Internal GPU-side data structures
////////////////////////////////////////////////////////////////////////////////
#define MAX_OPTIONS (1024 * 1024)

// Preprocessed input option data
typedef struct {
  real S;
  real X;
  real MuByT;
  real VBySqrtT;
} __TOptionData;

////////////////////////////////////////////////////////////////////////////////
// Overloaded shortcut payoff functions for different precision modes
////////////////////////////////////////////////////////////////////////////////
__device__ inline float endCallValue(float S, float X, float r, float MuByT,
                                     float VBySqrtT) {
  float callValue = S * __expf(MuByT + VBySqrtT * r) - X;
  return (callValue > 0.0F) ? callValue : 0.0F;
}

__device__ inline double endCallValue(double S, double X, double r,
                                      double MuByT, double VBySqrtT) {
  double callValue = S * exp(MuByT + VBySqrtT * r) - X;
  return (callValue > 0.0) ? callValue : 0.0;
}

#define THREAD_N 256

////////////////////////////////////////////////////////////////////////////////
// This kernel computes the integral over all paths using a single thread block
// per option. It is fastest when the number of thread blocks times the work per
// block is high enough to keep the GPU busy.
////////////////////////////////////////////////////////////////////////////////
static __global__ void MonteCarloOneBlockPerOption(
    curandState *__restrict rngStates,
    const __TOptionData *__restrict d_OptionData,
    __TOptionValue *__restrict d_CallValue, int pathN, int optionN) {
  // Handle to thread block group
  cg::thread_block cta = cg::this_thread_block();
  cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);

  const int SUM_N = THREAD_N;
  __shared__ real s_SumCall[SUM_N];
  __shared__ real s_Sum2Call[SUM_N];

  // determine global thread id
  int tid = threadIdx.x + blockIdx.x * blockDim.x;

  // Copy random number state to local memory for efficiency
  curandState localState = rngStates[tid];
  for (int optionIndex = blockIdx.x; optionIndex < optionN;
       optionIndex += gridDim.x) {
    const real S = d_OptionData[optionIndex].S;
    const real X = d_OptionData[optionIndex].X;
    const real MuByT = d_OptionData[optionIndex].MuByT;
    const real VBySqrtT = d_OptionData[optionIndex].VBySqrtT;

    // Cycle through the entire samples array:
    // derive end stock price for each path
    // accumulate partial integrals into intermediate shared memory buffer
    for (int iSum = threadIdx.x; iSum < SUM_N; iSum += blockDim.x) {
      __TOptionValue sumCall = {0, 0};

#pragma unroll 8
      for (int i = iSum; i < pathN; i += SUM_N) {
        real r = curand_normal(&localState);
        real callValue = endCallValue(S, X, r, MuByT, VBySqrtT);
        sumCall.Expected += callValue;
        sumCall.Confidence += callValue * callValue;
      }

      s_SumCall[iSum] = sumCall.Expected;
      s_Sum2Call[iSum] = sumCall.Confidence;
    }

    // Reduce shared memory accumulators
    // and write final result to global memory
    cg::sync(cta);
    sumReduce<real, SUM_N, THREAD_N>(s_SumCall, s_Sum2Call, cta, tile32,
                                     &d_CallValue[optionIndex]);
  }
}

static __global__ void rngSetupStates(curandState *rngState, int device_id) {
  // determine global thread id
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  // Each threadblock gets different seed,
  // Threads within a threadblock get different sequence numbers
  curand_init(blockIdx.x + gridDim.x * device_id, threadIdx.x, 0,
              &rngState[tid]);
}

////////////////////////////////////////////////////////////////////////////////
// Host-side interface to GPU Monte Carlo
////////////////////////////////////////////////////////////////////////////////

extern "C" void initMonteCarloGPU(TOptionPlan *plan) {
  checkCudaErrors(cudaMalloc(&plan->d_OptionData,
                             sizeof(__TOptionData) * (plan->optionCount)));
  checkCudaErrors(cudaMalloc(&plan->d_CallValue,
                             sizeof(__TOptionValue) * (plan->optionCount)));
  checkCudaErrors(cudaMallocHost(&plan->h_OptionData,
                                 sizeof(__TOptionData) * (plan->optionCount)));
  // Allocate internal device memory
  checkCudaErrors(cudaMallocHost(&plan->h_CallValue,
                                 sizeof(__TOptionValue) * (plan->optionCount)));
  // Allocate states for pseudo random number generators
  checkCudaErrors(cudaMalloc((void **)&plan->rngStates,
                             plan->gridSize * THREAD_N * sizeof(curandState)));
  checkCudaErrors(cudaMemset(plan->rngStates, 0,
                             plan->gridSize * THREAD_N * sizeof(curandState)));

  // place each device pathN random numbers apart on the random number sequence
  rngSetupStates<<<plan->gridSize, THREAD_N>>>(plan->rngStates, plan->device);
  getLastCudaError("rngSetupStates kernel failed.\n");
}

// Compute statistics and deallocate internal device memory
extern "C" void closeMonteCarloGPU(TOptionPlan *plan) {
  for (int i = 0; i < plan->optionCount; i++) {
    const double RT = plan->optionData[i].R * plan->optionData[i].T;
    const double sum = plan->h_CallValue[i].Expected;
    const double sum2 = plan->h_CallValue[i].Confidence;
    const double pathN = plan->pathN;
    // Derive average from the total sum and discount by riskfree rate
    plan->callValue[i].Expected = (float)(exp(-RT) * sum / pathN);
    // Standard deviation
    double stdDev = sqrt((pathN * sum2 - sum * sum) / (pathN * (pathN - 1)));
    // Confidence width; in 95% of all cases theoretical value lies within these
    // borders
    plan->callValue[i].Confidence =
        (float)(exp(-RT) * 1.96 * stdDev / sqrt(pathN));
  }

  checkCudaErrors(cudaFree(plan->rngStates));
  checkCudaErrors(cudaFreeHost(plan->h_CallValue));
  checkCudaErrors(cudaFreeHost(plan->h_OptionData));
  checkCudaErrors(cudaFree(plan->d_CallValue));
  checkCudaErrors(cudaFree(plan->d_OptionData));
}

// Main computations
extern "C" void MonteCarloGPU(TOptionPlan *plan, cudaStream_t stream) {
  __TOptionValue *h_CallValue = plan->h_CallValue;

  if (plan->optionCount <= 0 || plan->optionCount > MAX_OPTIONS) {
    printf("MonteCarloGPU(): bad option count.\n");
    return;
  }

  __TOptionData *h_OptionData = (__TOptionData *)plan->h_OptionData;

  for (int i = 0; i < plan->optionCount; i++) {
    const double T = plan->optionData[i].T;
    const double R = plan->optionData[i].R;
    const double V = plan->optionData[i].V;
    const double MuByT = (R - 0.5 * V * V) * T;
    const double VBySqrtT = V * sqrt(T);
    h_OptionData[i].S = (real)plan->optionData[i].S;
    h_OptionData[i].X = (real)plan->optionData[i].X;
    h_OptionData[i].MuByT = (real)MuByT;
    h_OptionData[i].VBySqrtT = (real)VBySqrtT;
  }

  checkCudaErrors(cudaMemcpyAsync(plan->d_OptionData, h_OptionData,
                                  plan->optionCount * sizeof(__TOptionData),
                                  cudaMemcpyHostToDevice, stream));

  MonteCarloOneBlockPerOption<<<plan->gridSize, THREAD_N, 0, stream>>>(
      plan->rngStates, (__TOptionData *)(plan->d_OptionData),
      (__TOptionValue *)(plan->d_CallValue), plan->pathN, plan->optionCount);
  getLastCudaError("MonteCarloOneBlockPerOption() execution failed\n");

  checkCudaErrors(cudaMemcpyAsync(h_CallValue, plan->d_CallValue,
                                  plan->optionCount * sizeof(__TOptionValue),
                                  cudaMemcpyDeviceToHost, stream));

  // cudaDeviceSynchronize();
}
add and update samples for CUDA 11.6 2022-01-13 14:05:24 +08:00			`/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.`
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Global types`
			`////////////////////////////////////////////////////////////////////////////////`
			`#include <stdlib.h>`
			`#include <stdio.h>`
			`#include <cooperative_groups.h>`

			`namespace cg = cooperative_groups;`
			`#include <helper_cuda.h>`
			`#include <curand_kernel.h>`
			`#include "MonteCarlo_common.h"`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Helper reduction template`
			`// Please see the "reduction" CUDA Sample for more information`
			`////////////////////////////////////////////////////////////////////////////////`
			`#include "MonteCarlo_reduction.cuh"`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Internal GPU-side data structures`
			`////////////////////////////////////////////////////////////////////////////////`
			`#define MAX_OPTIONS (1024 * 1024)`

			`// Preprocessed input option data`
			`typedef struct {`
			`real S;`
			`real X;`
			`real MuByT;`
			`real VBySqrtT;`
			`} __TOptionData;`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Overloaded shortcut payoff functions for different precision modes`
			`////////////////////////////////////////////////////////////////////////////////`
			`__device__ inline float endCallValue(float S, float X, float r, float MuByT,`
			`float VBySqrtT) {`
			`float callValue = S * __expf(MuByT + VBySqrtT * r) - X;`
			`return (callValue > 0.0F) ? callValue : 0.0F;`
			`}`

			`__device__ inline double endCallValue(double S, double X, double r,`
			`double MuByT, double VBySqrtT) {`
			`double callValue = S * exp(MuByT + VBySqrtT * r) - X;`
			`return (callValue > 0.0) ? callValue : 0.0;`
			`}`

			`#define THREAD_N 256`

			`////////////////////////////////////////////////////////////////////////////////`
			`// This kernel computes the integral over all paths using a single thread block`
			`// per option. It is fastest when the number of thread blocks times the work per`
			`// block is high enough to keep the GPU busy.`
			`////////////////////////////////////////////////////////////////////////////////`
			`static __global__ void MonteCarloOneBlockPerOption(`
			`curandState *__restrict rngStates,`
			`const __TOptionData *__restrict d_OptionData,`
			`__TOptionValue *__restrict d_CallValue, int pathN, int optionN) {`
			`// Handle to thread block group`
			`cg::thread_block cta = cg::this_thread_block();`
			`cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);`

			`const int SUM_N = THREAD_N;`
			`__shared__ real s_SumCall[SUM_N];`
			`__shared__ real s_Sum2Call[SUM_N];`

			`// determine global thread id`
			`int tid = threadIdx.x + blockIdx.x * blockDim.x;`

			`// Copy random number state to local memory for efficiency`
			`curandState localState = rngStates[tid];`
			`for (int optionIndex = blockIdx.x; optionIndex < optionN;`
			`optionIndex += gridDim.x) {`
			`const real S = d_OptionData[optionIndex].S;`
			`const real X = d_OptionData[optionIndex].X;`
			`const real MuByT = d_OptionData[optionIndex].MuByT;`
			`const real VBySqrtT = d_OptionData[optionIndex].VBySqrtT;`

			`// Cycle through the entire samples array:`
			`// derive end stock price for each path`
			`// accumulate partial integrals into intermediate shared memory buffer`
			`for (int iSum = threadIdx.x; iSum < SUM_N; iSum += blockDim.x) {`
			`__TOptionValue sumCall = {0, 0};`

			`#pragma unroll 8`
			`for (int i = iSum; i < pathN; i += SUM_N) {`
			`real r = curand_normal(&localState);`
			`real callValue = endCallValue(S, X, r, MuByT, VBySqrtT);`
			`sumCall.Expected += callValue;`
			`sumCall.Confidence += callValue * callValue;`
			`}`

			`s_SumCall[iSum] = sumCall.Expected;`
			`s_Sum2Call[iSum] = sumCall.Confidence;`
			`}`

			`// Reduce shared memory accumulators`
			`// and write final result to global memory`
			`cg::sync(cta);`
			`sumReduce<real, SUM_N, THREAD_N>(s_SumCall, s_Sum2Call, cta, tile32,`
			`&d_CallValue[optionIndex]);`
			`}`
			`}`

			`static __global__ void rngSetupStates(curandState *rngState, int device_id) {`
			`// determine global thread id`
			`int tid = threadIdx.x + blockIdx.x * blockDim.x;`
			`// Each threadblock gets different seed,`
			`// Threads within a threadblock get different sequence numbers`
			`curand_init(blockIdx.x + gridDim.x * device_id, threadIdx.x, 0,`
			`&rngState[tid]);`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Host-side interface to GPU Monte Carlo`
			`////////////////////////////////////////////////////////////////////////////////`

			`extern "C" void initMonteCarloGPU(TOptionPlan *plan) {`
			`checkCudaErrors(cudaMalloc(&plan->d_OptionData,`
			`sizeof(__TOptionData) * (plan->optionCount)));`
			`checkCudaErrors(cudaMalloc(&plan->d_CallValue,`
			`sizeof(__TOptionValue) * (plan->optionCount)));`
			`checkCudaErrors(cudaMallocHost(&plan->h_OptionData,`
			`sizeof(__TOptionData) * (plan->optionCount)));`
			`// Allocate internal device memory`
			`checkCudaErrors(cudaMallocHost(&plan->h_CallValue,`
			`sizeof(__TOptionValue) * (plan->optionCount)));`
			`// Allocate states for pseudo random number generators`
			`checkCudaErrors(cudaMalloc((void **)&plan->rngStates,`
			`plan->gridSize * THREAD_N * sizeof(curandState)));`
			`checkCudaErrors(cudaMemset(plan->rngStates, 0,`
			`plan->gridSize * THREAD_N * sizeof(curandState)));`

			`// place each device pathN random numbers apart on the random number sequence`
			`rngSetupStates<<<plan->gridSize, THREAD_N>>>(plan->rngStates, plan->device);`
			`getLastCudaError("rngSetupStates kernel failed.\n");`
			`}`

			`// Compute statistics and deallocate internal device memory`
			`extern "C" void closeMonteCarloGPU(TOptionPlan *plan) {`
			`for (int i = 0; i < plan->optionCount; i++) {`
			`const double RT = plan->optionData[i].R * plan->optionData[i].T;`
			`const double sum = plan->h_CallValue[i].Expected;`
			`const double sum2 = plan->h_CallValue[i].Confidence;`
			`const double pathN = plan->pathN;`
			`// Derive average from the total sum and discount by riskfree rate`
			`plan->callValue[i].Expected = (float)(exp(-RT) * sum / pathN);`
			`// Standard deviation`
			`double stdDev = sqrt((pathN * sum2 - sum * sum) / (pathN * (pathN - 1)));`
			`// Confidence width; in 95% of all cases theoretical value lies within these`
			`// borders`
			`plan->callValue[i].Confidence =`
			`(float)(exp(-RT) * 1.96 * stdDev / sqrt(pathN));`
			`}`

			`checkCudaErrors(cudaFree(plan->rngStates));`
			`checkCudaErrors(cudaFreeHost(plan->h_CallValue));`
			`checkCudaErrors(cudaFreeHost(plan->h_OptionData));`
			`checkCudaErrors(cudaFree(plan->d_CallValue));`
			`checkCudaErrors(cudaFree(plan->d_OptionData));`
			`}`

			`// Main computations`
			`extern "C" void MonteCarloGPU(TOptionPlan *plan, cudaStream_t stream) {`
			`__TOptionValue *h_CallValue = plan->h_CallValue;`

			`if (plan->optionCount <= 0 \|\| plan->optionCount > MAX_OPTIONS) {`
			`printf("MonteCarloGPU(): bad option count.\n");`
			`return;`
			`}`

			`__TOptionData h_OptionData = (__TOptionData )plan->h_OptionData;`

			`for (int i = 0; i < plan->optionCount; i++) {`
			`const double T = plan->optionData[i].T;`
			`const double R = plan->optionData[i].R;`
			`const double V = plan->optionData[i].V;`
			`const double MuByT = (R - 0.5 * V * V) * T;`
			`const double VBySqrtT = V * sqrt(T);`
			`h_OptionData[i].S = (real)plan->optionData[i].S;`
			`h_OptionData[i].X = (real)plan->optionData[i].X;`
			`h_OptionData[i].MuByT = (real)MuByT;`
			`h_OptionData[i].VBySqrtT = (real)VBySqrtT;`
			`}`

			`checkCudaErrors(cudaMemcpyAsync(plan->d_OptionData, h_OptionData,`
			`plan->optionCount * sizeof(__TOptionData),`
			`cudaMemcpyHostToDevice, stream));`

			`MonteCarloOneBlockPerOption<<<plan->gridSize, THREAD_N, 0, stream>>>(`
			`plan->rngStates, (__TOptionData *)(plan->d_OptionData),`
			`(__TOptionValue *)(plan->d_CallValue), plan->pathN, plan->optionCount);`
			`getLastCudaError("MonteCarloOneBlockPerOption() execution failed\n");`

			`checkCudaErrors(cudaMemcpyAsync(h_CallValue, plan->d_CallValue,`
			`plan->optionCount * sizeof(__TOptionValue),`
			`cudaMemcpyDeviceToHost, stream));`

			`// cudaDeviceSynchronize();`
			`}`