cuda-samples/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarlo_kernel.cu
2022-01-13 11:35:24 +05:30

227 lines
9.7 KiB
Plaintext

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
////////////////////////////////////////////////////////////////////////////////
// Global types
////////////////////////////////////////////////////////////////////////////////
#include <stdlib.h>
#include <stdio.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
#include <helper_cuda.h>
#include <curand_kernel.h>
#include "MonteCarlo_common.h"
////////////////////////////////////////////////////////////////////////////////
// Helper reduction template
// Please see the "reduction" CUDA Sample for more information
////////////////////////////////////////////////////////////////////////////////
#include "MonteCarlo_reduction.cuh"
////////////////////////////////////////////////////////////////////////////////
// Internal GPU-side data structures
////////////////////////////////////////////////////////////////////////////////
#define MAX_OPTIONS (1024 * 1024)
// Preprocessed input option data
typedef struct {
real S;
real X;
real MuByT;
real VBySqrtT;
} __TOptionData;
////////////////////////////////////////////////////////////////////////////////
// Overloaded shortcut payoff functions for different precision modes
////////////////////////////////////////////////////////////////////////////////
__device__ inline float endCallValue(float S, float X, float r, float MuByT,
float VBySqrtT) {
float callValue = S * __expf(MuByT + VBySqrtT * r) - X;
return (callValue > 0.0F) ? callValue : 0.0F;
}
__device__ inline double endCallValue(double S, double X, double r,
double MuByT, double VBySqrtT) {
double callValue = S * exp(MuByT + VBySqrtT * r) - X;
return (callValue > 0.0) ? callValue : 0.0;
}
#define THREAD_N 256
////////////////////////////////////////////////////////////////////////////////
// This kernel computes the integral over all paths using a single thread block
// per option. It is fastest when the number of thread blocks times the work per
// block is high enough to keep the GPU busy.
////////////////////////////////////////////////////////////////////////////////
static __global__ void MonteCarloOneBlockPerOption(
curandState *__restrict rngStates,
const __TOptionData *__restrict d_OptionData,
__TOptionValue *__restrict d_CallValue, int pathN, int optionN) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
const int SUM_N = THREAD_N;
__shared__ real s_SumCall[SUM_N];
__shared__ real s_Sum2Call[SUM_N];
// determine global thread id
int tid = threadIdx.x + blockIdx.x * blockDim.x;
// Copy random number state to local memory for efficiency
curandState localState = rngStates[tid];
for (int optionIndex = blockIdx.x; optionIndex < optionN;
optionIndex += gridDim.x) {
const real S = d_OptionData[optionIndex].S;
const real X = d_OptionData[optionIndex].X;
const real MuByT = d_OptionData[optionIndex].MuByT;
const real VBySqrtT = d_OptionData[optionIndex].VBySqrtT;
// Cycle through the entire samples array:
// derive end stock price for each path
// accumulate partial integrals into intermediate shared memory buffer
for (int iSum = threadIdx.x; iSum < SUM_N; iSum += blockDim.x) {
__TOptionValue sumCall = {0, 0};
#pragma unroll 8
for (int i = iSum; i < pathN; i += SUM_N) {
real r = curand_normal(&localState);
real callValue = endCallValue(S, X, r, MuByT, VBySqrtT);
sumCall.Expected += callValue;
sumCall.Confidence += callValue * callValue;
}
s_SumCall[iSum] = sumCall.Expected;
s_Sum2Call[iSum] = sumCall.Confidence;
}
// Reduce shared memory accumulators
// and write final result to global memory
cg::sync(cta);
sumReduce<real, SUM_N, THREAD_N>(s_SumCall, s_Sum2Call, cta, tile32,
&d_CallValue[optionIndex]);
}
}
static __global__ void rngSetupStates(curandState *rngState, int device_id) {
// determine global thread id
int tid = threadIdx.x + blockIdx.x * blockDim.x;
// Each threadblock gets different seed,
// Threads within a threadblock get different sequence numbers
curand_init(blockIdx.x + gridDim.x * device_id, threadIdx.x, 0,
&rngState[tid]);
}
////////////////////////////////////////////////////////////////////////////////
// Host-side interface to GPU Monte Carlo
////////////////////////////////////////////////////////////////////////////////
extern "C" void initMonteCarloGPU(TOptionPlan *plan) {
checkCudaErrors(cudaMalloc(&plan->d_OptionData,
sizeof(__TOptionData) * (plan->optionCount)));
checkCudaErrors(cudaMalloc(&plan->d_CallValue,
sizeof(__TOptionValue) * (plan->optionCount)));
checkCudaErrors(cudaMallocHost(&plan->h_OptionData,
sizeof(__TOptionData) * (plan->optionCount)));
// Allocate internal device memory
checkCudaErrors(cudaMallocHost(&plan->h_CallValue,
sizeof(__TOptionValue) * (plan->optionCount)));
// Allocate states for pseudo random number generators
checkCudaErrors(cudaMalloc((void **)&plan->rngStates,
plan->gridSize * THREAD_N * sizeof(curandState)));
checkCudaErrors(cudaMemset(plan->rngStates, 0,
plan->gridSize * THREAD_N * sizeof(curandState)));
// place each device pathN random numbers apart on the random number sequence
rngSetupStates<<<plan->gridSize, THREAD_N>>>(plan->rngStates, plan->device);
getLastCudaError("rngSetupStates kernel failed.\n");
}
// Compute statistics and deallocate internal device memory
extern "C" void closeMonteCarloGPU(TOptionPlan *plan) {
for (int i = 0; i < plan->optionCount; i++) {
const double RT = plan->optionData[i].R * plan->optionData[i].T;
const double sum = plan->h_CallValue[i].Expected;
const double sum2 = plan->h_CallValue[i].Confidence;
const double pathN = plan->pathN;
// Derive average from the total sum and discount by riskfree rate
plan->callValue[i].Expected = (float)(exp(-RT) * sum / pathN);
// Standard deviation
double stdDev = sqrt((pathN * sum2 - sum * sum) / (pathN * (pathN - 1)));
// Confidence width; in 95% of all cases theoretical value lies within these
// borders
plan->callValue[i].Confidence =
(float)(exp(-RT) * 1.96 * stdDev / sqrt(pathN));
}
checkCudaErrors(cudaFree(plan->rngStates));
checkCudaErrors(cudaFreeHost(plan->h_CallValue));
checkCudaErrors(cudaFreeHost(plan->h_OptionData));
checkCudaErrors(cudaFree(plan->d_CallValue));
checkCudaErrors(cudaFree(plan->d_OptionData));
}
// Main computations
extern "C" void MonteCarloGPU(TOptionPlan *plan, cudaStream_t stream) {
__TOptionValue *h_CallValue = plan->h_CallValue;
if (plan->optionCount <= 0 || plan->optionCount > MAX_OPTIONS) {
printf("MonteCarloGPU(): bad option count.\n");
return;
}
__TOptionData *h_OptionData = (__TOptionData *)plan->h_OptionData;
for (int i = 0; i < plan->optionCount; i++) {
const double T = plan->optionData[i].T;
const double R = plan->optionData[i].R;
const double V = plan->optionData[i].V;
const double MuByT = (R - 0.5 * V * V) * T;
const double VBySqrtT = V * sqrt(T);
h_OptionData[i].S = (real)plan->optionData[i].S;
h_OptionData[i].X = (real)plan->optionData[i].X;
h_OptionData[i].MuByT = (real)MuByT;
h_OptionData[i].VBySqrtT = (real)VBySqrtT;
}
checkCudaErrors(cudaMemcpyAsync(plan->d_OptionData, h_OptionData,
plan->optionCount * sizeof(__TOptionData),
cudaMemcpyHostToDevice, stream));
MonteCarloOneBlockPerOption<<<plan->gridSize, THREAD_N, 0, stream>>>(
plan->rngStates, (__TOptionData *)(plan->d_OptionData),
(__TOptionValue *)(plan->d_CallValue), plan->pathN, plan->optionCount);
getLastCudaError("MonteCarloOneBlockPerOption() execution failed\n");
checkCudaErrors(cudaMemcpyAsync(h_CallValue, plan->d_CallValue,
plan->optionCount * sizeof(__TOptionValue),
cudaMemcpyDeviceToHost, stream));
// cudaDeviceSynchronize();
}