/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "../inc/pricingengine.h" #include #include #include #include #include #include #include namespace cg = cooperative_groups; #include #include "../inc/asianoption.h" #include "../inc/cudasharedmem.h" using std::string; using std::vector; // RNG init kernel __global__ void initRNG(curandState *const rngStates, const unsigned int seed) { // Determine thread ID unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; // Initialise the RNG curand_init(seed, tid, 0, &rngStates[tid]); } __device__ inline float getPathStep(float &drift, float &diffusion, curandState &state) { return expf(drift + diffusion * curand_normal(&state)); } __device__ inline double getPathStep(double &drift, double &diffusion, curandState &state) { return exp(drift + diffusion * curand_normal_double(&state)); } // Path generation kernel template __global__ void generatePaths(Real *const paths, curandState *const rngStates, const AsianOption *const option, const unsigned int numSims, const unsigned int numTimesteps) { // Determine thread ID unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; unsigned int step = gridDim.x * blockDim.x; // Compute parameters Real drift = (option->r - static_cast(0.5) * option->sigma * option->sigma) * option->dt; Real diffusion = option->sigma * sqrt(option->dt); // Initialise the RNG curandState localState = rngStates[tid]; for (unsigned int i = tid; i < numSims; i += step) { // Shift the output pointer Real *output = paths + i; // Simulate the path Real s = static_cast(1); for (unsigned int t = 0; t < numTimesteps; t++, output += numSims) { s *= getPathStep(drift, diffusion, localState); *output = s; } } } template __device__ Real reduce_sum(Real in, cg::thread_block cta) { SharedMemory sdata; // Perform first level of reduction: // - Write to shared memory unsigned int ltid = threadIdx.x; sdata[ltid] = in; cg::sync(cta); // Do reduction in shared mem for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { if (ltid < s) { sdata[ltid] += sdata[ltid + s]; } cg::sync(cta); } return sdata[0]; } // Valuation kernel template __global__ void computeValue(Real *const values, const Real *const paths, const AsianOption *const option, const unsigned int numSims, const unsigned int numTimesteps) { // Handle to thread block group cg::thread_block cta = cg::this_thread_block(); // Determine thread ID unsigned int bid = blockIdx.x; unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; unsigned int step = gridDim.x * blockDim.x; Real sumPayoffs = static_cast(0); for (unsigned int i = tid; i < numSims; i += step) { // Shift the input pointer const Real *path = paths + i; // Compute the arithmetic average Real avg = static_cast(0); for (unsigned int t = 0; t < numTimesteps; t++, path += numSims) { avg += *path; } avg = avg * option->spot / numTimesteps; // Compute the payoff Real payoff = avg - option->strike; if (option->type == AsianOption::Put) { payoff = -payoff; } payoff = max(static_cast(0), payoff); // Accumulate payoff locally sumPayoffs += payoff; } // Reduce within the block sumPayoffs = reduce_sum(sumPayoffs, cta); // Store the result if (threadIdx.x == 0) { values[bid] = sumPayoffs; } } template PricingEngine::PricingEngine(unsigned int numSims, unsigned int device, unsigned int threadBlockSize, unsigned int seed) : m_numSims(numSims), m_device(device), m_threadBlockSize(threadBlockSize), m_seed(seed) {} template void PricingEngine::operator()(AsianOption &option) { cudaError_t cudaResult = cudaSuccess; struct cudaDeviceProp deviceProperties; struct cudaFuncAttributes funcAttributes; // Get device properties cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device); if (cudaResult != cudaSuccess) { string msg("Could not get device properties: "); msg += cudaGetErrorString(cudaResult); throw std::runtime_error(msg); } // Check precision is valid unsigned int deviceVersion = deviceProperties.major * 10 + deviceProperties.minor; if (typeid(Real) == typeid(double) && deviceVersion < 13) { throw std::runtime_error("Device does not have double precision support"); } // Attach to GPU cudaResult = cudaSetDevice(m_device); if (cudaResult != cudaSuccess) { string msg("Could not set CUDA device: "); msg += cudaGetErrorString(cudaResult); throw std::runtime_error(msg); } // Determine how to divide the work between cores dim3 block; dim3 grid; block.x = m_threadBlockSize; grid.x = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize; // Aim to launch around ten or more times as many blocks as there // are multiprocessors on the target device. unsigned int blocksPerSM = 10; unsigned int numSMs = deviceProperties.multiProcessorCount; while (grid.x > 2 * blocksPerSM * numSMs) { grid.x >>= 1; } // Get initRNG function properties and check the maximum block size cudaResult = cudaFuncGetAttributes(&funcAttributes, initRNG); if (cudaResult != cudaSuccess) { string msg("Could not get function attributes: "); msg += cudaGetErrorString(cudaResult); throw std::runtime_error(msg); } if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { throw std::runtime_error( "Block X dimension is too large for initRNG kernel"); } // Get generatePaths function properties and check the maximum block size cudaResult = cudaFuncGetAttributes(&funcAttributes, generatePaths); if (cudaResult != cudaSuccess) { string msg("Could not get function attributes: "); msg += cudaGetErrorString(cudaResult); throw std::runtime_error(msg); } if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { throw std::runtime_error( "Block X dimension is too large for generatePaths kernel"); } // Get computeValue function properties and check the maximum block size cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue); if (cudaResult != cudaSuccess) { string msg("Could not get function attributes: "); msg += cudaGetErrorString(cudaResult); throw std::runtime_error(msg); } if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { throw std::runtime_error( "Block X dimension is too large for computeValue kernel"); } // Setup problem on GPU AsianOption *d_option = 0; cudaResult = cudaMalloc((void **)&d_option, sizeof(AsianOption)); if (cudaResult != cudaSuccess) { string msg("Could not allocate memory on device for option data: "); msg += cudaGetErrorString(cudaResult); throw std::runtime_error(msg); } cudaResult = cudaMemcpy(d_option, &option, sizeof(AsianOption), cudaMemcpyHostToDevice); if (cudaResult != cudaSuccess) { string msg("Could not copy data to device: "); msg += cudaGetErrorString(cudaResult); throw std::runtime_error(msg); } // Allocate memory for paths Real *d_paths = 0; int numTimesteps = static_cast(option.tenor / option.dt); cudaResult = cudaMalloc((void **)&d_paths, m_numSims * numTimesteps * sizeof(Real)); if (cudaResult != cudaSuccess) { string msg("Could not allocate memory on device for paths: "); msg += cudaGetErrorString(cudaResult); throw std::runtime_error(msg); } // Allocate memory for RNG states curandState *d_rngStates = 0; cudaResult = cudaMalloc((void **)&d_rngStates, grid.x * block.x * sizeof(curandState)); if (cudaResult != cudaSuccess) { string msg("Could not allocate memory on device for RNG state: "); msg += cudaGetErrorString(cudaResult); throw std::runtime_error(msg); } // Allocate memory for result Real *d_values = 0; cudaResult = cudaMalloc((void **)&d_values, grid.x * sizeof(Real)); if (cudaResult != cudaSuccess) { string msg("Could not allocate memory on device for partial results: "); msg += cudaGetErrorString(cudaResult); throw std::runtime_error(msg); } // Initialise RNG initRNG<<>>(d_rngStates, m_seed); // Generate paths generatePaths<<>>(d_paths, d_rngStates, d_option, m_numSims, numTimesteps); // Compute value computeValue<<>>( d_values, d_paths, d_option, m_numSims, numTimesteps); // Copy partial results back vector values(grid.x); cudaResult = cudaMemcpy(&values[0], d_values, grid.x * sizeof(Real), cudaMemcpyDeviceToHost); if (cudaResult != cudaSuccess) { string msg("Could not copy partial results to host: "); msg += cudaGetErrorString(cudaResult); throw std::runtime_error(msg); } // Complete sum-reduction on host option.value = std::accumulate(values.begin(), values.end(), static_cast(0)); // Compute the mean option.value /= m_numSims; // Discount to present value option.value *= exp(-option.r * option.tenor); // Cleanup if (d_option) { cudaFree(d_option); d_option = 0; } if (d_paths) { cudaFree(d_paths); d_paths = 0; } if (d_rngStates) { cudaFree(d_rngStates); d_rngStates = 0; } if (d_values) { cudaFree(d_values); d_values = 0; } } // Explicit template instantiation template class PricingEngine; template class PricingEngine;