mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-28 16:39:15 +08:00
488 lines
15 KiB
C++
488 lines
15 KiB
C++
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/*
|
|
* This sample evaluates fair call price for a
|
|
* given set of European options using Monte Carlo approach.
|
|
* See supplied whitepaper for more explanations.
|
|
*/
|
|
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <math.h>
|
|
#include <cuda_runtime.h>
|
|
|
|
// includes, project
|
|
#include <helper_functions.h> // Helper functions (utilities, parsing, timing)
|
|
#include <helper_cuda.h> // helper functions (cuda error checking and initialization)
|
|
#include <multithreading.h>
|
|
|
|
#include "MonteCarlo_common.h"
|
|
|
|
int *pArgc = NULL;
|
|
char **pArgv = NULL;
|
|
|
|
#ifdef WIN32
|
|
#define strcasecmp _strcmpi
|
|
#endif
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Common functions
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
float randFloat(float low, float high) {
|
|
float t = (float)rand() / (float)RAND_MAX;
|
|
return (1.0f - t) * low + t * high;
|
|
}
|
|
|
|
/// Utility function to tweak problem size for small GPUs
|
|
int adjustProblemSize(int GPU_N, int default_nOptions) {
|
|
int nOptions = default_nOptions;
|
|
|
|
// select problem size
|
|
for (int i = 0; i < GPU_N; i++) {
|
|
cudaDeviceProp deviceProp;
|
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i));
|
|
int cudaCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
|
|
deviceProp.multiProcessorCount;
|
|
|
|
if (cudaCores <= 32) {
|
|
nOptions = (nOptions < cudaCores / 2 ? nOptions : cudaCores / 2);
|
|
}
|
|
}
|
|
|
|
return nOptions;
|
|
}
|
|
|
|
int adjustGridSize(int GPUIndex, int defaultGridSize) {
|
|
cudaDeviceProp deviceProp;
|
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, GPUIndex));
|
|
int maxGridSize = deviceProp.multiProcessorCount * 40;
|
|
return ((defaultGridSize > maxGridSize) ? maxGridSize : defaultGridSize);
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// CPU reference functions
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
extern "C" void MonteCarloCPU(TOptionValue &callValue, TOptionData optionData,
|
|
float *h_Random, int pathN);
|
|
|
|
// Black-Scholes formula for call options
|
|
extern "C" void BlackScholesCall(float &CallResult, TOptionData optionData);
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// GPU-driving host thread
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Timer
|
|
StopWatchInterface **hTimer = NULL;
|
|
|
|
static CUT_THREADPROC solverThread(TOptionPlan *plan) {
|
|
// Init GPU
|
|
checkCudaErrors(cudaSetDevice(plan->device));
|
|
|
|
cudaDeviceProp deviceProp;
|
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, plan->device));
|
|
|
|
// Start the timer
|
|
sdkStartTimer(&hTimer[plan->device]);
|
|
|
|
// Allocate intermediate memory for MC integrator and initialize
|
|
// RNG states
|
|
initMonteCarloGPU(plan);
|
|
|
|
// Main computation
|
|
MonteCarloGPU(plan);
|
|
|
|
checkCudaErrors(cudaDeviceSynchronize());
|
|
|
|
// Stop the timer
|
|
sdkStopTimer(&hTimer[plan->device]);
|
|
|
|
// Shut down this GPU
|
|
closeMonteCarloGPU(plan);
|
|
|
|
cudaStreamSynchronize(0);
|
|
|
|
printf("solverThread() finished - GPU Device %d: %s\n", plan->device,
|
|
deviceProp.name);
|
|
|
|
CUT_THREADEND;
|
|
}
|
|
|
|
static void multiSolver(TOptionPlan *plan, int nPlans) {
|
|
// allocate and initialize an array of stream handles
|
|
cudaStream_t *streams = (cudaStream_t *)malloc(nPlans * sizeof(cudaStream_t));
|
|
cudaEvent_t *events = (cudaEvent_t *)malloc(nPlans * sizeof(cudaEvent_t));
|
|
|
|
for (int i = 0; i < nPlans; i++) {
|
|
checkCudaErrors(cudaSetDevice(plan[i].device));
|
|
checkCudaErrors(cudaStreamCreate(&(streams[i])));
|
|
checkCudaErrors(cudaEventCreate(&(events[i])));
|
|
}
|
|
|
|
// Init Each GPU
|
|
// In CUDA 4.0 we can call cudaSetDevice multiple times to target each device
|
|
// Set the device desired, then perform initializations on that device
|
|
|
|
for (int i = 0; i < nPlans; i++) {
|
|
// set the target device to perform initialization on
|
|
checkCudaErrors(cudaSetDevice(plan[i].device));
|
|
|
|
cudaDeviceProp deviceProp;
|
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, plan[i].device));
|
|
|
|
// Allocate intermediate memory for MC integrator
|
|
// and initialize RNG state
|
|
initMonteCarloGPU(&plan[i]);
|
|
}
|
|
|
|
for (int i = 0; i < nPlans; i++) {
|
|
checkCudaErrors(cudaSetDevice(plan[i].device));
|
|
checkCudaErrors(cudaDeviceSynchronize());
|
|
}
|
|
|
|
// Start the timer
|
|
sdkResetTimer(&hTimer[0]);
|
|
sdkStartTimer(&hTimer[0]);
|
|
|
|
for (int i = 0; i < nPlans; i++) {
|
|
checkCudaErrors(cudaSetDevice(plan[i].device));
|
|
|
|
// Main computations
|
|
MonteCarloGPU(&plan[i], streams[i]);
|
|
|
|
checkCudaErrors(cudaEventRecord(events[i], streams[i]));
|
|
}
|
|
|
|
for (int i = 0; i < nPlans; i++) {
|
|
checkCudaErrors(cudaSetDevice(plan[i].device));
|
|
cudaEventSynchronize(events[i]);
|
|
}
|
|
|
|
// Stop the timer
|
|
sdkStopTimer(&hTimer[0]);
|
|
|
|
for (int i = 0; i < nPlans; i++) {
|
|
checkCudaErrors(cudaSetDevice(plan[i].device));
|
|
closeMonteCarloGPU(&plan[i]);
|
|
checkCudaErrors(cudaStreamDestroy(streams[i]));
|
|
checkCudaErrors(cudaEventDestroy(events[i]));
|
|
}
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// Main program
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
#define DO_CPU
|
|
#undef DO_CPU
|
|
|
|
#define PRINT_RESULTS
|
|
#undef PRINT_RESULTS
|
|
|
|
void usage() {
|
|
printf("--method=[threaded,streamed] --scaling=[strong,weak] [--help]\n");
|
|
printf("Method=threaded: 1 CPU thread for each GPU [default]\n");
|
|
printf(
|
|
" streamed: 1 CPU thread handles all GPUs (requires CUDA 4.0 or "
|
|
"newer)\n");
|
|
printf("Scaling=strong : constant problem size\n");
|
|
printf(
|
|
" weak : problem size scales with number of available GPUs "
|
|
"[default]\n");
|
|
}
|
|
|
|
int main(int argc, char **argv) {
|
|
char *multiMethodChoice = NULL;
|
|
char *scalingChoice = NULL;
|
|
bool use_threads = true;
|
|
bool bqatest = false;
|
|
bool strongScaling = false;
|
|
|
|
pArgc = &argc;
|
|
pArgv = argv;
|
|
|
|
printf("%s Starting...\n\n", argv[0]);
|
|
|
|
if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) {
|
|
bqatest = true;
|
|
}
|
|
|
|
getCmdLineArgumentString(argc, (const char **)argv, "method",
|
|
&multiMethodChoice);
|
|
getCmdLineArgumentString(argc, (const char **)argv, "scaling",
|
|
&scalingChoice);
|
|
|
|
if (checkCmdLineFlag(argc, (const char **)argv, "h") ||
|
|
checkCmdLineFlag(argc, (const char **)argv, "help")) {
|
|
usage();
|
|
exit(EXIT_SUCCESS);
|
|
}
|
|
|
|
if (multiMethodChoice == NULL) {
|
|
use_threads = false;
|
|
} else {
|
|
if (!strcasecmp(multiMethodChoice, "threaded")) {
|
|
use_threads = true;
|
|
} else {
|
|
use_threads = false;
|
|
}
|
|
}
|
|
|
|
if (use_threads == false) {
|
|
printf("Using single CPU thread for multiple GPUs\n");
|
|
}
|
|
|
|
if (scalingChoice == NULL) {
|
|
strongScaling = false;
|
|
} else {
|
|
if (!strcasecmp(scalingChoice, "strong")) {
|
|
strongScaling = true;
|
|
} else {
|
|
strongScaling = false;
|
|
}
|
|
}
|
|
|
|
// GPU number present in the system
|
|
int GPU_N;
|
|
checkCudaErrors(cudaGetDeviceCount(&GPU_N));
|
|
int nOptions = 8 * 1024;
|
|
|
|
nOptions = adjustProblemSize(GPU_N, nOptions);
|
|
|
|
// select problem size
|
|
int scale = (strongScaling) ? 1 : GPU_N;
|
|
int OPT_N = nOptions * scale;
|
|
int PATH_N = 262144;
|
|
|
|
// initialize the timers
|
|
hTimer = new StopWatchInterface *[GPU_N];
|
|
|
|
for (int i = 0; i < GPU_N; i++) {
|
|
sdkCreateTimer(&hTimer[i]);
|
|
sdkResetTimer(&hTimer[i]);
|
|
}
|
|
|
|
// Input data array
|
|
TOptionData *optionData = new TOptionData[OPT_N];
|
|
// Final GPU MC results
|
|
TOptionValue *callValueGPU = new TOptionValue[OPT_N];
|
|
//"Theoretical" call values by Black-Scholes formula
|
|
float *callValueBS = new float[OPT_N];
|
|
// Solver config
|
|
TOptionPlan *optionSolver = new TOptionPlan[GPU_N];
|
|
// OS thread ID
|
|
CUTThread *threadID = new CUTThread[GPU_N];
|
|
|
|
int gpuBase, gpuIndex;
|
|
int i;
|
|
|
|
float time;
|
|
|
|
double delta, ref, sumDelta, sumRef, sumReserve;
|
|
|
|
printf("MonteCarloMultiGPU\n");
|
|
printf("==================\n");
|
|
printf("Parallelization method = %s\n",
|
|
use_threads ? "threaded" : "streamed");
|
|
printf("Problem scaling = %s\n", strongScaling ? "strong" : "weak");
|
|
printf("Number of GPUs = %d\n", GPU_N);
|
|
printf("Total number of options = %d\n", OPT_N);
|
|
printf("Number of paths = %d\n", PATH_N);
|
|
|
|
printf("main(): generating input data...\n");
|
|
srand(123);
|
|
|
|
for (i = 0; i < OPT_N; i++) {
|
|
optionData[i].S = randFloat(5.0f, 50.0f);
|
|
optionData[i].X = randFloat(10.0f, 25.0f);
|
|
optionData[i].T = randFloat(1.0f, 5.0f);
|
|
optionData[i].R = 0.06f;
|
|
optionData[i].V = 0.10f;
|
|
callValueGPU[i].Expected = -1.0f;
|
|
callValueGPU[i].Confidence = -1.0f;
|
|
}
|
|
|
|
printf("main(): starting %i host threads...\n", GPU_N);
|
|
|
|
// Get option count for each GPU
|
|
for (i = 0; i < GPU_N; i++) {
|
|
optionSolver[i].optionCount = OPT_N / GPU_N;
|
|
}
|
|
|
|
// Take into account cases with "odd" option counts
|
|
for (i = 0; i < (OPT_N % GPU_N); i++) {
|
|
optionSolver[i].optionCount++;
|
|
}
|
|
|
|
// Assign GPU option ranges
|
|
gpuBase = 0;
|
|
|
|
for (i = 0; i < GPU_N; i++) {
|
|
optionSolver[i].device = i;
|
|
optionSolver[i].optionData = optionData + gpuBase;
|
|
optionSolver[i].callValue = callValueGPU + gpuBase;
|
|
optionSolver[i].pathN = PATH_N;
|
|
optionSolver[i].gridSize =
|
|
adjustGridSize(optionSolver[i].device, optionSolver[i].optionCount);
|
|
gpuBase += optionSolver[i].optionCount;
|
|
}
|
|
|
|
if (use_threads || bqatest) {
|
|
// Start CPU thread for each GPU
|
|
for (gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++) {
|
|
threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread,
|
|
&optionSolver[gpuIndex]);
|
|
}
|
|
|
|
printf("main(): waiting for GPU results...\n");
|
|
cutWaitForThreads(threadID, GPU_N);
|
|
|
|
printf("main(): GPU statistics, threaded\n");
|
|
|
|
for (i = 0; i < GPU_N; i++) {
|
|
cudaDeviceProp deviceProp;
|
|
checkCudaErrors(
|
|
cudaGetDeviceProperties(&deviceProp, optionSolver[i].device));
|
|
printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name);
|
|
printf("Options : %i\n", optionSolver[i].optionCount);
|
|
printf("Simulation paths: %i\n", optionSolver[i].pathN);
|
|
time = sdkGetTimerValue(&hTimer[i]);
|
|
printf("Total time (ms.): %f\n", time);
|
|
printf("Options per sec.: %f\n", OPT_N / (time * 0.001));
|
|
}
|
|
|
|
printf("main(): comparing Monte Carlo and Black-Scholes results...\n");
|
|
sumDelta = 0;
|
|
sumRef = 0;
|
|
sumReserve = 0;
|
|
|
|
for (i = 0; i < OPT_N; i++) {
|
|
BlackScholesCall(callValueBS[i], optionData[i]);
|
|
delta = fabs(callValueBS[i] - callValueGPU[i].Expected);
|
|
ref = callValueBS[i];
|
|
sumDelta += delta;
|
|
sumRef += fabs(ref);
|
|
|
|
if (delta > 1e-6) {
|
|
sumReserve += callValueGPU[i].Confidence / delta;
|
|
}
|
|
|
|
#ifdef PRINT_RESULTS
|
|
printf("BS: %f; delta: %E\n", callValueBS[i], delta);
|
|
#endif
|
|
}
|
|
|
|
sumReserve /= OPT_N;
|
|
}
|
|
|
|
if (!use_threads || bqatest) {
|
|
multiSolver(optionSolver, GPU_N);
|
|
|
|
printf("main(): GPU statistics, streamed\n");
|
|
|
|
for (i = 0; i < GPU_N; i++) {
|
|
cudaDeviceProp deviceProp;
|
|
checkCudaErrors(
|
|
cudaGetDeviceProperties(&deviceProp, optionSolver[i].device));
|
|
printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name);
|
|
printf("Options : %i\n", optionSolver[i].optionCount);
|
|
printf("Simulation paths: %i\n", optionSolver[i].pathN);
|
|
}
|
|
|
|
time = sdkGetTimerValue(&hTimer[0]);
|
|
printf("\nTotal time (ms.): %f\n", time);
|
|
printf("\tNote: This is elapsed time for all to compute.\n");
|
|
printf("Options per sec.: %f\n", OPT_N / (time * 0.001));
|
|
|
|
printf("main(): comparing Monte Carlo and Black-Scholes results...\n");
|
|
sumDelta = 0;
|
|
sumRef = 0;
|
|
sumReserve = 0;
|
|
|
|
for (i = 0; i < OPT_N; i++) {
|
|
BlackScholesCall(callValueBS[i], optionData[i]);
|
|
delta = fabs(callValueBS[i] - callValueGPU[i].Expected);
|
|
ref = callValueBS[i];
|
|
sumDelta += delta;
|
|
sumRef += fabs(ref);
|
|
|
|
if (delta > 1e-6) {
|
|
sumReserve += callValueGPU[i].Confidence / delta;
|
|
}
|
|
|
|
#ifdef PRINT_RESULTS
|
|
printf("BS: %f; delta: %E\n", callValueBS[i], delta);
|
|
#endif
|
|
}
|
|
|
|
sumReserve /= OPT_N;
|
|
}
|
|
|
|
#ifdef DO_CPU
|
|
printf("main(): running CPU MonteCarlo...\n");
|
|
TOptionValue callValueCPU;
|
|
sumDelta = 0;
|
|
sumRef = 0;
|
|
|
|
for (i = 0; i < OPT_N; i++) {
|
|
MonteCarloCPU(callValueCPU, optionData[i], NULL, PATH_N);
|
|
delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected);
|
|
ref = callValueCPU.Expected;
|
|
sumDelta += delta;
|
|
sumRef += fabs(ref);
|
|
printf("Exp : %f | %f\t", callValueCPU.Expected, callValueGPU[i].Expected);
|
|
printf("Conf: %f | %f\n", callValueCPU.Confidence,
|
|
callValueGPU[i].Confidence);
|
|
}
|
|
|
|
printf("L1 norm: %E\n", sumDelta / sumRef);
|
|
#endif
|
|
|
|
printf("Shutting down...\n");
|
|
|
|
for (int i = 0; i < GPU_N; i++) {
|
|
sdkStartTimer(&hTimer[i]);
|
|
checkCudaErrors(cudaSetDevice(i));
|
|
}
|
|
|
|
delete[] optionSolver;
|
|
delete[] callValueBS;
|
|
delete[] callValueGPU;
|
|
delete[] optionData;
|
|
delete[] threadID;
|
|
delete[] hTimer;
|
|
|
|
printf("Test Summary...\n");
|
|
printf("L1 norm : %E\n", sumDelta / sumRef);
|
|
printf("Average reserve: %f\n", sumReserve);
|
|
printf(
|
|
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
|
|
"Results may vary when GPU Boost is enabled.\n\n");
|
|
printf(sumReserve > 1.0f ? "Test passed\n" : "Test failed!\n");
|
|
exit(sumReserve > 1.0f ? EXIT_SUCCESS : EXIT_FAILURE);
|
|
}
|