cuda-samples/Samples/MonteCarloMultiGPU/MonteCarloMultiGPU.cpp

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * This sample evaluates fair call price for a
 * given set of European options using Monte Carlo approach.
 * See supplied whitepaper for more explanations.
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cuda_runtime.h>

// includes, project
#include <helper_functions.h>  // Helper functions (utilities, parsing, timing)
#include <helper_cuda.h>  // helper functions (cuda error checking and initialization)
#include <multithreading.h>

#include "MonteCarlo_common.h"

int *pArgc = NULL;
char **pArgv = NULL;

#ifdef WIN32
#define strcasecmp _strcmpi
#endif

////////////////////////////////////////////////////////////////////////////////
// Common functions
////////////////////////////////////////////////////////////////////////////////
float randFloat(float low, float high) {
  float t = (float)rand() / (float)RAND_MAX;
  return (1.0f - t) * low + t * high;
}

/// Utility function to tweak problem size for small GPUs
int adjustProblemSize(int GPU_N, int default_nOptions) {
  int nOptions = default_nOptions;

  // select problem size
  for (int i = 0; i < GPU_N; i++) {
    cudaDeviceProp deviceProp;
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i));
    int cudaCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
                    deviceProp.multiProcessorCount;

    if (cudaCores <= 32) {
      nOptions = (nOptions < cudaCores / 2 ? nOptions : cudaCores / 2);
    }
  }

  return nOptions;
}

int adjustGridSize(int GPUIndex, int defaultGridSize) {
  cudaDeviceProp deviceProp;
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, GPUIndex));
  int maxGridSize = deviceProp.multiProcessorCount * 40;
  return ((defaultGridSize > maxGridSize) ? maxGridSize : defaultGridSize);
}

///////////////////////////////////////////////////////////////////////////////
// CPU reference functions
///////////////////////////////////////////////////////////////////////////////
extern "C" void MonteCarloCPU(TOptionValue &callValue, TOptionData optionData,
                              float *h_Random, int pathN);

// Black-Scholes formula for call options
extern "C" void BlackScholesCall(float &CallResult, TOptionData optionData);

////////////////////////////////////////////////////////////////////////////////
// GPU-driving host thread
////////////////////////////////////////////////////////////////////////////////
// Timer
StopWatchInterface **hTimer = NULL;

static CUT_THREADPROC solverThread(TOptionPlan *plan) {
  // Init GPU
  checkCudaErrors(cudaSetDevice(plan->device));

  cudaDeviceProp deviceProp;
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, plan->device));

  // Start the timer
  sdkStartTimer(&hTimer[plan->device]);

  // Allocate intermediate memory for MC integrator and initialize
  // RNG states
  initMonteCarloGPU(plan);

  // Main computation
  MonteCarloGPU(plan);

  checkCudaErrors(cudaDeviceSynchronize());

  // Stop the timer
  sdkStopTimer(&hTimer[plan->device]);

  // Shut down this GPU
  closeMonteCarloGPU(plan);

  cudaStreamSynchronize(0);

  printf("solverThread() finished - GPU Device %d: %s\n", plan->device,
         deviceProp.name);

  CUT_THREADEND;
}

static void multiSolver(TOptionPlan *plan, int nPlans) {
  // allocate and initialize an array of stream handles
  cudaStream_t *streams = (cudaStream_t *)malloc(nPlans * sizeof(cudaStream_t));
  cudaEvent_t *events = (cudaEvent_t *)malloc(nPlans * sizeof(cudaEvent_t));

  for (int i = 0; i < nPlans; i++) {
    checkCudaErrors(cudaSetDevice(plan[i].device));
    checkCudaErrors(cudaStreamCreate(&(streams[i])));
    checkCudaErrors(cudaEventCreate(&(events[i])));
  }

  // Init Each GPU
  // In CUDA 4.0 we can call cudaSetDevice multiple times to target each device
  // Set the device desired, then perform initializations on that device

  for (int i = 0; i < nPlans; i++) {
    // set the target device to perform initialization on
    checkCudaErrors(cudaSetDevice(plan[i].device));

    cudaDeviceProp deviceProp;
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, plan[i].device));

    // Allocate intermediate memory for MC integrator
    // and initialize RNG state
    initMonteCarloGPU(&plan[i]);
  }

  for (int i = 0; i < nPlans; i++) {
    checkCudaErrors(cudaSetDevice(plan[i].device));
    checkCudaErrors(cudaDeviceSynchronize());
  }

  // Start the timer
  sdkResetTimer(&hTimer[0]);
  sdkStartTimer(&hTimer[0]);

  for (int i = 0; i < nPlans; i++) {
    checkCudaErrors(cudaSetDevice(plan[i].device));

    // Main computations
    MonteCarloGPU(&plan[i], streams[i]);

    checkCudaErrors(cudaEventRecord(events[i], streams[i]));
  }

  for (int i = 0; i < nPlans; i++) {
    checkCudaErrors(cudaSetDevice(plan[i].device));
    cudaEventSynchronize(events[i]);
  }

  // Stop the timer
  sdkStopTimer(&hTimer[0]);

  for (int i = 0; i < nPlans; i++) {
    checkCudaErrors(cudaSetDevice(plan[i].device));
    closeMonteCarloGPU(&plan[i]);
    checkCudaErrors(cudaStreamDestroy(streams[i]));
    checkCudaErrors(cudaEventDestroy(events[i]));
  }
}

///////////////////////////////////////////////////////////////////////////////
// Main program
///////////////////////////////////////////////////////////////////////////////
#define DO_CPU
#undef DO_CPU

#define PRINT_RESULTS
#undef PRINT_RESULTS

void usage() {
  printf("--method=[threaded,streamed] --scaling=[strong,weak] [--help]\n");
  printf("Method=threaded: 1 CPU thread for each GPU     [default]\n");
  printf(
      "       streamed: 1 CPU thread handles all GPUs (requires CUDA 4.0 or "
      "newer)\n");
  printf("Scaling=strong : constant problem size\n");
  printf(
      "        weak   : problem size scales with number of available GPUs "
      "[default]\n");
}

int main(int argc, char **argv) {
  char *multiMethodChoice = NULL;
  char *scalingChoice = NULL;
  bool use_threads = true;
  bool bqatest = false;
  bool strongScaling = false;

  pArgc = &argc;
  pArgv = argv;

  printf("%s Starting...\n\n", argv[0]);

  if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) {
    bqatest = true;
  }

  getCmdLineArgumentString(argc, (const char **)argv, "method",
                           &multiMethodChoice);
  getCmdLineArgumentString(argc, (const char **)argv, "scaling",
                           &scalingChoice);

  if (checkCmdLineFlag(argc, (const char **)argv, "h") ||
      checkCmdLineFlag(argc, (const char **)argv, "help")) {
    usage();
    exit(EXIT_SUCCESS);
  }

  if (multiMethodChoice == NULL) {
    use_threads = false;
  } else {
    if (!strcasecmp(multiMethodChoice, "threaded")) {
      use_threads = true;
    } else {
      use_threads = false;
    }
  }

  if (use_threads == false) {
    printf("Using single CPU thread for multiple GPUs\n");
  }

  if (scalingChoice == NULL) {
    strongScaling = false;
  } else {
    if (!strcasecmp(scalingChoice, "strong")) {
      strongScaling = true;
    } else {
      strongScaling = false;
    }
  }

  // GPU number present in the system
  int GPU_N;
  checkCudaErrors(cudaGetDeviceCount(&GPU_N));
  int nOptions = 8 * 1024;

  nOptions = adjustProblemSize(GPU_N, nOptions);

  // select problem size
  int scale = (strongScaling) ? 1 : GPU_N;
  int OPT_N = nOptions * scale;
  int PATH_N = 262144;

  // initialize the timers
  hTimer = new StopWatchInterface *[GPU_N];

  for (int i = 0; i < GPU_N; i++) {
    sdkCreateTimer(&hTimer[i]);
    sdkResetTimer(&hTimer[i]);
  }

  // Input data array
  TOptionData *optionData = new TOptionData[OPT_N];
  // Final GPU MC results
  TOptionValue *callValueGPU = new TOptionValue[OPT_N];
  //"Theoretical" call values by Black-Scholes formula
  float *callValueBS = new float[OPT_N];
  // Solver config
  TOptionPlan *optionSolver = new TOptionPlan[GPU_N];
  // OS thread ID
  CUTThread *threadID = new CUTThread[GPU_N];

  int gpuBase, gpuIndex;
  int i;

  float time;

  double delta, ref, sumDelta, sumRef, sumReserve;

  printf("MonteCarloMultiGPU\n");
  printf("==================\n");
  printf("Parallelization method  = %s\n",
         use_threads ? "threaded" : "streamed");
  printf("Problem scaling         = %s\n", strongScaling ? "strong" : "weak");
  printf("Number of GPUs          = %d\n", GPU_N);
  printf("Total number of options = %d\n", OPT_N);
  printf("Number of paths         = %d\n", PATH_N);

  printf("main(): generating input data...\n");
  srand(123);

  for (i = 0; i < OPT_N; i++) {
    optionData[i].S = randFloat(5.0f, 50.0f);
    optionData[i].X = randFloat(10.0f, 25.0f);
    optionData[i].T = randFloat(1.0f, 5.0f);
    optionData[i].R = 0.06f;
    optionData[i].V = 0.10f;
    callValueGPU[i].Expected = -1.0f;
    callValueGPU[i].Confidence = -1.0f;
  }

  printf("main(): starting %i host threads...\n", GPU_N);

  // Get option count for each GPU
  for (i = 0; i < GPU_N; i++) {
    optionSolver[i].optionCount = OPT_N / GPU_N;
  }

  // Take into account cases with "odd" option counts
  for (i = 0; i < (OPT_N % GPU_N); i++) {
    optionSolver[i].optionCount++;
  }

  // Assign GPU option ranges
  gpuBase = 0;

  for (i = 0; i < GPU_N; i++) {
    optionSolver[i].device = i;
    optionSolver[i].optionData = optionData + gpuBase;
    optionSolver[i].callValue = callValueGPU + gpuBase;
    optionSolver[i].pathN = PATH_N;
    optionSolver[i].gridSize =
        adjustGridSize(optionSolver[i].device, optionSolver[i].optionCount);
    gpuBase += optionSolver[i].optionCount;
  }

  if (use_threads || bqatest) {
    // Start CPU thread for each GPU
    for (gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++) {
      threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread,
                                          &optionSolver[gpuIndex]);
    }

    printf("main(): waiting for GPU results...\n");
    cutWaitForThreads(threadID, GPU_N);

    printf("main(): GPU statistics, threaded\n");

    for (i = 0; i < GPU_N; i++) {
      cudaDeviceProp deviceProp;
      checkCudaErrors(
          cudaGetDeviceProperties(&deviceProp, optionSolver[i].device));
      printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name);
      printf("Options         : %i\n", optionSolver[i].optionCount);
      printf("Simulation paths: %i\n", optionSolver[i].pathN);
      time = sdkGetTimerValue(&hTimer[i]);
      printf("Total time (ms.): %f\n", time);
      printf("Options per sec.: %f\n", OPT_N / (time * 0.001));
    }

    printf("main(): comparing Monte Carlo and Black-Scholes results...\n");
    sumDelta = 0;
    sumRef = 0;
    sumReserve = 0;

    for (i = 0; i < OPT_N; i++) {
      BlackScholesCall(callValueBS[i], optionData[i]);
      delta = fabs(callValueBS[i] - callValueGPU[i].Expected);
      ref = callValueBS[i];
      sumDelta += delta;
      sumRef += fabs(ref);

      if (delta > 1e-6) {
        sumReserve += callValueGPU[i].Confidence / delta;
      }

#ifdef PRINT_RESULTS
      printf("BS: %f; delta: %E\n", callValueBS[i], delta);
#endif
    }

    sumReserve /= OPT_N;
  }

  if (!use_threads || bqatest) {
    multiSolver(optionSolver, GPU_N);

    printf("main(): GPU statistics, streamed\n");

    for (i = 0; i < GPU_N; i++) {
      cudaDeviceProp deviceProp;
      checkCudaErrors(
          cudaGetDeviceProperties(&deviceProp, optionSolver[i].device));
      printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name);
      printf("Options         : %i\n", optionSolver[i].optionCount);
      printf("Simulation paths: %i\n", optionSolver[i].pathN);
    }

    time = sdkGetTimerValue(&hTimer[0]);
    printf("\nTotal time (ms.): %f\n", time);
    printf("\tNote: This is elapsed time for all to compute.\n");
    printf("Options per sec.: %f\n", OPT_N / (time * 0.001));

    printf("main(): comparing Monte Carlo and Black-Scholes results...\n");
    sumDelta = 0;
    sumRef = 0;
    sumReserve = 0;

    for (i = 0; i < OPT_N; i++) {
      BlackScholesCall(callValueBS[i], optionData[i]);
      delta = fabs(callValueBS[i] - callValueGPU[i].Expected);
      ref = callValueBS[i];
      sumDelta += delta;
      sumRef += fabs(ref);

      if (delta > 1e-6) {
        sumReserve += callValueGPU[i].Confidence / delta;
      }

#ifdef PRINT_RESULTS
      printf("BS: %f; delta: %E\n", callValueBS[i], delta);
#endif
    }

    sumReserve /= OPT_N;
  }

#ifdef DO_CPU
  printf("main(): running CPU MonteCarlo...\n");
  TOptionValue callValueCPU;
  sumDelta = 0;
  sumRef = 0;

  for (i = 0; i < OPT_N; i++) {
    MonteCarloCPU(callValueCPU, optionData[i], NULL, PATH_N);
    delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected);
    ref = callValueCPU.Expected;
    sumDelta += delta;
    sumRef += fabs(ref);
    printf("Exp : %f | %f\t", callValueCPU.Expected, callValueGPU[i].Expected);
    printf("Conf: %f | %f\n", callValueCPU.Confidence,
           callValueGPU[i].Confidence);
  }

  printf("L1 norm: %E\n", sumDelta / sumRef);
#endif

  printf("Shutting down...\n");

  for (int i = 0; i < GPU_N; i++) {
    sdkStartTimer(&hTimer[i]);
    checkCudaErrors(cudaSetDevice(i));
  }

  delete[] optionSolver;
  delete[] callValueBS;
  delete[] callValueGPU;
  delete[] optionData;
  delete[] threadID;
  delete[] hTimer;

  printf("Test Summary...\n");
  printf("L1 norm        : %E\n", sumDelta / sumRef);
  printf("Average reserve: %f\n", sumReserve);
  printf(
      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
      "Results may vary when GPU Boost is enabled.\n\n");
  printf(sumReserve > 1.0f ? "Test passed\n" : "Test failed!\n");
  exit(sumReserve > 1.0f ? EXIT_SUCCESS : EXIT_FAILURE);
}
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`/*`
			`* This sample evaluates fair call price for a`
			`* given set of European options using Monte Carlo approach.`
			`* See supplied whitepaper for more explanations.`
			`*/`

			`#include <stdlib.h>`
			`#include <stdio.h>`
			`#include <string.h>`
			`#include <math.h>`
			`#include <cuda_runtime.h>`

			`// includes, project`
			`#include <helper_functions.h> // Helper functions (utilities, parsing, timing)`
			`#include <helper_cuda.h> // helper functions (cuda error checking and initialization)`
			`#include <multithreading.h>`

			`#include "MonteCarlo_common.h"`

			`int *pArgc = NULL;`
			`char **pArgv = NULL;`

			`#ifdef WIN32`
			`#define strcasecmp _strcmpi`
			`#endif`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Common functions`
			`////////////////////////////////////////////////////////////////////////////////`
			`float randFloat(float low, float high) {`
			`float t = (float)rand() / (float)RAND_MAX;`
			`return (1.0f - t) * low + t * high;`
			`}`

			`/// Utility function to tweak problem size for small GPUs`
			`int adjustProblemSize(int GPU_N, int default_nOptions) {`
			`int nOptions = default_nOptions;`

			`// select problem size`
			`for (int i = 0; i < GPU_N; i++) {`
			`cudaDeviceProp deviceProp;`
			`checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i));`
			`int cudaCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *`
			`deviceProp.multiProcessorCount;`

			`if (cudaCores <= 32) {`
			`nOptions = (nOptions < cudaCores / 2 ? nOptions : cudaCores / 2);`
			`}`
			`}`

			`return nOptions;`
			`}`

			`int adjustGridSize(int GPUIndex, int defaultGridSize) {`
			`cudaDeviceProp deviceProp;`
			`checkCudaErrors(cudaGetDeviceProperties(&deviceProp, GPUIndex));`
			`int maxGridSize = deviceProp.multiProcessorCount * 40;`
			`return ((defaultGridSize > maxGridSize) ? maxGridSize : defaultGridSize);`
			`}`

			`///////////////////////////////////////////////////////////////////////////////`
			`// CPU reference functions`
			`///////////////////////////////////////////////////////////////////////////////`
			`extern "C" void MonteCarloCPU(TOptionValue &callValue, TOptionData optionData,`
			`float *h_Random, int pathN);`

			`// Black-Scholes formula for call options`
			`extern "C" void BlackScholesCall(float &CallResult, TOptionData optionData);`

			`////////////////////////////////////////////////////////////////////////////////`
			`// GPU-driving host thread`
			`////////////////////////////////////////////////////////////////////////////////`
			`// Timer`
			`StopWatchInterface **hTimer = NULL;`

			`static CUT_THREADPROC solverThread(TOptionPlan *plan) {`
			`// Init GPU`
			`checkCudaErrors(cudaSetDevice(plan->device));`

			`cudaDeviceProp deviceProp;`
			`checkCudaErrors(cudaGetDeviceProperties(&deviceProp, plan->device));`

			`// Start the timer`
			`sdkStartTimer(&hTimer[plan->device]);`

			`// Allocate intermediate memory for MC integrator and initialize`
			`// RNG states`
			`initMonteCarloGPU(plan);`

			`// Main computation`
			`MonteCarloGPU(plan);`

			`checkCudaErrors(cudaDeviceSynchronize());`

			`// Stop the timer`
			`sdkStopTimer(&hTimer[plan->device]);`

			`// Shut down this GPU`
			`closeMonteCarloGPU(plan);`

			`cudaStreamSynchronize(0);`

			`printf("solverThread() finished - GPU Device %d: %s\n", plan->device,`
			`deviceProp.name);`

			`CUT_THREADEND;`
			`}`

			`static void multiSolver(TOptionPlan *plan, int nPlans) {`
			`// allocate and initialize an array of stream handles`
			`cudaStream_t streams = (cudaStream_t )malloc(nPlans * sizeof(cudaStream_t));`
			`cudaEvent_t events = (cudaEvent_t )malloc(nPlans * sizeof(cudaEvent_t));`

			`for (int i = 0; i < nPlans; i++) {`
			`checkCudaErrors(cudaSetDevice(plan[i].device));`
			`checkCudaErrors(cudaStreamCreate(&(streams[i])));`
			`checkCudaErrors(cudaEventCreate(&(events[i])));`
			`}`

			`// Init Each GPU`
			`// In CUDA 4.0 we can call cudaSetDevice multiple times to target each device`
			`// Set the device desired, then perform initializations on that device`

			`for (int i = 0; i < nPlans; i++) {`
			`// set the target device to perform initialization on`
			`checkCudaErrors(cudaSetDevice(plan[i].device));`

			`cudaDeviceProp deviceProp;`
			`checkCudaErrors(cudaGetDeviceProperties(&deviceProp, plan[i].device));`

			`// Allocate intermediate memory for MC integrator`
			`// and initialize RNG state`
			`initMonteCarloGPU(&plan[i]);`
			`}`

			`for (int i = 0; i < nPlans; i++) {`
			`checkCudaErrors(cudaSetDevice(plan[i].device));`
			`checkCudaErrors(cudaDeviceSynchronize());`
			`}`

			`// Start the timer`
			`sdkResetTimer(&hTimer[0]);`
			`sdkStartTimer(&hTimer[0]);`

			`for (int i = 0; i < nPlans; i++) {`
			`checkCudaErrors(cudaSetDevice(plan[i].device));`

			`// Main computations`
			`MonteCarloGPU(&plan[i], streams[i]);`

			`checkCudaErrors(cudaEventRecord(events[i], streams[i]));`
			`}`

			`for (int i = 0; i < nPlans; i++) {`
			`checkCudaErrors(cudaSetDevice(plan[i].device));`
			`cudaEventSynchronize(events[i]);`
			`}`

			`// Stop the timer`
			`sdkStopTimer(&hTimer[0]);`

			`for (int i = 0; i < nPlans; i++) {`
			`checkCudaErrors(cudaSetDevice(plan[i].device));`
			`closeMonteCarloGPU(&plan[i]);`
			`checkCudaErrors(cudaStreamDestroy(streams[i]));`
			`checkCudaErrors(cudaEventDestroy(events[i]));`
			`}`
			`}`

			`///////////////////////////////////////////////////////////////////////////////`
			`// Main program`
			`///////////////////////////////////////////////////////////////////////////////`
			`#define DO_CPU`
			`#undef DO_CPU`

			`#define PRINT_RESULTS`
			`#undef PRINT_RESULTS`

			`void usage() {`
			`printf("--method=[threaded,streamed] --scaling=[strong,weak] [--help]\n");`
			`printf("Method=threaded: 1 CPU thread for each GPU [default]\n");`
			`printf(`
			`" streamed: 1 CPU thread handles all GPUs (requires CUDA 4.0 or "`
			`"newer)\n");`
			`printf("Scaling=strong : constant problem size\n");`
			`printf(`
			`" weak : problem size scales with number of available GPUs "`
			`"[default]\n");`
			`}`

			`int main(int argc, char **argv) {`
			`char *multiMethodChoice = NULL;`
			`char *scalingChoice = NULL;`
			`bool use_threads = true;`
			`bool bqatest = false;`
			`bool strongScaling = false;`

			`pArgc = &argc;`
			`pArgv = argv;`

			`printf("%s Starting...\n\n", argv[0]);`

			`if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) {`
			`bqatest = true;`
			`}`

			`getCmdLineArgumentString(argc, (const char **)argv, "method",`
			`&multiMethodChoice);`
			`getCmdLineArgumentString(argc, (const char **)argv, "scaling",`
			`&scalingChoice);`

			`if (checkCmdLineFlag(argc, (const char **)argv, "h") \|\|`
			`checkCmdLineFlag(argc, (const char **)argv, "help")) {`
			`usage();`
			`exit(EXIT_SUCCESS);`
			`}`

			`if (multiMethodChoice == NULL) {`
			`use_threads = false;`
			`} else {`
			`if (!strcasecmp(multiMethodChoice, "threaded")) {`
			`use_threads = true;`
			`} else {`
			`use_threads = false;`
			`}`
			`}`

			`if (use_threads == false) {`
			`printf("Using single CPU thread for multiple GPUs\n");`
			`}`

			`if (scalingChoice == NULL) {`
			`strongScaling = false;`
			`} else {`
			`if (!strcasecmp(scalingChoice, "strong")) {`
			`strongScaling = true;`
			`} else {`
			`strongScaling = false;`
			`}`
			`}`

			`// GPU number present in the system`
			`int GPU_N;`
			`checkCudaErrors(cudaGetDeviceCount(&GPU_N));`
			`int nOptions = 8 * 1024;`

			`nOptions = adjustProblemSize(GPU_N, nOptions);`

			`// select problem size`
			`int scale = (strongScaling) ? 1 : GPU_N;`
			`int OPT_N = nOptions * scale;`
			`int PATH_N = 262144;`

			`// initialize the timers`
			`hTimer = new StopWatchInterface *[GPU_N];`

			`for (int i = 0; i < GPU_N; i++) {`
			`sdkCreateTimer(&hTimer[i]);`
			`sdkResetTimer(&hTimer[i]);`
			`}`

			`// Input data array`
			`TOptionData *optionData = new TOptionData[OPT_N];`
			`// Final GPU MC results`
			`TOptionValue *callValueGPU = new TOptionValue[OPT_N];`
			`//"Theoretical" call values by Black-Scholes formula`
			`float *callValueBS = new float[OPT_N];`
			`// Solver config`
			`TOptionPlan *optionSolver = new TOptionPlan[GPU_N];`
			`// OS thread ID`
			`CUTThread *threadID = new CUTThread[GPU_N];`

			`int gpuBase, gpuIndex;`
			`int i;`

			`float time;`

			`double delta, ref, sumDelta, sumRef, sumReserve;`

			`printf("MonteCarloMultiGPU\n");`
			`printf("==================\n");`
			`printf("Parallelization method = %s\n",`
			`use_threads ? "threaded" : "streamed");`
			`printf("Problem scaling = %s\n", strongScaling ? "strong" : "weak");`
			`printf("Number of GPUs = %d\n", GPU_N);`
			`printf("Total number of options = %d\n", OPT_N);`
			`printf("Number of paths = %d\n", PATH_N);`

			`printf("main(): generating input data...\n");`
			`srand(123);`

			`for (i = 0; i < OPT_N; i++) {`
			`optionData[i].S = randFloat(5.0f, 50.0f);`
			`optionData[i].X = randFloat(10.0f, 25.0f);`
			`optionData[i].T = randFloat(1.0f, 5.0f);`
			`optionData[i].R = 0.06f;`
			`optionData[i].V = 0.10f;`
			`callValueGPU[i].Expected = -1.0f;`
			`callValueGPU[i].Confidence = -1.0f;`
			`}`

			`printf("main(): starting %i host threads...\n", GPU_N);`

			`// Get option count for each GPU`
			`for (i = 0; i < GPU_N; i++) {`
			`optionSolver[i].optionCount = OPT_N / GPU_N;`
			`}`

			`// Take into account cases with "odd" option counts`
			`for (i = 0; i < (OPT_N % GPU_N); i++) {`
			`optionSolver[i].optionCount++;`
			`}`

			`// Assign GPU option ranges`
			`gpuBase = 0;`

			`for (i = 0; i < GPU_N; i++) {`
			`optionSolver[i].device = i;`
			`optionSolver[i].optionData = optionData + gpuBase;`
			`optionSolver[i].callValue = callValueGPU + gpuBase;`
			`optionSolver[i].pathN = PATH_N;`
			`optionSolver[i].gridSize =`
			`adjustGridSize(optionSolver[i].device, optionSolver[i].optionCount);`
			`gpuBase += optionSolver[i].optionCount;`
			`}`

			`if (use_threads \|\| bqatest) {`
			`// Start CPU thread for each GPU`
			`for (gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++) {`
			`threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread,`
			`&optionSolver[gpuIndex]);`
			`}`

			`printf("main(): waiting for GPU results...\n");`
			`cutWaitForThreads(threadID, GPU_N);`

			`printf("main(): GPU statistics, threaded\n");`

			`for (i = 0; i < GPU_N; i++) {`
			`cudaDeviceProp deviceProp;`
			`checkCudaErrors(`
			`cudaGetDeviceProperties(&deviceProp, optionSolver[i].device));`
			`printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name);`
			`printf("Options : %i\n", optionSolver[i].optionCount);`
			`printf("Simulation paths: %i\n", optionSolver[i].pathN);`
			`time = sdkGetTimerValue(&hTimer[i]);`
			`printf("Total time (ms.): %f\n", time);`
			`printf("Options per sec.: %f\n", OPT_N / (time * 0.001));`
			`}`

			`printf("main(): comparing Monte Carlo and Black-Scholes results...\n");`
			`sumDelta = 0;`
			`sumRef = 0;`
			`sumReserve = 0;`

			`for (i = 0; i < OPT_N; i++) {`
			`BlackScholesCall(callValueBS[i], optionData[i]);`
			`delta = fabs(callValueBS[i] - callValueGPU[i].Expected);`
			`ref = callValueBS[i];`
			`sumDelta += delta;`
			`sumRef += fabs(ref);`

			`if (delta > 1e-6) {`
			`sumReserve += callValueGPU[i].Confidence / delta;`
			`}`

			`#ifdef PRINT_RESULTS`
			`printf("BS: %f; delta: %E\n", callValueBS[i], delta);`
			`#endif`
			`}`

			`sumReserve /= OPT_N;`
			`}`

			`if (!use_threads \|\| bqatest) {`
			`multiSolver(optionSolver, GPU_N);`

			`printf("main(): GPU statistics, streamed\n");`

			`for (i = 0; i < GPU_N; i++) {`
			`cudaDeviceProp deviceProp;`
			`checkCudaErrors(`
			`cudaGetDeviceProperties(&deviceProp, optionSolver[i].device));`
			`printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name);`
			`printf("Options : %i\n", optionSolver[i].optionCount);`
			`printf("Simulation paths: %i\n", optionSolver[i].pathN);`
			`}`

			`time = sdkGetTimerValue(&hTimer[0]);`
			`printf("\nTotal time (ms.): %f\n", time);`
			`printf("\tNote: This is elapsed time for all to compute.\n");`
			`printf("Options per sec.: %f\n", OPT_N / (time * 0.001));`

			`printf("main(): comparing Monte Carlo and Black-Scholes results...\n");`
			`sumDelta = 0;`
			`sumRef = 0;`
			`sumReserve = 0;`

			`for (i = 0; i < OPT_N; i++) {`
			`BlackScholesCall(callValueBS[i], optionData[i]);`
			`delta = fabs(callValueBS[i] - callValueGPU[i].Expected);`
			`ref = callValueBS[i];`
			`sumDelta += delta;`
			`sumRef += fabs(ref);`

			`if (delta > 1e-6) {`
			`sumReserve += callValueGPU[i].Confidence / delta;`
			`}`

			`#ifdef PRINT_RESULTS`
			`printf("BS: %f; delta: %E\n", callValueBS[i], delta);`
			`#endif`
			`}`

			`sumReserve /= OPT_N;`
			`}`

			`#ifdef DO_CPU`
			`printf("main(): running CPU MonteCarlo...\n");`
			`TOptionValue callValueCPU;`
			`sumDelta = 0;`
			`sumRef = 0;`

			`for (i = 0; i < OPT_N; i++) {`
			`MonteCarloCPU(callValueCPU, optionData[i], NULL, PATH_N);`
			`delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected);`
			`ref = callValueCPU.Expected;`
			`sumDelta += delta;`
			`sumRef += fabs(ref);`
			`printf("Exp : %f \| %f\t", callValueCPU.Expected, callValueGPU[i].Expected);`
			`printf("Conf: %f \| %f\n", callValueCPU.Confidence,`
			`callValueGPU[i].Confidence);`
			`}`

			`printf("L1 norm: %E\n", sumDelta / sumRef);`
			`#endif`

			`printf("Shutting down...\n");`

			`for (int i = 0; i < GPU_N; i++) {`
			`sdkStartTimer(&hTimer[i]);`
			`checkCudaErrors(cudaSetDevice(i));`
			`}`

			`delete[] optionSolver;`
			`delete[] callValueBS;`
			`delete[] callValueGPU;`
			`delete[] optionData;`
			`delete[] threadID;`
			`delete[] hTimer;`

			`printf("Test Summary...\n");`
			`printf("L1 norm : %E\n", sumDelta / sumRef);`
			`printf("Average reserve: %f\n", sumReserve);`
			`printf(`
			`"\nNOTE: The CUDA Samples are not meant for performance measurements. "`
			`"Results may vary when GPU Boost is enabled.\n\n");`
			`printf(sumReserve > 1.0f ? "Test passed\n" : "Test failed!\n");`
			`exit(sumReserve > 1.0f ? EXIT_SUCCESS : EXIT_FAILURE);`
			`}`