cuda-samples/Samples/5_Domain_Specific/SobolQRNG/sobol.cpp
2022-01-13 11:35:24 +05:30

306 lines
10 KiB
C++

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Portions Copyright (c) 2009 Mike Giles, Oxford University. All rights
* reserved.
* Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe. All rights
* reserved.
*
* Sobol Quasi-random Number Generator example
*
* Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
* http://people.maths.ox.ac.uk/~gilesm/
*
* and C code developed by Stephen Joe, University of Waikato, New Zealand
* and Frances Kuo, University of New South Wales, Australia
* http://web.maths.unsw.edu.au/~fkuo/sobol/
*
* For theoretical background see:
*
* P. Bratley and B.L. Fox.
* Implementing Sobol's quasirandom sequence generator
* http://portal.acm.org/citation.cfm?id=42288
* ACM Trans. on Math. Software, 14(1):88-100, 1988
*
* S. Joe and F. Kuo.
* Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
* http://portal.acm.org/citation.cfm?id=641879
* ACM Trans. on Math. Software, 29(1):49-57, 2003
*/
#include <iostream>
#include <cuda_runtime.h> // CUDA Runtime Functions
#include <helper_cuda.h> // helper functions for CUDA error checking and initialization
#include <helper_functions.h> // helper functions
#include <stdexcept>
#include <math.h>
#include "sobol.h"
#include "sobol_gold.h"
#include "sobol_gpu.h"
#define L1ERROR_TOLERANCE (1e-6)
const char *sSDKsample = "Sobol Quasi-Random Number Generator";
void printHelp(int argc, char *argv[]) {
if (argc > 0) {
std::cout << "\nUsage: " << argv[0] << " <options>\n\n";
} else {
std::cout << "\nUsage: <program name> <options>\n\n";
}
std::cout << "\t--vectors=M specify number of vectors (required)\n";
std::cout << "\t The generator will output M vectors\n\n";
std::cout << "\t--dimensions=N specify number of dimensions (required)\n";
std::cout << "\t Each vector will consist of N components\n\n";
std::cout << std::endl;
}
int main(int argc, char *argv[]) {
bool ok = true;
// We will generate n_vectors vectors of n_dimensions numbers
int n_vectors = 100000;
int n_dimensions = 100;
printf("%s Starting...\n\n", sSDKsample);
// Print help if requested
if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
printHelp(argc, argv);
return 0;
}
if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) {
// For QA testing set a default number of vectors and dimensions
n_vectors = 100000;
n_dimensions = 100;
} else {
// Parse the command line to determine the required number of vectors
if (checkCmdLineFlag(argc, (const char **)argv, "vectors")) {
n_vectors = getCmdLineArgumentInt(argc, (const char **)argv, "vectors");
if (n_vectors < 1) {
std::cerr << "Illegal argument: number of vectors must be positive "
"(--vectors=N)"
<< std::endl;
ok = false;
}
}
std::cout << "> number of vectors = " << n_vectors << std::endl;
// Parse the command line to determine the number of dimensions in each
// vector
if (checkCmdLineFlag(argc, (const char **)argv, "dimensions")) {
n_dimensions =
getCmdLineArgumentInt(argc, (const char **)argv, "dimensions");
if (n_dimensions < 1) {
std::cerr << "Illegal argument: number of dimensions must be positive "
"(--dimensions=N)"
<< std::endl;
ok = false;
}
}
std::cout << "> number of dimensions = " << n_dimensions << std::endl;
}
// If any of the command line checks failed, exit
if (!ok) {
return -1;
}
// Use command-line specified CUDA device, otherwise use device with highest
// Gflops/s
findCudaDevice(argc, (const char **)argv);
// Create a timer to measure performance
StopWatchInterface *hTimer = NULL;
double time;
sdkCreateTimer(&hTimer);
// Allocate memory for the arrays
std::cout << "Allocating CPU memory..." << std::endl;
unsigned int *h_directions = 0;
float *h_outputCPU = 0;
float *h_outputGPU = 0;
try {
h_directions = new unsigned int[n_dimensions * n_directions];
h_outputCPU = new float[n_vectors * n_dimensions];
h_outputGPU = new float[n_vectors * n_dimensions];
} catch (std::exception e) {
std::cerr << "Caught exception: " << e.what() << std::endl;
std::cerr << "Unable to allocate CPU memory (try running with fewer "
"vectors/dimensions)"
<< std::endl;
exit(EXIT_FAILURE);
}
std::cout << "Allocating GPU memory..." << std::endl;
unsigned int *d_directions;
float *d_output;
try {
cudaError_t cudaResult;
cudaResult = cudaMalloc((void **)&d_directions,
n_dimensions * n_directions * sizeof(unsigned int));
if (cudaResult != cudaSuccess) {
throw std::runtime_error(cudaGetErrorString(cudaResult));
}
cudaResult = cudaMalloc((void **)&d_output,
n_vectors * n_dimensions * sizeof(float));
if (cudaResult != cudaSuccess) {
throw std::runtime_error(cudaGetErrorString(cudaResult));
}
} catch (std::runtime_error e) {
std::cerr << "Caught exception: " << e.what() << std::endl;
std::cerr << "Unable to allocate GPU memory (try running with fewer "
"vectors/dimensions)"
<< std::endl;
exit(EXIT_FAILURE);
}
// Initialize the direction numbers (done on the host)
std::cout << "Initializing direction numbers..." << std::endl;
initSobolDirectionVectors(n_dimensions, h_directions);
// Copy the direction numbers to the device
std::cout << "Copying direction numbers to device..." << std::endl;
checkCudaErrors(cudaMemcpy(d_directions, h_directions,
n_dimensions * n_directions * sizeof(unsigned int),
cudaMemcpyHostToDevice));
checkCudaErrors(cudaDeviceSynchronize());
// Execute the QRNG on the device
std::cout << "Executing QRNG on GPU..." << std::endl;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
sobolGPU(n_vectors, n_dimensions, d_directions, d_output);
checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&hTimer);
time = sdkGetTimerValue(&hTimer);
if (time < 1e-6) {
std::cout << "Gsamples/s: problem size too small to measure, try "
"increasing number of vectors or dimensions"
<< std::endl;
} else {
std::cout << "Gsamples/s: "
<< (double)n_vectors * (double)n_dimensions * 1E-9 / (time * 1E-3)
<< std::endl;
}
std::cout << "Reading results from GPU..." << std::endl;
checkCudaErrors(cudaMemcpy(h_outputGPU, d_output,
n_vectors * n_dimensions * sizeof(float),
cudaMemcpyDeviceToHost));
std::cout << std::endl;
// Execute the QRNG on the host
std::cout << "Executing QRNG on CPU..." << std::endl;
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
sobolCPU(n_vectors, n_dimensions, h_directions, h_outputCPU);
sdkStopTimer(&hTimer);
time = sdkGetTimerValue(&hTimer);
if (time < 1e-6) {
std::cout << "Gsamples/s: problem size too small to measure, try "
"increasing number of vectors or dimensions"
<< std::endl;
} else {
std::cout << "Gsamples/s: "
<< (double)n_vectors * (double)n_dimensions * 1E-9 / (time * 1E-3)
<< std::endl;
}
// Check the results
std::cout << "Checking results..." << std::endl;
float l1norm_diff = 0.0F;
float l1norm_ref = 0.0F;
float l1error;
// Special case if n_vectors is 1, when the vector should be exactly 0
if (n_vectors == 1) {
for (int d = 0, v = 0; d < n_dimensions; d++) {
float ref = h_outputCPU[d * n_vectors + v];
l1norm_diff += fabs(h_outputGPU[d * n_vectors + v] - ref);
l1norm_ref += fabs(ref);
}
// Output the L1-Error
l1error = l1norm_diff;
if (l1norm_ref != 0) {
std::cerr << "Error: L1-Norm of the reference is not zero (for single "
"vector), golden generator appears broken\n";
} else {
std::cout << "L1-Error: " << l1error << std::endl;
}
} else {
for (int d = 0; d < n_dimensions; d++) {
for (int v = 0; v < n_vectors; v++) {
float ref = h_outputCPU[d * n_vectors + v];
l1norm_diff += fabs(h_outputGPU[d * n_vectors + v] - ref);
l1norm_ref += fabs(ref);
}
}
// Output the L1-Error
l1error = l1norm_diff / l1norm_ref;
if (l1norm_ref == 0) {
std::cerr << "Error: L1-Norm of the reference is zero, golden generator "
"appears broken\n";
} else {
std::cout << "L1-Error: " << l1error << std::endl;
}
}
// Cleanup and terminate
std::cout << "Shutting down..." << std::endl;
sdkDeleteTimer(&hTimer);
delete h_directions;
delete h_outputCPU;
delete h_outputGPU;
checkCudaErrors(cudaFree(d_directions));
checkCudaErrors(cudaFree(d_output));
// Check pass/fail using L1 error
exit(l1error < L1ERROR_TOLERANCE ? EXIT_SUCCESS : EXIT_FAILURE);
}