mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-28 13:59:19 +08:00
306 lines
10 KiB
C++
306 lines
10 KiB
C++
|
/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||
|
*
|
||
|
* Redistribution and use in source and binary forms, with or without
|
||
|
* modification, are permitted provided that the following conditions
|
||
|
* are met:
|
||
|
* * Redistributions of source code must retain the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer.
|
||
|
* * Redistributions in binary form must reproduce the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer in the
|
||
|
* documentation and/or other materials provided with the distribution.
|
||
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||
|
* contributors may be used to endorse or promote products derived
|
||
|
* from this software without specific prior written permission.
|
||
|
*
|
||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
*/
|
||
|
|
||
|
/*
|
||
|
* Portions Copyright (c) 2009 Mike Giles, Oxford University. All rights
|
||
|
* reserved.
|
||
|
* Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe. All rights
|
||
|
* reserved.
|
||
|
*
|
||
|
* Sobol Quasi-random Number Generator example
|
||
|
*
|
||
|
* Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
|
||
|
* http://people.maths.ox.ac.uk/~gilesm/
|
||
|
*
|
||
|
* and C code developed by Stephen Joe, University of Waikato, New Zealand
|
||
|
* and Frances Kuo, University of New South Wales, Australia
|
||
|
* http://web.maths.unsw.edu.au/~fkuo/sobol/
|
||
|
*
|
||
|
* For theoretical background see:
|
||
|
*
|
||
|
* P. Bratley and B.L. Fox.
|
||
|
* Implementing Sobol's quasirandom sequence generator
|
||
|
* http://portal.acm.org/citation.cfm?id=42288
|
||
|
* ACM Trans. on Math. Software, 14(1):88-100, 1988
|
||
|
*
|
||
|
* S. Joe and F. Kuo.
|
||
|
* Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
|
||
|
* http://portal.acm.org/citation.cfm?id=641879
|
||
|
* ACM Trans. on Math. Software, 29(1):49-57, 2003
|
||
|
*/
|
||
|
|
||
|
#include <iostream>
|
||
|
|
||
|
#include <cuda_runtime.h> // CUDA Runtime Functions
|
||
|
#include <helper_cuda.h> // helper functions for CUDA error checking and initialization
|
||
|
#include <helper_functions.h> // helper functions
|
||
|
|
||
|
#include <stdexcept>
|
||
|
#include <math.h>
|
||
|
|
||
|
#include "sobol.h"
|
||
|
#include "sobol_gold.h"
|
||
|
#include "sobol_gpu.h"
|
||
|
|
||
|
#define L1ERROR_TOLERANCE (1e-6)
|
||
|
|
||
|
const char *sSDKsample = "Sobol Quasi-Random Number Generator";
|
||
|
|
||
|
void printHelp(int argc, char *argv[]) {
|
||
|
if (argc > 0) {
|
||
|
std::cout << "\nUsage: " << argv[0] << " <options>\n\n";
|
||
|
} else {
|
||
|
std::cout << "\nUsage: <program name> <options>\n\n";
|
||
|
}
|
||
|
|
||
|
std::cout << "\t--vectors=M specify number of vectors (required)\n";
|
||
|
std::cout << "\t The generator will output M vectors\n\n";
|
||
|
std::cout << "\t--dimensions=N specify number of dimensions (required)\n";
|
||
|
std::cout << "\t Each vector will consist of N components\n\n";
|
||
|
std::cout << std::endl;
|
||
|
}
|
||
|
|
||
|
int main(int argc, char *argv[]) {
|
||
|
bool ok = true;
|
||
|
|
||
|
// We will generate n_vectors vectors of n_dimensions numbers
|
||
|
int n_vectors = 100000;
|
||
|
int n_dimensions = 100;
|
||
|
|
||
|
printf("%s Starting...\n\n", sSDKsample);
|
||
|
|
||
|
// Print help if requested
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
|
||
|
printHelp(argc, argv);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) {
|
||
|
// For QA testing set a default number of vectors and dimensions
|
||
|
n_vectors = 100000;
|
||
|
n_dimensions = 100;
|
||
|
} else {
|
||
|
// Parse the command line to determine the required number of vectors
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "vectors")) {
|
||
|
n_vectors = getCmdLineArgumentInt(argc, (const char **)argv, "vectors");
|
||
|
|
||
|
if (n_vectors < 1) {
|
||
|
std::cerr << "Illegal argument: number of vectors must be positive "
|
||
|
"(--vectors=N)"
|
||
|
<< std::endl;
|
||
|
ok = false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
std::cout << "> number of vectors = " << n_vectors << std::endl;
|
||
|
|
||
|
// Parse the command line to determine the number of dimensions in each
|
||
|
// vector
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "dimensions")) {
|
||
|
n_dimensions =
|
||
|
getCmdLineArgumentInt(argc, (const char **)argv, "dimensions");
|
||
|
|
||
|
if (n_dimensions < 1) {
|
||
|
std::cerr << "Illegal argument: number of dimensions must be positive "
|
||
|
"(--dimensions=N)"
|
||
|
<< std::endl;
|
||
|
ok = false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
std::cout << "> number of dimensions = " << n_dimensions << std::endl;
|
||
|
}
|
||
|
|
||
|
// If any of the command line checks failed, exit
|
||
|
if (!ok) {
|
||
|
return -1;
|
||
|
}
|
||
|
|
||
|
// Use command-line specified CUDA device, otherwise use device with highest
|
||
|
// Gflops/s
|
||
|
findCudaDevice(argc, (const char **)argv);
|
||
|
|
||
|
// Create a timer to measure performance
|
||
|
StopWatchInterface *hTimer = NULL;
|
||
|
double time;
|
||
|
sdkCreateTimer(&hTimer);
|
||
|
|
||
|
// Allocate memory for the arrays
|
||
|
std::cout << "Allocating CPU memory..." << std::endl;
|
||
|
unsigned int *h_directions = 0;
|
||
|
float *h_outputCPU = 0;
|
||
|
float *h_outputGPU = 0;
|
||
|
|
||
|
try {
|
||
|
h_directions = new unsigned int[n_dimensions * n_directions];
|
||
|
h_outputCPU = new float[n_vectors * n_dimensions];
|
||
|
h_outputGPU = new float[n_vectors * n_dimensions];
|
||
|
} catch (std::exception e) {
|
||
|
std::cerr << "Caught exception: " << e.what() << std::endl;
|
||
|
std::cerr << "Unable to allocate CPU memory (try running with fewer "
|
||
|
"vectors/dimensions)"
|
||
|
<< std::endl;
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
std::cout << "Allocating GPU memory..." << std::endl;
|
||
|
unsigned int *d_directions;
|
||
|
float *d_output;
|
||
|
|
||
|
try {
|
||
|
cudaError_t cudaResult;
|
||
|
cudaResult = cudaMalloc((void **)&d_directions,
|
||
|
n_dimensions * n_directions * sizeof(unsigned int));
|
||
|
|
||
|
if (cudaResult != cudaSuccess) {
|
||
|
throw std::runtime_error(cudaGetErrorString(cudaResult));
|
||
|
}
|
||
|
|
||
|
cudaResult = cudaMalloc((void **)&d_output,
|
||
|
n_vectors * n_dimensions * sizeof(float));
|
||
|
|
||
|
if (cudaResult != cudaSuccess) {
|
||
|
throw std::runtime_error(cudaGetErrorString(cudaResult));
|
||
|
}
|
||
|
} catch (std::runtime_error e) {
|
||
|
std::cerr << "Caught exception: " << e.what() << std::endl;
|
||
|
std::cerr << "Unable to allocate GPU memory (try running with fewer "
|
||
|
"vectors/dimensions)"
|
||
|
<< std::endl;
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
// Initialize the direction numbers (done on the host)
|
||
|
std::cout << "Initializing direction numbers..." << std::endl;
|
||
|
initSobolDirectionVectors(n_dimensions, h_directions);
|
||
|
|
||
|
// Copy the direction numbers to the device
|
||
|
std::cout << "Copying direction numbers to device..." << std::endl;
|
||
|
checkCudaErrors(cudaMemcpy(d_directions, h_directions,
|
||
|
n_dimensions * n_directions * sizeof(unsigned int),
|
||
|
cudaMemcpyHostToDevice));
|
||
|
checkCudaErrors(cudaDeviceSynchronize());
|
||
|
|
||
|
// Execute the QRNG on the device
|
||
|
std::cout << "Executing QRNG on GPU..." << std::endl;
|
||
|
sdkResetTimer(&hTimer);
|
||
|
sdkStartTimer(&hTimer);
|
||
|
sobolGPU(n_vectors, n_dimensions, d_directions, d_output);
|
||
|
checkCudaErrors(cudaDeviceSynchronize());
|
||
|
sdkStopTimer(&hTimer);
|
||
|
time = sdkGetTimerValue(&hTimer);
|
||
|
|
||
|
if (time < 1e-6) {
|
||
|
std::cout << "Gsamples/s: problem size too small to measure, try "
|
||
|
"increasing number of vectors or dimensions"
|
||
|
<< std::endl;
|
||
|
} else {
|
||
|
std::cout << "Gsamples/s: "
|
||
|
<< (double)n_vectors * (double)n_dimensions * 1E-9 / (time * 1E-3)
|
||
|
<< std::endl;
|
||
|
}
|
||
|
|
||
|
std::cout << "Reading results from GPU..." << std::endl;
|
||
|
checkCudaErrors(cudaMemcpy(h_outputGPU, d_output,
|
||
|
n_vectors * n_dimensions * sizeof(float),
|
||
|
cudaMemcpyDeviceToHost));
|
||
|
|
||
|
std::cout << std::endl;
|
||
|
// Execute the QRNG on the host
|
||
|
std::cout << "Executing QRNG on CPU..." << std::endl;
|
||
|
sdkResetTimer(&hTimer);
|
||
|
sdkStartTimer(&hTimer);
|
||
|
sobolCPU(n_vectors, n_dimensions, h_directions, h_outputCPU);
|
||
|
sdkStopTimer(&hTimer);
|
||
|
time = sdkGetTimerValue(&hTimer);
|
||
|
|
||
|
if (time < 1e-6) {
|
||
|
std::cout << "Gsamples/s: problem size too small to measure, try "
|
||
|
"increasing number of vectors or dimensions"
|
||
|
<< std::endl;
|
||
|
} else {
|
||
|
std::cout << "Gsamples/s: "
|
||
|
<< (double)n_vectors * (double)n_dimensions * 1E-9 / (time * 1E-3)
|
||
|
<< std::endl;
|
||
|
}
|
||
|
|
||
|
// Check the results
|
||
|
std::cout << "Checking results..." << std::endl;
|
||
|
float l1norm_diff = 0.0F;
|
||
|
float l1norm_ref = 0.0F;
|
||
|
float l1error;
|
||
|
|
||
|
// Special case if n_vectors is 1, when the vector should be exactly 0
|
||
|
if (n_vectors == 1) {
|
||
|
for (int d = 0, v = 0; d < n_dimensions; d++) {
|
||
|
float ref = h_outputCPU[d * n_vectors + v];
|
||
|
l1norm_diff += fabs(h_outputGPU[d * n_vectors + v] - ref);
|
||
|
l1norm_ref += fabs(ref);
|
||
|
}
|
||
|
|
||
|
// Output the L1-Error
|
||
|
l1error = l1norm_diff;
|
||
|
|
||
|
if (l1norm_ref != 0) {
|
||
|
std::cerr << "Error: L1-Norm of the reference is not zero (for single "
|
||
|
"vector), golden generator appears broken\n";
|
||
|
} else {
|
||
|
std::cout << "L1-Error: " << l1error << std::endl;
|
||
|
}
|
||
|
} else {
|
||
|
for (int d = 0; d < n_dimensions; d++) {
|
||
|
for (int v = 0; v < n_vectors; v++) {
|
||
|
float ref = h_outputCPU[d * n_vectors + v];
|
||
|
l1norm_diff += fabs(h_outputGPU[d * n_vectors + v] - ref);
|
||
|
l1norm_ref += fabs(ref);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Output the L1-Error
|
||
|
l1error = l1norm_diff / l1norm_ref;
|
||
|
|
||
|
if (l1norm_ref == 0) {
|
||
|
std::cerr << "Error: L1-Norm of the reference is zero, golden generator "
|
||
|
"appears broken\n";
|
||
|
} else {
|
||
|
std::cout << "L1-Error: " << l1error << std::endl;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Cleanup and terminate
|
||
|
std::cout << "Shutting down..." << std::endl;
|
||
|
sdkDeleteTimer(&hTimer);
|
||
|
delete h_directions;
|
||
|
delete h_outputCPU;
|
||
|
delete h_outputGPU;
|
||
|
checkCudaErrors(cudaFree(d_directions));
|
||
|
checkCudaErrors(cudaFree(d_output));
|
||
|
|
||
|
// Check pass/fail using L1 error
|
||
|
exit(l1error < L1ERROR_TOLERANCE ? EXIT_SUCCESS : EXIT_FAILURE);
|
||
|
}
|