cuda-samples/Samples/5_Domain_Specific/SobolQRNG/sobol_gpu.cu
2022-01-13 11:35:24 +05:30

208 lines
7.8 KiB
Plaintext

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Portions Copyright (c) 2009 Mike Giles, Oxford University. All rights
* reserved.
* Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe. All rights
* reserved.
*
* Sobol Quasi-random Number Generator example
*
* Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
* http://people.maths.ox.ac.uk/~gilesm/
*
* and C code developed by Stephen Joe, University of Waikato, New Zealand
* and Frances Kuo, University of New South Wales, Australia
* http://web.maths.unsw.edu.au/~fkuo/sobol/
*
* For theoretical background see:
*
* P. Bratley and B.L. Fox.
* Implementing Sobol's quasirandom sequence generator
* http://portal.acm.org/citation.cfm?id=42288
* ACM Trans. on Math. Software, 14(1):88-100, 1988
*
* S. Joe and F. Kuo.
* Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
* http://portal.acm.org/citation.cfm?id=641879
* ACM Trans. on Math. Software, 29(1):49-57, 2003
*
*/
#include "sobol.h"
#include "sobol_gpu.h"
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
#include <helper_cuda.h>
#define k_2powneg32 2.3283064E-10F
__global__ void sobolGPU_kernel(unsigned n_vectors, unsigned n_dimensions,
unsigned *d_directions, float *d_output) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ unsigned int v[n_directions];
// Offset into the correct dimension as specified by the
// block y coordinate
d_directions = d_directions + n_directions * blockIdx.y;
d_output = d_output + n_vectors * blockIdx.y;
// Copy the direction numbers for this dimension into shared
// memory - there are only 32 direction numbers so only the
// first 32 (n_directions) threads need participate.
if (threadIdx.x < n_directions) {
v[threadIdx.x] = d_directions[threadIdx.x];
}
cg::sync(cta);
// Set initial index (i.e. which vector this thread is
// computing first) and stride (i.e. step to the next vector
// for this thread)
int i0 = threadIdx.x + blockIdx.x * blockDim.x;
int stride = gridDim.x * blockDim.x;
// Get the gray code of the index
// c.f. Numerical Recipes in C, chapter 20
// http://www.nrbook.com/a/bookcpdf/c20-2.pdf
unsigned int g = i0 ^ (i0 >> 1);
// Initialisation for first point x[i0]
// In the Bratley and Fox paper this is equation (*), where
// we are computing the value for x[n] without knowing the
// value of x[n-1].
unsigned int X = 0;
unsigned int mask;
for (unsigned int k = 0; k < __ffs(stride) - 1; k++) {
// We want X ^= g_k * v[k], where g_k is one or zero.
// We do this by setting a mask with all bits equal to
// g_k. In reality we keep shifting g so that g_k is the
// LSB of g. This way we avoid multiplication.
mask = -(g & 1);
X ^= mask & v[k];
g = g >> 1;
}
if (i0 < n_vectors) {
d_output[i0] = (float)X * k_2powneg32;
}
// Now do rest of points, using the stride
// Here we want to generate x[i] from x[i-stride] where we
// don't have any of the x in between, therefore we have to
// revisit the equation (**), this is easiest with an example
// so assume stride is 16.
// From x[n] to x[n+16] there will be:
// 8 changes in the first bit
// 4 changes in the second bit
// 2 changes in the third bit
// 1 change in the fourth
// 1 change in one of the remaining bits
//
// What this means is that in the equation:
// x[n+1] = x[n] ^ v[p]
// x[n+2] = x[n+1] ^ v[q] = x[n] ^ v[p] ^ v[q]
// ...
// We will apply xor with v[1] eight times, v[2] four times,
// v[3] twice, v[4] once and one other direction number once.
// Since two xors cancel out, we can skip even applications
// and just apply xor with v[4] (i.e. log2(16)) and with
// the current applicable direction number.
// Note that all these indices count from 1, so we need to
// subtract 1 from them all to account for C arrays counting
// from zero.
unsigned int v_log2stridem1 = v[__ffs(stride) - 2];
unsigned int v_stridemask = stride - 1;
for (unsigned int i = i0 + stride; i < n_vectors; i += stride) {
// x[i] = x[i-stride] ^ v[b] ^ v[c]
// where b is log2(stride) minus 1 for C array indexing
// where c is the index of the rightmost zero bit in i,
// not including the bottom log2(stride) bits, minus 1
// for C array indexing
// In the Bratley and Fox paper this is equation (**)
X ^= v_log2stridem1 ^ v[__ffs(~((i - stride) | v_stridemask)) - 1];
d_output[i] = (float)X * k_2powneg32;
}
}
extern "C" void sobolGPU(int n_vectors, int n_dimensions,
unsigned int *d_directions, float *d_output) {
const int threadsperblock = 64;
// Set up the execution configuration
dim3 dimGrid;
dim3 dimBlock;
int device;
cudaDeviceProp prop;
checkCudaErrors(cudaGetDevice(&device));
checkCudaErrors(cudaGetDeviceProperties(&prop, device));
// This implementation of the generator outputs all the draws for
// one dimension in a contiguous region of memory, followed by the
// next dimension and so on.
// Therefore all threads within a block will be processing different
// vectors from the same dimension. As a result we want the total
// number of blocks to be a multiple of the number of dimensions.
dimGrid.y = n_dimensions;
// If the number of dimensions is large then we will set the number
// of blocks to equal the number of dimensions (i.e. dimGrid.x = 1)
// but if the number of dimensions is small (e.g. less than four per
// multiprocessor) then we'll partition the vectors across blocks
// (as well as threads).
if (n_dimensions < (4 * prop.multiProcessorCount)) {
dimGrid.x = 4 * prop.multiProcessorCount;
} else {
dimGrid.x = 1;
}
// Cap the dimGrid.x if the number of vectors is small
if (dimGrid.x > (unsigned int)(n_vectors / threadsperblock)) {
dimGrid.x = (n_vectors + threadsperblock - 1) / threadsperblock;
}
// Round up to a power of two, required for the algorithm so that
// stride is a power of two.
unsigned int targetDimGridX = dimGrid.x;
for (dimGrid.x = 1; dimGrid.x < targetDimGridX; dimGrid.x *= 2)
;
// Fix the number of threads
dimBlock.x = threadsperblock;
// Execute GPU kernel
sobolGPU_kernel<<<dimGrid, dimBlock>>>(n_vectors, n_dimensions, d_directions,
d_output);
}