cuda-samples/Samples/nbody/bodysystemcuda.cu

287 lines
9.6 KiB
Plaintext
Raw Normal View History

2021-10-21 19:04:49 +08:00
/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <helper_cuda.h>
#include <math.h>
#if defined(__APPLE__) || defined(MACOSX)
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
#include <GLUT/glut.h>
#else
#include <GL/freeglut.h>
#endif
// CUDA standard includes
#include <cuda_runtime.h>
#include <cuda_gl_interop.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
#include "bodysystem.h"
__constant__ float softeningSquared;
__constant__ double softeningSquared_fp64;
cudaError_t setSofteningSquared(float softeningSq) {
return cudaMemcpyToSymbol(softeningSquared, &softeningSq, sizeof(float), 0,
cudaMemcpyHostToDevice);
}
cudaError_t setSofteningSquared(double softeningSq) {
return cudaMemcpyToSymbol(softeningSquared_fp64, &softeningSq, sizeof(double),
0, cudaMemcpyHostToDevice);
}
template <class T>
struct SharedMemory {
__device__ inline operator T *() {
extern __shared__ int __smem[];
return (T *)__smem;
}
__device__ inline operator const T *() const {
extern __shared__ int __smem[];
return (T *)__smem;
}
};
template <typename T>
__device__ T rsqrt_T(T x) {
return rsqrt(x);
}
template <>
__device__ float rsqrt_T<float>(float x) {
return rsqrtf(x);
}
template <>
__device__ double rsqrt_T<double>(double x) {
return rsqrt(x);
}
// Macros to simplify shared memory addressing
#define SX(i) sharedPos[i + blockDim.x * threadIdx.y]
// This macro is only used when multithreadBodies is true (below)
#define SX_SUM(i, j) sharedPos[i + blockDim.x * j]
template <typename T>
__device__ T getSofteningSquared() {
return softeningSquared;
}
template <>
__device__ double getSofteningSquared<double>() {
return softeningSquared_fp64;
}
template <typename T>
struct DeviceData {
T *dPos[2]; // mapped host pointers
T *dVel;
cudaEvent_t event;
unsigned int offset;
unsigned int numBodies;
};
template <typename T>
__device__ typename vec3<T>::Type bodyBodyInteraction(
typename vec3<T>::Type ai, typename vec4<T>::Type bi,
typename vec4<T>::Type bj) {
typename vec3<T>::Type r;
// r_ij [3 FLOPS]
r.x = bj.x - bi.x;
r.y = bj.y - bi.y;
r.z = bj.z - bi.z;
// distSqr = dot(r_ij, r_ij) + EPS^2 [6 FLOPS]
T distSqr = r.x * r.x + r.y * r.y + r.z * r.z;
distSqr += getSofteningSquared<T>();
// invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
T invDist = rsqrt_T(distSqr);
T invDistCube = invDist * invDist * invDist;
// s = m_j * invDistCube [1 FLOP]
T s = bj.w * invDistCube;
// a_i = a_i + s * r_ij [6 FLOPS]
ai.x += r.x * s;
ai.y += r.y * s;
ai.z += r.z * s;
return ai;
}
template <typename T>
__device__ typename vec3<T>::Type computeBodyAccel(
typename vec4<T>::Type bodyPos, typename vec4<T>::Type *positions,
int numTiles, cg::thread_block cta) {
typename vec4<T>::Type *sharedPos = SharedMemory<typename vec4<T>::Type>();
typename vec3<T>::Type acc = {0.0f, 0.0f, 0.0f};
for (int tile = 0; tile < numTiles; tile++) {
sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x];
cg::sync(cta);
// This is the "tile_calculation" from the GPUG3 article.
#pragma unroll 128
for (unsigned int counter = 0; counter < blockDim.x; counter++) {
acc = bodyBodyInteraction<T>(acc, bodyPos, sharedPos[counter]);
}
cg::sync(cta);
}
return acc;
}
template <typename T>
__global__ void integrateBodies(typename vec4<T>::Type *__restrict__ newPos,
typename vec4<T>::Type *__restrict__ oldPos,
typename vec4<T>::Type *vel,
unsigned int deviceOffset,
unsigned int deviceNumBodies, float deltaTime,
float damping, int numTiles) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= deviceNumBodies) {
return;
}
typename vec4<T>::Type position = oldPos[deviceOffset + index];
typename vec3<T>::Type accel =
computeBodyAccel<T>(position, oldPos, numTiles, cta);
// acceleration = force / mass;
// new velocity = old velocity + acceleration * deltaTime
// note we factor out the body's mass from the equation, here and in
// bodyBodyInteraction
// (because they cancel out). Thus here force == acceleration
typename vec4<T>::Type velocity = vel[deviceOffset + index];
velocity.x += accel.x * deltaTime;
velocity.y += accel.y * deltaTime;
velocity.z += accel.z * deltaTime;
velocity.x *= damping;
velocity.y *= damping;
velocity.z *= damping;
// new position = old position + velocity * deltaTime
position.x += velocity.x * deltaTime;
position.y += velocity.y * deltaTime;
position.z += velocity.z * deltaTime;
// store new position and velocity
newPos[deviceOffset + index] = position;
vel[deviceOffset + index] = velocity;
}
template <typename T>
void integrateNbodySystem(DeviceData<T> *deviceData,
cudaGraphicsResource **pgres,
unsigned int currentRead, float deltaTime,
float damping, unsigned int numBodies,
unsigned int numDevices, int blockSize,
bool bUsePBO) {
if (bUsePBO) {
checkCudaErrors(cudaGraphicsResourceSetMapFlags(
pgres[currentRead], cudaGraphicsMapFlagsReadOnly));
checkCudaErrors(cudaGraphicsResourceSetMapFlags(
pgres[1 - currentRead], cudaGraphicsMapFlagsWriteDiscard));
checkCudaErrors(cudaGraphicsMapResources(2, pgres, 0));
size_t bytes;
checkCudaErrors(cudaGraphicsResourceGetMappedPointer(
(void **)&(deviceData[0].dPos[currentRead]), &bytes,
pgres[currentRead]));
checkCudaErrors(cudaGraphicsResourceGetMappedPointer(
(void **)&(deviceData[0].dPos[1 - currentRead]), &bytes,
pgres[1 - currentRead]));
}
for (unsigned int dev = 0; dev != numDevices; dev++) {
if (numDevices > 1) {
cudaSetDevice(dev);
}
int numBlocks = (deviceData[dev].numBodies + blockSize - 1) / blockSize;
int numTiles = (numBodies + blockSize - 1) / blockSize;
int sharedMemSize = blockSize * 4 * sizeof(T); // 4 floats for pos
integrateBodies<T><<<numBlocks, blockSize, sharedMemSize>>>(
(typename vec4<T>::Type *)deviceData[dev].dPos[1 - currentRead],
(typename vec4<T>::Type *)deviceData[dev].dPos[currentRead],
(typename vec4<T>::Type *)deviceData[dev].dVel, deviceData[dev].offset,
deviceData[dev].numBodies, deltaTime, damping, numTiles);
if (numDevices > 1) {
checkCudaErrors(cudaEventRecord(deviceData[dev].event));
// MJH: Hack on older driver versions to force kernel launches to flush!
cudaStreamQuery(0);
}
// check if kernel invocation generated an error
getLastCudaError("Kernel execution failed");
}
if (numDevices > 1) {
for (unsigned int dev = 0; dev < numDevices; dev++) {
checkCudaErrors(cudaEventSynchronize(deviceData[dev].event));
}
}
if (bUsePBO) {
checkCudaErrors(cudaGraphicsUnmapResources(2, pgres, 0));
}
}
// Explicit specializations needed to generate code
template void integrateNbodySystem<float>(DeviceData<float> *deviceData,
cudaGraphicsResource **pgres,
unsigned int currentRead,
float deltaTime, float damping,
unsigned int numBodies,
unsigned int numDevices,
int blockSize, bool bUsePBO);
template void integrateNbodySystem<double>(DeviceData<double> *deviceData,
cudaGraphicsResource **pgres,
unsigned int currentRead,
float deltaTime, float damping,
unsigned int numBodies,
unsigned int numDevices,
int blockSize, bool bUsePBO);