/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #if defined(__APPLE__) || defined(MACOSX) #pragma clang diagnostic ignored "-Wdeprecated-declarations" #include #else #include #endif // CUDA standard includes #include #include #include namespace cg = cooperative_groups; #include "bodysystem.h" __constant__ float softeningSquared; __constant__ double softeningSquared_fp64; cudaError_t setSofteningSquared(float softeningSq) { return cudaMemcpyToSymbol(softeningSquared, &softeningSq, sizeof(float), 0, cudaMemcpyHostToDevice); } cudaError_t setSofteningSquared(double softeningSq) { return cudaMemcpyToSymbol(softeningSquared_fp64, &softeningSq, sizeof(double), 0, cudaMemcpyHostToDevice); } template struct SharedMemory { __device__ inline operator T *() { extern __shared__ int __smem[]; return (T *)__smem; } __device__ inline operator const T *() const { extern __shared__ int __smem[]; return (T *)__smem; } }; template __device__ T rsqrt_T(T x) { return rsqrt(x); } template <> __device__ float rsqrt_T(float x) { return rsqrtf(x); } template <> __device__ double rsqrt_T(double x) { return rsqrt(x); } // Macros to simplify shared memory addressing #define SX(i) sharedPos[i + blockDim.x * threadIdx.y] // This macro is only used when multithreadBodies is true (below) #define SX_SUM(i, j) sharedPos[i + blockDim.x * j] template __device__ T getSofteningSquared() { return softeningSquared; } template <> __device__ double getSofteningSquared() { return softeningSquared_fp64; } template struct DeviceData { T *dPos[2]; // mapped host pointers T *dVel; cudaEvent_t event; unsigned int offset; unsigned int numBodies; }; template __device__ typename vec3::Type bodyBodyInteraction( typename vec3::Type ai, typename vec4::Type bi, typename vec4::Type bj) { typename vec3::Type r; // r_ij [3 FLOPS] r.x = bj.x - bi.x; r.y = bj.y - bi.y; r.z = bj.z - bi.z; // distSqr = dot(r_ij, r_ij) + EPS^2 [6 FLOPS] T distSqr = r.x * r.x + r.y * r.y + r.z * r.z; distSqr += getSofteningSquared(); // invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)] T invDist = rsqrt_T(distSqr); T invDistCube = invDist * invDist * invDist; // s = m_j * invDistCube [1 FLOP] T s = bj.w * invDistCube; // a_i = a_i + s * r_ij [6 FLOPS] ai.x += r.x * s; ai.y += r.y * s; ai.z += r.z * s; return ai; } template __device__ typename vec3::Type computeBodyAccel( typename vec4::Type bodyPos, typename vec4::Type *positions, int numTiles, cg::thread_block cta) { typename vec4::Type *sharedPos = SharedMemory::Type>(); typename vec3::Type acc = {0.0f, 0.0f, 0.0f}; for (int tile = 0; tile < numTiles; tile++) { sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x]; cg::sync(cta); // This is the "tile_calculation" from the GPUG3 article. #pragma unroll 128 for (unsigned int counter = 0; counter < blockDim.x; counter++) { acc = bodyBodyInteraction(acc, bodyPos, sharedPos[counter]); } cg::sync(cta); } return acc; } template __global__ void integrateBodies(typename vec4::Type *__restrict__ newPos, typename vec4::Type *__restrict__ oldPos, typename vec4::Type *vel, unsigned int deviceOffset, unsigned int deviceNumBodies, float deltaTime, float damping, int numTiles) { // Handle to thread block group cg::thread_block cta = cg::this_thread_block(); int index = blockIdx.x * blockDim.x + threadIdx.x; if (index >= deviceNumBodies) { return; } typename vec4::Type position = oldPos[deviceOffset + index]; typename vec3::Type accel = computeBodyAccel(position, oldPos, numTiles, cta); // acceleration = force / mass; // new velocity = old velocity + acceleration * deltaTime // note we factor out the body's mass from the equation, here and in // bodyBodyInteraction // (because they cancel out). Thus here force == acceleration typename vec4::Type velocity = vel[deviceOffset + index]; velocity.x += accel.x * deltaTime; velocity.y += accel.y * deltaTime; velocity.z += accel.z * deltaTime; velocity.x *= damping; velocity.y *= damping; velocity.z *= damping; // new position = old position + velocity * deltaTime position.x += velocity.x * deltaTime; position.y += velocity.y * deltaTime; position.z += velocity.z * deltaTime; // store new position and velocity newPos[deviceOffset + index] = position; vel[deviceOffset + index] = velocity; } template void integrateNbodySystem(DeviceData *deviceData, cudaGraphicsResource **pgres, unsigned int currentRead, float deltaTime, float damping, unsigned int numBodies, unsigned int numDevices, int blockSize, bool bUsePBO) { if (bUsePBO) { checkCudaErrors(cudaGraphicsResourceSetMapFlags( pgres[currentRead], cudaGraphicsMapFlagsReadOnly)); checkCudaErrors(cudaGraphicsResourceSetMapFlags( pgres[1 - currentRead], cudaGraphicsMapFlagsWriteDiscard)); checkCudaErrors(cudaGraphicsMapResources(2, pgres, 0)); size_t bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer( (void **)&(deviceData[0].dPos[currentRead]), &bytes, pgres[currentRead])); checkCudaErrors(cudaGraphicsResourceGetMappedPointer( (void **)&(deviceData[0].dPos[1 - currentRead]), &bytes, pgres[1 - currentRead])); } for (unsigned int dev = 0; dev != numDevices; dev++) { if (numDevices > 1) { cudaSetDevice(dev); } int numBlocks = (deviceData[dev].numBodies + blockSize - 1) / blockSize; int numTiles = (numBodies + blockSize - 1) / blockSize; int sharedMemSize = blockSize * 4 * sizeof(T); // 4 floats for pos integrateBodies<<>>( (typename vec4::Type *)deviceData[dev].dPos[1 - currentRead], (typename vec4::Type *)deviceData[dev].dPos[currentRead], (typename vec4::Type *)deviceData[dev].dVel, deviceData[dev].offset, deviceData[dev].numBodies, deltaTime, damping, numTiles); if (numDevices > 1) { checkCudaErrors(cudaEventRecord(deviceData[dev].event)); // MJH: Hack on older driver versions to force kernel launches to flush! cudaStreamQuery(0); } // check if kernel invocation generated an error getLastCudaError("Kernel execution failed"); } if (numDevices > 1) { for (unsigned int dev = 0; dev < numDevices; dev++) { checkCudaErrors(cudaEventSynchronize(deviceData[dev].event)); } } if (bUsePBO) { checkCudaErrors(cudaGraphicsUnmapResources(2, pgres, 0)); } } // Explicit specializations needed to generate code template void integrateNbodySystem(DeviceData *deviceData, cudaGraphicsResource **pgres, unsigned int currentRead, float deltaTime, float damping, unsigned int numBodies, unsigned int numDevices, int blockSize, bool bUsePBO); template void integrateNbodySystem(DeviceData *deviceData, cudaGraphicsResource **pgres, unsigned int currentRead, float deltaTime, float damping, unsigned int numBodies, unsigned int numDevices, int blockSize, bool bUsePBO);