mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-28 14:19:15 +08:00
1304 lines
36 KiB
C++
1304 lines
36 KiB
C++
|
/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||
|
*
|
||
|
* Redistribution and use in source and binary forms, with or without
|
||
|
* modification, are permitted provided that the following conditions
|
||
|
* are met:
|
||
|
* * Redistributions of source code must retain the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer.
|
||
|
* * Redistributions in binary form must reproduce the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer in the
|
||
|
* documentation and/or other materials provided with the distribution.
|
||
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||
|
* contributors may be used to endorse or promote products derived
|
||
|
* from this software without specific prior written permission.
|
||
|
*
|
||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
*/
|
||
|
|
||
|
#include <helper_gl.h>
|
||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||
|
#include <GL/wglew.h>
|
||
|
#endif
|
||
|
|
||
|
#if defined(__APPLE__) || defined(MACOSX)
|
||
|
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||
|
#include <GLUT/glut.h>
|
||
|
#else
|
||
|
#include <GL/freeglut.h>
|
||
|
#endif
|
||
|
|
||
|
#include <paramgl.h>
|
||
|
#include <cstdlib>
|
||
|
#include <cstdio>
|
||
|
#include <algorithm>
|
||
|
#include <assert.h>
|
||
|
#include <math.h>
|
||
|
|
||
|
#include <cuda_runtime.h>
|
||
|
#include <cuda_gl_interop.h>
|
||
|
#include <helper_cuda.h>
|
||
|
#include <helper_functions.h>
|
||
|
|
||
|
#include "bodysystemcuda.h"
|
||
|
#include "bodysystemcpu.h"
|
||
|
#include "render_particles.h"
|
||
|
#include "cuda_runtime.h"
|
||
|
|
||
|
// view params
|
||
|
int ox = 0, oy = 0;
|
||
|
int buttonState = 0;
|
||
|
float camera_trans[] = {0, -2, -150};
|
||
|
float camera_rot[] = {0, 0, 0};
|
||
|
float camera_trans_lag[] = {0, -2, -150};
|
||
|
float camera_rot_lag[] = {0, 0, 0};
|
||
|
const float inertia = 0.1f;
|
||
|
|
||
|
ParticleRenderer::DisplayMode displayMode =
|
||
|
ParticleRenderer::PARTICLE_SPRITES_COLOR;
|
||
|
|
||
|
bool benchmark = false;
|
||
|
bool compareToCPU = false;
|
||
|
bool QATest = false;
|
||
|
int blockSize = 256;
|
||
|
bool useHostMem = false;
|
||
|
bool useP2P = true; // this is always optimal to use P2P path when available
|
||
|
bool fp64 = false;
|
||
|
bool useCpu = false;
|
||
|
int numDevsRequested = 1;
|
||
|
bool displayEnabled = true;
|
||
|
bool bPause = false;
|
||
|
bool bFullscreen = false;
|
||
|
bool bDispInteractions = false;
|
||
|
bool bSupportDouble = false;
|
||
|
int flopsPerInteraction = 20;
|
||
|
|
||
|
char deviceName[100];
|
||
|
|
||
|
enum { M_VIEW = 0, M_MOVE };
|
||
|
|
||
|
int numBodies = 16384;
|
||
|
|
||
|
std::string tipsyFile = "";
|
||
|
|
||
|
int numIterations = 0; // run until exit
|
||
|
|
||
|
void computePerfStats(double &interactionsPerSecond, double &gflops,
|
||
|
float milliseconds, int iterations) {
|
||
|
// double precision uses intrinsic operation followed by refinement,
|
||
|
// resulting in higher operation count per interaction.
|
||
|
// (Note Astrophysicists use 38 flops per interaction no matter what,
|
||
|
// based on "historical precedent", but they are using FLOP/s as a
|
||
|
// measure of "science throughput". We are using it as a measure of
|
||
|
// hardware throughput. They should really use interactions/s...
|
||
|
// const int flopsPerInteraction = fp64 ? 30 : 20;
|
||
|
interactionsPerSecond = (float)numBodies * (float)numBodies;
|
||
|
interactionsPerSecond *= 1e-9 * iterations * 1000 / milliseconds;
|
||
|
gflops = interactionsPerSecond * (float)flopsPerInteraction;
|
||
|
}
|
||
|
|
||
|
////////////////////////////////////////
|
||
|
// Demo Parameters
|
||
|
////////////////////////////////////////
|
||
|
struct NBodyParams {
|
||
|
float m_timestep;
|
||
|
float m_clusterScale;
|
||
|
float m_velocityScale;
|
||
|
float m_softening;
|
||
|
float m_damping;
|
||
|
float m_pointSize;
|
||
|
float m_x, m_y, m_z;
|
||
|
|
||
|
void print() {
|
||
|
printf("{ %f, %f, %f, %f, %f, %f, %f, %f, %f },\n", m_timestep,
|
||
|
m_clusterScale, m_velocityScale, m_softening, m_damping, m_pointSize,
|
||
|
m_x, m_y, m_z);
|
||
|
}
|
||
|
};
|
||
|
|
||
|
NBodyParams demoParams[] = {
|
||
|
{0.016f, 1.54f, 8.0f, 0.1f, 1.0f, 1.0f, 0, -2, -100},
|
||
|
{0.016f, 0.68f, 20.0f, 0.1f, 1.0f, 0.8f, 0, -2, -30},
|
||
|
{0.0006f, 0.16f, 1000.0f, 1.0f, 1.0f, 0.07f, 0, 0, -1.5f},
|
||
|
{0.0006f, 0.16f, 1000.0f, 1.0f, 1.0f, 0.07f, 0, 0, -1.5f},
|
||
|
{0.0019f, 0.32f, 276.0f, 1.0f, 1.0f, 0.07f, 0, 0, -5},
|
||
|
{0.0016f, 0.32f, 272.0f, 0.145f, 1.0f, 0.08f, 0, 0, -5},
|
||
|
{0.016000f, 6.040000f, 0.000000f, 1.000000f, 1.000000f, 0.760000f, 0, 0,
|
||
|
-50},
|
||
|
};
|
||
|
|
||
|
int numDemos = sizeof(demoParams) / sizeof(NBodyParams);
|
||
|
bool cycleDemo = true;
|
||
|
int activeDemo = 0;
|
||
|
float demoTime = 10000.0f; // ms
|
||
|
StopWatchInterface *demoTimer = NULL, *timer = NULL;
|
||
|
|
||
|
// run multiple iterations to compute an average sort time
|
||
|
|
||
|
NBodyParams activeParams = demoParams[activeDemo];
|
||
|
|
||
|
// The UI.
|
||
|
ParamListGL *paramlist; // parameter list
|
||
|
bool bShowSliders = true;
|
||
|
|
||
|
// fps
|
||
|
static int fpsCount = 0;
|
||
|
static int fpsLimit = 5;
|
||
|
cudaEvent_t startEvent, stopEvent;
|
||
|
cudaEvent_t hostMemSyncEvent;
|
||
|
|
||
|
template <typename T>
|
||
|
class NBodyDemo {
|
||
|
public:
|
||
|
static void Create() { m_singleton = new NBodyDemo; }
|
||
|
static void Destroy() { delete m_singleton; }
|
||
|
|
||
|
static void init(int numBodies, int numDevices, int blockSize, bool usePBO,
|
||
|
bool useHostMem, bool useP2P, bool useCpu, int devID) {
|
||
|
m_singleton->_init(numBodies, numDevices, blockSize, usePBO, useHostMem,
|
||
|
useP2P, useCpu, devID);
|
||
|
}
|
||
|
|
||
|
static void reset(int numBodies, NBodyConfig config) {
|
||
|
m_singleton->_reset(numBodies, config);
|
||
|
}
|
||
|
|
||
|
static void selectDemo(int index) { m_singleton->_selectDemo(index); }
|
||
|
|
||
|
static bool compareResults(int numBodies) {
|
||
|
return m_singleton->_compareResults(numBodies);
|
||
|
}
|
||
|
|
||
|
static void runBenchmark(int iterations) {
|
||
|
m_singleton->_runBenchmark(iterations);
|
||
|
}
|
||
|
|
||
|
static void updateParams() {
|
||
|
m_singleton->m_nbody->setSoftening(activeParams.m_softening);
|
||
|
m_singleton->m_nbody->setDamping(activeParams.m_damping);
|
||
|
}
|
||
|
|
||
|
static void updateSimulation() {
|
||
|
m_singleton->m_nbody->update(activeParams.m_timestep);
|
||
|
}
|
||
|
|
||
|
static void display() {
|
||
|
m_singleton->m_renderer->setSpriteSize(activeParams.m_pointSize);
|
||
|
|
||
|
if (useHostMem) {
|
||
|
// This event sync is required because we are rendering from the host
|
||
|
// memory that CUDA is
|
||
|
// writing. If we don't wait until CUDA is done updating it, we will
|
||
|
// render partially
|
||
|
// updated data, resulting in a jerky frame rate.
|
||
|
if (!useCpu) {
|
||
|
cudaEventSynchronize(hostMemSyncEvent);
|
||
|
}
|
||
|
|
||
|
m_singleton->m_renderer->setPositions(
|
||
|
m_singleton->m_nbody->getArray(BODYSYSTEM_POSITION),
|
||
|
m_singleton->m_nbody->getNumBodies());
|
||
|
} else {
|
||
|
m_singleton->m_renderer->setPBO(
|
||
|
m_singleton->m_nbody->getCurrentReadBuffer(),
|
||
|
m_singleton->m_nbody->getNumBodies(), (sizeof(T) > 4));
|
||
|
}
|
||
|
|
||
|
// display particles
|
||
|
m_singleton->m_renderer->display(displayMode);
|
||
|
}
|
||
|
|
||
|
static void getArrays(T *pos, T *vel) {
|
||
|
T *_pos = m_singleton->m_nbody->getArray(BODYSYSTEM_POSITION);
|
||
|
T *_vel = m_singleton->m_nbody->getArray(BODYSYSTEM_VELOCITY);
|
||
|
memcpy(pos, _pos, m_singleton->m_nbody->getNumBodies() * 4 * sizeof(T));
|
||
|
memcpy(vel, _vel, m_singleton->m_nbody->getNumBodies() * 4 * sizeof(T));
|
||
|
}
|
||
|
|
||
|
static void setArrays(const T *pos, const T *vel) {
|
||
|
if (pos != m_singleton->m_hPos) {
|
||
|
memcpy(m_singleton->m_hPos, pos, numBodies * 4 * sizeof(T));
|
||
|
}
|
||
|
|
||
|
if (vel != m_singleton->m_hVel) {
|
||
|
memcpy(m_singleton->m_hVel, vel, numBodies * 4 * sizeof(T));
|
||
|
}
|
||
|
|
||
|
m_singleton->m_nbody->setArray(BODYSYSTEM_POSITION, m_singleton->m_hPos);
|
||
|
m_singleton->m_nbody->setArray(BODYSYSTEM_VELOCITY, m_singleton->m_hVel);
|
||
|
|
||
|
if (!benchmark && !useCpu && !compareToCPU) {
|
||
|
m_singleton->_resetRenderer();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
static NBodyDemo *m_singleton;
|
||
|
|
||
|
BodySystem<T> *m_nbody;
|
||
|
BodySystemCUDA<T> *m_nbodyCuda;
|
||
|
BodySystemCPU<T> *m_nbodyCpu;
|
||
|
|
||
|
ParticleRenderer *m_renderer;
|
||
|
|
||
|
T *m_hPos;
|
||
|
T *m_hVel;
|
||
|
float *m_hColor;
|
||
|
|
||
|
private:
|
||
|
NBodyDemo()
|
||
|
: m_nbody(0),
|
||
|
m_nbodyCuda(0),
|
||
|
m_nbodyCpu(0),
|
||
|
m_renderer(0),
|
||
|
m_hPos(0),
|
||
|
m_hVel(0),
|
||
|
m_hColor(0) {}
|
||
|
|
||
|
~NBodyDemo() {
|
||
|
if (m_nbodyCpu) {
|
||
|
delete m_nbodyCpu;
|
||
|
}
|
||
|
|
||
|
if (m_nbodyCuda) {
|
||
|
delete m_nbodyCuda;
|
||
|
}
|
||
|
|
||
|
if (m_hPos) {
|
||
|
delete[] m_hPos;
|
||
|
}
|
||
|
|
||
|
if (m_hVel) {
|
||
|
delete[] m_hVel;
|
||
|
}
|
||
|
|
||
|
if (m_hColor) {
|
||
|
delete[] m_hColor;
|
||
|
}
|
||
|
|
||
|
sdkDeleteTimer(&demoTimer);
|
||
|
|
||
|
if (!benchmark && !compareToCPU) delete m_renderer;
|
||
|
}
|
||
|
|
||
|
void _init(int numBodies, int numDevices, int blockSize, bool bUsePBO,
|
||
|
bool useHostMem, bool useP2P, bool useCpu, int devID) {
|
||
|
if (useCpu) {
|
||
|
m_nbodyCpu = new BodySystemCPU<T>(numBodies);
|
||
|
m_nbody = m_nbodyCpu;
|
||
|
m_nbodyCuda = 0;
|
||
|
} else {
|
||
|
m_nbodyCuda = new BodySystemCUDA<T>(numBodies, numDevices, blockSize,
|
||
|
bUsePBO, useHostMem, useP2P, devID);
|
||
|
m_nbody = m_nbodyCuda;
|
||
|
m_nbodyCpu = 0;
|
||
|
}
|
||
|
|
||
|
// allocate host memory
|
||
|
m_hPos = new T[numBodies * 4];
|
||
|
m_hVel = new T[numBodies * 4];
|
||
|
m_hColor = new float[numBodies * 4];
|
||
|
|
||
|
m_nbody->setSoftening(activeParams.m_softening);
|
||
|
m_nbody->setDamping(activeParams.m_damping);
|
||
|
|
||
|
if (useCpu) {
|
||
|
sdkCreateTimer(&timer);
|
||
|
sdkStartTimer(&timer);
|
||
|
} else {
|
||
|
checkCudaErrors(cudaEventCreate(&startEvent));
|
||
|
checkCudaErrors(cudaEventCreate(&stopEvent));
|
||
|
checkCudaErrors(cudaEventCreate(&hostMemSyncEvent));
|
||
|
}
|
||
|
|
||
|
if (!benchmark && !compareToCPU) {
|
||
|
m_renderer = new ParticleRenderer;
|
||
|
_resetRenderer();
|
||
|
}
|
||
|
|
||
|
sdkCreateTimer(&demoTimer);
|
||
|
sdkStartTimer(&demoTimer);
|
||
|
}
|
||
|
|
||
|
void _reset(int numBodies, NBodyConfig config) {
|
||
|
if (tipsyFile == "") {
|
||
|
randomizeBodies(config, m_hPos, m_hVel, m_hColor,
|
||
|
activeParams.m_clusterScale, activeParams.m_velocityScale,
|
||
|
numBodies, true);
|
||
|
setArrays(m_hPos, m_hVel);
|
||
|
} else {
|
||
|
m_nbody->loadTipsyFile(tipsyFile);
|
||
|
::numBodies = m_nbody->getNumBodies();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void _resetRenderer() {
|
||
|
if (fp64) {
|
||
|
float color[4] = {0.4f, 0.8f, 0.1f, 1.0f};
|
||
|
m_renderer->setBaseColor(color);
|
||
|
} else {
|
||
|
float color[4] = {1.0f, 0.6f, 0.3f, 1.0f};
|
||
|
m_renderer->setBaseColor(color);
|
||
|
}
|
||
|
|
||
|
m_renderer->setColors(m_hColor, m_nbody->getNumBodies());
|
||
|
m_renderer->setSpriteSize(activeParams.m_pointSize);
|
||
|
}
|
||
|
|
||
|
void _selectDemo(int index) {
|
||
|
assert(index < numDemos);
|
||
|
|
||
|
activeParams = demoParams[index];
|
||
|
camera_trans[0] = camera_trans_lag[0] = activeParams.m_x;
|
||
|
camera_trans[1] = camera_trans_lag[1] = activeParams.m_y;
|
||
|
camera_trans[2] = camera_trans_lag[2] = activeParams.m_z;
|
||
|
reset(numBodies, NBODY_CONFIG_SHELL);
|
||
|
sdkResetTimer(&demoTimer);
|
||
|
}
|
||
|
|
||
|
bool _compareResults(int numBodies) {
|
||
|
assert(m_nbodyCuda);
|
||
|
|
||
|
bool passed = true;
|
||
|
|
||
|
m_nbody->update(0.001f);
|
||
|
|
||
|
{
|
||
|
m_nbodyCpu = new BodySystemCPU<T>(numBodies);
|
||
|
|
||
|
m_nbodyCpu->setArray(BODYSYSTEM_POSITION, m_hPos);
|
||
|
m_nbodyCpu->setArray(BODYSYSTEM_VELOCITY, m_hVel);
|
||
|
|
||
|
m_nbodyCpu->update(0.001f);
|
||
|
|
||
|
T *cudaPos = m_nbodyCuda->getArray(BODYSYSTEM_POSITION);
|
||
|
T *cpuPos = m_nbodyCpu->getArray(BODYSYSTEM_POSITION);
|
||
|
|
||
|
T tolerance = 0.0005f;
|
||
|
|
||
|
for (int i = 0; i < numBodies; i++) {
|
||
|
if (fabs(cpuPos[i] - cudaPos[i]) > tolerance) {
|
||
|
passed = false;
|
||
|
printf("Error: (host)%f != (device)%f\n", cpuPos[i], cudaPos[i]);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if (passed) {
|
||
|
printf(" OK\n");
|
||
|
}
|
||
|
return passed;
|
||
|
}
|
||
|
|
||
|
void _runBenchmark(int iterations) {
|
||
|
// once without timing to prime the device
|
||
|
if (!useCpu) {
|
||
|
m_nbody->update(activeParams.m_timestep);
|
||
|
}
|
||
|
|
||
|
if (useCpu) {
|
||
|
sdkCreateTimer(&timer);
|
||
|
sdkStartTimer(&timer);
|
||
|
} else {
|
||
|
checkCudaErrors(cudaEventRecord(startEvent, 0));
|
||
|
}
|
||
|
|
||
|
for (int i = 0; i < iterations; ++i) {
|
||
|
m_nbody->update(activeParams.m_timestep);
|
||
|
}
|
||
|
|
||
|
float milliseconds = 0;
|
||
|
|
||
|
if (useCpu) {
|
||
|
sdkStopTimer(&timer);
|
||
|
milliseconds = sdkGetTimerValue(&timer);
|
||
|
sdkStartTimer(&timer);
|
||
|
} else {
|
||
|
checkCudaErrors(cudaEventRecord(stopEvent, 0));
|
||
|
checkCudaErrors(cudaEventSynchronize(stopEvent));
|
||
|
checkCudaErrors(
|
||
|
cudaEventElapsedTime(&milliseconds, startEvent, stopEvent));
|
||
|
}
|
||
|
|
||
|
double interactionsPerSecond = 0;
|
||
|
double gflops = 0;
|
||
|
computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations);
|
||
|
|
||
|
printf("%d bodies, total time for %d iterations: %.3f ms\n", numBodies,
|
||
|
iterations, milliseconds);
|
||
|
printf("= %.3f billion interactions per second\n", interactionsPerSecond);
|
||
|
printf("= %.3f %s-precision GFLOP/s at %d flops per interaction\n", gflops,
|
||
|
(sizeof(T) > 4) ? "double" : "single", flopsPerInteraction);
|
||
|
}
|
||
|
};
|
||
|
|
||
|
void finalize() {
|
||
|
if (!useCpu) {
|
||
|
checkCudaErrors(cudaEventDestroy(startEvent));
|
||
|
checkCudaErrors(cudaEventDestroy(stopEvent));
|
||
|
checkCudaErrors(cudaEventDestroy(hostMemSyncEvent));
|
||
|
}
|
||
|
|
||
|
NBodyDemo<float>::Destroy();
|
||
|
|
||
|
if (bSupportDouble) NBodyDemo<double>::Destroy();
|
||
|
}
|
||
|
|
||
|
template <>
|
||
|
NBodyDemo<double> *NBodyDemo<double>::m_singleton = 0;
|
||
|
template <>
|
||
|
NBodyDemo<float> *NBodyDemo<float>::m_singleton = 0;
|
||
|
|
||
|
template <typename T_new, typename T_old>
|
||
|
void switchDemoPrecision() {
|
||
|
cudaDeviceSynchronize();
|
||
|
|
||
|
fp64 = !fp64;
|
||
|
flopsPerInteraction = fp64 ? 30 : 20;
|
||
|
|
||
|
T_old *oldPos = new T_old[numBodies * 4];
|
||
|
T_old *oldVel = new T_old[numBodies * 4];
|
||
|
|
||
|
NBodyDemo<T_old>::getArrays(oldPos, oldVel);
|
||
|
|
||
|
// convert float to double
|
||
|
T_new *newPos = new T_new[numBodies * 4];
|
||
|
T_new *newVel = new T_new[numBodies * 4];
|
||
|
|
||
|
for (int i = 0; i < numBodies * 4; i++) {
|
||
|
newPos[i] = (T_new)oldPos[i];
|
||
|
newVel[i] = (T_new)oldVel[i];
|
||
|
}
|
||
|
|
||
|
NBodyDemo<T_new>::setArrays(newPos, newVel);
|
||
|
|
||
|
cudaDeviceSynchronize();
|
||
|
|
||
|
delete[] oldPos;
|
||
|
delete[] oldVel;
|
||
|
delete[] newPos;
|
||
|
delete[] newVel;
|
||
|
}
|
||
|
|
||
|
// check for OpenGL errors
|
||
|
inline void checkGLErrors(const char *s) {
|
||
|
GLenum error;
|
||
|
|
||
|
while ((error = glGetError()) != GL_NO_ERROR) {
|
||
|
fprintf(stderr, "%s: error - %s\n", s, (char *)gluErrorString(error));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void initGL(int *argc, char **argv) {
|
||
|
// First initialize OpenGL context, so we can properly set the GL for CUDA.
|
||
|
// This is necessary in order to achieve optimal performance with OpenGL/CUDA
|
||
|
// interop.
|
||
|
glutInit(argc, argv);
|
||
|
glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE);
|
||
|
glutInitWindowSize(720, 480);
|
||
|
glutCreateWindow("CUDA n-body system");
|
||
|
|
||
|
if (bFullscreen) {
|
||
|
glutFullScreen();
|
||
|
}
|
||
|
|
||
|
else if (!isGLVersionSupported(2, 0) ||
|
||
|
!areGLExtensionsSupported("GL_ARB_multitexture "
|
||
|
"GL_ARB_vertex_buffer_object")) {
|
||
|
fprintf(stderr, "Required OpenGL extensions missing.");
|
||
|
exit(EXIT_FAILURE);
|
||
|
} else {
|
||
|
#if defined(WIN32)
|
||
|
wglSwapIntervalEXT(0);
|
||
|
#elif defined(LINUX)
|
||
|
glxSwapIntervalSGI(0);
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
glEnable(GL_DEPTH_TEST);
|
||
|
glClearColor(0.0, 0.0, 0.0, 1.0);
|
||
|
|
||
|
checkGLErrors("initGL");
|
||
|
}
|
||
|
|
||
|
void initParameters() {
|
||
|
// create a new parameter list
|
||
|
paramlist = new ParamListGL("sliders");
|
||
|
paramlist->SetBarColorInner(0.8f, 0.8f, 0.0f);
|
||
|
|
||
|
// add some parameters to the list
|
||
|
|
||
|
// Point Size
|
||
|
paramlist->AddParam(new Param<float>("Point Size", activeParams.m_pointSize,
|
||
|
0.001f, 10.0f, 0.01f,
|
||
|
&activeParams.m_pointSize));
|
||
|
|
||
|
// Velocity Damping
|
||
|
paramlist->AddParam(new Param<float>("Velocity Damping",
|
||
|
activeParams.m_damping, 0.5f, 1.0f,
|
||
|
.0001f, &(activeParams.m_damping)));
|
||
|
// Softening Factor
|
||
|
paramlist->AddParam(new Param<float>("Softening Factor",
|
||
|
activeParams.m_softening, 0.001f, 1.0f,
|
||
|
.0001f, &(activeParams.m_softening)));
|
||
|
// Time step size
|
||
|
paramlist->AddParam(new Param<float>("Time Step", activeParams.m_timestep,
|
||
|
0.0f, 1.0f, .0001f,
|
||
|
&(activeParams.m_timestep)));
|
||
|
// Cluster scale (only affects starting configuration
|
||
|
paramlist->AddParam(new Param<float>("Cluster Scale",
|
||
|
activeParams.m_clusterScale, 0.0f, 10.0f,
|
||
|
0.01f, &(activeParams.m_clusterScale)));
|
||
|
|
||
|
// Velocity scale (only affects starting configuration)
|
||
|
paramlist->AddParam(
|
||
|
new Param<float>("Velocity Scale", activeParams.m_velocityScale, 0.0f,
|
||
|
1000.0f, 0.1f, &activeParams.m_velocityScale));
|
||
|
}
|
||
|
|
||
|
void selectDemo(int activeDemo) {
|
||
|
if (fp64) {
|
||
|
NBodyDemo<double>::selectDemo(activeDemo);
|
||
|
} else {
|
||
|
NBodyDemo<float>::selectDemo(activeDemo);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void updateSimulation() {
|
||
|
if (fp64) {
|
||
|
NBodyDemo<double>::updateSimulation();
|
||
|
} else {
|
||
|
NBodyDemo<float>::updateSimulation();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void displayNBodySystem() {
|
||
|
if (fp64) {
|
||
|
NBodyDemo<double>::display();
|
||
|
} else {
|
||
|
NBodyDemo<float>::display();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void display() {
|
||
|
static double gflops = 0;
|
||
|
static double ifps = 0;
|
||
|
static double interactionsPerSecond = 0;
|
||
|
|
||
|
// update the simulation
|
||
|
if (!bPause) {
|
||
|
if (cycleDemo && (sdkGetTimerValue(&demoTimer) > demoTime)) {
|
||
|
activeDemo = (activeDemo + 1) % numDemos;
|
||
|
selectDemo(activeDemo);
|
||
|
}
|
||
|
|
||
|
updateSimulation();
|
||
|
|
||
|
if (!useCpu) {
|
||
|
cudaEventRecord(hostMemSyncEvent,
|
||
|
0); // insert an event to wait on before rendering
|
||
|
}
|
||
|
}
|
||
|
|
||
|
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
|
||
|
|
||
|
if (displayEnabled) {
|
||
|
// view transform
|
||
|
{
|
||
|
glMatrixMode(GL_MODELVIEW);
|
||
|
glLoadIdentity();
|
||
|
|
||
|
for (int c = 0; c < 3; ++c) {
|
||
|
camera_trans_lag[c] +=
|
||
|
(camera_trans[c] - camera_trans_lag[c]) * inertia;
|
||
|
camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia;
|
||
|
}
|
||
|
|
||
|
glTranslatef(camera_trans_lag[0], camera_trans_lag[1],
|
||
|
camera_trans_lag[2]);
|
||
|
glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0);
|
||
|
glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0);
|
||
|
}
|
||
|
|
||
|
displayNBodySystem();
|
||
|
|
||
|
// display user interface
|
||
|
if (bShowSliders) {
|
||
|
glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color
|
||
|
glEnable(GL_BLEND);
|
||
|
paramlist->Render(0, 0);
|
||
|
glDisable(GL_BLEND);
|
||
|
}
|
||
|
|
||
|
if (bFullscreen) {
|
||
|
beginWinCoords();
|
||
|
char msg0[256], msg1[256], msg2[256];
|
||
|
|
||
|
if (bDispInteractions) {
|
||
|
sprintf(msg1, "%0.2f billion interactions per second",
|
||
|
interactionsPerSecond);
|
||
|
} else {
|
||
|
sprintf(msg1, "%0.2f GFLOP/s", gflops);
|
||
|
}
|
||
|
|
||
|
sprintf(msg0, "%s", deviceName);
|
||
|
sprintf(msg2, "%0.2f FPS [%s | %d bodies]", ifps,
|
||
|
fp64 ? "double precision" : "single precision", numBodies);
|
||
|
|
||
|
glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color
|
||
|
glEnable(GL_BLEND);
|
||
|
glColor3f(0.46f, 0.73f, 0.0f);
|
||
|
glPrint(80, glutGet(GLUT_WINDOW_HEIGHT) - 122, msg0,
|
||
|
GLUT_BITMAP_TIMES_ROMAN_24);
|
||
|
glColor3f(1.0f, 1.0f, 1.0f);
|
||
|
glPrint(80, glutGet(GLUT_WINDOW_HEIGHT) - 96, msg2,
|
||
|
GLUT_BITMAP_TIMES_ROMAN_24);
|
||
|
glColor3f(1.0f, 1.0f, 1.0f);
|
||
|
glPrint(80, glutGet(GLUT_WINDOW_HEIGHT) - 70, msg1,
|
||
|
GLUT_BITMAP_TIMES_ROMAN_24);
|
||
|
glDisable(GL_BLEND);
|
||
|
|
||
|
endWinCoords();
|
||
|
}
|
||
|
|
||
|
glutSwapBuffers();
|
||
|
}
|
||
|
|
||
|
fpsCount++;
|
||
|
|
||
|
// this displays the frame rate updated every second (independent of frame
|
||
|
// rate)
|
||
|
if (fpsCount >= fpsLimit) {
|
||
|
char fps[256];
|
||
|
|
||
|
float milliseconds = 1;
|
||
|
|
||
|
// stop timer
|
||
|
if (useCpu) {
|
||
|
milliseconds = sdkGetTimerValue(&timer);
|
||
|
sdkResetTimer(&timer);
|
||
|
} else {
|
||
|
checkCudaErrors(cudaEventRecord(stopEvent, 0));
|
||
|
checkCudaErrors(cudaEventSynchronize(stopEvent));
|
||
|
checkCudaErrors(
|
||
|
cudaEventElapsedTime(&milliseconds, startEvent, stopEvent));
|
||
|
}
|
||
|
|
||
|
milliseconds /= (float)fpsCount;
|
||
|
computePerfStats(interactionsPerSecond, gflops, milliseconds, 1);
|
||
|
|
||
|
ifps = 1.f / (milliseconds / 1000.f);
|
||
|
sprintf(fps,
|
||
|
"CUDA N-Body (%d bodies): "
|
||
|
"%0.1f fps | %0.1f BIPS | %0.1f GFLOP/s | %s",
|
||
|
numBodies, ifps, interactionsPerSecond, gflops,
|
||
|
fp64 ? "double precision" : "single precision");
|
||
|
|
||
|
glutSetWindowTitle(fps);
|
||
|
fpsCount = 0;
|
||
|
fpsLimit = (ifps > 1.f) ? (int)ifps : 1;
|
||
|
|
||
|
if (bPause) {
|
||
|
fpsLimit = 0;
|
||
|
}
|
||
|
|
||
|
// restart timer
|
||
|
if (!useCpu) {
|
||
|
checkCudaErrors(cudaEventRecord(startEvent, 0));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
glutReportErrors();
|
||
|
}
|
||
|
|
||
|
void reshape(int w, int h) {
|
||
|
glMatrixMode(GL_PROJECTION);
|
||
|
glLoadIdentity();
|
||
|
gluPerspective(60.0, (float)w / (float)h, 0.1, 1000.0);
|
||
|
|
||
|
glMatrixMode(GL_MODELVIEW);
|
||
|
glViewport(0, 0, w, h);
|
||
|
}
|
||
|
|
||
|
void updateParams() {
|
||
|
if (fp64) {
|
||
|
NBodyDemo<double>::updateParams();
|
||
|
} else {
|
||
|
NBodyDemo<float>::updateParams();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void mouse(int button, int state, int x, int y) {
|
||
|
if (bShowSliders) {
|
||
|
// call list mouse function
|
||
|
if (paramlist->Mouse(x, y, button, state)) {
|
||
|
updateParams();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int mods;
|
||
|
|
||
|
if (state == GLUT_DOWN) {
|
||
|
buttonState |= 1 << button;
|
||
|
} else if (state == GLUT_UP) {
|
||
|
buttonState = 0;
|
||
|
}
|
||
|
|
||
|
mods = glutGetModifiers();
|
||
|
|
||
|
if (mods & GLUT_ACTIVE_SHIFT) {
|
||
|
buttonState = 2;
|
||
|
} else if (mods & GLUT_ACTIVE_CTRL) {
|
||
|
buttonState = 3;
|
||
|
}
|
||
|
|
||
|
ox = x;
|
||
|
oy = y;
|
||
|
|
||
|
glutPostRedisplay();
|
||
|
}
|
||
|
|
||
|
void motion(int x, int y) {
|
||
|
if (bShowSliders) {
|
||
|
// call parameter list motion function
|
||
|
if (paramlist->Motion(x, y)) {
|
||
|
updateParams();
|
||
|
glutPostRedisplay();
|
||
|
return;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
float dx = (float)(x - ox);
|
||
|
float dy = (float)(y - oy);
|
||
|
|
||
|
if (buttonState == 3) {
|
||
|
// left+middle = zoom
|
||
|
camera_trans[2] += (dy / 100.0f) * 0.5f * fabs(camera_trans[2]);
|
||
|
} else if (buttonState & 2) {
|
||
|
// middle = translate
|
||
|
camera_trans[0] += dx / 100.0f;
|
||
|
camera_trans[1] -= dy / 100.0f;
|
||
|
} else if (buttonState & 1) {
|
||
|
// left = rotate
|
||
|
camera_rot[0] += dy / 5.0f;
|
||
|
camera_rot[1] += dx / 5.0f;
|
||
|
}
|
||
|
|
||
|
ox = x;
|
||
|
oy = y;
|
||
|
glutPostRedisplay();
|
||
|
}
|
||
|
|
||
|
// commented out to remove unused parameter warnings in Linux
|
||
|
void key(unsigned char key, int /*x*/, int /*y*/) {
|
||
|
switch (key) {
|
||
|
case ' ':
|
||
|
bPause = !bPause;
|
||
|
break;
|
||
|
|
||
|
case 27: // escape
|
||
|
case 'q':
|
||
|
case 'Q':
|
||
|
finalize();
|
||
|
exit(EXIT_SUCCESS);
|
||
|
break;
|
||
|
|
||
|
case 13: // return
|
||
|
if (bSupportDouble) {
|
||
|
if (fp64) {
|
||
|
switchDemoPrecision<float, double>();
|
||
|
} else {
|
||
|
switchDemoPrecision<double, float>();
|
||
|
}
|
||
|
|
||
|
printf("> %s precision floating point simulation\n",
|
||
|
fp64 ? "Double" : "Single");
|
||
|
}
|
||
|
|
||
|
break;
|
||
|
|
||
|
case '`':
|
||
|
bShowSliders = !bShowSliders;
|
||
|
break;
|
||
|
|
||
|
case 'g':
|
||
|
case 'G':
|
||
|
bDispInteractions = !bDispInteractions;
|
||
|
break;
|
||
|
|
||
|
case 'p':
|
||
|
case 'P':
|
||
|
displayMode = (ParticleRenderer::DisplayMode)(
|
||
|
(displayMode + 1) % ParticleRenderer::PARTICLE_NUM_MODES);
|
||
|
break;
|
||
|
|
||
|
case 'c':
|
||
|
case 'C':
|
||
|
cycleDemo = !cycleDemo;
|
||
|
printf("Cycle Demo Parameters: %s\n", cycleDemo ? "ON" : "OFF");
|
||
|
break;
|
||
|
|
||
|
case '[':
|
||
|
activeDemo =
|
||
|
(activeDemo == 0) ? numDemos - 1 : (activeDemo - 1) % numDemos;
|
||
|
selectDemo(activeDemo);
|
||
|
break;
|
||
|
|
||
|
case ']':
|
||
|
activeDemo = (activeDemo + 1) % numDemos;
|
||
|
selectDemo(activeDemo);
|
||
|
break;
|
||
|
|
||
|
case 'd':
|
||
|
case 'D':
|
||
|
displayEnabled = !displayEnabled;
|
||
|
break;
|
||
|
|
||
|
case 'o':
|
||
|
case 'O':
|
||
|
activeParams.print();
|
||
|
break;
|
||
|
|
||
|
case '1':
|
||
|
if (fp64) {
|
||
|
NBodyDemo<double>::reset(numBodies, NBODY_CONFIG_SHELL);
|
||
|
} else {
|
||
|
NBodyDemo<float>::reset(numBodies, NBODY_CONFIG_SHELL);
|
||
|
}
|
||
|
|
||
|
break;
|
||
|
|
||
|
case '2':
|
||
|
if (fp64) {
|
||
|
NBodyDemo<double>::reset(numBodies, NBODY_CONFIG_RANDOM);
|
||
|
} else {
|
||
|
NBodyDemo<float>::reset(numBodies, NBODY_CONFIG_RANDOM);
|
||
|
}
|
||
|
|
||
|
break;
|
||
|
|
||
|
case '3':
|
||
|
if (fp64) {
|
||
|
NBodyDemo<double>::reset(numBodies, NBODY_CONFIG_EXPAND);
|
||
|
} else {
|
||
|
NBodyDemo<float>::reset(numBodies, NBODY_CONFIG_EXPAND);
|
||
|
}
|
||
|
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
glutPostRedisplay();
|
||
|
}
|
||
|
|
||
|
void special(int key, int x, int y) {
|
||
|
paramlist->Special(key, x, y);
|
||
|
glutPostRedisplay();
|
||
|
}
|
||
|
|
||
|
void idle(void) { glutPostRedisplay(); }
|
||
|
|
||
|
void showHelp() {
|
||
|
printf("\t-fullscreen (run n-body simulation in fullscreen mode)\n");
|
||
|
printf(
|
||
|
"\t-fp64 (use double precision floating point values for "
|
||
|
"simulation)\n");
|
||
|
printf("\t-hostmem (stores simulation data in host memory)\n");
|
||
|
printf("\t-benchmark (run benchmark to measure performance) \n");
|
||
|
printf(
|
||
|
"\t-numbodies=<N> (number of bodies (>= 1) to run in simulation) \n");
|
||
|
printf(
|
||
|
"\t-device=<d> (where d=0,1,2.... for the CUDA device to use)\n");
|
||
|
printf(
|
||
|
"\t-numdevices=<i> (where i=(number of CUDA devices > 0) to use for "
|
||
|
"simulation)\n");
|
||
|
printf(
|
||
|
"\t-compare (compares simulation results running once on the "
|
||
|
"default GPU and once on the CPU)\n");
|
||
|
printf("\t-cpu (run n-body simulation on the CPU)\n");
|
||
|
printf("\t-tipsy=<file.bin> (load a tipsy model file for simulation)\n\n");
|
||
|
}
|
||
|
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
// Program main
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
int main(int argc, char **argv) {
|
||
|
bool bTestResults = true;
|
||
|
|
||
|
#if defined(__linux__)
|
||
|
setenv("DISPLAY", ":0", 0);
|
||
|
#endif
|
||
|
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
|
||
|
printf("\n> Command line options\n");
|
||
|
showHelp();
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
printf(
|
||
|
"Run \"nbody -benchmark [-numbodies=<numBodies>]\" to measure "
|
||
|
"performance.\n");
|
||
|
showHelp();
|
||
|
|
||
|
printf(
|
||
|
"NOTE: The CUDA Samples are not meant for performance measurements. "
|
||
|
"Results may vary when GPU Boost is enabled.\n\n");
|
||
|
|
||
|
bFullscreen =
|
||
|
(checkCmdLineFlag(argc, (const char **)argv, "fullscreen") != 0);
|
||
|
|
||
|
if (bFullscreen) {
|
||
|
bShowSliders = false;
|
||
|
}
|
||
|
|
||
|
benchmark = (checkCmdLineFlag(argc, (const char **)argv, "benchmark") != 0);
|
||
|
|
||
|
compareToCPU =
|
||
|
((checkCmdLineFlag(argc, (const char **)argv, "compare") != 0) ||
|
||
|
(checkCmdLineFlag(argc, (const char **)argv, "qatest") != 0));
|
||
|
|
||
|
QATest = (checkCmdLineFlag(argc, (const char **)argv, "qatest") != 0);
|
||
|
useHostMem = (checkCmdLineFlag(argc, (const char **)argv, "hostmem") != 0);
|
||
|
fp64 = (checkCmdLineFlag(argc, (const char **)argv, "fp64") != 0);
|
||
|
|
||
|
flopsPerInteraction = fp64 ? 30 : 20;
|
||
|
|
||
|
useCpu = (checkCmdLineFlag(argc, (const char **)argv, "cpu") != 0);
|
||
|
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "numdevices")) {
|
||
|
numDevsRequested =
|
||
|
getCmdLineArgumentInt(argc, (const char **)argv, "numdevices");
|
||
|
|
||
|
if (numDevsRequested < 1) {
|
||
|
printf(
|
||
|
"Error: \"number of CUDA devices\" specified %d is invalid. Value "
|
||
|
"should be >= 1\n",
|
||
|
numDevsRequested);
|
||
|
exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE);
|
||
|
} else {
|
||
|
printf("number of CUDA devices = %d\n", numDevsRequested);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int numDevsAvailable = 0;
|
||
|
bool customGPU = false;
|
||
|
cudaGetDeviceCount(&numDevsAvailable);
|
||
|
|
||
|
if (numDevsAvailable < numDevsRequested) {
|
||
|
printf("Error: only %d Devices available, %d requested. Exiting.\n",
|
||
|
numDevsAvailable, numDevsRequested);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
if (numDevsRequested > 1) {
|
||
|
// If user did not explicitly request host memory to be used, we default to
|
||
|
// P2P.
|
||
|
// We fallback to host memory, if any of GPUs does not support P2P.
|
||
|
bool allGPUsSupportP2P = true;
|
||
|
if (!useHostMem) {
|
||
|
// Enable P2P only in one direction, as every peer will access gpu0
|
||
|
for (int i = 1; i < numDevsRequested; ++i) {
|
||
|
int canAccessPeer;
|
||
|
checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeer, i, 0));
|
||
|
|
||
|
if (canAccessPeer != 1) {
|
||
|
allGPUsSupportP2P = false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (!allGPUsSupportP2P) {
|
||
|
useHostMem = true;
|
||
|
useP2P = false;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
printf("> %s mode\n", bFullscreen ? "Fullscreen" : "Windowed");
|
||
|
printf("> Simulation data stored in %s memory\n",
|
||
|
useHostMem ? "system" : "video");
|
||
|
printf("> %s precision floating point simulation\n",
|
||
|
fp64 ? "Double" : "Single");
|
||
|
printf("> %d Devices used for simulation\n", numDevsRequested);
|
||
|
|
||
|
int devID;
|
||
|
cudaDeviceProp props;
|
||
|
|
||
|
if (useCpu) {
|
||
|
useHostMem = true;
|
||
|
compareToCPU = false;
|
||
|
bSupportDouble = true;
|
||
|
|
||
|
#ifdef OPENMP
|
||
|
printf("> Simulation with CPU using OpenMP\n");
|
||
|
#else
|
||
|
printf("> Simulation with CPU\n");
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
// Initialize GL and GLUT if necessary
|
||
|
if (!benchmark && !compareToCPU) {
|
||
|
initGL(&argc, argv);
|
||
|
initParameters();
|
||
|
}
|
||
|
|
||
|
if (!useCpu) {
|
||
|
// Now choose the CUDA Device
|
||
|
// Either without GL interop:
|
||
|
if (benchmark || compareToCPU || useHostMem) {
|
||
|
// Note if we are using host memory for the body system, we
|
||
|
// don't use CUDA-GL interop.
|
||
|
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
|
||
|
customGPU = true;
|
||
|
}
|
||
|
|
||
|
devID = findCudaDevice(argc, (const char **)argv);
|
||
|
} else // or with GL interop:
|
||
|
{
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
|
||
|
customGPU = true;
|
||
|
}
|
||
|
|
||
|
devID = findCudaDevice(argc, (const char **)argv);
|
||
|
}
|
||
|
|
||
|
checkCudaErrors(cudaGetDevice(&devID));
|
||
|
checkCudaErrors(cudaGetDeviceProperties(&props, devID));
|
||
|
|
||
|
bSupportDouble = true;
|
||
|
|
||
|
#if CUDART_VERSION < 4000
|
||
|
|
||
|
if (numDevsRequested > 1) {
|
||
|
printf("MultiGPU n-body requires CUDA 4.0 or later\n");
|
||
|
exit(EXIT_SUCCESS);
|
||
|
}
|
||
|
|
||
|
#endif
|
||
|
|
||
|
// Initialize devices
|
||
|
if (numDevsRequested > 1 && customGPU) {
|
||
|
printf("You can't use --numdevices and --device at the same time.\n");
|
||
|
exit(EXIT_SUCCESS);
|
||
|
}
|
||
|
|
||
|
if (customGPU || numDevsRequested == 1) {
|
||
|
cudaDeviceProp props;
|
||
|
checkCudaErrors(cudaGetDeviceProperties(&props, devID));
|
||
|
printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor,
|
||
|
props.name);
|
||
|
} else {
|
||
|
for (int i = 0; i < numDevsRequested; i++) {
|
||
|
cudaDeviceProp props;
|
||
|
checkCudaErrors(cudaGetDeviceProperties(&props, i));
|
||
|
|
||
|
printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor,
|
||
|
props.name);
|
||
|
|
||
|
if (useHostMem) {
|
||
|
#if CUDART_VERSION >= 2020
|
||
|
|
||
|
if (!props.canMapHostMemory) {
|
||
|
fprintf(stderr, "Device %d cannot map host memory!\n", devID);
|
||
|
exit(EXIT_SUCCESS);
|
||
|
}
|
||
|
|
||
|
if (numDevsRequested > 1) {
|
||
|
checkCudaErrors(cudaSetDevice(i));
|
||
|
}
|
||
|
|
||
|
checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost));
|
||
|
#else
|
||
|
fprintf(stderr,
|
||
|
"This CUDART version does not support "
|
||
|
"<cudaDeviceProp.canMapHostMemory> field\n");
|
||
|
exit(EXIT_SUCCESS);
|
||
|
#endif
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// CC 1.2 and earlier do not support double precision
|
||
|
if (props.major * 10 + props.minor <= 12) {
|
||
|
bSupportDouble = false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// if(numDevsRequested > 1)
|
||
|
// checkCudaErrors(cudaSetDevice(devID));
|
||
|
|
||
|
if (fp64 && !bSupportDouble) {
|
||
|
fprintf(stderr,
|
||
|
"One or more of the requested devices does not support double "
|
||
|
"precision floating-point\n");
|
||
|
exit(EXIT_SUCCESS);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
numIterations = 0;
|
||
|
blockSize = 0;
|
||
|
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "i")) {
|
||
|
numIterations = getCmdLineArgumentInt(argc, (const char **)argv, "i");
|
||
|
}
|
||
|
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "blockSize")) {
|
||
|
blockSize = getCmdLineArgumentInt(argc, (const char **)argv, "blockSize");
|
||
|
}
|
||
|
|
||
|
if (blockSize == 0) // blockSize not set on command line
|
||
|
blockSize = 256;
|
||
|
|
||
|
// default number of bodies is #SMs * 4 * CTA size
|
||
|
if (useCpu)
|
||
|
#ifdef OPENMP
|
||
|
numBodies = 8192;
|
||
|
|
||
|
#else
|
||
|
numBodies = 4096;
|
||
|
#endif
|
||
|
else if (numDevsRequested == 1) {
|
||
|
numBodies = compareToCPU ? 4096 : blockSize * 4 * props.multiProcessorCount;
|
||
|
} else {
|
||
|
numBodies = 0;
|
||
|
|
||
|
for (int i = 0; i < numDevsRequested; i++) {
|
||
|
cudaDeviceProp props;
|
||
|
checkCudaErrors(cudaGetDeviceProperties(&props, i));
|
||
|
numBodies +=
|
||
|
blockSize * (props.major >= 2 ? 4 : 1) * props.multiProcessorCount;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "numbodies")) {
|
||
|
numBodies = getCmdLineArgumentInt(argc, (const char **)argv, "numbodies");
|
||
|
|
||
|
if (numBodies < 1) {
|
||
|
printf(
|
||
|
"Error: \"number of bodies\" specified %d is invalid. Value should "
|
||
|
"be >= 1\n",
|
||
|
numBodies);
|
||
|
exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE);
|
||
|
} else if (numBodies % blockSize) {
|
||
|
int newNumBodies = ((numBodies / blockSize) + 1) * blockSize;
|
||
|
printf(
|
||
|
"Warning: \"number of bodies\" specified %d is not a multiple of "
|
||
|
"%d.\n",
|
||
|
numBodies, blockSize);
|
||
|
printf("Rounding up to the nearest multiple: %d.\n", newNumBodies);
|
||
|
numBodies = newNumBodies;
|
||
|
} else {
|
||
|
printf("number of bodies = %d\n", numBodies);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
char *fname;
|
||
|
|
||
|
if (getCmdLineArgumentString(argc, (const char **)argv, "tipsy", &fname)) {
|
||
|
tipsyFile.assign(fname, strlen(fname));
|
||
|
cycleDemo = false;
|
||
|
bShowSliders = false;
|
||
|
}
|
||
|
|
||
|
if (numBodies <= 1024) {
|
||
|
activeParams.m_clusterScale = 1.52f;
|
||
|
activeParams.m_velocityScale = 2.f;
|
||
|
} else if (numBodies <= 2048) {
|
||
|
activeParams.m_clusterScale = 1.56f;
|
||
|
activeParams.m_velocityScale = 2.64f;
|
||
|
} else if (numBodies <= 4096) {
|
||
|
activeParams.m_clusterScale = 1.68f;
|
||
|
activeParams.m_velocityScale = 2.98f;
|
||
|
} else if (numBodies <= 8192) {
|
||
|
activeParams.m_clusterScale = 1.98f;
|
||
|
activeParams.m_velocityScale = 2.9f;
|
||
|
} else if (numBodies <= 16384) {
|
||
|
activeParams.m_clusterScale = 1.54f;
|
||
|
activeParams.m_velocityScale = 8.f;
|
||
|
} else if (numBodies <= 32768) {
|
||
|
activeParams.m_clusterScale = 1.44f;
|
||
|
activeParams.m_velocityScale = 11.f;
|
||
|
}
|
||
|
|
||
|
// Create the demo -- either double (fp64) or float (fp32, default)
|
||
|
// implementation
|
||
|
NBodyDemo<float>::Create();
|
||
|
|
||
|
NBodyDemo<float>::init(numBodies, numDevsRequested, blockSize,
|
||
|
!(benchmark || compareToCPU || useHostMem), useHostMem,
|
||
|
useP2P, useCpu, devID);
|
||
|
NBodyDemo<float>::reset(numBodies, NBODY_CONFIG_SHELL);
|
||
|
|
||
|
if (bSupportDouble) {
|
||
|
NBodyDemo<double>::Create();
|
||
|
NBodyDemo<double>::init(numBodies, numDevsRequested, blockSize,
|
||
|
!(benchmark || compareToCPU || useHostMem),
|
||
|
useHostMem, useP2P, useCpu, devID);
|
||
|
NBodyDemo<double>::reset(numBodies, NBODY_CONFIG_SHELL);
|
||
|
}
|
||
|
|
||
|
if (fp64) {
|
||
|
if (benchmark) {
|
||
|
if (numIterations <= 0) {
|
||
|
numIterations = 10;
|
||
|
} else if (numIterations > 10) {
|
||
|
printf("Advisory: setting a high number of iterations\n");
|
||
|
printf("in benchmark mode may cause failure on Windows\n");
|
||
|
printf("Vista and Win7. On these OSes, set iterations <= 10\n");
|
||
|
}
|
||
|
|
||
|
NBodyDemo<double>::runBenchmark(numIterations);
|
||
|
} else if (compareToCPU) {
|
||
|
bTestResults = NBodyDemo<double>::compareResults(numBodies);
|
||
|
} else {
|
||
|
glutDisplayFunc(display);
|
||
|
glutReshapeFunc(reshape);
|
||
|
glutMouseFunc(mouse);
|
||
|
glutMotionFunc(motion);
|
||
|
glutKeyboardFunc(key);
|
||
|
glutSpecialFunc(special);
|
||
|
glutIdleFunc(idle);
|
||
|
|
||
|
if (!useCpu) {
|
||
|
checkCudaErrors(cudaEventRecord(startEvent, 0));
|
||
|
}
|
||
|
|
||
|
glutMainLoop();
|
||
|
}
|
||
|
|
||
|
} else {
|
||
|
if (benchmark) {
|
||
|
if (numIterations <= 0) {
|
||
|
numIterations = 10;
|
||
|
}
|
||
|
|
||
|
NBodyDemo<float>::runBenchmark(numIterations);
|
||
|
} else if (compareToCPU) {
|
||
|
bTestResults = NBodyDemo<float>::compareResults(numBodies);
|
||
|
} else {
|
||
|
glutDisplayFunc(display);
|
||
|
glutReshapeFunc(reshape);
|
||
|
glutMouseFunc(mouse);
|
||
|
glutMotionFunc(motion);
|
||
|
glutKeyboardFunc(key);
|
||
|
glutSpecialFunc(special);
|
||
|
glutIdleFunc(idle);
|
||
|
|
||
|
if (!useCpu) {
|
||
|
checkCudaErrors(cudaEventRecord(startEvent, 0));
|
||
|
}
|
||
|
|
||
|
glutMainLoop();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
finalize();
|
||
|
exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE);
|
||
|
}
|