cuda-samples/Samples/nbody/bodysystemcuda_impl.h

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <helper_cuda.h>

#include <assert.h>
#include <math.h>
#include <memory.h>
#include <cstdio>
#include <cstdlib>
#include <vector>
#include <algorithm>

#include <cuda_gl_interop.h>

template <typename T>
void integrateNbodySystem(DeviceData<T> *deviceData,
                          cudaGraphicsResource **pgres,
                          unsigned int currentRead, float deltaTime,
                          float damping, unsigned int numBodies,
                          unsigned int numDevices, int blockSize, bool bUsePBO);

cudaError_t setSofteningSquared(float softeningSq);
cudaError_t setSofteningSquared(double softeningSq);

template <typename T>
BodySystemCUDA<T>::BodySystemCUDA(unsigned int numBodies,
                                  unsigned int numDevices,
                                  unsigned int blockSize, bool usePBO,
                                  bool useSysMem, bool useP2P, int deviceId)
    : m_numBodies(numBodies),
      m_numDevices(numDevices),
      m_bInitialized(false),
      m_bUsePBO(usePBO),
      m_bUseSysMem(useSysMem),
      m_bUseP2P(useP2P),
      m_currentRead(0),
      m_currentWrite(1),
      m_blockSize(blockSize),
      m_devID(deviceId) {
  m_hPos[0] = m_hPos[1] = 0;
  m_hVel = 0;

  m_deviceData = 0;

  _initialize(numBodies);
  setSoftening(0.00125f);
  setDamping(0.995f);
}

template <typename T>
BodySystemCUDA<T>::~BodySystemCUDA() {
  _finalize();
  m_numBodies = 0;
}

template <typename T>
void BodySystemCUDA<T>::_initialize(int numBodies) {
  assert(!m_bInitialized);

  m_numBodies = numBodies;

  unsigned int memSize = sizeof(T) * 4 * numBodies;

  m_deviceData = new DeviceData<T>[m_numDevices];

  // divide up the workload amongst Devices
  float *weights = new float[m_numDevices];
  int *numSms = new int[m_numDevices];
  float total = 0;

  for (unsigned int i = 0; i < m_numDevices; i++) {
    cudaDeviceProp props;
    checkCudaErrors(cudaGetDeviceProperties(&props, i));

    // Choose the weight based on the Compute Capability
    // We estimate that a CC2.0 SM is about 4.0x faster than a CC 1.x SM for
    // this application (since a 15-SM GF100 is about 2X faster than a 30-SM
    // GT200).
    numSms[i] = props.multiProcessorCount;
    weights[i] = numSms[i] * (props.major >= 2 ? 4.f : 1.f);
    total += weights[i];
  }

  unsigned int offset = 0;
  unsigned int remaining = m_numBodies;

  for (unsigned int i = 0; i < m_numDevices; i++) {
    unsigned int count = (int)((weights[i] / total) * m_numBodies);
    // Rounding up to numSms[i]*256 leads to better GPU utilization _per_ GPU
    // but when using multiple devices, it will lead to the last GPUs not having
    // any work at all
    // which means worse overall performance
    // unsigned int round = numSms[i] * 256;
    unsigned int round = 256;

    count = round * ((count + round - 1) / round);
    if (count > remaining) {
      count = remaining;
    }

    remaining -= count;
    m_deviceData[i].offset = offset;
    m_deviceData[i].numBodies = count;
    offset += count;

    if ((i == m_numDevices - 1) && (offset < m_numBodies - 1)) {
      m_deviceData[i].numBodies += m_numBodies - offset;
    }
  }

  delete[] weights;
  delete[] numSms;

  if (m_bUseSysMem) {
    checkCudaErrors(cudaHostAlloc((void **)&m_hPos[0], memSize,
                                  cudaHostAllocMapped | cudaHostAllocPortable));
    checkCudaErrors(cudaHostAlloc((void **)&m_hPos[1], memSize,
                                  cudaHostAllocMapped | cudaHostAllocPortable));
    checkCudaErrors(cudaHostAlloc((void **)&m_hVel, memSize,
                                  cudaHostAllocMapped | cudaHostAllocPortable));

    memset(m_hPos[0], 0, memSize);
    memset(m_hPos[1], 0, memSize);
    memset(m_hVel, 0, memSize);

    for (unsigned int i = 0; i < m_numDevices; i++) {
      if (m_numDevices > 1) {
        checkCudaErrors(cudaSetDevice(i));
      }

      checkCudaErrors(cudaEventCreate(&m_deviceData[i].event));
      checkCudaErrors(cudaHostGetDevicePointer(
          (void **)&m_deviceData[i].dPos[0], (void *)m_hPos[0], 0));
      checkCudaErrors(cudaHostGetDevicePointer(
          (void **)&m_deviceData[i].dPos[1], (void *)m_hPos[1], 0));
      checkCudaErrors(cudaHostGetDevicePointer((void **)&m_deviceData[i].dVel,
                                               (void *)m_hVel, 0));
    }
  } else {
    m_hPos[0] = new T[m_numBodies * 4];
    m_hVel = new T[m_numBodies * 4];

    memset(m_hPos[0], 0, memSize);
    memset(m_hVel, 0, memSize);

    checkCudaErrors(cudaSetDevice(m_devID));
    checkCudaErrors(cudaEventCreate(&m_deviceData[0].event));

    if (m_bUsePBO) {
      // create the position pixel buffer objects for rendering
      // we will actually compute directly from this memory in CUDA too
      glGenBuffers(2, (GLuint *)m_pbo);

      for (int i = 0; i < 2; ++i) {
        glBindBuffer(GL_ARRAY_BUFFER, m_pbo[i]);
        glBufferData(GL_ARRAY_BUFFER, memSize, m_hPos[0], GL_DYNAMIC_DRAW);

        int size = 0;
        glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, (GLint *)&size);

        if ((unsigned)size != memSize) {
          fprintf(stderr, "WARNING: Pixel Buffer Object allocation failed!n");
        }

        glBindBuffer(GL_ARRAY_BUFFER, 0);
        checkCudaErrors(cudaGraphicsGLRegisterBuffer(&m_pGRes[i], m_pbo[i],
                                                     cudaGraphicsMapFlagsNone));
      }
    } else {
      checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dPos[0], memSize));
      checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dPos[1], memSize));
    }

    checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dVel, memSize));

    // At this point we already know P2P is supported
    if (m_bUseP2P) {
      for (unsigned int i = 1; i < m_numDevices; i++) {
        int access = 0;
        cudaError_t error;

        // Enable access for gpu_i to memory owned by gpu0
        checkCudaErrors(cudaSetDevice(i));
        if ((error = cudaDeviceEnablePeerAccess(0, 0)) !=
            cudaErrorPeerAccessAlreadyEnabled) {
          checkCudaErrors(error);
        } else {
          // We might have already enabled P2P, so catch this and reset error
          // code...
          cudaGetLastError();
        }

        checkCudaErrors(cudaEventCreate(&m_deviceData[i].event));

        // Point all GPUs to the memory allocated on gpu0
        m_deviceData[i].dPos[0] = m_deviceData[0].dPos[0];
        m_deviceData[i].dPos[1] = m_deviceData[0].dPos[1];
        m_deviceData[i].dVel = m_deviceData[0].dVel;
      }
    }
  }

  m_bInitialized = true;
}

template <typename T>
void BodySystemCUDA<T>::_finalize() {
  assert(m_bInitialized);

  if (m_bUseSysMem) {
    checkCudaErrors(cudaFreeHost(m_hPos[0]));
    checkCudaErrors(cudaFreeHost(m_hPos[1]));
    checkCudaErrors(cudaFreeHost(m_hVel));

    for (unsigned int i = 0; i < m_numDevices; i++) {
      cudaEventDestroy(m_deviceData[i].event);
    }
  } else {
    delete[] m_hPos[0];
    delete[] m_hPos[1];
    delete[] m_hVel;

    checkCudaErrors(cudaFree((void **)m_deviceData[0].dVel));

    if (m_bUsePBO) {
      checkCudaErrors(cudaGraphicsUnregisterResource(m_pGRes[0]));
      checkCudaErrors(cudaGraphicsUnregisterResource(m_pGRes[1]));
      glDeleteBuffers(2, (const GLuint *)m_pbo);
    } else {
      checkCudaErrors(cudaFree((void **)m_deviceData[0].dPos[0]));
      checkCudaErrors(cudaFree((void **)m_deviceData[0].dPos[1]));

      checkCudaErrors(cudaEventDestroy(m_deviceData[0].event));

      if (m_bUseP2P) {
        for (unsigned int i = 1; i < m_numDevices; i++) {
          checkCudaErrors(cudaEventDestroy(m_deviceData[i].event));
        }
      }
    }
  }

  delete[] m_deviceData;

  m_bInitialized = false;
}

template <typename T>
void BodySystemCUDA<T>::loadTipsyFile(const std::string &filename) {
  if (m_bInitialized) _finalize();

  std::vector<typename vec4<T>::Type> positions;
  std::vector<typename vec4<T>::Type> velocities;
  std::vector<int> ids;

  int nBodies = 0;
  int nFirst = 0, nSecond = 0, nThird = 0;

  read_tipsy_file(positions, velocities, ids, filename, nBodies, nFirst,
                  nSecond, nThird);

  _initialize(nBodies);

  setArray(BODYSYSTEM_POSITION, (T *)&positions[0]);
  setArray(BODYSYSTEM_VELOCITY, (T *)&velocities[0]);
}

template <typename T>
void BodySystemCUDA<T>::setSoftening(T softening) {
  T softeningSq = softening * softening;

  for (unsigned int i = 0; i < m_numDevices; i++) {
    if (m_numDevices > 1) {
      checkCudaErrors(cudaSetDevice(i));
    }

    checkCudaErrors(setSofteningSquared(softeningSq));
  }
}

template <typename T>
void BodySystemCUDA<T>::setDamping(T damping) {
  m_damping = damping;
}

template <typename T>
void BodySystemCUDA<T>::update(T deltaTime) {
  assert(m_bInitialized);

  integrateNbodySystem<T>(m_deviceData, m_pGRes, m_currentRead,
                          (float)deltaTime, (float)m_damping, m_numBodies,
                          m_numDevices, m_blockSize, m_bUsePBO);

  std::swap(m_currentRead, m_currentWrite);
}

template <typename T>
T *BodySystemCUDA<T>::getArray(BodyArray array) {
  assert(m_bInitialized);

  T *hdata = 0;
  T *ddata = 0;

  cudaGraphicsResource *pgres = NULL;

  int currentReadHost = m_bUseSysMem ? m_currentRead : 0;

  switch (array) {
    default:
    case BODYSYSTEM_POSITION:
      hdata = m_hPos[currentReadHost];
      ddata = m_deviceData[0].dPos[m_currentRead];

      if (m_bUsePBO) {
        pgres = m_pGRes[m_currentRead];
      }

      break;

    case BODYSYSTEM_VELOCITY:
      hdata = m_hVel;
      ddata = m_deviceData[0].dVel;
      break;
  }

  if (!m_bUseSysMem) {
    if (pgres) {
      checkCudaErrors(
          cudaGraphicsResourceSetMapFlags(pgres, cudaGraphicsMapFlagsReadOnly));
      checkCudaErrors(cudaGraphicsMapResources(1, &pgres, 0));
      size_t bytes;
      checkCudaErrors(
          cudaGraphicsResourceGetMappedPointer((void **)&ddata, &bytes, pgres));
    }

    checkCudaErrors(cudaMemcpy(hdata, ddata, m_numBodies * 4 * sizeof(T),
                               cudaMemcpyDeviceToHost));

    if (pgres) {
      checkCudaErrors(cudaGraphicsUnmapResources(1, &pgres, 0));
    }
  }

  return hdata;
}

template <typename T>
void BodySystemCUDA<T>::setArray(BodyArray array, const T *data) {
  assert(m_bInitialized);

  m_currentRead = 0;
  m_currentWrite = 1;

  switch (array) {
    default:
    case BODYSYSTEM_POSITION: {
      if (m_bUsePBO) {
        glBindBuffer(GL_ARRAY_BUFFER, m_pbo[m_currentRead]);
        glBufferSubData(GL_ARRAY_BUFFER, 0, 4 * sizeof(T) * m_numBodies, data);

        int size = 0;
        glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, (GLint *)&size);

        if ((unsigned)size != 4 * (sizeof(T) * m_numBodies)) {
          fprintf(stderr, "WARNING: Pixel Buffer Object download failed!n");
        }

        glBindBuffer(GL_ARRAY_BUFFER, 0);
      } else {
        if (m_bUseSysMem) {
          memcpy(m_hPos[m_currentRead], data, m_numBodies * 4 * sizeof(T));
        } else
          checkCudaErrors(cudaMemcpy(m_deviceData[0].dPos[m_currentRead], data,
                                     m_numBodies * 4 * sizeof(T),
                                     cudaMemcpyHostToDevice));
      }
    } break;

    case BODYSYSTEM_VELOCITY:
      if (m_bUseSysMem) {
        memcpy(m_hVel, data, m_numBodies * 4 * sizeof(T));
      } else
        checkCudaErrors(cudaMemcpy(m_deviceData[0].dVel, data,
                                   m_numBodies * 4 * sizeof(T),
                                   cudaMemcpyHostToDevice));

      break;
  }
}
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`#include <helper_cuda.h>`

			`#include <assert.h>`
			`#include <math.h>`
			`#include <memory.h>`
			`#include <cstdio>`
			`#include <cstdlib>`
			`#include <vector>`
			`#include <algorithm>`

			`#include <cuda_gl_interop.h>`

			`template <typename T>`
			`void integrateNbodySystem(DeviceData<T> *deviceData,`
			`cudaGraphicsResource **pgres,`
			`unsigned int currentRead, float deltaTime,`
			`float damping, unsigned int numBodies,`
			`unsigned int numDevices, int blockSize, bool bUsePBO);`

			`cudaError_t setSofteningSquared(float softeningSq);`
			`cudaError_t setSofteningSquared(double softeningSq);`

			`template <typename T>`
			`BodySystemCUDA<T>::BodySystemCUDA(unsigned int numBodies,`
			`unsigned int numDevices,`
			`unsigned int blockSize, bool usePBO,`
			`bool useSysMem, bool useP2P, int deviceId)`
			`: m_numBodies(numBodies),`
			`m_numDevices(numDevices),`
			`m_bInitialized(false),`
			`m_bUsePBO(usePBO),`
			`m_bUseSysMem(useSysMem),`
			`m_bUseP2P(useP2P),`
			`m_currentRead(0),`
			`m_currentWrite(1),`
			`m_blockSize(blockSize),`
			`m_devID(deviceId) {`
			`m_hPos[0] = m_hPos[1] = 0;`
			`m_hVel = 0;`

			`m_deviceData = 0;`

			`_initialize(numBodies);`
			`setSoftening(0.00125f);`
			`setDamping(0.995f);`
			`}`

			`template <typename T>`
			`BodySystemCUDA<T>::~BodySystemCUDA() {`
			`_finalize();`
			`m_numBodies = 0;`
			`}`

			`template <typename T>`
			`void BodySystemCUDA<T>::_initialize(int numBodies) {`
			`assert(!m_bInitialized);`

			`m_numBodies = numBodies;`

			`unsigned int memSize = sizeof(T) * 4 * numBodies;`

			`m_deviceData = new DeviceData<T>[m_numDevices];`

			`// divide up the workload amongst Devices`
			`float *weights = new float[m_numDevices];`
			`int *numSms = new int[m_numDevices];`
			`float total = 0;`

			`for (unsigned int i = 0; i < m_numDevices; i++) {`
			`cudaDeviceProp props;`
			`checkCudaErrors(cudaGetDeviceProperties(&props, i));`

			`// Choose the weight based on the Compute Capability`
			`// We estimate that a CC2.0 SM is about 4.0x faster than a CC 1.x SM for`
			`// this application (since a 15-SM GF100 is about 2X faster than a 30-SM`
			`// GT200).`
			`numSms[i] = props.multiProcessorCount;`
			`weights[i] = numSms[i] * (props.major >= 2 ? 4.f : 1.f);`
			`total += weights[i];`
			`}`

			`unsigned int offset = 0;`
			`unsigned int remaining = m_numBodies;`

			`for (unsigned int i = 0; i < m_numDevices; i++) {`
			`unsigned int count = (int)((weights[i] / total) * m_numBodies);`
			`// Rounding up to numSms[i]*256 leads to better GPU utilization _per_ GPU`
			`// but when using multiple devices, it will lead to the last GPUs not having`
			`// any work at all`
			`// which means worse overall performance`
			`// unsigned int round = numSms[i] * 256;`
			`unsigned int round = 256;`

			`count = round * ((count + round - 1) / round);`
			`if (count > remaining) {`
			`count = remaining;`
			`}`

			`remaining -= count;`
			`m_deviceData[i].offset = offset;`
			`m_deviceData[i].numBodies = count;`
			`offset += count;`

			`if ((i == m_numDevices - 1) && (offset < m_numBodies - 1)) {`
			`m_deviceData[i].numBodies += m_numBodies - offset;`
			`}`
			`}`

			`delete[] weights;`
			`delete[] numSms;`

			`if (m_bUseSysMem) {`
			`checkCudaErrors(cudaHostAlloc((void **)&m_hPos[0], memSize,`
			`cudaHostAllocMapped \| cudaHostAllocPortable));`
			`checkCudaErrors(cudaHostAlloc((void **)&m_hPos[1], memSize,`
			`cudaHostAllocMapped \| cudaHostAllocPortable));`
			`checkCudaErrors(cudaHostAlloc((void **)&m_hVel, memSize,`
			`cudaHostAllocMapped \| cudaHostAllocPortable));`

			`memset(m_hPos[0], 0, memSize);`
			`memset(m_hPos[1], 0, memSize);`
			`memset(m_hVel, 0, memSize);`

			`for (unsigned int i = 0; i < m_numDevices; i++) {`
			`if (m_numDevices > 1) {`
			`checkCudaErrors(cudaSetDevice(i));`
			`}`

			`checkCudaErrors(cudaEventCreate(&m_deviceData[i].event));`
			`checkCudaErrors(cudaHostGetDevicePointer(`
			`(void *)&m_deviceData[i].dPos[0], (void )m_hPos[0], 0));`
			`checkCudaErrors(cudaHostGetDevicePointer(`
			`(void *)&m_deviceData[i].dPos[1], (void )m_hPos[1], 0));`
			`checkCudaErrors(cudaHostGetDevicePointer((void **)&m_deviceData[i].dVel,`
			`(void *)m_hVel, 0));`
			`}`
			`} else {`
			`m_hPos[0] = new T[m_numBodies * 4];`
			`m_hVel = new T[m_numBodies * 4];`

			`memset(m_hPos[0], 0, memSize);`
			`memset(m_hVel, 0, memSize);`

			`checkCudaErrors(cudaSetDevice(m_devID));`
			`checkCudaErrors(cudaEventCreate(&m_deviceData[0].event));`

			`if (m_bUsePBO) {`
			`// create the position pixel buffer objects for rendering`
			`// we will actually compute directly from this memory in CUDA too`
			`glGenBuffers(2, (GLuint *)m_pbo);`

			`for (int i = 0; i < 2; ++i) {`
			`glBindBuffer(GL_ARRAY_BUFFER, m_pbo[i]);`
			`glBufferData(GL_ARRAY_BUFFER, memSize, m_hPos[0], GL_DYNAMIC_DRAW);`

			`int size = 0;`
			`glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, (GLint *)&size);`

			`if ((unsigned)size != memSize) {`
			`fprintf(stderr, "WARNING: Pixel Buffer Object allocation failed!n");`
			`}`

			`glBindBuffer(GL_ARRAY_BUFFER, 0);`
			`checkCudaErrors(cudaGraphicsGLRegisterBuffer(&m_pGRes[i], m_pbo[i],`
			`cudaGraphicsMapFlagsNone));`
			`}`
			`} else {`
			`checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dPos[0], memSize));`
			`checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dPos[1], memSize));`
			`}`

			`checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dVel, memSize));`

			`// At this point we already know P2P is supported`
			`if (m_bUseP2P) {`
			`for (unsigned int i = 1; i < m_numDevices; i++) {`
			`int access = 0;`
			`cudaError_t error;`

			`// Enable access for gpu_i to memory owned by gpu0`
			`checkCudaErrors(cudaSetDevice(i));`
			`if ((error = cudaDeviceEnablePeerAccess(0, 0)) !=`
			`cudaErrorPeerAccessAlreadyEnabled) {`
			`checkCudaErrors(error);`
			`} else {`
			`// We might have already enabled P2P, so catch this and reset error`
			`// code...`
			`cudaGetLastError();`
			`}`

			`checkCudaErrors(cudaEventCreate(&m_deviceData[i].event));`

			`// Point all GPUs to the memory allocated on gpu0`
			`m_deviceData[i].dPos[0] = m_deviceData[0].dPos[0];`
			`m_deviceData[i].dPos[1] = m_deviceData[0].dPos[1];`
			`m_deviceData[i].dVel = m_deviceData[0].dVel;`
			`}`
			`}`
			`}`

			`m_bInitialized = true;`
			`}`

			`template <typename T>`
			`void BodySystemCUDA<T>::_finalize() {`
			`assert(m_bInitialized);`

			`if (m_bUseSysMem) {`
			`checkCudaErrors(cudaFreeHost(m_hPos[0]));`
			`checkCudaErrors(cudaFreeHost(m_hPos[1]));`
			`checkCudaErrors(cudaFreeHost(m_hVel));`

			`for (unsigned int i = 0; i < m_numDevices; i++) {`
			`cudaEventDestroy(m_deviceData[i].event);`
			`}`
			`} else {`
			`delete[] m_hPos[0];`
			`delete[] m_hPos[1];`
			`delete[] m_hVel;`

			`checkCudaErrors(cudaFree((void **)m_deviceData[0].dVel));`

			`if (m_bUsePBO) {`
			`checkCudaErrors(cudaGraphicsUnregisterResource(m_pGRes[0]));`
			`checkCudaErrors(cudaGraphicsUnregisterResource(m_pGRes[1]));`
			`glDeleteBuffers(2, (const GLuint *)m_pbo);`
			`} else {`
			`checkCudaErrors(cudaFree((void **)m_deviceData[0].dPos[0]));`
			`checkCudaErrors(cudaFree((void **)m_deviceData[0].dPos[1]));`

			`checkCudaErrors(cudaEventDestroy(m_deviceData[0].event));`

			`if (m_bUseP2P) {`
			`for (unsigned int i = 1; i < m_numDevices; i++) {`
			`checkCudaErrors(cudaEventDestroy(m_deviceData[i].event));`
			`}`
			`}`
			`}`
			`}`

			`delete[] m_deviceData;`

			`m_bInitialized = false;`
			`}`

			`template <typename T>`
			`void BodySystemCUDA<T>::loadTipsyFile(const std::string &filename) {`
			`if (m_bInitialized) _finalize();`

			`std::vector<typename vec4<T>::Type> positions;`
			`std::vector<typename vec4<T>::Type> velocities;`
			`std::vector<int> ids;`

			`int nBodies = 0;`
			`int nFirst = 0, nSecond = 0, nThird = 0;`

			`read_tipsy_file(positions, velocities, ids, filename, nBodies, nFirst,`
			`nSecond, nThird);`

			`_initialize(nBodies);`

			`setArray(BODYSYSTEM_POSITION, (T *)&positions[0]);`
			`setArray(BODYSYSTEM_VELOCITY, (T *)&velocities[0]);`
			`}`

			`template <typename T>`
			`void BodySystemCUDA<T>::setSoftening(T softening) {`
			`T softeningSq = softening * softening;`

			`for (unsigned int i = 0; i < m_numDevices; i++) {`
			`if (m_numDevices > 1) {`
			`checkCudaErrors(cudaSetDevice(i));`
			`}`

			`checkCudaErrors(setSofteningSquared(softeningSq));`
			`}`
			`}`

			`template <typename T>`
			`void BodySystemCUDA<T>::setDamping(T damping) {`
			`m_damping = damping;`
			`}`

			`template <typename T>`
			`void BodySystemCUDA<T>::update(T deltaTime) {`
			`assert(m_bInitialized);`

			`integrateNbodySystem<T>(m_deviceData, m_pGRes, m_currentRead,`
			`(float)deltaTime, (float)m_damping, m_numBodies,`
			`m_numDevices, m_blockSize, m_bUsePBO);`

			`std::swap(m_currentRead, m_currentWrite);`
			`}`

			`template <typename T>`
			`T *BodySystemCUDA<T>::getArray(BodyArray array) {`
			`assert(m_bInitialized);`

			`T *hdata = 0;`
			`T *ddata = 0;`

			`cudaGraphicsResource *pgres = NULL;`

			`int currentReadHost = m_bUseSysMem ? m_currentRead : 0;`

			`switch (array) {`
			`default:`
			`case BODYSYSTEM_POSITION:`
			`hdata = m_hPos[currentReadHost];`
			`ddata = m_deviceData[0].dPos[m_currentRead];`

			`if (m_bUsePBO) {`
			`pgres = m_pGRes[m_currentRead];`
			`}`

			`break;`

			`case BODYSYSTEM_VELOCITY:`
			`hdata = m_hVel;`
			`ddata = m_deviceData[0].dVel;`
			`break;`
			`}`

			`if (!m_bUseSysMem) {`
			`if (pgres) {`
			`checkCudaErrors(`
			`cudaGraphicsResourceSetMapFlags(pgres, cudaGraphicsMapFlagsReadOnly));`
			`checkCudaErrors(cudaGraphicsMapResources(1, &pgres, 0));`
			`size_t bytes;`
			`checkCudaErrors(`
			`cudaGraphicsResourceGetMappedPointer((void **)&ddata, &bytes, pgres));`
			`}`

			`checkCudaErrors(cudaMemcpy(hdata, ddata, m_numBodies * 4 * sizeof(T),`
			`cudaMemcpyDeviceToHost));`

			`if (pgres) {`
			`checkCudaErrors(cudaGraphicsUnmapResources(1, &pgres, 0));`
			`}`
			`}`

			`return hdata;`
			`}`

			`template <typename T>`
			`void BodySystemCUDA<T>::setArray(BodyArray array, const T *data) {`
			`assert(m_bInitialized);`

			`m_currentRead = 0;`
			`m_currentWrite = 1;`

			`switch (array) {`
			`default:`
			`case BODYSYSTEM_POSITION: {`
			`if (m_bUsePBO) {`
			`glBindBuffer(GL_ARRAY_BUFFER, m_pbo[m_currentRead]);`
			`glBufferSubData(GL_ARRAY_BUFFER, 0, 4 * sizeof(T) * m_numBodies, data);`

			`int size = 0;`
			`glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, (GLint *)&size);`

			`if ((unsigned)size != 4 * (sizeof(T) * m_numBodies)) {`
			`fprintf(stderr, "WARNING: Pixel Buffer Object download failed!n");`
			`}`

			`glBindBuffer(GL_ARRAY_BUFFER, 0);`
			`} else {`
			`if (m_bUseSysMem) {`
			`memcpy(m_hPos[m_currentRead], data, m_numBodies * 4 * sizeof(T));`
			`} else`
			`checkCudaErrors(cudaMemcpy(m_deviceData[0].dPos[m_currentRead], data,`
			`m_numBodies * 4 * sizeof(T),`
			`cudaMemcpyHostToDevice));`
			`}`
			`} break;`

			`case BODYSYSTEM_VELOCITY:`
			`if (m_bUseSysMem) {`
			`memcpy(m_hVel, data, m_numBodies * 4 * sizeof(T));`
			`} else`
			`checkCudaErrors(cudaMemcpy(m_deviceData[0].dVel, data,`
			`m_numBodies * 4 * sizeof(T),`
			`cudaMemcpyHostToDevice));`

			`break;`
			`}`
			`}`