cuda-samples/Samples/bindlessTexture/bindlessTexture_kernel.cu

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
  This sample has two kernels, one doing the rendering every frame, and
  another one used to generate the mip map levels at startup.

  For rendering we use a "virtual" texturing approach, where one 2d texture
  stores pointers to the actual textures used. This can be achieved by the
  new cudaTextureObject introduced in CUDA 5.0 and requiring sm3+ hardware.

  The mipmap generation kernel uses cudaSurfaceObject and cudaTextureObject
  passed as kernel arguments to compute the higher mip map level based on
  the lower.
*/

#ifndef _BINDLESSTEXTURE_KERNEL_CU_
#define _BINDLESSTEXTURE_KERNEL_CU_

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include <vector>

#include <helper_cuda.h>
#include <helper_math.h>

#include "bindlessTexture.h"

// set this to just see the mipmap chain of first image
//#define SHOW_MIPMAPS

// local references to resources

Image atlasImage;
std::vector<Image> contentImages;
float highestLod = 1.0f;

#ifndef MAX
#define MAX(a, b) ((a > b) ? a : b)
#endif

//////////////////////////////////////////////////////////////////////////

__host__ __device__ __inline__ uint2 encodeTextureObject(
    cudaTextureObject_t obj) {
  return make_uint2((uint)(obj & 0xFFFFFFFF), (uint)(obj >> 32));
}

__host__ __device__ __inline__ cudaTextureObject_t decodeTextureObject(
    uint2 obj) {
  return (((cudaTextureObject_t)obj.x) | ((cudaTextureObject_t)obj.y) << 32);
}

__device__ __inline__ float4 to_float4(uchar4 vec) {
  return make_float4(vec.x, vec.y, vec.z, vec.w);
}

__device__ __inline__ uchar4 to_uchar4(float4 vec) {
  return make_uchar4((uchar)vec.x, (uchar)vec.y, (uchar)vec.z, (uchar)vec.w);
}

//////////////////////////////////////////////////////////////////////////
// Rendering

// the atlas texture stores the 64 bit cudaTextureObjects
// we use it for "virtual" texturing

__global__ void d_render(uchar4 *d_output, uint imageW, uint imageH, float lod,
                         cudaTextureObject_t atlasTexture) {
  uint x = blockIdx.x * blockDim.x + threadIdx.x;
  uint y = blockIdx.y * blockDim.y + threadIdx.y;

  float u = x / (float)imageW;
  float v = y / (float)imageH;

  if ((x < imageW) && (y < imageH)) {
    // read from 2D atlas texture and decode texture object
    uint2 texCoded = tex2D<uint2>(atlasTexture, u, v);
    cudaTextureObject_t tex = decodeTextureObject(texCoded);

    // read from cuda texture object, use template to specify what data will be
    // returned. tex2DLod allows us to pass the lod (mip map level) directly.
    // There is other functions with CUDA 5, e.g. tex2DGrad, that allow you
    // to pass derivatives to perform automatic mipmap/anisotropic filtering.
    float4 color = tex2DLod<float4>(tex, u, 1 - v, lod);
    // In our sample tex is always valid, but for something like your own
    // sparse texturing you would need to make sure to handle the zero case.

    // write output color
    uint i = y * imageW + x;
    d_output[i] = to_uchar4(color * 255.0);
  }
}

extern "C" void renderAtlasImage(dim3 gridSize, dim3 blockSize,
                                 uchar4 *d_output, uint imageW, uint imageH,
                                 float lod) {
  // psuedo animate lod
  lod = fmodf(lod, highestLod * 2);
  lod = highestLod - fabs(lod - highestLod);

#ifdef SHOW_MIPMAPS
  lod = 0.0f;
#endif

  d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, lod,
                                    atlasImage.textureObject);

  checkCudaErrors(cudaGetLastError());
}

//////////////////////////////////////////////////////////////////////////
// MipMap Generation

//  A key benefit of using the new surface objects is that we don't need any
//  global binding points anymore. We can directly pass them as function
//  arguments.

__global__ void d_mipmap(cudaSurfaceObject_t mipOutput,
                         cudaTextureObject_t mipInput, uint imageW,
                         uint imageH) {
  uint x = blockIdx.x * blockDim.x + threadIdx.x;
  uint y = blockIdx.y * blockDim.y + threadIdx.y;

  float px = 1.0 / float(imageW);
  float py = 1.0 / float(imageH);

  if ((x < imageW) && (y < imageH)) {
    // take the average of 4 samples

    // we are using the normalized access to make sure non-power-of-two textures
    // behave well when downsized.
    float4 color = (tex2D<float4>(mipInput, (x + 0) * px, (y + 0) * py)) +
                   (tex2D<float4>(mipInput, (x + 1) * px, (y + 0) * py)) +
                   (tex2D<float4>(mipInput, (x + 1) * px, (y + 1) * py)) +
                   (tex2D<float4>(mipInput, (x + 0) * px, (y + 1) * py));

    color /= 4.0;
    color *= 255.0;
    color = fminf(color, make_float4(255.0));

    surf2Dwrite(to_uchar4(color), mipOutput, x * sizeof(uchar4), y);
  }
}

void generateMipMaps(cudaMipmappedArray_t mipmapArray, cudaExtent size) {
  size_t width = size.width;
  size_t height = size.height;

#ifdef SHOW_MIPMAPS
  cudaArray_t levelFirst;
  checkCudaErrors(cudaGetMipmappedArrayLevel(&levelFirst, mipmapArray, 0));
#endif

  uint level = 0;

  while (width != 1 || height != 1) {
    width /= 2;
    width = MAX((size_t)1, width);
    height /= 2;
    height = MAX((size_t)1, height);

    cudaArray_t levelFrom;
    checkCudaErrors(cudaGetMipmappedArrayLevel(&levelFrom, mipmapArray, level));
    cudaArray_t levelTo;
    checkCudaErrors(
        cudaGetMipmappedArrayLevel(&levelTo, mipmapArray, level + 1));

    cudaExtent levelToSize;
    checkCudaErrors(cudaArrayGetInfo(NULL, &levelToSize, NULL, levelTo));
    checkHost(levelToSize.width == width);
    checkHost(levelToSize.height == height);
    checkHost(levelToSize.depth == 0);

    // generate texture object for reading
    cudaTextureObject_t texInput;
    cudaResourceDesc texRes;
    memset(&texRes, 0, sizeof(cudaResourceDesc));

    texRes.resType = cudaResourceTypeArray;
    texRes.res.array.array = levelFrom;

    cudaTextureDesc texDescr;
    memset(&texDescr, 0, sizeof(cudaTextureDesc));

    texDescr.normalizedCoords = 1;
    texDescr.filterMode = cudaFilterModeLinear;

    texDescr.addressMode[0] = cudaAddressModeClamp;
    texDescr.addressMode[1] = cudaAddressModeClamp;
    texDescr.addressMode[2] = cudaAddressModeClamp;

    texDescr.readMode = cudaReadModeNormalizedFloat;

    checkCudaErrors(
        cudaCreateTextureObject(&texInput, &texRes, &texDescr, NULL));

    // generate surface object for writing

    cudaSurfaceObject_t surfOutput;
    cudaResourceDesc surfRes;
    memset(&surfRes, 0, sizeof(cudaResourceDesc));
    surfRes.resType = cudaResourceTypeArray;
    surfRes.res.array.array = levelTo;

    checkCudaErrors(cudaCreateSurfaceObject(&surfOutput, &surfRes));

    // run mipmap kernel
    dim3 blockSize(16, 16, 1);
    dim3 gridSize(((uint)width + blockSize.x - 1) / blockSize.x,
                  ((uint)height + blockSize.y - 1) / blockSize.y, 1);

    d_mipmap<<<gridSize, blockSize>>>(surfOutput, texInput, (uint)width,
                                      (uint)height);

    checkCudaErrors(cudaDeviceSynchronize());
    checkCudaErrors(cudaGetLastError());

    checkCudaErrors(cudaDestroySurfaceObject(surfOutput));

    checkCudaErrors(cudaDestroyTextureObject(texInput));

#ifdef SHOW_MIPMAPS
    // we blit the current mipmap back into first level
    cudaMemcpy3DParms copyParams = {0};
    copyParams.dstArray = levelFirst;
    copyParams.srcArray = levelTo;
    copyParams.extent = make_cudaExtent(width, height, 1);
    copyParams.kind = cudaMemcpyDeviceToDevice;
    checkCudaErrors(cudaMemcpy3D(&copyParams));
#endif

    level++;
  }
}

uint getMipMapLevels(cudaExtent size) {
  size_t sz = MAX(MAX(size.width, size.height), size.depth);

  uint levels = 0;

  while (sz) {
    sz /= 2;
    levels++;
  }

  return levels;
}

//////////////////////////////////////////////////////////////////////////
// Initalization

extern "C" void randomizeAtlas() {
  uint2 *h_data = (uint2 *)atlasImage.h_data;

  // assign random texture object handles to our atlas image tiles
  for (size_t i = 0; i < atlasImage.size.width * atlasImage.size.height; i++) {
#ifdef SHOW_MIPMAPS
    h_data[i] = encodeTextureObject(contentImages[0].textureObject);
#else
    h_data[i] = encodeTextureObject(
        contentImages[rand() % contentImages.size()].textureObject);
#endif
  }

  // copy data to atlas array
  cudaMemcpy3DParms copyParams = {0};
  copyParams.srcPtr = make_cudaPitchedPtr(
      atlasImage.h_data, atlasImage.size.width * sizeof(uint2),
      atlasImage.size.width, atlasImage.size.height);
  copyParams.dstArray = atlasImage.dataArray;
  copyParams.extent = atlasImage.size;
  copyParams.extent.depth = 1;
  copyParams.kind = cudaMemcpyHostToDevice;
  checkCudaErrors(cudaMemcpy3D(&copyParams));
};

extern "C" void deinitAtlasAndImages() {
  for (size_t i = 0; i < contentImages.size(); i++) {
    Image &image = contentImages[i];

    if (image.h_data) {
      free(image.h_data);
    }

    if (image.textureObject) {
      checkCudaErrors(cudaDestroyTextureObject(image.textureObject));
    }

    if (image.mipmapArray) {
      checkCudaErrors(cudaFreeMipmappedArray(image.mipmapArray));
    }
  }

  if (atlasImage.h_data) {
    free(atlasImage.h_data);
  }

  if (atlasImage.textureObject) {
    checkCudaErrors(cudaDestroyTextureObject(atlasImage.textureObject));
  }

  if (atlasImage.dataArray) {
    checkCudaErrors(cudaFreeArray(atlasImage.dataArray));
  }
}

extern "C" void initAtlasAndImages(const Image *images, size_t numImages,
                                   cudaExtent atlasSize) {
  // create individual textures
  contentImages.resize(numImages);

  for (size_t i = 0; i < numImages; i++) {
    Image &image = contentImages[i];
    image.size = images[i].size;
    image.size.depth = 0;
    image.type = cudaResourceTypeMipmappedArray;

    // how many mipmaps we need
    uint levels = getMipMapLevels(image.size);
    highestLod = MAX(highestLod, (float)levels - 1);

    cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
    checkCudaErrors(cudaMallocMipmappedArray(&image.mipmapArray, &desc,
                                             image.size, levels));

    // upload level 0
    cudaArray_t level0;
    checkCudaErrors(cudaGetMipmappedArrayLevel(&level0, image.mipmapArray, 0));

    cudaMemcpy3DParms copyParams = {0};
    copyParams.srcPtr =
        make_cudaPitchedPtr(images[i].h_data, image.size.width * sizeof(uchar4),
                            image.size.width, image.size.height);
    copyParams.dstArray = level0;
    copyParams.extent = image.size;
    copyParams.extent.depth = 1;
    copyParams.kind = cudaMemcpyHostToDevice;
    checkCudaErrors(cudaMemcpy3D(&copyParams));

    // compute rest of mipmaps based on level 0
    generateMipMaps(image.mipmapArray, image.size);

    // generate bindless texture object

    cudaResourceDesc resDescr;
    memset(&resDescr, 0, sizeof(cudaResourceDesc));

    resDescr.resType = cudaResourceTypeMipmappedArray;
    resDescr.res.mipmap.mipmap = image.mipmapArray;

    cudaTextureDesc texDescr;
    memset(&texDescr, 0, sizeof(cudaTextureDesc));

    texDescr.normalizedCoords = 1;
    texDescr.filterMode = cudaFilterModeLinear;
    texDescr.mipmapFilterMode = cudaFilterModeLinear;

    texDescr.addressMode[0] = cudaAddressModeClamp;
    texDescr.addressMode[1] = cudaAddressModeClamp;
    texDescr.addressMode[2] = cudaAddressModeClamp;

    texDescr.maxMipmapLevelClamp = float(levels - 1);

    texDescr.readMode = cudaReadModeNormalizedFloat;

    checkCudaErrors(cudaCreateTextureObject(&image.textureObject, &resDescr,
                                            &texDescr, NULL));
  }

  // create atlas array
  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uint2>();
  checkCudaErrors(cudaMallocArray(&atlasImage.dataArray, &channelDesc,
                                  atlasSize.width, atlasSize.height));
  atlasImage.h_data =
      malloc(atlasSize.width * atlasSize.height * sizeof(uint2));
  atlasImage.type = cudaResourceTypeArray;
  atlasImage.size = atlasSize;

  cudaResourceDesc texRes;
  memset(&texRes, 0, sizeof(cudaResourceDesc));

  texRes.resType = cudaResourceTypeArray;
  texRes.res.array.array = atlasImage.dataArray;

  cudaTextureDesc texDescr;
  memset(&texDescr, 0, sizeof(cudaTextureDesc));

  texDescr.normalizedCoords = true;
  texDescr.filterMode = cudaFilterModePoint;
  texDescr.addressMode[0] = cudaAddressModeClamp;
  texDescr.addressMode[1] = cudaAddressModeClamp;
  texDescr.addressMode[1] = cudaAddressModeClamp;
  texDescr.readMode = cudaReadModeElementType;

  checkCudaErrors(cudaCreateTextureObject(&atlasImage.textureObject, &texRes,
                                          &texDescr, NULL));

  randomizeAtlas();
}

#endif  // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`/*`
			`This sample has two kernels, one doing the rendering every frame, and`
			`another one used to generate the mip map levels at startup.`

			`For rendering we use a "virtual" texturing approach, where one 2d texture`
			`stores pointers to the actual textures used. This can be achieved by the`
			`new cudaTextureObject introduced in CUDA 5.0 and requiring sm3+ hardware.`

			`The mipmap generation kernel uses cudaSurfaceObject and cudaTextureObject`
			`passed as kernel arguments to compute the higher mip map level based on`
			`the lower.`
			`*/`

			`#ifndef _BINDLESSTEXTURE_KERNEL_CU_`
			`#define _BINDLESSTEXTURE_KERNEL_CU_`

			`#include <stdlib.h>`
			`#include <stdio.h>`
			`#include <string.h>`
			`#include <math.h>`

			`#include <vector>`

			`#include <helper_cuda.h>`
			`#include <helper_math.h>`

			`#include "bindlessTexture.h"`

			`// set this to just see the mipmap chain of first image`
			`//#define SHOW_MIPMAPS`

			`// local references to resources`

			`Image atlasImage;`
			`std::vector<Image> contentImages;`
			`float highestLod = 1.0f;`

			`#ifndef MAX`
			`#define MAX(a, b) ((a > b) ? a : b)`
			`#endif`

			`//////////////////////////////////////////////////////////////////////////`

			`__host__ __device__ __inline__ uint2 encodeTextureObject(`
			`cudaTextureObject_t obj) {`
			`return make_uint2((uint)(obj & 0xFFFFFFFF), (uint)(obj >> 32));`
			`}`

			`__host__ __device__ __inline__ cudaTextureObject_t decodeTextureObject(`
			`uint2 obj) {`
			`return (((cudaTextureObject_t)obj.x) \| ((cudaTextureObject_t)obj.y) << 32);`
			`}`

			`__device__ __inline__ float4 to_float4(uchar4 vec) {`
			`return make_float4(vec.x, vec.y, vec.z, vec.w);`
			`}`

			`__device__ __inline__ uchar4 to_uchar4(float4 vec) {`
			`return make_uchar4((uchar)vec.x, (uchar)vec.y, (uchar)vec.z, (uchar)vec.w);`
			`}`

			`//////////////////////////////////////////////////////////////////////////`
			`// Rendering`

			`// the atlas texture stores the 64 bit cudaTextureObjects`
			`// we use it for "virtual" texturing`

			`__global__ void d_render(uchar4 *d_output, uint imageW, uint imageH, float lod,`
			`cudaTextureObject_t atlasTexture) {`
			`uint x = blockIdx.x * blockDim.x + threadIdx.x;`
			`uint y = blockIdx.y * blockDim.y + threadIdx.y;`

			`float u = x / (float)imageW;`
			`float v = y / (float)imageH;`

			`if ((x < imageW) && (y < imageH)) {`
			`// read from 2D atlas texture and decode texture object`
			`uint2 texCoded = tex2D<uint2>(atlasTexture, u, v);`
			`cudaTextureObject_t tex = decodeTextureObject(texCoded);`

			`// read from cuda texture object, use template to specify what data will be`
			`// returned. tex2DLod allows us to pass the lod (mip map level) directly.`
			`// There is other functions with CUDA 5, e.g. tex2DGrad, that allow you`
			`// to pass derivatives to perform automatic mipmap/anisotropic filtering.`
			`float4 color = tex2DLod<float4>(tex, u, 1 - v, lod);`
			`// In our sample tex is always valid, but for something like your own`
			`// sparse texturing you would need to make sure to handle the zero case.`

			`// write output color`
			`uint i = y * imageW + x;`
			`d_output[i] = to_uchar4(color * 255.0);`
			`}`
			`}`

			`extern "C" void renderAtlasImage(dim3 gridSize, dim3 blockSize,`
			`uchar4 *d_output, uint imageW, uint imageH,`
			`float lod) {`
			`// psuedo animate lod`
			`lod = fmodf(lod, highestLod * 2);`
			`lod = highestLod - fabs(lod - highestLod);`

			`#ifdef SHOW_MIPMAPS`
			`lod = 0.0f;`
			`#endif`

			`d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, lod,`
			`atlasImage.textureObject);`

			`checkCudaErrors(cudaGetLastError());`
			`}`

			`//////////////////////////////////////////////////////////////////////////`
			`// MipMap Generation`

			`// A key benefit of using the new surface objects is that we don't need any`
			`// global binding points anymore. We can directly pass them as function`
			`// arguments.`

			`__global__ void d_mipmap(cudaSurfaceObject_t mipOutput,`
			`cudaTextureObject_t mipInput, uint imageW,`
			`uint imageH) {`
			`uint x = blockIdx.x * blockDim.x + threadIdx.x;`
			`uint y = blockIdx.y * blockDim.y + threadIdx.y;`

			`float px = 1.0 / float(imageW);`
			`float py = 1.0 / float(imageH);`

			`if ((x < imageW) && (y < imageH)) {`
			`// take the average of 4 samples`

			`// we are using the normalized access to make sure non-power-of-two textures`
			`// behave well when downsized.`
			`float4 color = (tex2D<float4>(mipInput, (x + 0) * px, (y + 0) * py)) +`
			`(tex2D<float4>(mipInput, (x + 1) * px, (y + 0) * py)) +`
			`(tex2D<float4>(mipInput, (x + 1) * px, (y + 1) * py)) +`
			`(tex2D<float4>(mipInput, (x + 0) * px, (y + 1) * py));`

			`color /= 4.0;`
			`color *= 255.0;`
			`color = fminf(color, make_float4(255.0));`

			`surf2Dwrite(to_uchar4(color), mipOutput, x * sizeof(uchar4), y);`
			`}`
			`}`

			`void generateMipMaps(cudaMipmappedArray_t mipmapArray, cudaExtent size) {`
			`size_t width = size.width;`
			`size_t height = size.height;`

			`#ifdef SHOW_MIPMAPS`
			`cudaArray_t levelFirst;`
			`checkCudaErrors(cudaGetMipmappedArrayLevel(&levelFirst, mipmapArray, 0));`
			`#endif`

			`uint level = 0;`

			`while (width != 1 \|\| height != 1) {`
			`width /= 2;`
			`width = MAX((size_t)1, width);`
			`height /= 2;`
			`height = MAX((size_t)1, height);`

			`cudaArray_t levelFrom;`
			`checkCudaErrors(cudaGetMipmappedArrayLevel(&levelFrom, mipmapArray, level));`
			`cudaArray_t levelTo;`
			`checkCudaErrors(`
			`cudaGetMipmappedArrayLevel(&levelTo, mipmapArray, level + 1));`

			`cudaExtent levelToSize;`
			`checkCudaErrors(cudaArrayGetInfo(NULL, &levelToSize, NULL, levelTo));`
			`checkHost(levelToSize.width == width);`
			`checkHost(levelToSize.height == height);`
			`checkHost(levelToSize.depth == 0);`

			`// generate texture object for reading`
			`cudaTextureObject_t texInput;`
			`cudaResourceDesc texRes;`
			`memset(&texRes, 0, sizeof(cudaResourceDesc));`

			`texRes.resType = cudaResourceTypeArray;`
			`texRes.res.array.array = levelFrom;`

			`cudaTextureDesc texDescr;`
			`memset(&texDescr, 0, sizeof(cudaTextureDesc));`

			`texDescr.normalizedCoords = 1;`
			`texDescr.filterMode = cudaFilterModeLinear;`

			`texDescr.addressMode[0] = cudaAddressModeClamp;`
			`texDescr.addressMode[1] = cudaAddressModeClamp;`
			`texDescr.addressMode[2] = cudaAddressModeClamp;`

			`texDescr.readMode = cudaReadModeNormalizedFloat;`

			`checkCudaErrors(`
			`cudaCreateTextureObject(&texInput, &texRes, &texDescr, NULL));`

			`// generate surface object for writing`

			`cudaSurfaceObject_t surfOutput;`
			`cudaResourceDesc surfRes;`
			`memset(&surfRes, 0, sizeof(cudaResourceDesc));`
			`surfRes.resType = cudaResourceTypeArray;`
			`surfRes.res.array.array = levelTo;`

			`checkCudaErrors(cudaCreateSurfaceObject(&surfOutput, &surfRes));`

			`// run mipmap kernel`
			`dim3 blockSize(16, 16, 1);`
			`dim3 gridSize(((uint)width + blockSize.x - 1) / blockSize.x,`
			`((uint)height + blockSize.y - 1) / blockSize.y, 1);`

			`d_mipmap<<<gridSize, blockSize>>>(surfOutput, texInput, (uint)width,`
			`(uint)height);`

			`checkCudaErrors(cudaDeviceSynchronize());`
			`checkCudaErrors(cudaGetLastError());`

			`checkCudaErrors(cudaDestroySurfaceObject(surfOutput));`

			`checkCudaErrors(cudaDestroyTextureObject(texInput));`

			`#ifdef SHOW_MIPMAPS`
			`// we blit the current mipmap back into first level`
			`cudaMemcpy3DParms copyParams = {0};`
			`copyParams.dstArray = levelFirst;`
			`copyParams.srcArray = levelTo;`
			`copyParams.extent = make_cudaExtent(width, height, 1);`
			`copyParams.kind = cudaMemcpyDeviceToDevice;`
			`checkCudaErrors(cudaMemcpy3D(&copyParams));`
			`#endif`

			`level++;`
			`}`
			`}`

			`uint getMipMapLevels(cudaExtent size) {`
			`size_t sz = MAX(MAX(size.width, size.height), size.depth);`

			`uint levels = 0;`

			`while (sz) {`
			`sz /= 2;`
			`levels++;`
			`}`

			`return levels;`
			`}`

			`//////////////////////////////////////////////////////////////////////////`
			`// Initalization`

			`extern "C" void randomizeAtlas() {`
			`uint2 h_data = (uint2 )atlasImage.h_data;`

			`// assign random texture object handles to our atlas image tiles`
			`for (size_t i = 0; i < atlasImage.size.width * atlasImage.size.height; i++) {`
			`#ifdef SHOW_MIPMAPS`
			`h_data[i] = encodeTextureObject(contentImages[0].textureObject);`
			`#else`
			`h_data[i] = encodeTextureObject(`
			`contentImages[rand() % contentImages.size()].textureObject);`
			`#endif`
			`}`

			`// copy data to atlas array`
			`cudaMemcpy3DParms copyParams = {0};`
			`copyParams.srcPtr = make_cudaPitchedPtr(`
			`atlasImage.h_data, atlasImage.size.width * sizeof(uint2),`
			`atlasImage.size.width, atlasImage.size.height);`
			`copyParams.dstArray = atlasImage.dataArray;`
			`copyParams.extent = atlasImage.size;`
			`copyParams.extent.depth = 1;`
			`copyParams.kind = cudaMemcpyHostToDevice;`
			`checkCudaErrors(cudaMemcpy3D(&copyParams));`
			`};`

			`extern "C" void deinitAtlasAndImages() {`
			`for (size_t i = 0; i < contentImages.size(); i++) {`
			`Image &image = contentImages[i];`

			`if (image.h_data) {`
			`free(image.h_data);`
			`}`

			`if (image.textureObject) {`
			`checkCudaErrors(cudaDestroyTextureObject(image.textureObject));`
			`}`

			`if (image.mipmapArray) {`
			`checkCudaErrors(cudaFreeMipmappedArray(image.mipmapArray));`
			`}`
			`}`

			`if (atlasImage.h_data) {`
			`free(atlasImage.h_data);`
			`}`

			`if (atlasImage.textureObject) {`
			`checkCudaErrors(cudaDestroyTextureObject(atlasImage.textureObject));`
			`}`

			`if (atlasImage.dataArray) {`
			`checkCudaErrors(cudaFreeArray(atlasImage.dataArray));`
			`}`
			`}`

			`extern "C" void initAtlasAndImages(const Image *images, size_t numImages,`
			`cudaExtent atlasSize) {`
			`// create individual textures`
			`contentImages.resize(numImages);`

			`for (size_t i = 0; i < numImages; i++) {`
			`Image &image = contentImages[i];`
			`image.size = images[i].size;`
			`image.size.depth = 0;`
			`image.type = cudaResourceTypeMipmappedArray;`

			`// how many mipmaps we need`
			`uint levels = getMipMapLevels(image.size);`
			`highestLod = MAX(highestLod, (float)levels - 1);`

			`cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();`
			`checkCudaErrors(cudaMallocMipmappedArray(&image.mipmapArray, &desc,`
			`image.size, levels));`

			`// upload level 0`
			`cudaArray_t level0;`
			`checkCudaErrors(cudaGetMipmappedArrayLevel(&level0, image.mipmapArray, 0));`

			`cudaMemcpy3DParms copyParams = {0};`
			`copyParams.srcPtr =`
			`make_cudaPitchedPtr(images[i].h_data, image.size.width * sizeof(uchar4),`
			`image.size.width, image.size.height);`
			`copyParams.dstArray = level0;`
			`copyParams.extent = image.size;`
			`copyParams.extent.depth = 1;`
			`copyParams.kind = cudaMemcpyHostToDevice;`
			`checkCudaErrors(cudaMemcpy3D(&copyParams));`

			`// compute rest of mipmaps based on level 0`
			`generateMipMaps(image.mipmapArray, image.size);`

			`// generate bindless texture object`

			`cudaResourceDesc resDescr;`
			`memset(&resDescr, 0, sizeof(cudaResourceDesc));`

			`resDescr.resType = cudaResourceTypeMipmappedArray;`
			`resDescr.res.mipmap.mipmap = image.mipmapArray;`

			`cudaTextureDesc texDescr;`
			`memset(&texDescr, 0, sizeof(cudaTextureDesc));`

			`texDescr.normalizedCoords = 1;`
			`texDescr.filterMode = cudaFilterModeLinear;`
			`texDescr.mipmapFilterMode = cudaFilterModeLinear;`

			`texDescr.addressMode[0] = cudaAddressModeClamp;`
			`texDescr.addressMode[1] = cudaAddressModeClamp;`
			`texDescr.addressMode[2] = cudaAddressModeClamp;`

			`texDescr.maxMipmapLevelClamp = float(levels - 1);`

			`texDescr.readMode = cudaReadModeNormalizedFloat;`

			`checkCudaErrors(cudaCreateTextureObject(&image.textureObject, &resDescr,`
			`&texDescr, NULL));`
			`}`

			`// create atlas array`
			`cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uint2>();`
			`checkCudaErrors(cudaMallocArray(&atlasImage.dataArray, &channelDesc,`
			`atlasSize.width, atlasSize.height));`
			`atlasImage.h_data =`
			`malloc(atlasSize.width * atlasSize.height * sizeof(uint2));`
			`atlasImage.type = cudaResourceTypeArray;`
			`atlasImage.size = atlasSize;`

			`cudaResourceDesc texRes;`
			`memset(&texRes, 0, sizeof(cudaResourceDesc));`

			`texRes.resType = cudaResourceTypeArray;`
			`texRes.res.array.array = atlasImage.dataArray;`

			`cudaTextureDesc texDescr;`
			`memset(&texDescr, 0, sizeof(cudaTextureDesc));`

			`texDescr.normalizedCoords = true;`
			`texDescr.filterMode = cudaFilterModePoint;`
			`texDescr.addressMode[0] = cudaAddressModeClamp;`
			`texDescr.addressMode[1] = cudaAddressModeClamp;`
			`texDescr.addressMode[1] = cudaAddressModeClamp;`
			`texDescr.readMode = cudaReadModeElementType;`

			`checkCudaErrors(cudaCreateTextureObject(&atlasImage.textureObject, &texRes,`
			`&texDescr, NULL));`

			`randomizeAtlas();`
			`}`

			`#endif // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_`