cuda-samples/Samples/bindlessTexture/bindlessTexture_kernel.cu

430 lines
14 KiB
Plaintext
Raw Normal View History

2021-10-21 19:04:49 +08:00
/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
This sample has two kernels, one doing the rendering every frame, and
another one used to generate the mip map levels at startup.
For rendering we use a "virtual" texturing approach, where one 2d texture
stores pointers to the actual textures used. This can be achieved by the
new cudaTextureObject introduced in CUDA 5.0 and requiring sm3+ hardware.
The mipmap generation kernel uses cudaSurfaceObject and cudaTextureObject
passed as kernel arguments to compute the higher mip map level based on
the lower.
*/
#ifndef _BINDLESSTEXTURE_KERNEL_CU_
#define _BINDLESSTEXTURE_KERNEL_CU_
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <vector>
#include <helper_cuda.h>
#include <helper_math.h>
#include "bindlessTexture.h"
// set this to just see the mipmap chain of first image
//#define SHOW_MIPMAPS
// local references to resources
Image atlasImage;
std::vector<Image> contentImages;
float highestLod = 1.0f;
#ifndef MAX
#define MAX(a, b) ((a > b) ? a : b)
#endif
//////////////////////////////////////////////////////////////////////////
__host__ __device__ __inline__ uint2 encodeTextureObject(
cudaTextureObject_t obj) {
return make_uint2((uint)(obj & 0xFFFFFFFF), (uint)(obj >> 32));
}
__host__ __device__ __inline__ cudaTextureObject_t decodeTextureObject(
uint2 obj) {
return (((cudaTextureObject_t)obj.x) | ((cudaTextureObject_t)obj.y) << 32);
}
__device__ __inline__ float4 to_float4(uchar4 vec) {
return make_float4(vec.x, vec.y, vec.z, vec.w);
}
__device__ __inline__ uchar4 to_uchar4(float4 vec) {
return make_uchar4((uchar)vec.x, (uchar)vec.y, (uchar)vec.z, (uchar)vec.w);
}
//////////////////////////////////////////////////////////////////////////
// Rendering
// the atlas texture stores the 64 bit cudaTextureObjects
// we use it for "virtual" texturing
__global__ void d_render(uchar4 *d_output, uint imageW, uint imageH, float lod,
cudaTextureObject_t atlasTexture) {
uint x = blockIdx.x * blockDim.x + threadIdx.x;
uint y = blockIdx.y * blockDim.y + threadIdx.y;
float u = x / (float)imageW;
float v = y / (float)imageH;
if ((x < imageW) && (y < imageH)) {
// read from 2D atlas texture and decode texture object
uint2 texCoded = tex2D<uint2>(atlasTexture, u, v);
cudaTextureObject_t tex = decodeTextureObject(texCoded);
// read from cuda texture object, use template to specify what data will be
// returned. tex2DLod allows us to pass the lod (mip map level) directly.
// There is other functions with CUDA 5, e.g. tex2DGrad, that allow you
// to pass derivatives to perform automatic mipmap/anisotropic filtering.
float4 color = tex2DLod<float4>(tex, u, 1 - v, lod);
// In our sample tex is always valid, but for something like your own
// sparse texturing you would need to make sure to handle the zero case.
// write output color
uint i = y * imageW + x;
d_output[i] = to_uchar4(color * 255.0);
}
}
extern "C" void renderAtlasImage(dim3 gridSize, dim3 blockSize,
uchar4 *d_output, uint imageW, uint imageH,
float lod) {
// psuedo animate lod
lod = fmodf(lod, highestLod * 2);
lod = highestLod - fabs(lod - highestLod);
#ifdef SHOW_MIPMAPS
lod = 0.0f;
#endif
d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, lod,
atlasImage.textureObject);
checkCudaErrors(cudaGetLastError());
}
//////////////////////////////////////////////////////////////////////////
// MipMap Generation
// A key benefit of using the new surface objects is that we don't need any
// global binding points anymore. We can directly pass them as function
// arguments.
__global__ void d_mipmap(cudaSurfaceObject_t mipOutput,
cudaTextureObject_t mipInput, uint imageW,
uint imageH) {
uint x = blockIdx.x * blockDim.x + threadIdx.x;
uint y = blockIdx.y * blockDim.y + threadIdx.y;
float px = 1.0 / float(imageW);
float py = 1.0 / float(imageH);
if ((x < imageW) && (y < imageH)) {
// take the average of 4 samples
// we are using the normalized access to make sure non-power-of-two textures
// behave well when downsized.
float4 color = (tex2D<float4>(mipInput, (x + 0) * px, (y + 0) * py)) +
(tex2D<float4>(mipInput, (x + 1) * px, (y + 0) * py)) +
(tex2D<float4>(mipInput, (x + 1) * px, (y + 1) * py)) +
(tex2D<float4>(mipInput, (x + 0) * px, (y + 1) * py));
color /= 4.0;
color *= 255.0;
color = fminf(color, make_float4(255.0));
surf2Dwrite(to_uchar4(color), mipOutput, x * sizeof(uchar4), y);
}
}
void generateMipMaps(cudaMipmappedArray_t mipmapArray, cudaExtent size) {
size_t width = size.width;
size_t height = size.height;
#ifdef SHOW_MIPMAPS
cudaArray_t levelFirst;
checkCudaErrors(cudaGetMipmappedArrayLevel(&levelFirst, mipmapArray, 0));
#endif
uint level = 0;
while (width != 1 || height != 1) {
width /= 2;
width = MAX((size_t)1, width);
height /= 2;
height = MAX((size_t)1, height);
cudaArray_t levelFrom;
checkCudaErrors(cudaGetMipmappedArrayLevel(&levelFrom, mipmapArray, level));
cudaArray_t levelTo;
checkCudaErrors(
cudaGetMipmappedArrayLevel(&levelTo, mipmapArray, level + 1));
cudaExtent levelToSize;
checkCudaErrors(cudaArrayGetInfo(NULL, &levelToSize, NULL, levelTo));
checkHost(levelToSize.width == width);
checkHost(levelToSize.height == height);
checkHost(levelToSize.depth == 0);
// generate texture object for reading
cudaTextureObject_t texInput;
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = levelFrom;
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = 1;
texDescr.filterMode = cudaFilterModeLinear;
texDescr.addressMode[0] = cudaAddressModeClamp;
texDescr.addressMode[1] = cudaAddressModeClamp;
texDescr.addressMode[2] = cudaAddressModeClamp;
texDescr.readMode = cudaReadModeNormalizedFloat;
checkCudaErrors(
cudaCreateTextureObject(&texInput, &texRes, &texDescr, NULL));
// generate surface object for writing
cudaSurfaceObject_t surfOutput;
cudaResourceDesc surfRes;
memset(&surfRes, 0, sizeof(cudaResourceDesc));
surfRes.resType = cudaResourceTypeArray;
surfRes.res.array.array = levelTo;
checkCudaErrors(cudaCreateSurfaceObject(&surfOutput, &surfRes));
// run mipmap kernel
dim3 blockSize(16, 16, 1);
dim3 gridSize(((uint)width + blockSize.x - 1) / blockSize.x,
((uint)height + blockSize.y - 1) / blockSize.y, 1);
d_mipmap<<<gridSize, blockSize>>>(surfOutput, texInput, (uint)width,
(uint)height);
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDestroySurfaceObject(surfOutput));
checkCudaErrors(cudaDestroyTextureObject(texInput));
#ifdef SHOW_MIPMAPS
// we blit the current mipmap back into first level
cudaMemcpy3DParms copyParams = {0};
copyParams.dstArray = levelFirst;
copyParams.srcArray = levelTo;
copyParams.extent = make_cudaExtent(width, height, 1);
copyParams.kind = cudaMemcpyDeviceToDevice;
checkCudaErrors(cudaMemcpy3D(&copyParams));
#endif
level++;
}
}
uint getMipMapLevels(cudaExtent size) {
size_t sz = MAX(MAX(size.width, size.height), size.depth);
uint levels = 0;
while (sz) {
sz /= 2;
levels++;
}
return levels;
}
//////////////////////////////////////////////////////////////////////////
// Initalization
extern "C" void randomizeAtlas() {
uint2 *h_data = (uint2 *)atlasImage.h_data;
// assign random texture object handles to our atlas image tiles
for (size_t i = 0; i < atlasImage.size.width * atlasImage.size.height; i++) {
#ifdef SHOW_MIPMAPS
h_data[i] = encodeTextureObject(contentImages[0].textureObject);
#else
h_data[i] = encodeTextureObject(
contentImages[rand() % contentImages.size()].textureObject);
#endif
}
// copy data to atlas array
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = make_cudaPitchedPtr(
atlasImage.h_data, atlasImage.size.width * sizeof(uint2),
atlasImage.size.width, atlasImage.size.height);
copyParams.dstArray = atlasImage.dataArray;
copyParams.extent = atlasImage.size;
copyParams.extent.depth = 1;
copyParams.kind = cudaMemcpyHostToDevice;
checkCudaErrors(cudaMemcpy3D(&copyParams));
};
extern "C" void deinitAtlasAndImages() {
for (size_t i = 0; i < contentImages.size(); i++) {
Image &image = contentImages[i];
if (image.h_data) {
free(image.h_data);
}
if (image.textureObject) {
checkCudaErrors(cudaDestroyTextureObject(image.textureObject));
}
if (image.mipmapArray) {
checkCudaErrors(cudaFreeMipmappedArray(image.mipmapArray));
}
}
if (atlasImage.h_data) {
free(atlasImage.h_data);
}
if (atlasImage.textureObject) {
checkCudaErrors(cudaDestroyTextureObject(atlasImage.textureObject));
}
if (atlasImage.dataArray) {
checkCudaErrors(cudaFreeArray(atlasImage.dataArray));
}
}
extern "C" void initAtlasAndImages(const Image *images, size_t numImages,
cudaExtent atlasSize) {
// create individual textures
contentImages.resize(numImages);
for (size_t i = 0; i < numImages; i++) {
Image &image = contentImages[i];
image.size = images[i].size;
image.size.depth = 0;
image.type = cudaResourceTypeMipmappedArray;
// how many mipmaps we need
uint levels = getMipMapLevels(image.size);
highestLod = MAX(highestLod, (float)levels - 1);
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
checkCudaErrors(cudaMallocMipmappedArray(&image.mipmapArray, &desc,
image.size, levels));
// upload level 0
cudaArray_t level0;
checkCudaErrors(cudaGetMipmappedArrayLevel(&level0, image.mipmapArray, 0));
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr =
make_cudaPitchedPtr(images[i].h_data, image.size.width * sizeof(uchar4),
image.size.width, image.size.height);
copyParams.dstArray = level0;
copyParams.extent = image.size;
copyParams.extent.depth = 1;
copyParams.kind = cudaMemcpyHostToDevice;
checkCudaErrors(cudaMemcpy3D(&copyParams));
// compute rest of mipmaps based on level 0
generateMipMaps(image.mipmapArray, image.size);
// generate bindless texture object
cudaResourceDesc resDescr;
memset(&resDescr, 0, sizeof(cudaResourceDesc));
resDescr.resType = cudaResourceTypeMipmappedArray;
resDescr.res.mipmap.mipmap = image.mipmapArray;
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = 1;
texDescr.filterMode = cudaFilterModeLinear;
texDescr.mipmapFilterMode = cudaFilterModeLinear;
texDescr.addressMode[0] = cudaAddressModeClamp;
texDescr.addressMode[1] = cudaAddressModeClamp;
texDescr.addressMode[2] = cudaAddressModeClamp;
texDescr.maxMipmapLevelClamp = float(levels - 1);
texDescr.readMode = cudaReadModeNormalizedFloat;
checkCudaErrors(cudaCreateTextureObject(&image.textureObject, &resDescr,
&texDescr, NULL));
}
// create atlas array
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uint2>();
checkCudaErrors(cudaMallocArray(&atlasImage.dataArray, &channelDesc,
atlasSize.width, atlasSize.height));
atlasImage.h_data =
malloc(atlasSize.width * atlasSize.height * sizeof(uint2));
atlasImage.type = cudaResourceTypeArray;
atlasImage.size = atlasSize;
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = atlasImage.dataArray;
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = true;
texDescr.filterMode = cudaFilterModePoint;
texDescr.addressMode[0] = cudaAddressModeClamp;
texDescr.addressMode[1] = cudaAddressModeClamp;
texDescr.addressMode[1] = cudaAddressModeClamp;
texDescr.readMode = cudaReadModeElementType;
checkCudaErrors(cudaCreateTextureObject(&atlasImage.textureObject, &texRes,
&texDescr, NULL));
randomizeAtlas();
}
#endif // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_