cuda-samples/Samples/5_Domain_Specific/postProcessGL/postProcessGL.cu
2022-01-13 11:35:24 +05:30

257 lines
7.6 KiB
Plaintext

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Utilities and system includes
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
#include <helper_cuda.h>
cudaTextureObject_t inTexObject;
// clamp x to range [a, b]
__device__ float clamp(float x, float a, float b) { return max(a, min(b, x)); }
__device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); }
// convert floating point rgb color to 8-bit integer
__device__ int rgbToInt(float r, float g, float b) {
r = clamp(r, 0.0f, 255.0f);
g = clamp(g, 0.0f, 255.0f);
b = clamp(b, 0.0f, 255.0f);
return (int(b) << 16) | (int(g) << 8) | int(r);
}
// get pixel from 2D image, with clamping to border
__device__ uchar4 getPixel(int x, int y, cudaTextureObject_t inTex) {
#ifndef USE_TEXTURE_RGBA8UI
float4 res = tex2D<float4>(inTex, x, y);
uchar4 ucres = make_uchar4(res.x * 255.0f, res.y * 255.0f, res.z * 255.0f,
res.w * 255.0f);
#else
uchar4 ucres = tex2D<uchar4>(inTex, x, y);
#endif
return ucres;
}
// macros to make indexing shared memory easier
#define SMEM(X, Y) sdata[(Y)*tilew + (X)]
/*
2D convolution using shared memory
- operates on 8-bit RGB data stored in 32-bit int
- assumes kernel radius is less than or equal to block size
- not optimized for performance
_____________
| : : |
|_ _:_____:_ _|
| | | |
| | | |
|_ _|_____|_ _|
r | : : |
|___:_____:___|
r bw r
<----tilew---->
*/
__global__ void cudaProcess(unsigned int *g_odata, int imgw, int imgh,
int tilew, int r, float threshold, float highlight,
cudaTextureObject_t inTex) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
extern __shared__ uchar4 sdata[];
int tx = threadIdx.x;
int ty = threadIdx.y;
int bw = blockDim.x;
int bh = blockDim.y;
int x = blockIdx.x * bw + tx;
int y = blockIdx.y * bh + ty;
#if 0
uchar4 c4 = getPixel(x, y);
g_odata[y*imgw+x] = rgbToInt(c4.z, c4.y, c4.x);
#else
// copy tile to shared memory
// center region
SMEM(r + tx, r + ty) = getPixel(x, y, inTex);
// borders
if (threadIdx.x < r) {
// left
SMEM(tx, r + ty) = getPixel(x - r, y, inTex);
// right
SMEM(r + bw + tx, r + ty) = getPixel(x + bw, y, inTex);
}
if (threadIdx.y < r) {
// top
SMEM(r + tx, ty) = getPixel(x, y - r, inTex);
// bottom
SMEM(r + tx, r + bh + ty) = getPixel(x, y + bh, inTex);
}
// load corners
if ((threadIdx.x < r) && (threadIdx.y < r)) {
// tl
SMEM(tx, ty) = getPixel(x - r, y - r, inTex);
// bl
SMEM(tx, r + bh + ty) = getPixel(x - r, y + bh, inTex);
// tr
SMEM(r + bw + tx, ty) = getPixel(x + bh, y - r, inTex);
// br
SMEM(r + bw + tx, r + bh + ty) = getPixel(x + bw, y + bh, inTex);
}
// wait for loads to complete
cg::sync(cta);
// perform convolution
float rsum = 0.0f;
float gsum = 0.0f;
float bsum = 0.0f;
float samples = 0.0f;
for (int dy = -r; dy <= r; dy++) {
for (int dx = -r; dx <= r; dx++) {
#if 0
// try this to see the benefit of using shared memory
uchar4 pixel = getPixel(x+dx, y+dy);
#else
uchar4 pixel = SMEM(r + tx + dx, r + ty + dy);
#endif
// only sum pixels within disc-shaped kernel
float l = dx * dx + dy * dy;
if (l <= r * r) {
float r = float(pixel.x);
float g = float(pixel.y);
float b = float(pixel.z);
#if 1
// brighten highlights
float lum = (r + g + b) / (255 * 3);
if (lum > threshold) {
r *= highlight;
g *= highlight;
b *= highlight;
}
#endif
rsum += r;
gsum += g;
bsum += b;
samples += 1.0f;
}
}
}
rsum /= samples;
gsum /= samples;
bsum /= samples;
// ABGR
g_odata[y * imgw + x] = rgbToInt(rsum, gsum, bsum);
// g_odata[y*imgw+x] = rgbToInt(x,y,0);
#endif
}
extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes,
cudaArray *g_data_array,
unsigned int *g_odata, int imgw, int imgh,
int tilew, int radius, float threshold,
float highlight) {
struct cudaChannelFormatDesc desc;
checkCudaErrors(cudaGetChannelDesc(&desc, g_data_array));
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = g_data_array;
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = false;
texDescr.filterMode = cudaFilterModePoint;
texDescr.addressMode[0] = cudaAddressModeWrap;
texDescr.readMode = cudaReadModeElementType;
checkCudaErrors(
cudaCreateTextureObject(&inTexObject, &texRes, &texDescr, NULL));
#if 0
printf("CUDA Array channel descriptor, bits per component:\n");
printf("X %d Y %d Z %d W %d, kind %d\n",
desc.x,desc.y,desc.z,desc.w,desc.f);
printf("Possible values for channel format kind: i %d, u%d, f%d:\n",
cudaChannelFormatKindSigned, cudaChannelFormatKindUnsigned,
cudaChannelFormatKindFloat);
#endif
// printf("\n");
#ifdef GPU_PROFILING
StopWatchInterface *timer = 0;
sdkCreateTimer(&timer);
int nIter = 30;
for (int i = -1; i < nIter; ++i) {
if (i == 0) {
sdkStartTimer(&timer);
}
#endif
cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw, imgh,
block.x + (2 * radius), radius, 0.8f,
4.0f, inTexObject);
#ifdef GPU_PROFILING
}
cudaDeviceSynchronize();
sdkStopTimer(&timer);
double dSeconds = sdkGetTimerValue(&timer) / ((double)nIter * 1000.0);
double dNumTexels = (double)imgw * (double)imgh;
double mtexps = 1.0e-6 * dNumTexels / dSeconds;
if (radius == 4) {
printf("\n");
printf(
"postprocessGL, Throughput = %.4f MTexels/s, Time = %.5f s, Size = "
"%.0f Texels, NumDevsUsed = %d, Workgroup = %u\n",
mtexps, dSeconds, dNumTexels, 1, block.x * block.y);
}
#endif
}