/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Utilities and system includes #include namespace cg = cooperative_groups; #include cudaTextureObject_t inTexObject; // clamp x to range [a, b] __device__ float clamp(float x, float a, float b) { return max(a, min(b, x)); } __device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); } // convert floating point rgb color to 8-bit integer __device__ int rgbToInt(float r, float g, float b) { r = clamp(r, 0.0f, 255.0f); g = clamp(g, 0.0f, 255.0f); b = clamp(b, 0.0f, 255.0f); return (int(b) << 16) | (int(g) << 8) | int(r); } // get pixel from 2D image, with clamping to border __device__ uchar4 getPixel(int x, int y, cudaTextureObject_t inTex) { #ifndef USE_TEXTURE_RGBA8UI float4 res = tex2D(inTex, x, y); uchar4 ucres = make_uchar4(res.x * 255.0f, res.y * 255.0f, res.z * 255.0f, res.w * 255.0f); #else uchar4 ucres = tex2D(inTex, x, y); #endif return ucres; } // macros to make indexing shared memory easier #define SMEM(X, Y) sdata[(Y)*tilew + (X)] /* 2D convolution using shared memory - operates on 8-bit RGB data stored in 32-bit int - assumes kernel radius is less than or equal to block size - not optimized for performance _____________ | : : | |_ _:_____:_ _| | | | | | | | | |_ _|_____|_ _| r | : : | |___:_____:___| r bw r <----tilew----> */ __global__ void cudaProcess(unsigned int *g_odata, int imgw, int imgh, int tilew, int r, float threshold, float highlight, cudaTextureObject_t inTex) { // Handle to thread block group cg::thread_block cta = cg::this_thread_block(); extern __shared__ uchar4 sdata[]; int tx = threadIdx.x; int ty = threadIdx.y; int bw = blockDim.x; int bh = blockDim.y; int x = blockIdx.x * bw + tx; int y = blockIdx.y * bh + ty; #if 0 uchar4 c4 = getPixel(x, y); g_odata[y*imgw+x] = rgbToInt(c4.z, c4.y, c4.x); #else // copy tile to shared memory // center region SMEM(r + tx, r + ty) = getPixel(x, y, inTex); // borders if (threadIdx.x < r) { // left SMEM(tx, r + ty) = getPixel(x - r, y, inTex); // right SMEM(r + bw + tx, r + ty) = getPixel(x + bw, y, inTex); } if (threadIdx.y < r) { // top SMEM(r + tx, ty) = getPixel(x, y - r, inTex); // bottom SMEM(r + tx, r + bh + ty) = getPixel(x, y + bh, inTex); } // load corners if ((threadIdx.x < r) && (threadIdx.y < r)) { // tl SMEM(tx, ty) = getPixel(x - r, y - r, inTex); // bl SMEM(tx, r + bh + ty) = getPixel(x - r, y + bh, inTex); // tr SMEM(r + bw + tx, ty) = getPixel(x + bh, y - r, inTex); // br SMEM(r + bw + tx, r + bh + ty) = getPixel(x + bw, y + bh, inTex); } // wait for loads to complete cg::sync(cta); // perform convolution float rsum = 0.0f; float gsum = 0.0f; float bsum = 0.0f; float samples = 0.0f; for (int dy = -r; dy <= r; dy++) { for (int dx = -r; dx <= r; dx++) { #if 0 // try this to see the benefit of using shared memory uchar4 pixel = getPixel(x+dx, y+dy); #else uchar4 pixel = SMEM(r + tx + dx, r + ty + dy); #endif // only sum pixels within disc-shaped kernel float l = dx * dx + dy * dy; if (l <= r * r) { float r = float(pixel.x); float g = float(pixel.y); float b = float(pixel.z); #if 1 // brighten highlights float lum = (r + g + b) / (255 * 3); if (lum > threshold) { r *= highlight; g *= highlight; b *= highlight; } #endif rsum += r; gsum += g; bsum += b; samples += 1.0f; } } } rsum /= samples; gsum /= samples; bsum /= samples; // ABGR g_odata[y * imgw + x] = rgbToInt(rsum, gsum, bsum); // g_odata[y*imgw+x] = rgbToInt(x,y,0); #endif } extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, cudaArray *g_data_array, unsigned int *g_odata, int imgw, int imgh, int tilew, int radius, float threshold, float highlight) { struct cudaChannelFormatDesc desc; checkCudaErrors(cudaGetChannelDesc(&desc, g_data_array)); cudaResourceDesc texRes; memset(&texRes, 0, sizeof(cudaResourceDesc)); texRes.resType = cudaResourceTypeArray; texRes.res.array.array = g_data_array; cudaTextureDesc texDescr; memset(&texDescr, 0, sizeof(cudaTextureDesc)); texDescr.normalizedCoords = false; texDescr.filterMode = cudaFilterModePoint; texDescr.addressMode[0] = cudaAddressModeWrap; texDescr.readMode = cudaReadModeElementType; checkCudaErrors( cudaCreateTextureObject(&inTexObject, &texRes, &texDescr, NULL)); #if 0 printf("CUDA Array channel descriptor, bits per component:\n"); printf("X %d Y %d Z %d W %d, kind %d\n", desc.x,desc.y,desc.z,desc.w,desc.f); printf("Possible values for channel format kind: i %d, u%d, f%d:\n", cudaChannelFormatKindSigned, cudaChannelFormatKindUnsigned, cudaChannelFormatKindFloat); #endif // printf("\n"); #ifdef GPU_PROFILING StopWatchInterface *timer = 0; sdkCreateTimer(&timer); int nIter = 30; for (int i = -1; i < nIter; ++i) { if (i == 0) { sdkStartTimer(&timer); } #endif cudaProcess<<>>(g_odata, imgw, imgh, block.x + (2 * radius), radius, 0.8f, 4.0f, inTexObject); #ifdef GPU_PROFILING } cudaDeviceSynchronize(); sdkStopTimer(&timer); double dSeconds = sdkGetTimerValue(&timer) / ((double)nIter * 1000.0); double dNumTexels = (double)imgw * (double)imgh; double mtexps = 1.0e-6 * dNumTexels / dSeconds; if (radius == 4) { printf("\n"); printf( "postprocessGL, Throughput = %.4f MTexels/s, Time = %.5f s, Size = " "%.0f Texels, NumDevsUsed = %d, Workgroup = %u\n", mtexps, dSeconds, dNumTexels, 1, block.x * block.y); } #endif }