cuda-samples/Samples/postProcessGL/postProcessGL.cu

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

// Utilities and system includes

#include <cooperative_groups.h>

namespace cg = cooperative_groups;

#include <helper_cuda.h>

cudaTextureObject_t inTexObject;

// clamp x to range [a, b]
__device__ float clamp(float x, float a, float b) { return max(a, min(b, x)); }

__device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); }

// convert floating point rgb color to 8-bit integer
__device__ int rgbToInt(float r, float g, float b) {
  r = clamp(r, 0.0f, 255.0f);
  g = clamp(g, 0.0f, 255.0f);
  b = clamp(b, 0.0f, 255.0f);
  return (int(b) << 16) | (int(g) << 8) | int(r);
}

// get pixel from 2D image, with clamping to border
__device__ uchar4 getPixel(int x, int y, cudaTextureObject_t inTex) {
#ifndef USE_TEXTURE_RGBA8UI
  float4 res = tex2D<float4>(inTex, x, y);
  uchar4 ucres = make_uchar4(res.x * 255.0f, res.y * 255.0f, res.z * 255.0f,
                             res.w * 255.0f);
#else
  uchar4 ucres = tex2D<uchar4>(inTex, x, y);
#endif
  return ucres;
}

// macros to make indexing shared memory easier
#define SMEM(X, Y) sdata[(Y)*tilew + (X)]

/*
    2D convolution using shared memory
    - operates on 8-bit RGB data stored in 32-bit int
    - assumes kernel radius is less than or equal to block size
    - not optimized for performance
     _____________
    |   :     :   |
    |_ _:_____:_ _|
    |   |     |   |
    |   |     |   |
    |_ _|_____|_ _|
  r |   :     :   |
    |___:_____:___|
      r    bw   r
    <----tilew---->
*/

__global__ void cudaProcess(unsigned int *g_odata, int imgw, int imgh,
                            int tilew, int r, float threshold, float highlight,
                            cudaTextureObject_t inTex) {
  // Handle to thread block group
  cg::thread_block cta = cg::this_thread_block();
  extern __shared__ uchar4 sdata[];

  int tx = threadIdx.x;
  int ty = threadIdx.y;
  int bw = blockDim.x;
  int bh = blockDim.y;
  int x = blockIdx.x * bw + tx;
  int y = blockIdx.y * bh + ty;

#if 0
    uchar4 c4 = getPixel(x, y);
    g_odata[y*imgw+x] = rgbToInt(c4.z, c4.y, c4.x);
#else
  // copy tile to shared memory
  // center region
  SMEM(r + tx, r + ty) = getPixel(x, y, inTex);

  // borders
  if (threadIdx.x < r) {
    // left
    SMEM(tx, r + ty) = getPixel(x - r, y, inTex);
    // right
    SMEM(r + bw + tx, r + ty) = getPixel(x + bw, y, inTex);
  }

  if (threadIdx.y < r) {
    // top
    SMEM(r + tx, ty) = getPixel(x, y - r, inTex);
    // bottom
    SMEM(r + tx, r + bh + ty) = getPixel(x, y + bh, inTex);
  }

  // load corners
  if ((threadIdx.x < r) && (threadIdx.y < r)) {
    // tl
    SMEM(tx, ty) = getPixel(x - r, y - r, inTex);
    // bl
    SMEM(tx, r + bh + ty) = getPixel(x - r, y + bh, inTex);
    // tr
    SMEM(r + bw + tx, ty) = getPixel(x + bh, y - r, inTex);
    // br
    SMEM(r + bw + tx, r + bh + ty) = getPixel(x + bw, y + bh, inTex);
  }

  // wait for loads to complete
  cg::sync(cta);

  // perform convolution
  float rsum = 0.0f;
  float gsum = 0.0f;
  float bsum = 0.0f;
  float samples = 0.0f;

  for (int dy = -r; dy <= r; dy++) {
    for (int dx = -r; dx <= r; dx++) {
#if 0
            // try this to see the benefit of using shared memory
            uchar4 pixel = getPixel(x+dx, y+dy);
#else
      uchar4 pixel = SMEM(r + tx + dx, r + ty + dy);
#endif

      // only sum pixels within disc-shaped kernel
      float l = dx * dx + dy * dy;

      if (l <= r * r) {
        float r = float(pixel.x);
        float g = float(pixel.y);
        float b = float(pixel.z);
#if 1
        // brighten highlights
        float lum = (r + g + b) / (255 * 3);

        if (lum > threshold) {
          r *= highlight;
          g *= highlight;
          b *= highlight;
        }

#endif
        rsum += r;
        gsum += g;
        bsum += b;
        samples += 1.0f;
      }
    }
  }

  rsum /= samples;
  gsum /= samples;
  bsum /= samples;
  // ABGR
  g_odata[y * imgw + x] = rgbToInt(rsum, gsum, bsum);
// g_odata[y*imgw+x] = rgbToInt(x,y,0);
#endif
}

extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes,
                                   cudaArray *g_data_array,
                                   unsigned int *g_odata, int imgw, int imgh,
                                   int tilew, int radius, float threshold,
                                   float highlight) {
  struct cudaChannelFormatDesc desc;
  checkCudaErrors(cudaGetChannelDesc(&desc, g_data_array));

  cudaResourceDesc texRes;
  memset(&texRes, 0, sizeof(cudaResourceDesc));

  texRes.resType = cudaResourceTypeArray;
  texRes.res.array.array = g_data_array;

  cudaTextureDesc texDescr;
  memset(&texDescr, 0, sizeof(cudaTextureDesc));

  texDescr.normalizedCoords = false;
  texDescr.filterMode = cudaFilterModePoint;
  texDescr.addressMode[0] = cudaAddressModeWrap;
  texDescr.readMode = cudaReadModeElementType;

  checkCudaErrors(
      cudaCreateTextureObject(&inTexObject, &texRes, &texDescr, NULL));

#if 0
    printf("CUDA Array channel descriptor, bits per component:\n");
    printf("X %d Y %d Z %d W %d, kind %d\n",
           desc.x,desc.y,desc.z,desc.w,desc.f);

    printf("Possible values for channel format kind: i %d, u%d, f%d:\n",
           cudaChannelFormatKindSigned, cudaChannelFormatKindUnsigned,
           cudaChannelFormatKindFloat);
#endif

// printf("\n");
#ifdef GPU_PROFILING
  StopWatchInterface *timer = 0;
  sdkCreateTimer(&timer);

  int nIter = 30;

  for (int i = -1; i < nIter; ++i) {
    if (i == 0) {
      sdkStartTimer(&timer);
    }

#endif

    cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw, imgh,
                                         block.x + (2 * radius), radius, 0.8f,
                                         4.0f, inTexObject);

#ifdef GPU_PROFILING
  }

  cudaDeviceSynchronize();
  sdkStopTimer(&timer);
  double dSeconds = sdkGetTimerValue(&timer) / ((double)nIter * 1000.0);
  double dNumTexels = (double)imgw * (double)imgh;
  double mtexps = 1.0e-6 * dNumTexels / dSeconds;

  if (radius == 4) {
    printf("\n");
    printf(
        "postprocessGL, Throughput = %.4f MTexels/s, Time = %.5f s, Size = "
        "%.0f Texels, NumDevsUsed = %d, Workgroup = %u\n",
        mtexps, dSeconds, dNumTexels, 1, block.x * block.y);
  }

#endif
}
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`// Utilities and system includes`

			`#include <cooperative_groups.h>`

			`namespace cg = cooperative_groups;`

			`#include <helper_cuda.h>`

			`cudaTextureObject_t inTexObject;`

			`// clamp x to range [a, b]`
			`__device__ float clamp(float x, float a, float b) { return max(a, min(b, x)); }`

			`__device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); }`

			`// convert floating point rgb color to 8-bit integer`
			`__device__ int rgbToInt(float r, float g, float b) {`
			`r = clamp(r, 0.0f, 255.0f);`
			`g = clamp(g, 0.0f, 255.0f);`
			`b = clamp(b, 0.0f, 255.0f);`
			`return (int(b) << 16) \| (int(g) << 8) \| int(r);`
			`}`

			`// get pixel from 2D image, with clamping to border`
			`__device__ uchar4 getPixel(int x, int y, cudaTextureObject_t inTex) {`
			`#ifndef USE_TEXTURE_RGBA8UI`
			`float4 res = tex2D<float4>(inTex, x, y);`
			`uchar4 ucres = make_uchar4(res.x * 255.0f, res.y * 255.0f, res.z * 255.0f,`
			`res.w * 255.0f);`
			`#else`
			`uchar4 ucres = tex2D<uchar4>(inTex, x, y);`
			`#endif`
			`return ucres;`
			`}`

			`// macros to make indexing shared memory easier`
			`#define SMEM(X, Y) sdata[(Y)*tilew + (X)]`

			`/*`
			`2D convolution using shared memory`
			`- operates on 8-bit RGB data stored in 32-bit int`
			`- assumes kernel radius is less than or equal to block size`
			`- not optimized for performance`
			`_____________`
			`\| : : \|`
			`\|_ _:_____:_ _\|`
			`\| \| \| \|`
			`\| \| \| \|`
			`\|_ _\|_____\|_ _\|`
			`r \| : : \|`
			`\|___:_____:___\|`
			`r bw r`
			`<----tilew---->`
			`*/`

			`__global__ void cudaProcess(unsigned int *g_odata, int imgw, int imgh,`
			`int tilew, int r, float threshold, float highlight,`
			`cudaTextureObject_t inTex) {`
			`// Handle to thread block group`
			`cg::thread_block cta = cg::this_thread_block();`
			`extern __shared__ uchar4 sdata[];`

			`int tx = threadIdx.x;`
			`int ty = threadIdx.y;`
			`int bw = blockDim.x;`
			`int bh = blockDim.y;`
			`int x = blockIdx.x * bw + tx;`
			`int y = blockIdx.y * bh + ty;`

			`#if 0`
			`uchar4 c4 = getPixel(x, y);`
			`g_odata[y*imgw+x] = rgbToInt(c4.z, c4.y, c4.x);`
			`#else`
			`// copy tile to shared memory`
			`// center region`
			`SMEM(r + tx, r + ty) = getPixel(x, y, inTex);`

			`// borders`
			`if (threadIdx.x < r) {`
			`// left`
			`SMEM(tx, r + ty) = getPixel(x - r, y, inTex);`
			`// right`
			`SMEM(r + bw + tx, r + ty) = getPixel(x + bw, y, inTex);`
			`}`

			`if (threadIdx.y < r) {`
			`// top`
			`SMEM(r + tx, ty) = getPixel(x, y - r, inTex);`
			`// bottom`
			`SMEM(r + tx, r + bh + ty) = getPixel(x, y + bh, inTex);`
			`}`

			`// load corners`
			`if ((threadIdx.x < r) && (threadIdx.y < r)) {`
			`// tl`
			`SMEM(tx, ty) = getPixel(x - r, y - r, inTex);`
			`// bl`
			`SMEM(tx, r + bh + ty) = getPixel(x - r, y + bh, inTex);`
			`// tr`
			`SMEM(r + bw + tx, ty) = getPixel(x + bh, y - r, inTex);`
			`// br`
			`SMEM(r + bw + tx, r + bh + ty) = getPixel(x + bw, y + bh, inTex);`
			`}`

			`// wait for loads to complete`
			`cg::sync(cta);`

			`// perform convolution`
			`float rsum = 0.0f;`
			`float gsum = 0.0f;`
			`float bsum = 0.0f;`
			`float samples = 0.0f;`

			`for (int dy = -r; dy <= r; dy++) {`
			`for (int dx = -r; dx <= r; dx++) {`
			`#if 0`
			`// try this to see the benefit of using shared memory`
			`uchar4 pixel = getPixel(x+dx, y+dy);`
			`#else`
			`uchar4 pixel = SMEM(r + tx + dx, r + ty + dy);`
			`#endif`

			`// only sum pixels within disc-shaped kernel`
			`float l = dx * dx + dy * dy;`

			`if (l <= r * r) {`
			`float r = float(pixel.x);`
			`float g = float(pixel.y);`
			`float b = float(pixel.z);`
			`#if 1`
			`// brighten highlights`
			`float lum = (r + g + b) / (255 * 3);`

			`if (lum > threshold) {`
			`r *= highlight;`
			`g *= highlight;`
			`b *= highlight;`
			`}`

			`#endif`
			`rsum += r;`
			`gsum += g;`
			`bsum += b;`
			`samples += 1.0f;`
			`}`
			`}`
			`}`

			`rsum /= samples;`
			`gsum /= samples;`
			`bsum /= samples;`
			`// ABGR`
			`g_odata[y * imgw + x] = rgbToInt(rsum, gsum, bsum);`
			`// g_odata[y*imgw+x] = rgbToInt(x,y,0);`
			`#endif`
			`}`

			`extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes,`
			`cudaArray *g_data_array,`
			`unsigned int *g_odata, int imgw, int imgh,`
			`int tilew, int radius, float threshold,`
			`float highlight) {`
			`struct cudaChannelFormatDesc desc;`
			`checkCudaErrors(cudaGetChannelDesc(&desc, g_data_array));`

			`cudaResourceDesc texRes;`
			`memset(&texRes, 0, sizeof(cudaResourceDesc));`

			`texRes.resType = cudaResourceTypeArray;`
			`texRes.res.array.array = g_data_array;`

			`cudaTextureDesc texDescr;`
			`memset(&texDescr, 0, sizeof(cudaTextureDesc));`

			`texDescr.normalizedCoords = false;`
			`texDescr.filterMode = cudaFilterModePoint;`
			`texDescr.addressMode[0] = cudaAddressModeWrap;`
			`texDescr.readMode = cudaReadModeElementType;`

			`checkCudaErrors(`
			`cudaCreateTextureObject(&inTexObject, &texRes, &texDescr, NULL));`

			`#if 0`
			`printf("CUDA Array channel descriptor, bits per component:\n");`
			`printf("X %d Y %d Z %d W %d, kind %d\n",`
			`desc.x,desc.y,desc.z,desc.w,desc.f);`

			`printf("Possible values for channel format kind: i %d, u%d, f%d:\n",`
			`cudaChannelFormatKindSigned, cudaChannelFormatKindUnsigned,`
			`cudaChannelFormatKindFloat);`
			`#endif`

			`// printf("\n");`
			`#ifdef GPU_PROFILING`
			`StopWatchInterface *timer = 0;`
			`sdkCreateTimer(&timer);`

			`int nIter = 30;`

			`for (int i = -1; i < nIter; ++i) {`
			`if (i == 0) {`
			`sdkStartTimer(&timer);`
			`}`

			`#endif`

			`cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw, imgh,`
			`block.x + (2 * radius), radius, 0.8f,`
			`4.0f, inTexObject);`

			`#ifdef GPU_PROFILING`
			`}`

			`cudaDeviceSynchronize();`
			`sdkStopTimer(&timer);`
			`double dSeconds = sdkGetTimerValue(&timer) / ((double)nIter * 1000.0);`
			`double dNumTexels = (double)imgw * (double)imgh;`
			`double mtexps = 1.0e-6 * dNumTexels / dSeconds;`

			`if (radius == 4) {`
			`printf("\n");`
			`printf(`
			`"postprocessGL, Throughput = %.4f MTexels/s, Time = %.5f s, Size = "`
			`"%.0f Texels, NumDevsUsed = %d, Workgroup = %u\n",`
			`mtexps, dSeconds, dNumTexels, 1, block.x * block.y);`
			`}`

			`#endif`
			`}`