cuda-samples/Samples/5_Domain_Specific/stereoDisparity/stereoDisparity_kernel.cuh

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/* Simple kernel computes a Stereo Disparity using CUDA SIMD SAD intrinsics. */

#ifndef _STEREODISPARITY_KERNEL_H_
#define _STEREODISPARITY_KERNEL_H_

#define blockSize_x 32
#define blockSize_y 8

// RAD is the radius of the region of support for the search
#define RAD 8
// STEPS is the number of loads we must perform to initialize the shared memory
// area (see convolution CUDA Sample for example)
#define STEPS 3

#include <cooperative_groups.h>

namespace cg = cooperative_groups;

////////////////////////////////////////////////////////////////////////////////
// This function applies the video intrinsic operations to compute a
// sum of absolute differences.  The absolute differences are computed
// and the optional .add instruction is used to sum the lanes.
//
// For more information, see also the documents:
//  "Using_Inline_PTX_Assembly_In_CUDA.pdf"
// and also the PTX ISA documentation for the architecture in question, e.g.:
//  "ptx_isa_3.0K.pdf"
// included in the NVIDIA GPU Computing Toolkit
////////////////////////////////////////////////////////////////////////////////
__device__ unsigned int __usad4(unsigned int A, unsigned int B,
                                unsigned int C = 0) {
  unsigned int result;

  // Kepler (SM 3.x) and higher supports a 4 vector SAD SIMD
  asm(
      "vabsdiff4.u32.u32.u32.add"
      " %0, %1, %2, %3;"
      : "=r"(result)
      : "r"(A), "r"(B), "r"(C));

  return result;
}

////////////////////////////////////////////////////////////////////////////////
//! Simple stereo disparity kernel to test atomic instructions
//! Algorithm Explanation:
//! For stereo disparity this performs a basic block matching scheme.
//! The sum of abs. diffs between and area of the candidate pixel in the left
//! images
//! is computed against different horizontal shifts of areas from the right.
//! The shift at which the difference is minimum is taken as how far that pixel
//! moved between left/right image pairs.   The recovered motion is the
//! disparity map
//! More motion indicates more parallax indicates a closer object.
//! @param g_img1  image 1 in global memory, RGBA, 4 bytes/pixel
//! @param g_img2  image 2 in global memory
//! @param g_odata disparity map output in global memory,  unsigned int
//! output/pixel
//! @param w image width in pixels
//! @param h image height in pixels
//! @param minDisparity leftmost search range
//! @param maxDisparity rightmost search range
////////////////////////////////////////////////////////////////////////////////
__global__ void stereoDisparityKernel(unsigned int *g_img0,
                                      unsigned int *g_img1,
                                      unsigned int *g_odata, int w, int h,
                                      int minDisparity, int maxDisparity,
                                      cudaTextureObject_t tex2Dleft,
                                      cudaTextureObject_t tex2Dright) {
  // Handle to thread block group
  cg::thread_block cta = cg::this_thread_block();
  // access thread id
  const int tidx = blockDim.x * blockIdx.x + threadIdx.x;
  const int tidy = blockDim.y * blockIdx.y + threadIdx.y;
  const unsigned int sidx = threadIdx.x + RAD;
  const unsigned int sidy = threadIdx.y + RAD;

  unsigned int imLeft;
  unsigned int imRight;
  unsigned int cost;
  unsigned int bestCost = 9999999;
  unsigned int bestDisparity = 0;
  __shared__ unsigned int diff[blockSize_y + 2 * RAD][blockSize_x + 2 * RAD];

  // store needed values for left image into registers (constant indexed local
  // vars)
  unsigned int imLeftA[STEPS];
  unsigned int imLeftB[STEPS];

  for (int i = 0; i < STEPS; i++) {
    int offset = -RAD + i * RAD;
    imLeftA[i] = tex2D<unsigned int>(tex2Dleft, tidx - RAD, tidy + offset);
    imLeftB[i] =
        tex2D<unsigned int>(tex2Dleft, tidx - RAD + blockSize_x, tidy + offset);
  }

  // for a fixed camera system this could be hardcoded and loop unrolled
  for (int d = minDisparity; d <= maxDisparity; d++) {
// LEFT
#pragma unroll
    for (int i = 0; i < STEPS; i++) {
      int offset = -RAD + i * RAD;
      // imLeft = tex2D( tex2Dleft, tidx-RAD, tidy+offset );
      imLeft = imLeftA[i];
      imRight = tex2D<unsigned int>(tex2Dright, tidx - RAD + d, tidy + offset);
      cost = __usad4(imLeft, imRight);
      diff[sidy + offset][sidx - RAD] = cost;
    }

// RIGHT
#pragma unroll

    for (int i = 0; i < STEPS; i++) {
      int offset = -RAD + i * RAD;

      if (threadIdx.x < 2 * RAD) {
        // imLeft = tex2D( tex2Dleft, tidx-RAD+blockSize_x, tidy+offset );
        imLeft = imLeftB[i];
        imRight = tex2D<unsigned int>(tex2Dright, tidx - RAD + blockSize_x + d,
                                      tidy + offset);
        cost = __usad4(imLeft, imRight);
        diff[sidy + offset][sidx - RAD + blockSize_x] = cost;
      }
    }

    cg::sync(cta);

// sum cost horizontally
#pragma unroll

    for (int j = 0; j < STEPS; j++) {
      int offset = -RAD + j * RAD;
      cost = 0;
#pragma unroll

      for (int i = -RAD; i <= RAD; i++) {
        cost += diff[sidy + offset][sidx + i];
      }

      cg::sync(cta);
      diff[sidy + offset][sidx] = cost;
      cg::sync(cta);
    }

    // sum cost vertically
    cost = 0;
#pragma unroll

    for (int i = -RAD; i <= RAD; i++) {
      cost += diff[sidy + i][sidx];
    }

    // see if it is better or not
    if (cost < bestCost) {
      bestCost = cost;
      bestDisparity = d + 8;
    }

    cg::sync(cta);
  }

  if (tidy < h && tidx < w) {
    g_odata[tidy * w + tidx] = bestDisparity;
  }
}

void cpu_gold_stereo(unsigned int *img0, unsigned int *img1,
                     unsigned int *odata, int w, int h, int minDisparity,
                     int maxDisparity) {
  for (int y = 0; y < h; y++) {
    for (int x = 0; x < w; x++) {
      unsigned int bestCost = 9999999;
      unsigned int bestDisparity = 0;

      for (int d = minDisparity; d <= maxDisparity; d++) {
        unsigned int cost = 0;

        for (int i = -RAD; i <= RAD; i++) {
          for (int j = -RAD; j <= RAD; j++) {
            // border clamping
            int yy, xx, xxd;
            yy = y + i;

            if (yy < 0) yy = 0;

            if (yy >= h) yy = h - 1;

            xx = x + j;

            if (xx < 0) xx = 0;

            if (xx >= w) xx = w - 1;

            xxd = x + j + d;

            if (xxd < 0) xxd = 0;

            if (xxd >= w) xxd = w - 1;

            // sum abs diff across components
            unsigned char *A = (unsigned char *)&img0[yy * w + xx];
            unsigned char *B = (unsigned char *)&img1[yy * w + xxd];
            unsigned int absdiff = 0;

            for (int k = 0; k < 4; k++) {
              absdiff += abs((int)(A[k] - B[k]));
            }

            cost += absdiff;
          }
        }

        if (cost < bestCost) {
          bestCost = cost;
          bestDisparity = d + 8;
        }

      }  // end for disparities

      // store to best disparity
      odata[y * w + x] = bestDisparity;
    }
  }
}
#endif  // #ifndef _STEREODISPARITY_KERNEL_H_
add and update samples for CUDA 11.6 2022-01-13 14:05:24 +08:00			`/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.`
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`/* Simple kernel computes a Stereo Disparity using CUDA SIMD SAD intrinsics. */`

			`#ifndef _STEREODISPARITY_KERNEL_H_`
			`#define _STEREODISPARITY_KERNEL_H_`

			`#define blockSize_x 32`
			`#define blockSize_y 8`

			`// RAD is the radius of the region of support for the search`
			`#define RAD 8`
			`// STEPS is the number of loads we must perform to initialize the shared memory`
			`// area (see convolution CUDA Sample for example)`
			`#define STEPS 3`

			`#include <cooperative_groups.h>`

			`namespace cg = cooperative_groups;`

			`////////////////////////////////////////////////////////////////////////////////`
			`// This function applies the video intrinsic operations to compute a`
			`// sum of absolute differences. The absolute differences are computed`
			`// and the optional .add instruction is used to sum the lanes.`
			`//`
			`// For more information, see also the documents:`
			`// "Using_Inline_PTX_Assembly_In_CUDA.pdf"`
			`// and also the PTX ISA documentation for the architecture in question, e.g.:`
			`// "ptx_isa_3.0K.pdf"`
			`// included in the NVIDIA GPU Computing Toolkit`
			`////////////////////////////////////////////////////////////////////////////////`
			`__device__ unsigned int __usad4(unsigned int A, unsigned int B,`
			`unsigned int C = 0) {`
			`unsigned int result;`

			`// Kepler (SM 3.x) and higher supports a 4 vector SAD SIMD`
			`asm(`
			`"vabsdiff4.u32.u32.u32.add"`
			`" %0, %1, %2, %3;"`
			`: "=r"(result)`
			`: "r"(A), "r"(B), "r"(C));`

			`return result;`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`//! Simple stereo disparity kernel to test atomic instructions`
			`//! Algorithm Explanation:`
			`//! For stereo disparity this performs a basic block matching scheme.`
			`//! The sum of abs. diffs between and area of the candidate pixel in the left`
			`//! images`
			`//! is computed against different horizontal shifts of areas from the right.`
			`//! The shift at which the difference is minimum is taken as how far that pixel`
			`//! moved between left/right image pairs. The recovered motion is the`
			`//! disparity map`
			`//! More motion indicates more parallax indicates a closer object.`
			`//! @param g_img1 image 1 in global memory, RGBA, 4 bytes/pixel`
			`//! @param g_img2 image 2 in global memory`
			`//! @param g_odata disparity map output in global memory, unsigned int`
			`//! output/pixel`
			`//! @param w image width in pixels`
			`//! @param h image height in pixels`
			`//! @param minDisparity leftmost search range`
			`//! @param maxDisparity rightmost search range`
			`////////////////////////////////////////////////////////////////////////////////`
			`__global__ void stereoDisparityKernel(unsigned int *g_img0,`
			`unsigned int *g_img1,`
			`unsigned int *g_odata, int w, int h,`
			`int minDisparity, int maxDisparity,`
			`cudaTextureObject_t tex2Dleft,`
			`cudaTextureObject_t tex2Dright) {`
			`// Handle to thread block group`
			`cg::thread_block cta = cg::this_thread_block();`
			`// access thread id`
			`const int tidx = blockDim.x * blockIdx.x + threadIdx.x;`
			`const int tidy = blockDim.y * blockIdx.y + threadIdx.y;`
			`const unsigned int sidx = threadIdx.x + RAD;`
			`const unsigned int sidy = threadIdx.y + RAD;`

			`unsigned int imLeft;`
			`unsigned int imRight;`
			`unsigned int cost;`
			`unsigned int bestCost = 9999999;`
			`unsigned int bestDisparity = 0;`
			`__shared__ unsigned int diff[blockSize_y + 2 * RAD][blockSize_x + 2 * RAD];`

			`// store needed values for left image into registers (constant indexed local`
			`// vars)`
			`unsigned int imLeftA[STEPS];`
			`unsigned int imLeftB[STEPS];`

			`for (int i = 0; i < STEPS; i++) {`
			`int offset = -RAD + i * RAD;`
			`imLeftA[i] = tex2D<unsigned int>(tex2Dleft, tidx - RAD, tidy + offset);`
			`imLeftB[i] =`
			`tex2D<unsigned int>(tex2Dleft, tidx - RAD + blockSize_x, tidy + offset);`
			`}`

			`// for a fixed camera system this could be hardcoded and loop unrolled`
			`for (int d = minDisparity; d <= maxDisparity; d++) {`
			`// LEFT`
			`#pragma unroll`
			`for (int i = 0; i < STEPS; i++) {`
			`int offset = -RAD + i * RAD;`
			`// imLeft = tex2D( tex2Dleft, tidx-RAD, tidy+offset );`
			`imLeft = imLeftA[i];`
			`imRight = tex2D<unsigned int>(tex2Dright, tidx - RAD + d, tidy + offset);`
			`cost = __usad4(imLeft, imRight);`
			`diff[sidy + offset][sidx - RAD] = cost;`
			`}`

			`// RIGHT`
			`#pragma unroll`

			`for (int i = 0; i < STEPS; i++) {`
			`int offset = -RAD + i * RAD;`

			`if (threadIdx.x < 2 * RAD) {`
			`// imLeft = tex2D( tex2Dleft, tidx-RAD+blockSize_x, tidy+offset );`
			`imLeft = imLeftB[i];`
			`imRight = tex2D<unsigned int>(tex2Dright, tidx - RAD + blockSize_x + d,`
			`tidy + offset);`
			`cost = __usad4(imLeft, imRight);`
			`diff[sidy + offset][sidx - RAD + blockSize_x] = cost;`
			`}`
			`}`

			`cg::sync(cta);`

			`// sum cost horizontally`
			`#pragma unroll`

			`for (int j = 0; j < STEPS; j++) {`
			`int offset = -RAD + j * RAD;`
			`cost = 0;`
			`#pragma unroll`

			`for (int i = -RAD; i <= RAD; i++) {`
			`cost += diff[sidy + offset][sidx + i];`
			`}`

			`cg::sync(cta);`
			`diff[sidy + offset][sidx] = cost;`
			`cg::sync(cta);`
			`}`

			`// sum cost vertically`
			`cost = 0;`
			`#pragma unroll`

			`for (int i = -RAD; i <= RAD; i++) {`
			`cost += diff[sidy + i][sidx];`
			`}`

			`// see if it is better or not`
			`if (cost < bestCost) {`
			`bestCost = cost;`
			`bestDisparity = d + 8;`
			`}`

			`cg::sync(cta);`
			`}`

			`if (tidy < h && tidx < w) {`
			`g_odata[tidy * w + tidx] = bestDisparity;`
			`}`
			`}`

			`void cpu_gold_stereo(unsigned int img0, unsigned int img1,`
			`unsigned int *odata, int w, int h, int minDisparity,`
			`int maxDisparity) {`
			`for (int y = 0; y < h; y++) {`
			`for (int x = 0; x < w; x++) {`
			`unsigned int bestCost = 9999999;`
			`unsigned int bestDisparity = 0;`

			`for (int d = minDisparity; d <= maxDisparity; d++) {`
			`unsigned int cost = 0;`

			`for (int i = -RAD; i <= RAD; i++) {`
			`for (int j = -RAD; j <= RAD; j++) {`
			`// border clamping`
			`int yy, xx, xxd;`
			`yy = y + i;`

			`if (yy < 0) yy = 0;`

			`if (yy >= h) yy = h - 1;`

			`xx = x + j;`

			`if (xx < 0) xx = 0;`

			`if (xx >= w) xx = w - 1;`

			`xxd = x + j + d;`

			`if (xxd < 0) xxd = 0;`

			`if (xxd >= w) xxd = w - 1;`

			`// sum abs diff across components`
			`unsigned char A = (unsigned char )&img0[yy * w + xx];`
			`unsigned char B = (unsigned char )&img1[yy * w + xxd];`
			`unsigned int absdiff = 0;`

			`for (int k = 0; k < 4; k++) {`
			`absdiff += abs((int)(A[k] - B[k]));`
			`}`

			`cost += absdiff;`
			`}`
			`}`

			`if (cost < bestCost) {`
			`bestCost = cost;`
			`bestDisparity = d + 8;`
			`}`

			`} // end for disparities`

			`// store to best disparity`
			`odata[y * w + x] = bestDisparity;`
			`}`
			`}`
			`}`
			`#endif // #ifndef _STEREODISPARITY_KERNEL_H_`