cuda-samples/Samples/5_Domain_Specific/stereoDisparity/stereoDisparity_kernel.cuh
2022-01-13 11:35:24 +05:30

253 lines
8.4 KiB
Plaintext

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Simple kernel computes a Stereo Disparity using CUDA SIMD SAD intrinsics. */
#ifndef _STEREODISPARITY_KERNEL_H_
#define _STEREODISPARITY_KERNEL_H_
#define blockSize_x 32
#define blockSize_y 8
// RAD is the radius of the region of support for the search
#define RAD 8
// STEPS is the number of loads we must perform to initialize the shared memory
// area (see convolution CUDA Sample for example)
#define STEPS 3
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
////////////////////////////////////////////////////////////////////////////////
// This function applies the video intrinsic operations to compute a
// sum of absolute differences. The absolute differences are computed
// and the optional .add instruction is used to sum the lanes.
//
// For more information, see also the documents:
// "Using_Inline_PTX_Assembly_In_CUDA.pdf"
// and also the PTX ISA documentation for the architecture in question, e.g.:
// "ptx_isa_3.0K.pdf"
// included in the NVIDIA GPU Computing Toolkit
////////////////////////////////////////////////////////////////////////////////
__device__ unsigned int __usad4(unsigned int A, unsigned int B,
unsigned int C = 0) {
unsigned int result;
// Kepler (SM 3.x) and higher supports a 4 vector SAD SIMD
asm(
"vabsdiff4.u32.u32.u32.add"
" %0, %1, %2, %3;"
: "=r"(result)
: "r"(A), "r"(B), "r"(C));
return result;
}
////////////////////////////////////////////////////////////////////////////////
//! Simple stereo disparity kernel to test atomic instructions
//! Algorithm Explanation:
//! For stereo disparity this performs a basic block matching scheme.
//! The sum of abs. diffs between and area of the candidate pixel in the left
//! images
//! is computed against different horizontal shifts of areas from the right.
//! The shift at which the difference is minimum is taken as how far that pixel
//! moved between left/right image pairs. The recovered motion is the
//! disparity map
//! More motion indicates more parallax indicates a closer object.
//! @param g_img1 image 1 in global memory, RGBA, 4 bytes/pixel
//! @param g_img2 image 2 in global memory
//! @param g_odata disparity map output in global memory, unsigned int
//! output/pixel
//! @param w image width in pixels
//! @param h image height in pixels
//! @param minDisparity leftmost search range
//! @param maxDisparity rightmost search range
////////////////////////////////////////////////////////////////////////////////
__global__ void stereoDisparityKernel(unsigned int *g_img0,
unsigned int *g_img1,
unsigned int *g_odata, int w, int h,
int minDisparity, int maxDisparity,
cudaTextureObject_t tex2Dleft,
cudaTextureObject_t tex2Dright) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
// access thread id
const int tidx = blockDim.x * blockIdx.x + threadIdx.x;
const int tidy = blockDim.y * blockIdx.y + threadIdx.y;
const unsigned int sidx = threadIdx.x + RAD;
const unsigned int sidy = threadIdx.y + RAD;
unsigned int imLeft;
unsigned int imRight;
unsigned int cost;
unsigned int bestCost = 9999999;
unsigned int bestDisparity = 0;
__shared__ unsigned int diff[blockSize_y + 2 * RAD][blockSize_x + 2 * RAD];
// store needed values for left image into registers (constant indexed local
// vars)
unsigned int imLeftA[STEPS];
unsigned int imLeftB[STEPS];
for (int i = 0; i < STEPS; i++) {
int offset = -RAD + i * RAD;
imLeftA[i] = tex2D<unsigned int>(tex2Dleft, tidx - RAD, tidy + offset);
imLeftB[i] =
tex2D<unsigned int>(tex2Dleft, tidx - RAD + blockSize_x, tidy + offset);
}
// for a fixed camera system this could be hardcoded and loop unrolled
for (int d = minDisparity; d <= maxDisparity; d++) {
// LEFT
#pragma unroll
for (int i = 0; i < STEPS; i++) {
int offset = -RAD + i * RAD;
// imLeft = tex2D( tex2Dleft, tidx-RAD, tidy+offset );
imLeft = imLeftA[i];
imRight = tex2D<unsigned int>(tex2Dright, tidx - RAD + d, tidy + offset);
cost = __usad4(imLeft, imRight);
diff[sidy + offset][sidx - RAD] = cost;
}
// RIGHT
#pragma unroll
for (int i = 0; i < STEPS; i++) {
int offset = -RAD + i * RAD;
if (threadIdx.x < 2 * RAD) {
// imLeft = tex2D( tex2Dleft, tidx-RAD+blockSize_x, tidy+offset );
imLeft = imLeftB[i];
imRight = tex2D<unsigned int>(tex2Dright, tidx - RAD + blockSize_x + d,
tidy + offset);
cost = __usad4(imLeft, imRight);
diff[sidy + offset][sidx - RAD + blockSize_x] = cost;
}
}
cg::sync(cta);
// sum cost horizontally
#pragma unroll
for (int j = 0; j < STEPS; j++) {
int offset = -RAD + j * RAD;
cost = 0;
#pragma unroll
for (int i = -RAD; i <= RAD; i++) {
cost += diff[sidy + offset][sidx + i];
}
cg::sync(cta);
diff[sidy + offset][sidx] = cost;
cg::sync(cta);
}
// sum cost vertically
cost = 0;
#pragma unroll
for (int i = -RAD; i <= RAD; i++) {
cost += diff[sidy + i][sidx];
}
// see if it is better or not
if (cost < bestCost) {
bestCost = cost;
bestDisparity = d + 8;
}
cg::sync(cta);
}
if (tidy < h && tidx < w) {
g_odata[tidy * w + tidx] = bestDisparity;
}
}
void cpu_gold_stereo(unsigned int *img0, unsigned int *img1,
unsigned int *odata, int w, int h, int minDisparity,
int maxDisparity) {
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++) {
unsigned int bestCost = 9999999;
unsigned int bestDisparity = 0;
for (int d = minDisparity; d <= maxDisparity; d++) {
unsigned int cost = 0;
for (int i = -RAD; i <= RAD; i++) {
for (int j = -RAD; j <= RAD; j++) {
// border clamping
int yy, xx, xxd;
yy = y + i;
if (yy < 0) yy = 0;
if (yy >= h) yy = h - 1;
xx = x + j;
if (xx < 0) xx = 0;
if (xx >= w) xx = w - 1;
xxd = x + j + d;
if (xxd < 0) xxd = 0;
if (xxd >= w) xxd = w - 1;
// sum abs diff across components
unsigned char *A = (unsigned char *)&img0[yy * w + xx];
unsigned char *B = (unsigned char *)&img1[yy * w + xxd];
unsigned int absdiff = 0;
for (int k = 0; k < 4; k++) {
absdiff += abs((int)(A[k] - B[k]));
}
cost += absdiff;
}
}
if (cost < bestCost) {
bestCost = cost;
bestDisparity = d + 8;
}
} // end for disparities
// store to best disparity
odata[y * w + x] = bestDisparity;
}
}
}
#endif // #ifndef _STEREODISPARITY_KERNEL_H_