/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Simple kernel computes a Stereo Disparity using CUDA SIMD SAD intrinsics. */ #ifndef _STEREODISPARITY_KERNEL_H_ #define _STEREODISPARITY_KERNEL_H_ #define blockSize_x 32 #define blockSize_y 8 // RAD is the radius of the region of support for the search #define RAD 8 // STEPS is the number of loads we must perform to initialize the shared memory // area (see convolution CUDA Sample for example) #define STEPS 3 #include namespace cg = cooperative_groups; //////////////////////////////////////////////////////////////////////////////// // This function applies the video intrinsic operations to compute a // sum of absolute differences. The absolute differences are computed // and the optional .add instruction is used to sum the lanes. // // For more information, see also the documents: // "Using_Inline_PTX_Assembly_In_CUDA.pdf" // and also the PTX ISA documentation for the architecture in question, e.g.: // "ptx_isa_3.0K.pdf" // included in the NVIDIA GPU Computing Toolkit //////////////////////////////////////////////////////////////////////////////// __device__ unsigned int __usad4(unsigned int A, unsigned int B, unsigned int C = 0) { unsigned int result; // Kepler (SM 3.x) and higher supports a 4 vector SAD SIMD asm( "vabsdiff4.u32.u32.u32.add" " %0, %1, %2, %3;" : "=r"(result) : "r"(A), "r"(B), "r"(C)); return result; } //////////////////////////////////////////////////////////////////////////////// //! Simple stereo disparity kernel to test atomic instructions //! Algorithm Explanation: //! For stereo disparity this performs a basic block matching scheme. //! The sum of abs. diffs between and area of the candidate pixel in the left //! images //! is computed against different horizontal shifts of areas from the right. //! The shift at which the difference is minimum is taken as how far that pixel //! moved between left/right image pairs. The recovered motion is the //! disparity map //! More motion indicates more parallax indicates a closer object. //! @param g_img1 image 1 in global memory, RGBA, 4 bytes/pixel //! @param g_img2 image 2 in global memory //! @param g_odata disparity map output in global memory, unsigned int //! output/pixel //! @param w image width in pixels //! @param h image height in pixels //! @param minDisparity leftmost search range //! @param maxDisparity rightmost search range //////////////////////////////////////////////////////////////////////////////// __global__ void stereoDisparityKernel(unsigned int *g_img0, unsigned int *g_img1, unsigned int *g_odata, int w, int h, int minDisparity, int maxDisparity, cudaTextureObject_t tex2Dleft, cudaTextureObject_t tex2Dright) { // Handle to thread block group cg::thread_block cta = cg::this_thread_block(); // access thread id const int tidx = blockDim.x * blockIdx.x + threadIdx.x; const int tidy = blockDim.y * blockIdx.y + threadIdx.y; const unsigned int sidx = threadIdx.x + RAD; const unsigned int sidy = threadIdx.y + RAD; unsigned int imLeft; unsigned int imRight; unsigned int cost; unsigned int bestCost = 9999999; unsigned int bestDisparity = 0; __shared__ unsigned int diff[blockSize_y + 2 * RAD][blockSize_x + 2 * RAD]; // store needed values for left image into registers (constant indexed local // vars) unsigned int imLeftA[STEPS]; unsigned int imLeftB[STEPS]; for (int i = 0; i < STEPS; i++) { int offset = -RAD + i * RAD; imLeftA[i] = tex2D(tex2Dleft, tidx - RAD, tidy + offset); imLeftB[i] = tex2D(tex2Dleft, tidx - RAD + blockSize_x, tidy + offset); } // for a fixed camera system this could be hardcoded and loop unrolled for (int d = minDisparity; d <= maxDisparity; d++) { // LEFT #pragma unroll for (int i = 0; i < STEPS; i++) { int offset = -RAD + i * RAD; // imLeft = tex2D( tex2Dleft, tidx-RAD, tidy+offset ); imLeft = imLeftA[i]; imRight = tex2D(tex2Dright, tidx - RAD + d, tidy + offset); cost = __usad4(imLeft, imRight); diff[sidy + offset][sidx - RAD] = cost; } // RIGHT #pragma unroll for (int i = 0; i < STEPS; i++) { int offset = -RAD + i * RAD; if (threadIdx.x < 2 * RAD) { // imLeft = tex2D( tex2Dleft, tidx-RAD+blockSize_x, tidy+offset ); imLeft = imLeftB[i]; imRight = tex2D(tex2Dright, tidx - RAD + blockSize_x + d, tidy + offset); cost = __usad4(imLeft, imRight); diff[sidy + offset][sidx - RAD + blockSize_x] = cost; } } cg::sync(cta); // sum cost horizontally #pragma unroll for (int j = 0; j < STEPS; j++) { int offset = -RAD + j * RAD; cost = 0; #pragma unroll for (int i = -RAD; i <= RAD; i++) { cost += diff[sidy + offset][sidx + i]; } cg::sync(cta); diff[sidy + offset][sidx] = cost; cg::sync(cta); } // sum cost vertically cost = 0; #pragma unroll for (int i = -RAD; i <= RAD; i++) { cost += diff[sidy + i][sidx]; } // see if it is better or not if (cost < bestCost) { bestCost = cost; bestDisparity = d + 8; } cg::sync(cta); } if (tidy < h && tidx < w) { g_odata[tidy * w + tidx] = bestDisparity; } } void cpu_gold_stereo(unsigned int *img0, unsigned int *img1, unsigned int *odata, int w, int h, int minDisparity, int maxDisparity) { for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { unsigned int bestCost = 9999999; unsigned int bestDisparity = 0; for (int d = minDisparity; d <= maxDisparity; d++) { unsigned int cost = 0; for (int i = -RAD; i <= RAD; i++) { for (int j = -RAD; j <= RAD; j++) { // border clamping int yy, xx, xxd; yy = y + i; if (yy < 0) yy = 0; if (yy >= h) yy = h - 1; xx = x + j; if (xx < 0) xx = 0; if (xx >= w) xx = w - 1; xxd = x + j + d; if (xxd < 0) xxd = 0; if (xxd >= w) xxd = w - 1; // sum abs diff across components unsigned char *A = (unsigned char *)&img0[yy * w + xx]; unsigned char *B = (unsigned char *)&img1[yy * w + xxd]; unsigned int absdiff = 0; for (int k = 0; k < 4; k++) { absdiff += abs((int)(A[k] - B[k])); } cost += absdiff; } } if (cost < bestCost) { bestCost = cost; bestDisparity = d + 8; } } // end for disparities // store to best disparity odata[y * w + x] = bestDisparity; } } } #endif // #ifndef _STEREODISPARITY_KERNEL_H_