/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include namespace cg = cooperative_groups; #include "convolutionSeparable_common.h" //////////////////////////////////////////////////////////////////////////////// // Convolution kernel storage //////////////////////////////////////////////////////////////////////////////// __constant__ float c_Kernel[KERNEL_LENGTH]; extern "C" void setConvolutionKernel(float *h_Kernel) { cudaMemcpyToSymbol(c_Kernel, h_Kernel, KERNEL_LENGTH * sizeof(float)); } //////////////////////////////////////////////////////////////////////////////// // Row convolution filter //////////////////////////////////////////////////////////////////////////////// #define ROWS_BLOCKDIM_X 16 #define ROWS_BLOCKDIM_Y 4 #define ROWS_RESULT_STEPS 8 #define ROWS_HALO_STEPS 1 __global__ void convolutionRowsKernel(float *d_Dst, float *d_Src, int imageW, int imageH, int pitch) { // Handle to thread block group cg::thread_block cta = cg::this_thread_block(); __shared__ float s_Data[ROWS_BLOCKDIM_Y][(ROWS_RESULT_STEPS + 2 * ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X]; // Offset to the left halo edge const int baseX = (blockIdx.x * ROWS_RESULT_STEPS - ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X + threadIdx.x; const int baseY = blockIdx.y * ROWS_BLOCKDIM_Y + threadIdx.y; d_Src += baseY * pitch + baseX; d_Dst += baseY * pitch + baseX; // Load main data #pragma unroll for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) { s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] = d_Src[i * ROWS_BLOCKDIM_X]; } // Load left halo #pragma unroll for (int i = 0; i < ROWS_HALO_STEPS; i++) { s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] = (baseX >= -i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0; } // Load right halo #pragma unroll for (int i = ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS + ROWS_HALO_STEPS; i++) { s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] = (imageW - baseX > i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0; } // Compute and store results cg::sync(cta); #pragma unroll for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) { float sum = 0; #pragma unroll for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) { sum += c_Kernel[KERNEL_RADIUS - j] * s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X + j]; } d_Dst[i * ROWS_BLOCKDIM_X] = sum; } } extern "C" void convolutionRowsGPU(float *d_Dst, float *d_Src, int imageW, int imageH) { assert(ROWS_BLOCKDIM_X * ROWS_HALO_STEPS >= KERNEL_RADIUS); assert(imageW % (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X) == 0); assert(imageH % ROWS_BLOCKDIM_Y == 0); dim3 blocks(imageW / (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X), imageH / ROWS_BLOCKDIM_Y); dim3 threads(ROWS_BLOCKDIM_X, ROWS_BLOCKDIM_Y); convolutionRowsKernel<<>>(d_Dst, d_Src, imageW, imageH, imageW); getLastCudaError("convolutionRowsKernel() execution failed\n"); } //////////////////////////////////////////////////////////////////////////////// // Column convolution filter //////////////////////////////////////////////////////////////////////////////// #define COLUMNS_BLOCKDIM_X 16 #define COLUMNS_BLOCKDIM_Y 8 #define COLUMNS_RESULT_STEPS 8 #define COLUMNS_HALO_STEPS 1 __global__ void convolutionColumnsKernel(float *d_Dst, float *d_Src, int imageW, int imageH, int pitch) { // Handle to thread block group cg::thread_block cta = cg::this_thread_block(); __shared__ float s_Data[COLUMNS_BLOCKDIM_X][(COLUMNS_RESULT_STEPS + 2 * COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + 1]; // Offset to the upper halo edge const int baseX = blockIdx.x * COLUMNS_BLOCKDIM_X + threadIdx.x; const int baseY = (blockIdx.y * COLUMNS_RESULT_STEPS - COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + threadIdx.y; d_Src += baseY * pitch + baseX; d_Dst += baseY * pitch + baseX; // Main data #pragma unroll for (int i = COLUMNS_HALO_STEPS; i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) { s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = d_Src[i * COLUMNS_BLOCKDIM_Y * pitch]; } // Upper halo #pragma unroll for (int i = 0; i < COLUMNS_HALO_STEPS; i++) { s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = (baseY >= -i * COLUMNS_BLOCKDIM_Y) ? d_Src[i * COLUMNS_BLOCKDIM_Y * pitch] : 0; } // Lower halo #pragma unroll for (int i = COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS + COLUMNS_HALO_STEPS; i++) { s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = (imageH - baseY > i * COLUMNS_BLOCKDIM_Y) ? d_Src[i * COLUMNS_BLOCKDIM_Y * pitch] : 0; } // Compute and store results cg::sync(cta); #pragma unroll for (int i = COLUMNS_HALO_STEPS; i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) { float sum = 0; #pragma unroll for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) { sum += c_Kernel[KERNEL_RADIUS - j] * s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y + j]; } d_Dst[i * COLUMNS_BLOCKDIM_Y * pitch] = sum; } } extern "C" void convolutionColumnsGPU(float *d_Dst, float *d_Src, int imageW, int imageH) { assert(COLUMNS_BLOCKDIM_Y * COLUMNS_HALO_STEPS >= KERNEL_RADIUS); assert(imageW % COLUMNS_BLOCKDIM_X == 0); assert(imageH % (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y) == 0); dim3 blocks(imageW / COLUMNS_BLOCKDIM_X, imageH / (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y)); dim3 threads(COLUMNS_BLOCKDIM_X, COLUMNS_BLOCKDIM_Y); convolutionColumnsKernel<<>>(d_Dst, d_Src, imageW, imageH, imageW); getLastCudaError("convolutionColumnsKernel() execution failed\n"); }