cuda-samples/Samples/convolutionSeparable/convolutionSeparable.cu
2021-10-21 16:34:49 +05:30

215 lines
7.8 KiB
Plaintext

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <assert.h>
#include <helper_cuda.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
#include "convolutionSeparable_common.h"
////////////////////////////////////////////////////////////////////////////////
// Convolution kernel storage
////////////////////////////////////////////////////////////////////////////////
__constant__ float c_Kernel[KERNEL_LENGTH];
extern "C" void setConvolutionKernel(float *h_Kernel) {
cudaMemcpyToSymbol(c_Kernel, h_Kernel, KERNEL_LENGTH * sizeof(float));
}
////////////////////////////////////////////////////////////////////////////////
// Row convolution filter
////////////////////////////////////////////////////////////////////////////////
#define ROWS_BLOCKDIM_X 16
#define ROWS_BLOCKDIM_Y 4
#define ROWS_RESULT_STEPS 8
#define ROWS_HALO_STEPS 1
__global__ void convolutionRowsKernel(float *d_Dst, float *d_Src, int imageW,
int imageH, int pitch) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ float
s_Data[ROWS_BLOCKDIM_Y][(ROWS_RESULT_STEPS + 2 * ROWS_HALO_STEPS) *
ROWS_BLOCKDIM_X];
// Offset to the left halo edge
const int baseX =
(blockIdx.x * ROWS_RESULT_STEPS - ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X +
threadIdx.x;
const int baseY = blockIdx.y * ROWS_BLOCKDIM_Y + threadIdx.y;
d_Src += baseY * pitch + baseX;
d_Dst += baseY * pitch + baseX;
// Load main data
#pragma unroll
for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) {
s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] =
d_Src[i * ROWS_BLOCKDIM_X];
}
// Load left halo
#pragma unroll
for (int i = 0; i < ROWS_HALO_STEPS; i++) {
s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] =
(baseX >= -i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0;
}
// Load right halo
#pragma unroll
for (int i = ROWS_HALO_STEPS + ROWS_RESULT_STEPS;
i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS + ROWS_HALO_STEPS; i++) {
s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] =
(imageW - baseX > i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0;
}
// Compute and store results
cg::sync(cta);
#pragma unroll
for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) {
float sum = 0;
#pragma unroll
for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) {
sum += c_Kernel[KERNEL_RADIUS - j] *
s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X + j];
}
d_Dst[i * ROWS_BLOCKDIM_X] = sum;
}
}
extern "C" void convolutionRowsGPU(float *d_Dst, float *d_Src, int imageW,
int imageH) {
assert(ROWS_BLOCKDIM_X * ROWS_HALO_STEPS >= KERNEL_RADIUS);
assert(imageW % (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X) == 0);
assert(imageH % ROWS_BLOCKDIM_Y == 0);
dim3 blocks(imageW / (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X),
imageH / ROWS_BLOCKDIM_Y);
dim3 threads(ROWS_BLOCKDIM_X, ROWS_BLOCKDIM_Y);
convolutionRowsKernel<<<blocks, threads>>>(d_Dst, d_Src, imageW, imageH,
imageW);
getLastCudaError("convolutionRowsKernel() execution failed\n");
}
////////////////////////////////////////////////////////////////////////////////
// Column convolution filter
////////////////////////////////////////////////////////////////////////////////
#define COLUMNS_BLOCKDIM_X 16
#define COLUMNS_BLOCKDIM_Y 8
#define COLUMNS_RESULT_STEPS 8
#define COLUMNS_HALO_STEPS 1
__global__ void convolutionColumnsKernel(float *d_Dst, float *d_Src, int imageW,
int imageH, int pitch) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ float s_Data[COLUMNS_BLOCKDIM_X][(COLUMNS_RESULT_STEPS +
2 * COLUMNS_HALO_STEPS) *
COLUMNS_BLOCKDIM_Y +
1];
// Offset to the upper halo edge
const int baseX = blockIdx.x * COLUMNS_BLOCKDIM_X + threadIdx.x;
const int baseY = (blockIdx.y * COLUMNS_RESULT_STEPS - COLUMNS_HALO_STEPS) *
COLUMNS_BLOCKDIM_Y +
threadIdx.y;
d_Src += baseY * pitch + baseX;
d_Dst += baseY * pitch + baseX;
// Main data
#pragma unroll
for (int i = COLUMNS_HALO_STEPS;
i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) {
s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] =
d_Src[i * COLUMNS_BLOCKDIM_Y * pitch];
}
// Upper halo
#pragma unroll
for (int i = 0; i < COLUMNS_HALO_STEPS; i++) {
s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] =
(baseY >= -i * COLUMNS_BLOCKDIM_Y)
? d_Src[i * COLUMNS_BLOCKDIM_Y * pitch]
: 0;
}
// Lower halo
#pragma unroll
for (int i = COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS;
i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS + COLUMNS_HALO_STEPS;
i++) {
s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] =
(imageH - baseY > i * COLUMNS_BLOCKDIM_Y)
? d_Src[i * COLUMNS_BLOCKDIM_Y * pitch]
: 0;
}
// Compute and store results
cg::sync(cta);
#pragma unroll
for (int i = COLUMNS_HALO_STEPS;
i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) {
float sum = 0;
#pragma unroll
for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) {
sum += c_Kernel[KERNEL_RADIUS - j] *
s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y + j];
}
d_Dst[i * COLUMNS_BLOCKDIM_Y * pitch] = sum;
}
}
extern "C" void convolutionColumnsGPU(float *d_Dst, float *d_Src, int imageW,
int imageH) {
assert(COLUMNS_BLOCKDIM_Y * COLUMNS_HALO_STEPS >= KERNEL_RADIUS);
assert(imageW % COLUMNS_BLOCKDIM_X == 0);
assert(imageH % (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y) == 0);
dim3 blocks(imageW / COLUMNS_BLOCKDIM_X,
imageH / (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y));
dim3 threads(COLUMNS_BLOCKDIM_X, COLUMNS_BLOCKDIM_Y);
convolutionColumnsKernel<<<blocks, threads>>>(d_Dst, d_Src, imageW, imageH,
imageW);
getLastCudaError("convolutionColumnsKernel() execution failed\n");
}