mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-12-01 13:09:15 +08:00
215 lines
7.8 KiB
Plaintext
215 lines
7.8 KiB
Plaintext
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include <helper_cuda.h>
|
|
#include <cooperative_groups.h>
|
|
|
|
namespace cg = cooperative_groups;
|
|
#include "convolutionSeparable_common.h"
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Convolution kernel storage
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__constant__ float c_Kernel[KERNEL_LENGTH];
|
|
|
|
extern "C" void setConvolutionKernel(float *h_Kernel) {
|
|
cudaMemcpyToSymbol(c_Kernel, h_Kernel, KERNEL_LENGTH * sizeof(float));
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Row convolution filter
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
#define ROWS_BLOCKDIM_X 16
|
|
#define ROWS_BLOCKDIM_Y 4
|
|
#define ROWS_RESULT_STEPS 8
|
|
#define ROWS_HALO_STEPS 1
|
|
|
|
__global__ void convolutionRowsKernel(float *d_Dst, float *d_Src, int imageW,
|
|
int imageH, int pitch) {
|
|
// Handle to thread block group
|
|
cg::thread_block cta = cg::this_thread_block();
|
|
__shared__ float
|
|
s_Data[ROWS_BLOCKDIM_Y][(ROWS_RESULT_STEPS + 2 * ROWS_HALO_STEPS) *
|
|
ROWS_BLOCKDIM_X];
|
|
|
|
// Offset to the left halo edge
|
|
const int baseX =
|
|
(blockIdx.x * ROWS_RESULT_STEPS - ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X +
|
|
threadIdx.x;
|
|
const int baseY = blockIdx.y * ROWS_BLOCKDIM_Y + threadIdx.y;
|
|
|
|
d_Src += baseY * pitch + baseX;
|
|
d_Dst += baseY * pitch + baseX;
|
|
|
|
// Load main data
|
|
#pragma unroll
|
|
|
|
for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) {
|
|
s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] =
|
|
d_Src[i * ROWS_BLOCKDIM_X];
|
|
}
|
|
|
|
// Load left halo
|
|
#pragma unroll
|
|
|
|
for (int i = 0; i < ROWS_HALO_STEPS; i++) {
|
|
s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] =
|
|
(baseX >= -i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0;
|
|
}
|
|
|
|
// Load right halo
|
|
#pragma unroll
|
|
|
|
for (int i = ROWS_HALO_STEPS + ROWS_RESULT_STEPS;
|
|
i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS + ROWS_HALO_STEPS; i++) {
|
|
s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] =
|
|
(imageW - baseX > i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0;
|
|
}
|
|
|
|
// Compute and store results
|
|
cg::sync(cta);
|
|
#pragma unroll
|
|
|
|
for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) {
|
|
float sum = 0;
|
|
|
|
#pragma unroll
|
|
|
|
for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) {
|
|
sum += c_Kernel[KERNEL_RADIUS - j] *
|
|
s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X + j];
|
|
}
|
|
|
|
d_Dst[i * ROWS_BLOCKDIM_X] = sum;
|
|
}
|
|
}
|
|
|
|
extern "C" void convolutionRowsGPU(float *d_Dst, float *d_Src, int imageW,
|
|
int imageH) {
|
|
assert(ROWS_BLOCKDIM_X * ROWS_HALO_STEPS >= KERNEL_RADIUS);
|
|
assert(imageW % (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X) == 0);
|
|
assert(imageH % ROWS_BLOCKDIM_Y == 0);
|
|
|
|
dim3 blocks(imageW / (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X),
|
|
imageH / ROWS_BLOCKDIM_Y);
|
|
dim3 threads(ROWS_BLOCKDIM_X, ROWS_BLOCKDIM_Y);
|
|
|
|
convolutionRowsKernel<<<blocks, threads>>>(d_Dst, d_Src, imageW, imageH,
|
|
imageW);
|
|
getLastCudaError("convolutionRowsKernel() execution failed\n");
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Column convolution filter
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
#define COLUMNS_BLOCKDIM_X 16
|
|
#define COLUMNS_BLOCKDIM_Y 8
|
|
#define COLUMNS_RESULT_STEPS 8
|
|
#define COLUMNS_HALO_STEPS 1
|
|
|
|
__global__ void convolutionColumnsKernel(float *d_Dst, float *d_Src, int imageW,
|
|
int imageH, int pitch) {
|
|
// Handle to thread block group
|
|
cg::thread_block cta = cg::this_thread_block();
|
|
__shared__ float s_Data[COLUMNS_BLOCKDIM_X][(COLUMNS_RESULT_STEPS +
|
|
2 * COLUMNS_HALO_STEPS) *
|
|
COLUMNS_BLOCKDIM_Y +
|
|
1];
|
|
|
|
// Offset to the upper halo edge
|
|
const int baseX = blockIdx.x * COLUMNS_BLOCKDIM_X + threadIdx.x;
|
|
const int baseY = (blockIdx.y * COLUMNS_RESULT_STEPS - COLUMNS_HALO_STEPS) *
|
|
COLUMNS_BLOCKDIM_Y +
|
|
threadIdx.y;
|
|
d_Src += baseY * pitch + baseX;
|
|
d_Dst += baseY * pitch + baseX;
|
|
|
|
// Main data
|
|
#pragma unroll
|
|
|
|
for (int i = COLUMNS_HALO_STEPS;
|
|
i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) {
|
|
s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] =
|
|
d_Src[i * COLUMNS_BLOCKDIM_Y * pitch];
|
|
}
|
|
|
|
// Upper halo
|
|
#pragma unroll
|
|
|
|
for (int i = 0; i < COLUMNS_HALO_STEPS; i++) {
|
|
s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] =
|
|
(baseY >= -i * COLUMNS_BLOCKDIM_Y)
|
|
? d_Src[i * COLUMNS_BLOCKDIM_Y * pitch]
|
|
: 0;
|
|
}
|
|
|
|
// Lower halo
|
|
#pragma unroll
|
|
|
|
for (int i = COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS;
|
|
i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS + COLUMNS_HALO_STEPS;
|
|
i++) {
|
|
s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] =
|
|
(imageH - baseY > i * COLUMNS_BLOCKDIM_Y)
|
|
? d_Src[i * COLUMNS_BLOCKDIM_Y * pitch]
|
|
: 0;
|
|
}
|
|
|
|
// Compute and store results
|
|
cg::sync(cta);
|
|
#pragma unroll
|
|
|
|
for (int i = COLUMNS_HALO_STEPS;
|
|
i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) {
|
|
float sum = 0;
|
|
#pragma unroll
|
|
|
|
for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) {
|
|
sum += c_Kernel[KERNEL_RADIUS - j] *
|
|
s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y + j];
|
|
}
|
|
|
|
d_Dst[i * COLUMNS_BLOCKDIM_Y * pitch] = sum;
|
|
}
|
|
}
|
|
|
|
extern "C" void convolutionColumnsGPU(float *d_Dst, float *d_Src, int imageW,
|
|
int imageH) {
|
|
assert(COLUMNS_BLOCKDIM_Y * COLUMNS_HALO_STEPS >= KERNEL_RADIUS);
|
|
assert(imageW % COLUMNS_BLOCKDIM_X == 0);
|
|
assert(imageH % (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y) == 0);
|
|
|
|
dim3 blocks(imageW / COLUMNS_BLOCKDIM_X,
|
|
imageH / (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y));
|
|
dim3 threads(COLUMNS_BLOCKDIM_X, COLUMNS_BLOCKDIM_Y);
|
|
|
|
convolutionColumnsKernel<<<blocks, threads>>>(d_Dst, d_Src, imageW, imageH,
|
|
imageW);
|
|
getLastCudaError("convolutionColumnsKernel() execution failed\n");
|
|
}
|