cuda-samples/Samples/5_Domain_Specific/convolutionFFT2D/convolutionFFT2D.cuh

422 lines
13 KiB
Plaintext
Raw Normal View History

2022-01-13 14:05:24 +08:00
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2021-10-21 19:04:49 +08:00
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define USE_TEXTURE 1
#define POWER_OF_TWO 1
#if (USE_TEXTURE)
#define LOAD_FLOAT(i) tex1Dfetch<float>(texFloat, i)
#define SET_FLOAT_BASE
#else
#define LOAD_FLOAT(i) d_Src[i]
#define SET_FLOAT_BASE
#endif
////////////////////////////////////////////////////////////////////////////////
/// Position convolution kernel center at (0, 0) in the image
////////////////////////////////////////////////////////////////////////////////
__global__ void padKernel_kernel(float *d_Dst, float *d_Src, int fftH, int fftW,
int kernelH, int kernelW, int kernelY,
int kernelX
#if (USE_TEXTURE)
,
cudaTextureObject_t texFloat
#endif
) {
const int y = blockDim.y * blockIdx.y + threadIdx.y;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
if (y < kernelH && x < kernelW) {
int ky = y - kernelY;
if (ky < 0) {
ky += fftH;
}
int kx = x - kernelX;
if (kx < 0) {
kx += fftW;
}
d_Dst[ky * fftW + kx] = LOAD_FLOAT(y * kernelW + x);
}
}
////////////////////////////////////////////////////////////////////////////////
// Prepare data for "pad to border" addressing mode
////////////////////////////////////////////////////////////////////////////////
__global__ void padDataClampToBorder_kernel(float *d_Dst, float *d_Src,
int fftH, int fftW, int dataH,
int dataW, int kernelH, int kernelW,
int kernelY, int kernelX
#if (USE_TEXTURE)
,
cudaTextureObject_t texFloat
#endif
) {
const int y = blockDim.y * blockIdx.y + threadIdx.y;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int borderH = dataH + kernelY;
const int borderW = dataW + kernelX;
if (y < fftH && x < fftW) {
int dy, dx;
if (y < dataH) {
dy = y;
}
if (x < dataW) {
dx = x;
}
if (y >= dataH && y < borderH) {
dy = dataH - 1;
}
if (x >= dataW && x < borderW) {
dx = dataW - 1;
}
if (y >= borderH) {
dy = 0;
}
if (x >= borderW) {
dx = 0;
}
d_Dst[y * fftW + x] = LOAD_FLOAT(dy * dataW + dx);
}
}
////////////////////////////////////////////////////////////////////////////////
// Modulate Fourier image of padded data by Fourier image of padded kernel
// and normalize by FFT size
////////////////////////////////////////////////////////////////////////////////
inline __device__ void mulAndScale(fComplex &a, const fComplex &b,
const float &c) {
fComplex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)};
a = t;
}
__global__ void modulateAndNormalize_kernel(fComplex *d_Dst, fComplex *d_Src,
int dataSize, float c) {
const int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= dataSize) {
return;
}
fComplex a = d_Src[i];
fComplex b = d_Dst[i];
mulAndScale(a, b, c);
d_Dst[i] = a;
}
////////////////////////////////////////////////////////////////////////////////
// 2D R2C / C2R post/preprocessing kernels
////////////////////////////////////////////////////////////////////////////////
#if (USE_TEXTURE)
#define LOAD_FCOMPLEX(i) tex1Dfetch<fComplex>(texComplex, i)
#define LOAD_FCOMPLEX_A(i) tex1Dfetch<fComplex>(texComplexA, i)
#define LOAD_FCOMPLEX_B(i) tex1Dfetch<fComplex>(texComplexB, i)
#define SET_FCOMPLEX_BASE
#define SET_FCOMPLEX_BASE_A
#define SET_FCOMPLEX_BASE_B
#else
#define LOAD_FCOMPLEX(i) d_Src[i]
#define LOAD_FCOMPLEX_A(i) d_SrcA[i]
#define LOAD_FCOMPLEX_B(i) d_SrcB[i]
#define SET_FCOMPLEX_BASE
#define SET_FCOMPLEX_BASE_A
#define SET_FCOMPLEX_BASE_B
#endif
inline __device__ void spPostprocessC2C(fComplex &D1, fComplex &D2,
const fComplex &twiddle) {
float A1 = 0.5f * (D1.x + D2.x);
float B1 = 0.5f * (D1.y - D2.y);
float A2 = 0.5f * (D1.y + D2.y);
float B2 = 0.5f * (D1.x - D2.x);
D1.x = A1 + (A2 * twiddle.x + B2 * twiddle.y);
D1.y = (A2 * twiddle.y - B2 * twiddle.x) + B1;
D2.x = A1 - (A2 * twiddle.x + B2 * twiddle.y);
D2.y = (A2 * twiddle.y - B2 * twiddle.x) - B1;
}
// Premultiply by 2 to account for 1.0 / (DZ * DY * DX) normalization
inline __device__ void spPreprocessC2C(fComplex &D1, fComplex &D2,
const fComplex &twiddle) {
float A1 = /* 0.5f * */ (D1.x + D2.x);
float B1 = /* 0.5f * */ (D1.y - D2.y);
float A2 = /* 0.5f * */ (D1.y + D2.y);
float B2 = /* 0.5f * */ (D1.x - D2.x);
D1.x = A1 - (A2 * twiddle.x - B2 * twiddle.y);
D1.y = (B2 * twiddle.x + A2 * twiddle.y) + B1;
D2.x = A1 + (A2 * twiddle.x - B2 * twiddle.y);
D2.y = (B2 * twiddle.x + A2 * twiddle.y) - B1;
}
inline __device__ void getTwiddle(fComplex &twiddle, float phase) {
__sincosf(phase, &twiddle.y, &twiddle.x);
}
inline __device__ uint mod(uint a, uint DA) {
//(DA - a) % DA, assuming a <= DA
return a ? (DA - a) : a;
}
static inline uint factorRadix2(uint &log2N, uint n) {
if (!n) {
log2N = 0;
return 0;
} else {
for (log2N = 0; n % 2 == 0; n /= 2, log2N++)
;
return n;
}
}
inline __device__ void udivmod(uint &dividend, uint divisor, uint &rem) {
#if (!POWER_OF_TWO)
rem = dividend % divisor;
dividend /= divisor;
#else
rem = dividend & (divisor - 1);
dividend >>= (__ffs(divisor) - 1);
#endif
}
__global__ void spPostprocess2D_kernel(fComplex *d_Dst, fComplex *d_Src,
uint DY, uint DX, uint threadCount,
uint padding, float phaseBase
#if (USE_TEXTURE)
,
cudaTextureObject_t texComplex
#endif
) {
const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;
if (threadId >= threadCount) {
return;
}
uint x, y, i = threadId;
udivmod(i, DX / 2, x);
udivmod(i, DY, y);
// Avoid overwrites in columns DX / 2 by different threads
if ((x == 0) && (y > DY / 2)) {
return;
}
const uint srcOffset = i * DY * DX;
const uint dstOffset = i * DY * (DX + padding);
// Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX]
{
const uint loadPos1 = srcOffset + y * DX + x;
const uint loadPos2 = srcOffset + mod(y, DY) * DX + mod(x, DX);
const uint storePos1 = dstOffset + y * (DX + padding) + x;
const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + (DX - x);
fComplex D1 = LOAD_FCOMPLEX(loadPos1);
fComplex D2 = LOAD_FCOMPLEX(loadPos2);
fComplex twiddle;
getTwiddle(twiddle, phaseBase * (float)x);
spPostprocessC2C(D1, D2, twiddle);
d_Dst[storePos1] = D1;
d_Dst[storePos2] = D2;
}
// Process x = DX / 2
if (x == 0) {
const uint loadPos1 = srcOffset + y * DX + DX / 2;
const uint loadPos2 = srcOffset + mod(y, DY) * DX + DX / 2;
const uint storePos1 = dstOffset + y * (DX + padding) + DX / 2;
const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + DX / 2;
fComplex D1 = LOAD_FCOMPLEX(loadPos1);
fComplex D2 = LOAD_FCOMPLEX(loadPos2);
// twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(dir * j * PI / 2)
fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f};
spPostprocessC2C(D1, D2, twiddle);
d_Dst[storePos1] = D1;
d_Dst[storePos2] = D2;
}
}
__global__ void spPreprocess2D_kernel(fComplex *d_Dst, fComplex *d_Src, uint DY,
uint DX, uint threadCount, uint padding,
float phaseBase
#if (USE_TEXTURE)
,
cudaTextureObject_t texComplex
#endif
) {
const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;
if (threadId >= threadCount) {
return;
}
uint x, y, i = threadId;
udivmod(i, DX / 2, x);
udivmod(i, DY, y);
// Avoid overwrites in columns 0 and DX / 2 by different threads (lower and
// upper halves)
if ((x == 0) && (y > DY / 2)) {
return;
}
const uint srcOffset = i * DY * (DX + padding);
const uint dstOffset = i * DY * DX;
// Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX]
{
const uint loadPos1 = srcOffset + y * (DX + padding) + x;
const uint loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + (DX - x);
const uint storePos1 = dstOffset + y * DX + x;
const uint storePos2 = dstOffset + mod(y, DY) * DX + mod(x, DX);
fComplex D1 = LOAD_FCOMPLEX(loadPos1);
fComplex D2 = LOAD_FCOMPLEX(loadPos2);
fComplex twiddle;
getTwiddle(twiddle, phaseBase * (float)x);
spPreprocessC2C(D1, D2, twiddle);
d_Dst[storePos1] = D1;
d_Dst[storePos2] = D2;
}
// Process x = DX / 2
if (x == 0) {
const uint loadPos1 = srcOffset + y * (DX + padding) + DX / 2;
const uint loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + DX / 2;
const uint storePos1 = dstOffset + y * DX + DX / 2;
const uint storePos2 = dstOffset + mod(y, DY) * DX + DX / 2;
fComplex D1 = LOAD_FCOMPLEX(loadPos1);
fComplex D2 = LOAD_FCOMPLEX(loadPos2);
// twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(-dir * j * PI / 2)
fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f};
spPreprocessC2C(D1, D2, twiddle);
d_Dst[storePos1] = D1;
d_Dst[storePos2] = D2;
}
}
////////////////////////////////////////////////////////////////////////////////
// Combined spPostprocess2D + modulateAndNormalize + spPreprocess2D
////////////////////////////////////////////////////////////////////////////////
__global__ void spProcess2D_kernel(fComplex *d_Dst, fComplex *d_SrcA,
fComplex *d_SrcB, uint DY, uint DX,
uint threadCount, float phaseBase, float c
#if (USE_TEXTURE)
,
cudaTextureObject_t texComplexA,
cudaTextureObject_t texComplexB
#endif
) {
const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;
if (threadId >= threadCount) {
return;
}
uint x, y, i = threadId;
udivmod(i, DX, x);
udivmod(i, DY / 2, y);
const uint offset = i * DY * DX;
// Avoid overwrites in rows 0 and DY / 2 by different threads (left and right
// halves) Otherwise correctness for in-place transformations is affected
if ((y == 0) && (x > DX / 2)) {
return;
}
fComplex twiddle;
// Process y = [0 .. DY / 2 - 1] U [DY - (DY / 2) + 1 .. DY - 1]
{
const uint pos1 = offset + y * DX + x;
const uint pos2 = offset + mod(y, DY) * DX + mod(x, DX);
fComplex D1 = LOAD_FCOMPLEX_A(pos1);
fComplex D2 = LOAD_FCOMPLEX_A(pos2);
fComplex K1 = LOAD_FCOMPLEX_B(pos1);
fComplex K2 = LOAD_FCOMPLEX_B(pos2);
getTwiddle(twiddle, phaseBase * (float)x);
spPostprocessC2C(D1, D2, twiddle);
spPostprocessC2C(K1, K2, twiddle);
mulAndScale(D1, K1, c);
mulAndScale(D2, K2, c);
spPreprocessC2C(D1, D2, twiddle);
d_Dst[pos1] = D1;
d_Dst[pos2] = D2;
}
if (y == 0) {
const uint pos1 = offset + (DY / 2) * DX + x;
const uint pos2 = offset + (DY / 2) * DX + mod(x, DX);
fComplex D1 = LOAD_FCOMPLEX_A(pos1);
fComplex D2 = LOAD_FCOMPLEX_A(pos2);
fComplex K1 = LOAD_FCOMPLEX_B(pos1);
fComplex K2 = LOAD_FCOMPLEX_B(pos2);
spPostprocessC2C(D1, D2, twiddle);
spPostprocessC2C(K1, K2, twiddle);
mulAndScale(D1, K1, c);
mulAndScale(D2, K2, c);
spPreprocessC2C(D1, D2, twiddle);
d_Dst[pos1] = D1;
d_Dst[pos2] = D2;
}
}