mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-28 14:59:19 +08:00
422 lines
13 KiB
Plaintext
422 lines
13 KiB
Plaintext
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#define USE_TEXTURE 1
|
|
#define POWER_OF_TWO 1
|
|
|
|
#if (USE_TEXTURE)
|
|
#define LOAD_FLOAT(i) tex1Dfetch<float>(texFloat, i)
|
|
#define SET_FLOAT_BASE
|
|
#else
|
|
#define LOAD_FLOAT(i) d_Src[i]
|
|
#define SET_FLOAT_BASE
|
|
#endif
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// Position convolution kernel center at (0, 0) in the image
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__global__ void padKernel_kernel(float *d_Dst, float *d_Src, int fftH, int fftW,
|
|
int kernelH, int kernelW, int kernelY,
|
|
int kernelX
|
|
#if (USE_TEXTURE)
|
|
,
|
|
cudaTextureObject_t texFloat
|
|
#endif
|
|
) {
|
|
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
|
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
|
|
|
if (y < kernelH && x < kernelW) {
|
|
int ky = y - kernelY;
|
|
|
|
if (ky < 0) {
|
|
ky += fftH;
|
|
}
|
|
|
|
int kx = x - kernelX;
|
|
|
|
if (kx < 0) {
|
|
kx += fftW;
|
|
}
|
|
|
|
d_Dst[ky * fftW + kx] = LOAD_FLOAT(y * kernelW + x);
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Prepare data for "pad to border" addressing mode
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__global__ void padDataClampToBorder_kernel(float *d_Dst, float *d_Src,
|
|
int fftH, int fftW, int dataH,
|
|
int dataW, int kernelH, int kernelW,
|
|
int kernelY, int kernelX
|
|
#if (USE_TEXTURE)
|
|
,
|
|
cudaTextureObject_t texFloat
|
|
#endif
|
|
) {
|
|
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
|
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
|
const int borderH = dataH + kernelY;
|
|
const int borderW = dataW + kernelX;
|
|
|
|
if (y < fftH && x < fftW) {
|
|
int dy, dx;
|
|
|
|
if (y < dataH) {
|
|
dy = y;
|
|
}
|
|
|
|
if (x < dataW) {
|
|
dx = x;
|
|
}
|
|
|
|
if (y >= dataH && y < borderH) {
|
|
dy = dataH - 1;
|
|
}
|
|
|
|
if (x >= dataW && x < borderW) {
|
|
dx = dataW - 1;
|
|
}
|
|
|
|
if (y >= borderH) {
|
|
dy = 0;
|
|
}
|
|
|
|
if (x >= borderW) {
|
|
dx = 0;
|
|
}
|
|
|
|
d_Dst[y * fftW + x] = LOAD_FLOAT(dy * dataW + dx);
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Modulate Fourier image of padded data by Fourier image of padded kernel
|
|
// and normalize by FFT size
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
inline __device__ void mulAndScale(fComplex &a, const fComplex &b,
|
|
const float &c) {
|
|
fComplex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)};
|
|
a = t;
|
|
}
|
|
|
|
__global__ void modulateAndNormalize_kernel(fComplex *d_Dst, fComplex *d_Src,
|
|
int dataSize, float c) {
|
|
const int i = blockDim.x * blockIdx.x + threadIdx.x;
|
|
|
|
if (i >= dataSize) {
|
|
return;
|
|
}
|
|
|
|
fComplex a = d_Src[i];
|
|
fComplex b = d_Dst[i];
|
|
|
|
mulAndScale(a, b, c);
|
|
|
|
d_Dst[i] = a;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// 2D R2C / C2R post/preprocessing kernels
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
#if (USE_TEXTURE)
|
|
#define LOAD_FCOMPLEX(i) tex1Dfetch<fComplex>(texComplex, i)
|
|
#define LOAD_FCOMPLEX_A(i) tex1Dfetch<fComplex>(texComplexA, i)
|
|
#define LOAD_FCOMPLEX_B(i) tex1Dfetch<fComplex>(texComplexB, i)
|
|
|
|
#define SET_FCOMPLEX_BASE
|
|
#define SET_FCOMPLEX_BASE_A
|
|
#define SET_FCOMPLEX_BASE_B
|
|
#else
|
|
#define LOAD_FCOMPLEX(i) d_Src[i]
|
|
#define LOAD_FCOMPLEX_A(i) d_SrcA[i]
|
|
#define LOAD_FCOMPLEX_B(i) d_SrcB[i]
|
|
|
|
#define SET_FCOMPLEX_BASE
|
|
#define SET_FCOMPLEX_BASE_A
|
|
#define SET_FCOMPLEX_BASE_B
|
|
#endif
|
|
|
|
inline __device__ void spPostprocessC2C(fComplex &D1, fComplex &D2,
|
|
const fComplex &twiddle) {
|
|
float A1 = 0.5f * (D1.x + D2.x);
|
|
float B1 = 0.5f * (D1.y - D2.y);
|
|
float A2 = 0.5f * (D1.y + D2.y);
|
|
float B2 = 0.5f * (D1.x - D2.x);
|
|
|
|
D1.x = A1 + (A2 * twiddle.x + B2 * twiddle.y);
|
|
D1.y = (A2 * twiddle.y - B2 * twiddle.x) + B1;
|
|
D2.x = A1 - (A2 * twiddle.x + B2 * twiddle.y);
|
|
D2.y = (A2 * twiddle.y - B2 * twiddle.x) - B1;
|
|
}
|
|
|
|
// Premultiply by 2 to account for 1.0 / (DZ * DY * DX) normalization
|
|
inline __device__ void spPreprocessC2C(fComplex &D1, fComplex &D2,
|
|
const fComplex &twiddle) {
|
|
float A1 = /* 0.5f * */ (D1.x + D2.x);
|
|
float B1 = /* 0.5f * */ (D1.y - D2.y);
|
|
float A2 = /* 0.5f * */ (D1.y + D2.y);
|
|
float B2 = /* 0.5f * */ (D1.x - D2.x);
|
|
|
|
D1.x = A1 - (A2 * twiddle.x - B2 * twiddle.y);
|
|
D1.y = (B2 * twiddle.x + A2 * twiddle.y) + B1;
|
|
D2.x = A1 + (A2 * twiddle.x - B2 * twiddle.y);
|
|
D2.y = (B2 * twiddle.x + A2 * twiddle.y) - B1;
|
|
}
|
|
|
|
inline __device__ void getTwiddle(fComplex &twiddle, float phase) {
|
|
__sincosf(phase, &twiddle.y, &twiddle.x);
|
|
}
|
|
|
|
inline __device__ uint mod(uint a, uint DA) {
|
|
//(DA - a) % DA, assuming a <= DA
|
|
return a ? (DA - a) : a;
|
|
}
|
|
|
|
static inline uint factorRadix2(uint &log2N, uint n) {
|
|
if (!n) {
|
|
log2N = 0;
|
|
return 0;
|
|
} else {
|
|
for (log2N = 0; n % 2 == 0; n /= 2, log2N++)
|
|
;
|
|
|
|
return n;
|
|
}
|
|
}
|
|
|
|
inline __device__ void udivmod(uint ÷nd, uint divisor, uint &rem) {
|
|
#if (!POWER_OF_TWO)
|
|
rem = dividend % divisor;
|
|
dividend /= divisor;
|
|
#else
|
|
rem = dividend & (divisor - 1);
|
|
dividend >>= (__ffs(divisor) - 1);
|
|
#endif
|
|
}
|
|
|
|
__global__ void spPostprocess2D_kernel(fComplex *d_Dst, fComplex *d_Src,
|
|
uint DY, uint DX, uint threadCount,
|
|
uint padding, float phaseBase
|
|
#if (USE_TEXTURE)
|
|
,
|
|
cudaTextureObject_t texComplex
|
|
#endif
|
|
) {
|
|
const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
if (threadId >= threadCount) {
|
|
return;
|
|
}
|
|
|
|
uint x, y, i = threadId;
|
|
udivmod(i, DX / 2, x);
|
|
udivmod(i, DY, y);
|
|
|
|
// Avoid overwrites in columns DX / 2 by different threads
|
|
if ((x == 0) && (y > DY / 2)) {
|
|
return;
|
|
}
|
|
|
|
const uint srcOffset = i * DY * DX;
|
|
const uint dstOffset = i * DY * (DX + padding);
|
|
|
|
// Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX]
|
|
{
|
|
const uint loadPos1 = srcOffset + y * DX + x;
|
|
const uint loadPos2 = srcOffset + mod(y, DY) * DX + mod(x, DX);
|
|
const uint storePos1 = dstOffset + y * (DX + padding) + x;
|
|
const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + (DX - x);
|
|
|
|
fComplex D1 = LOAD_FCOMPLEX(loadPos1);
|
|
fComplex D2 = LOAD_FCOMPLEX(loadPos2);
|
|
|
|
fComplex twiddle;
|
|
getTwiddle(twiddle, phaseBase * (float)x);
|
|
spPostprocessC2C(D1, D2, twiddle);
|
|
|
|
d_Dst[storePos1] = D1;
|
|
d_Dst[storePos2] = D2;
|
|
}
|
|
|
|
// Process x = DX / 2
|
|
if (x == 0) {
|
|
const uint loadPos1 = srcOffset + y * DX + DX / 2;
|
|
const uint loadPos2 = srcOffset + mod(y, DY) * DX + DX / 2;
|
|
const uint storePos1 = dstOffset + y * (DX + padding) + DX / 2;
|
|
const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + DX / 2;
|
|
|
|
fComplex D1 = LOAD_FCOMPLEX(loadPos1);
|
|
fComplex D2 = LOAD_FCOMPLEX(loadPos2);
|
|
|
|
// twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(dir * j * PI / 2)
|
|
fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f};
|
|
spPostprocessC2C(D1, D2, twiddle);
|
|
|
|
d_Dst[storePos1] = D1;
|
|
d_Dst[storePos2] = D2;
|
|
}
|
|
}
|
|
|
|
__global__ void spPreprocess2D_kernel(fComplex *d_Dst, fComplex *d_Src, uint DY,
|
|
uint DX, uint threadCount, uint padding,
|
|
float phaseBase
|
|
#if (USE_TEXTURE)
|
|
,
|
|
cudaTextureObject_t texComplex
|
|
#endif
|
|
) {
|
|
const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
if (threadId >= threadCount) {
|
|
return;
|
|
}
|
|
|
|
uint x, y, i = threadId;
|
|
udivmod(i, DX / 2, x);
|
|
udivmod(i, DY, y);
|
|
|
|
// Avoid overwrites in columns 0 and DX / 2 by different threads (lower and
|
|
// upper halves)
|
|
if ((x == 0) && (y > DY / 2)) {
|
|
return;
|
|
}
|
|
|
|
const uint srcOffset = i * DY * (DX + padding);
|
|
const uint dstOffset = i * DY * DX;
|
|
|
|
// Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX]
|
|
{
|
|
const uint loadPos1 = srcOffset + y * (DX + padding) + x;
|
|
const uint loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + (DX - x);
|
|
const uint storePos1 = dstOffset + y * DX + x;
|
|
const uint storePos2 = dstOffset + mod(y, DY) * DX + mod(x, DX);
|
|
|
|
fComplex D1 = LOAD_FCOMPLEX(loadPos1);
|
|
fComplex D2 = LOAD_FCOMPLEX(loadPos2);
|
|
|
|
fComplex twiddle;
|
|
getTwiddle(twiddle, phaseBase * (float)x);
|
|
spPreprocessC2C(D1, D2, twiddle);
|
|
|
|
d_Dst[storePos1] = D1;
|
|
d_Dst[storePos2] = D2;
|
|
}
|
|
|
|
// Process x = DX / 2
|
|
if (x == 0) {
|
|
const uint loadPos1 = srcOffset + y * (DX + padding) + DX / 2;
|
|
const uint loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + DX / 2;
|
|
const uint storePos1 = dstOffset + y * DX + DX / 2;
|
|
const uint storePos2 = dstOffset + mod(y, DY) * DX + DX / 2;
|
|
|
|
fComplex D1 = LOAD_FCOMPLEX(loadPos1);
|
|
fComplex D2 = LOAD_FCOMPLEX(loadPos2);
|
|
|
|
// twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(-dir * j * PI / 2)
|
|
fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f};
|
|
spPreprocessC2C(D1, D2, twiddle);
|
|
|
|
d_Dst[storePos1] = D1;
|
|
d_Dst[storePos2] = D2;
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Combined spPostprocess2D + modulateAndNormalize + spPreprocess2D
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__global__ void spProcess2D_kernel(fComplex *d_Dst, fComplex *d_SrcA,
|
|
fComplex *d_SrcB, uint DY, uint DX,
|
|
uint threadCount, float phaseBase, float c
|
|
#if (USE_TEXTURE)
|
|
,
|
|
cudaTextureObject_t texComplexA,
|
|
cudaTextureObject_t texComplexB
|
|
#endif
|
|
) {
|
|
const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
if (threadId >= threadCount) {
|
|
return;
|
|
}
|
|
|
|
uint x, y, i = threadId;
|
|
udivmod(i, DX, x);
|
|
udivmod(i, DY / 2, y);
|
|
|
|
const uint offset = i * DY * DX;
|
|
|
|
// Avoid overwrites in rows 0 and DY / 2 by different threads (left and right
|
|
// halves) Otherwise correctness for in-place transformations is affected
|
|
if ((y == 0) && (x > DX / 2)) {
|
|
return;
|
|
}
|
|
|
|
fComplex twiddle;
|
|
|
|
// Process y = [0 .. DY / 2 - 1] U [DY - (DY / 2) + 1 .. DY - 1]
|
|
{
|
|
const uint pos1 = offset + y * DX + x;
|
|
const uint pos2 = offset + mod(y, DY) * DX + mod(x, DX);
|
|
|
|
fComplex D1 = LOAD_FCOMPLEX_A(pos1);
|
|
fComplex D2 = LOAD_FCOMPLEX_A(pos2);
|
|
fComplex K1 = LOAD_FCOMPLEX_B(pos1);
|
|
fComplex K2 = LOAD_FCOMPLEX_B(pos2);
|
|
getTwiddle(twiddle, phaseBase * (float)x);
|
|
|
|
spPostprocessC2C(D1, D2, twiddle);
|
|
spPostprocessC2C(K1, K2, twiddle);
|
|
mulAndScale(D1, K1, c);
|
|
mulAndScale(D2, K2, c);
|
|
spPreprocessC2C(D1, D2, twiddle);
|
|
|
|
d_Dst[pos1] = D1;
|
|
d_Dst[pos2] = D2;
|
|
}
|
|
|
|
if (y == 0) {
|
|
const uint pos1 = offset + (DY / 2) * DX + x;
|
|
const uint pos2 = offset + (DY / 2) * DX + mod(x, DX);
|
|
|
|
fComplex D1 = LOAD_FCOMPLEX_A(pos1);
|
|
fComplex D2 = LOAD_FCOMPLEX_A(pos2);
|
|
fComplex K1 = LOAD_FCOMPLEX_B(pos1);
|
|
fComplex K2 = LOAD_FCOMPLEX_B(pos2);
|
|
|
|
spPostprocessC2C(D1, D2, twiddle);
|
|
spPostprocessC2C(K1, K2, twiddle);
|
|
mulAndScale(D1, K1, c);
|
|
mulAndScale(D2, K2, c);
|
|
spPreprocessC2C(D1, D2, twiddle);
|
|
|
|
d_Dst[pos1] = D1;
|
|
d_Dst[pos2] = D2;
|
|
}
|
|
}
|