cuda-samples/Samples/5_Domain_Specific/convolutionFFT2D/convolutionFFT2D.cuh

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#define USE_TEXTURE 1
#define POWER_OF_TWO 1

#if (USE_TEXTURE)
#define LOAD_FLOAT(i) tex1Dfetch<float>(texFloat, i)
#define SET_FLOAT_BASE
#else
#define LOAD_FLOAT(i) d_Src[i]
#define SET_FLOAT_BASE
#endif

////////////////////////////////////////////////////////////////////////////////
/// Position convolution kernel center at (0, 0) in the image
////////////////////////////////////////////////////////////////////////////////
__global__ void padKernel_kernel(float *d_Dst, float *d_Src, int fftH, int fftW,
                                 int kernelH, int kernelW, int kernelY,
                                 int kernelX
#if (USE_TEXTURE)
                                 ,
                                 cudaTextureObject_t texFloat
#endif
                                 ) {
  const int y = blockDim.y * blockIdx.y + threadIdx.y;
  const int x = blockDim.x * blockIdx.x + threadIdx.x;

  if (y < kernelH && x < kernelW) {
    int ky = y - kernelY;

    if (ky < 0) {
      ky += fftH;
    }

    int kx = x - kernelX;

    if (kx < 0) {
      kx += fftW;
    }

    d_Dst[ky * fftW + kx] = LOAD_FLOAT(y * kernelW + x);
  }
}

////////////////////////////////////////////////////////////////////////////////
// Prepare data for "pad to border" addressing mode
////////////////////////////////////////////////////////////////////////////////
__global__ void padDataClampToBorder_kernel(float *d_Dst, float *d_Src,
                                            int fftH, int fftW, int dataH,
                                            int dataW, int kernelH, int kernelW,
                                            int kernelY, int kernelX
#if (USE_TEXTURE)
                                            ,
                                            cudaTextureObject_t texFloat
#endif
                                            ) {
  const int y = blockDim.y * blockIdx.y + threadIdx.y;
  const int x = blockDim.x * blockIdx.x + threadIdx.x;
  const int borderH = dataH + kernelY;
  const int borderW = dataW + kernelX;

  if (y < fftH && x < fftW) {
    int dy, dx;

    if (y < dataH) {
      dy = y;
    }

    if (x < dataW) {
      dx = x;
    }

    if (y >= dataH && y < borderH) {
      dy = dataH - 1;
    }

    if (x >= dataW && x < borderW) {
      dx = dataW - 1;
    }

    if (y >= borderH) {
      dy = 0;
    }

    if (x >= borderW) {
      dx = 0;
    }

    d_Dst[y * fftW + x] = LOAD_FLOAT(dy * dataW + dx);
  }
}

////////////////////////////////////////////////////////////////////////////////
// Modulate Fourier image of padded data by Fourier image of padded kernel
// and normalize by FFT size
////////////////////////////////////////////////////////////////////////////////
inline __device__ void mulAndScale(fComplex &a, const fComplex &b,
                                   const float &c) {
  fComplex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)};
  a = t;
}

__global__ void modulateAndNormalize_kernel(fComplex *d_Dst, fComplex *d_Src,
                                            int dataSize, float c) {
  const int i = blockDim.x * blockIdx.x + threadIdx.x;

  if (i >= dataSize) {
    return;
  }

  fComplex a = d_Src[i];
  fComplex b = d_Dst[i];

  mulAndScale(a, b, c);

  d_Dst[i] = a;
}

////////////////////////////////////////////////////////////////////////////////
// 2D R2C / C2R post/preprocessing kernels
////////////////////////////////////////////////////////////////////////////////
#if (USE_TEXTURE)
#define LOAD_FCOMPLEX(i) tex1Dfetch<fComplex>(texComplex, i)
#define LOAD_FCOMPLEX_A(i) tex1Dfetch<fComplex>(texComplexA, i)
#define LOAD_FCOMPLEX_B(i) tex1Dfetch<fComplex>(texComplexB, i)

#define SET_FCOMPLEX_BASE
#define SET_FCOMPLEX_BASE_A
#define SET_FCOMPLEX_BASE_B
#else
#define LOAD_FCOMPLEX(i) d_Src[i]
#define LOAD_FCOMPLEX_A(i) d_SrcA[i]
#define LOAD_FCOMPLEX_B(i) d_SrcB[i]

#define SET_FCOMPLEX_BASE
#define SET_FCOMPLEX_BASE_A
#define SET_FCOMPLEX_BASE_B
#endif

inline __device__ void spPostprocessC2C(fComplex &D1, fComplex &D2,
                                        const fComplex &twiddle) {
  float A1 = 0.5f * (D1.x + D2.x);
  float B1 = 0.5f * (D1.y - D2.y);
  float A2 = 0.5f * (D1.y + D2.y);
  float B2 = 0.5f * (D1.x - D2.x);

  D1.x = A1 + (A2 * twiddle.x + B2 * twiddle.y);
  D1.y = (A2 * twiddle.y - B2 * twiddle.x) + B1;
  D2.x = A1 - (A2 * twiddle.x + B2 * twiddle.y);
  D2.y = (A2 * twiddle.y - B2 * twiddle.x) - B1;
}

// Premultiply by 2 to account for 1.0 / (DZ * DY * DX) normalization
inline __device__ void spPreprocessC2C(fComplex &D1, fComplex &D2,
                                       const fComplex &twiddle) {
  float A1 = /* 0.5f * */ (D1.x + D2.x);
  float B1 = /* 0.5f * */ (D1.y - D2.y);
  float A2 = /* 0.5f * */ (D1.y + D2.y);
  float B2 = /* 0.5f * */ (D1.x - D2.x);

  D1.x = A1 - (A2 * twiddle.x - B2 * twiddle.y);
  D1.y = (B2 * twiddle.x + A2 * twiddle.y) + B1;
  D2.x = A1 + (A2 * twiddle.x - B2 * twiddle.y);
  D2.y = (B2 * twiddle.x + A2 * twiddle.y) - B1;
}

inline __device__ void getTwiddle(fComplex &twiddle, float phase) {
  __sincosf(phase, &twiddle.y, &twiddle.x);
}

inline __device__ uint mod(uint a, uint DA) {
  //(DA - a) % DA, assuming a <= DA
  return a ? (DA - a) : a;
}

static inline uint factorRadix2(uint &log2N, uint n) {
  if (!n) {
    log2N = 0;
    return 0;
  } else {
    for (log2N = 0; n % 2 == 0; n /= 2, log2N++)
      ;

    return n;
  }
}

inline __device__ void udivmod(uint &dividend, uint divisor, uint &rem) {
#if (!POWER_OF_TWO)
  rem = dividend % divisor;
  dividend /= divisor;
#else
  rem = dividend & (divisor - 1);
  dividend >>= (__ffs(divisor) - 1);
#endif
}

__global__ void spPostprocess2D_kernel(fComplex *d_Dst, fComplex *d_Src,
                                       uint DY, uint DX, uint threadCount,
                                       uint padding, float phaseBase
#if (USE_TEXTURE)
                                       ,
                                       cudaTextureObject_t texComplex
#endif
                                       ) {
  const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;

  if (threadId >= threadCount) {
    return;
  }

  uint x, y, i = threadId;
  udivmod(i, DX / 2, x);
  udivmod(i, DY, y);

  // Avoid overwrites in columns DX / 2 by different threads
  if ((x == 0) && (y > DY / 2)) {
    return;
  }

  const uint srcOffset = i * DY * DX;
  const uint dstOffset = i * DY * (DX + padding);

  // Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX]
  {
    const uint loadPos1 = srcOffset + y * DX + x;
    const uint loadPos2 = srcOffset + mod(y, DY) * DX + mod(x, DX);
    const uint storePos1 = dstOffset + y * (DX + padding) + x;
    const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + (DX - x);

    fComplex D1 = LOAD_FCOMPLEX(loadPos1);
    fComplex D2 = LOAD_FCOMPLEX(loadPos2);

    fComplex twiddle;
    getTwiddle(twiddle, phaseBase * (float)x);
    spPostprocessC2C(D1, D2, twiddle);

    d_Dst[storePos1] = D1;
    d_Dst[storePos2] = D2;
  }

  // Process x = DX / 2
  if (x == 0) {
    const uint loadPos1 = srcOffset + y * DX + DX / 2;
    const uint loadPos2 = srcOffset + mod(y, DY) * DX + DX / 2;
    const uint storePos1 = dstOffset + y * (DX + padding) + DX / 2;
    const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + DX / 2;

    fComplex D1 = LOAD_FCOMPLEX(loadPos1);
    fComplex D2 = LOAD_FCOMPLEX(loadPos2);

    // twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(dir * j * PI / 2)
    fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f};
    spPostprocessC2C(D1, D2, twiddle);

    d_Dst[storePos1] = D1;
    d_Dst[storePos2] = D2;
  }
}

__global__ void spPreprocess2D_kernel(fComplex *d_Dst, fComplex *d_Src, uint DY,
                                      uint DX, uint threadCount, uint padding,
                                      float phaseBase
#if (USE_TEXTURE)
                                      ,
                                      cudaTextureObject_t texComplex
#endif
                                      ) {
  const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;

  if (threadId >= threadCount) {
    return;
  }

  uint x, y, i = threadId;
  udivmod(i, DX / 2, x);
  udivmod(i, DY, y);

  // Avoid overwrites in columns 0 and DX / 2 by different threads (lower and
  // upper halves)
  if ((x == 0) && (y > DY / 2)) {
    return;
  }

  const uint srcOffset = i * DY * (DX + padding);
  const uint dstOffset = i * DY * DX;

  // Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX]
  {
    const uint loadPos1 = srcOffset + y * (DX + padding) + x;
    const uint loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + (DX - x);
    const uint storePos1 = dstOffset + y * DX + x;
    const uint storePos2 = dstOffset + mod(y, DY) * DX + mod(x, DX);

    fComplex D1 = LOAD_FCOMPLEX(loadPos1);
    fComplex D2 = LOAD_FCOMPLEX(loadPos2);

    fComplex twiddle;
    getTwiddle(twiddle, phaseBase * (float)x);
    spPreprocessC2C(D1, D2, twiddle);

    d_Dst[storePos1] = D1;
    d_Dst[storePos2] = D2;
  }

  // Process x = DX / 2
  if (x == 0) {
    const uint loadPos1 = srcOffset + y * (DX + padding) + DX / 2;
    const uint loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + DX / 2;
    const uint storePos1 = dstOffset + y * DX + DX / 2;
    const uint storePos2 = dstOffset + mod(y, DY) * DX + DX / 2;

    fComplex D1 = LOAD_FCOMPLEX(loadPos1);
    fComplex D2 = LOAD_FCOMPLEX(loadPos2);

    // twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(-dir * j * PI / 2)
    fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f};
    spPreprocessC2C(D1, D2, twiddle);

    d_Dst[storePos1] = D1;
    d_Dst[storePos2] = D2;
  }
}

////////////////////////////////////////////////////////////////////////////////
// Combined spPostprocess2D + modulateAndNormalize + spPreprocess2D
////////////////////////////////////////////////////////////////////////////////
__global__ void spProcess2D_kernel(fComplex *d_Dst, fComplex *d_SrcA,
                                   fComplex *d_SrcB, uint DY, uint DX,
                                   uint threadCount, float phaseBase, float c
#if (USE_TEXTURE)
                                   ,
                                   cudaTextureObject_t texComplexA,
                                   cudaTextureObject_t texComplexB
#endif
                                   ) {
  const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;

  if (threadId >= threadCount) {
    return;
  }

  uint x, y, i = threadId;
  udivmod(i, DX, x);
  udivmod(i, DY / 2, y);

  const uint offset = i * DY * DX;

  // Avoid overwrites in rows 0 and DY / 2 by different threads (left and right
  // halves) Otherwise correctness for in-place transformations is affected
  if ((y == 0) && (x > DX / 2)) {
    return;
  }

  fComplex twiddle;

  // Process y = [0 .. DY / 2 - 1] U [DY - (DY / 2) + 1 .. DY - 1]
  {
    const uint pos1 = offset + y * DX + x;
    const uint pos2 = offset + mod(y, DY) * DX + mod(x, DX);

    fComplex D1 = LOAD_FCOMPLEX_A(pos1);
    fComplex D2 = LOAD_FCOMPLEX_A(pos2);
    fComplex K1 = LOAD_FCOMPLEX_B(pos1);
    fComplex K2 = LOAD_FCOMPLEX_B(pos2);
    getTwiddle(twiddle, phaseBase * (float)x);

    spPostprocessC2C(D1, D2, twiddle);
    spPostprocessC2C(K1, K2, twiddle);
    mulAndScale(D1, K1, c);
    mulAndScale(D2, K2, c);
    spPreprocessC2C(D1, D2, twiddle);

    d_Dst[pos1] = D1;
    d_Dst[pos2] = D2;
  }

  if (y == 0) {
    const uint pos1 = offset + (DY / 2) * DX + x;
    const uint pos2 = offset + (DY / 2) * DX + mod(x, DX);

    fComplex D1 = LOAD_FCOMPLEX_A(pos1);
    fComplex D2 = LOAD_FCOMPLEX_A(pos2);
    fComplex K1 = LOAD_FCOMPLEX_B(pos1);
    fComplex K2 = LOAD_FCOMPLEX_B(pos2);

    spPostprocessC2C(D1, D2, twiddle);
    spPostprocessC2C(K1, K2, twiddle);
    mulAndScale(D1, K1, c);
    mulAndScale(D2, K2, c);
    spPreprocessC2C(D1, D2, twiddle);

    d_Dst[pos1] = D1;
    d_Dst[pos2] = D2;
  }
}
add and update samples for CUDA 11.6 2022-01-13 14:05:24 +08:00			`/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.`
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`#define USE_TEXTURE 1`
			`#define POWER_OF_TWO 1`

			`#if (USE_TEXTURE)`
			`#define LOAD_FLOAT(i) tex1Dfetch<float>(texFloat, i)`
			`#define SET_FLOAT_BASE`
			`#else`
			`#define LOAD_FLOAT(i) d_Src[i]`
			`#define SET_FLOAT_BASE`
			`#endif`

			`////////////////////////////////////////////////////////////////////////////////`
			`/// Position convolution kernel center at (0, 0) in the image`
			`////////////////////////////////////////////////////////////////////////////////`
			`__global__ void padKernel_kernel(float d_Dst, float d_Src, int fftH, int fftW,`
			`int kernelH, int kernelW, int kernelY,`
			`int kernelX`
			`#if (USE_TEXTURE)`
			`,`
			`cudaTextureObject_t texFloat`
			`#endif`
			`) {`
			`const int y = blockDim.y * blockIdx.y + threadIdx.y;`
			`const int x = blockDim.x * blockIdx.x + threadIdx.x;`

			`if (y < kernelH && x < kernelW) {`
			`int ky = y - kernelY;`

			`if (ky < 0) {`
			`ky += fftH;`
			`}`

			`int kx = x - kernelX;`

			`if (kx < 0) {`
			`kx += fftW;`
			`}`

			`d_Dst[ky * fftW + kx] = LOAD_FLOAT(y * kernelW + x);`
			`}`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Prepare data for "pad to border" addressing mode`
			`////////////////////////////////////////////////////////////////////////////////`
			`__global__ void padDataClampToBorder_kernel(float d_Dst, float d_Src,`
			`int fftH, int fftW, int dataH,`
			`int dataW, int kernelH, int kernelW,`
			`int kernelY, int kernelX`
			`#if (USE_TEXTURE)`
			`,`
			`cudaTextureObject_t texFloat`
			`#endif`
			`) {`
			`const int y = blockDim.y * blockIdx.y + threadIdx.y;`
			`const int x = blockDim.x * blockIdx.x + threadIdx.x;`
			`const int borderH = dataH + kernelY;`
			`const int borderW = dataW + kernelX;`

			`if (y < fftH && x < fftW) {`
			`int dy, dx;`

			`if (y < dataH) {`
			`dy = y;`
			`}`

			`if (x < dataW) {`
			`dx = x;`
			`}`

			`if (y >= dataH && y < borderH) {`
			`dy = dataH - 1;`
			`}`

			`if (x >= dataW && x < borderW) {`
			`dx = dataW - 1;`
			`}`

			`if (y >= borderH) {`
			`dy = 0;`
			`}`

			`if (x >= borderW) {`
			`dx = 0;`
			`}`

			`d_Dst[y * fftW + x] = LOAD_FLOAT(dy * dataW + dx);`
			`}`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Modulate Fourier image of padded data by Fourier image of padded kernel`
			`// and normalize by FFT size`
			`////////////////////////////////////////////////////////////////////////////////`
			`inline __device__ void mulAndScale(fComplex &a, const fComplex &b,`
			`const float &c) {`
			`fComplex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)};`
			`a = t;`
			`}`

			`__global__ void modulateAndNormalize_kernel(fComplex d_Dst, fComplex d_Src,`
			`int dataSize, float c) {`
			`const int i = blockDim.x * blockIdx.x + threadIdx.x;`

			`if (i >= dataSize) {`
			`return;`
			`}`

			`fComplex a = d_Src[i];`
			`fComplex b = d_Dst[i];`

			`mulAndScale(a, b, c);`

			`d_Dst[i] = a;`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`// 2D R2C / C2R post/preprocessing kernels`
			`////////////////////////////////////////////////////////////////////////////////`
			`#if (USE_TEXTURE)`
			`#define LOAD_FCOMPLEX(i) tex1Dfetch<fComplex>(texComplex, i)`
			`#define LOAD_FCOMPLEX_A(i) tex1Dfetch<fComplex>(texComplexA, i)`
			`#define LOAD_FCOMPLEX_B(i) tex1Dfetch<fComplex>(texComplexB, i)`

			`#define SET_FCOMPLEX_BASE`
			`#define SET_FCOMPLEX_BASE_A`
			`#define SET_FCOMPLEX_BASE_B`
			`#else`
			`#define LOAD_FCOMPLEX(i) d_Src[i]`
			`#define LOAD_FCOMPLEX_A(i) d_SrcA[i]`
			`#define LOAD_FCOMPLEX_B(i) d_SrcB[i]`

			`#define SET_FCOMPLEX_BASE`
			`#define SET_FCOMPLEX_BASE_A`
			`#define SET_FCOMPLEX_BASE_B`
			`#endif`

			`inline __device__ void spPostprocessC2C(fComplex &D1, fComplex &D2,`
			`const fComplex &twiddle) {`
			`float A1 = 0.5f * (D1.x + D2.x);`
			`float B1 = 0.5f * (D1.y - D2.y);`
			`float A2 = 0.5f * (D1.y + D2.y);`
			`float B2 = 0.5f * (D1.x - D2.x);`

			`D1.x = A1 + (A2 * twiddle.x + B2 * twiddle.y);`
			`D1.y = (A2 * twiddle.y - B2 * twiddle.x) + B1;`
			`D2.x = A1 - (A2 * twiddle.x + B2 * twiddle.y);`
			`D2.y = (A2 * twiddle.y - B2 * twiddle.x) - B1;`
			`}`

			`// Premultiply by 2 to account for 1.0 / (DZ * DY * DX) normalization`
			`inline __device__ void spPreprocessC2C(fComplex &D1, fComplex &D2,`
			`const fComplex &twiddle) {`
			`float A1 = /* 0.5f * */ (D1.x + D2.x);`
			`float B1 = /* 0.5f * */ (D1.y - D2.y);`
			`float A2 = /* 0.5f * */ (D1.y + D2.y);`
			`float B2 = /* 0.5f * */ (D1.x - D2.x);`

			`D1.x = A1 - (A2 * twiddle.x - B2 * twiddle.y);`
			`D1.y = (B2 * twiddle.x + A2 * twiddle.y) + B1;`
			`D2.x = A1 + (A2 * twiddle.x - B2 * twiddle.y);`
			`D2.y = (B2 * twiddle.x + A2 * twiddle.y) - B1;`
			`}`

			`inline __device__ void getTwiddle(fComplex &twiddle, float phase) {`
			`__sincosf(phase, &twiddle.y, &twiddle.x);`
			`}`

			`inline __device__ uint mod(uint a, uint DA) {`
			`//(DA - a) % DA, assuming a <= DA`
			`return a ? (DA - a) : a;`
			`}`

			`static inline uint factorRadix2(uint &log2N, uint n) {`
			`if (!n) {`
			`log2N = 0;`
			`return 0;`
			`} else {`
			`for (log2N = 0; n % 2 == 0; n /= 2, log2N++)`
			`;`

			`return n;`
			`}`
			`}`

			`inline __device__ void udivmod(uint &dividend, uint divisor, uint &rem) {`
			`#if (!POWER_OF_TWO)`
			`rem = dividend % divisor;`
			`dividend /= divisor;`
			`#else`
			`rem = dividend & (divisor - 1);`
			`dividend >>= (__ffs(divisor) - 1);`
			`#endif`
			`}`

			`__global__ void spPostprocess2D_kernel(fComplex d_Dst, fComplex d_Src,`
			`uint DY, uint DX, uint threadCount,`
			`uint padding, float phaseBase`
			`#if (USE_TEXTURE)`
			`,`
			`cudaTextureObject_t texComplex`
			`#endif`
			`) {`
			`const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;`

			`if (threadId >= threadCount) {`
			`return;`
			`}`

			`uint x, y, i = threadId;`
			`udivmod(i, DX / 2, x);`
			`udivmod(i, DY, y);`

			`// Avoid overwrites in columns DX / 2 by different threads`
			`if ((x == 0) && (y > DY / 2)) {`
			`return;`
			`}`

			`const uint srcOffset = i * DY * DX;`
			`const uint dstOffset = i * DY * (DX + padding);`

			`// Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX]`
			`{`
			`const uint loadPos1 = srcOffset + y * DX + x;`
			`const uint loadPos2 = srcOffset + mod(y, DY) * DX + mod(x, DX);`
			`const uint storePos1 = dstOffset + y * (DX + padding) + x;`
			`const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + (DX - x);`

			`fComplex D1 = LOAD_FCOMPLEX(loadPos1);`
			`fComplex D2 = LOAD_FCOMPLEX(loadPos2);`

			`fComplex twiddle;`
			`getTwiddle(twiddle, phaseBase * (float)x);`
			`spPostprocessC2C(D1, D2, twiddle);`

			`d_Dst[storePos1] = D1;`
			`d_Dst[storePos2] = D2;`
			`}`

			`// Process x = DX / 2`
			`if (x == 0) {`
			`const uint loadPos1 = srcOffset + y * DX + DX / 2;`
			`const uint loadPos2 = srcOffset + mod(y, DY) * DX + DX / 2;`
			`const uint storePos1 = dstOffset + y * (DX + padding) + DX / 2;`
			`const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + DX / 2;`

			`fComplex D1 = LOAD_FCOMPLEX(loadPos1);`
			`fComplex D2 = LOAD_FCOMPLEX(loadPos2);`

			`// twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(dir * j * PI / 2)`
			`fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f};`
			`spPostprocessC2C(D1, D2, twiddle);`

			`d_Dst[storePos1] = D1;`
			`d_Dst[storePos2] = D2;`
			`}`
			`}`

			`__global__ void spPreprocess2D_kernel(fComplex d_Dst, fComplex d_Src, uint DY,`
			`uint DX, uint threadCount, uint padding,`
			`float phaseBase`
			`#if (USE_TEXTURE)`
			`,`
			`cudaTextureObject_t texComplex`
			`#endif`
			`) {`
			`const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;`

			`if (threadId >= threadCount) {`
			`return;`
			`}`

			`uint x, y, i = threadId;`
			`udivmod(i, DX / 2, x);`
			`udivmod(i, DY, y);`

			`// Avoid overwrites in columns 0 and DX / 2 by different threads (lower and`
			`// upper halves)`
			`if ((x == 0) && (y > DY / 2)) {`
			`return;`
			`}`

			`const uint srcOffset = i * DY * (DX + padding);`
			`const uint dstOffset = i * DY * DX;`

			`// Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX]`
			`{`
			`const uint loadPos1 = srcOffset + y * (DX + padding) + x;`
			`const uint loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + (DX - x);`
			`const uint storePos1 = dstOffset + y * DX + x;`
			`const uint storePos2 = dstOffset + mod(y, DY) * DX + mod(x, DX);`

			`fComplex D1 = LOAD_FCOMPLEX(loadPos1);`
			`fComplex D2 = LOAD_FCOMPLEX(loadPos2);`

			`fComplex twiddle;`
			`getTwiddle(twiddle, phaseBase * (float)x);`
			`spPreprocessC2C(D1, D2, twiddle);`

			`d_Dst[storePos1] = D1;`
			`d_Dst[storePos2] = D2;`
			`}`

			`// Process x = DX / 2`
			`if (x == 0) {`
			`const uint loadPos1 = srcOffset + y * (DX + padding) + DX / 2;`
			`const uint loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + DX / 2;`
			`const uint storePos1 = dstOffset + y * DX + DX / 2;`
			`const uint storePos2 = dstOffset + mod(y, DY) * DX + DX / 2;`

			`fComplex D1 = LOAD_FCOMPLEX(loadPos1);`
			`fComplex D2 = LOAD_FCOMPLEX(loadPos2);`

			`// twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(-dir * j * PI / 2)`
			`fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f};`
			`spPreprocessC2C(D1, D2, twiddle);`

			`d_Dst[storePos1] = D1;`
			`d_Dst[storePos2] = D2;`
			`}`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Combined spPostprocess2D + modulateAndNormalize + spPreprocess2D`
			`////////////////////////////////////////////////////////////////////////////////`
			`__global__ void spProcess2D_kernel(fComplex d_Dst, fComplex d_SrcA,`
			`fComplex *d_SrcB, uint DY, uint DX,`
			`uint threadCount, float phaseBase, float c`
			`#if (USE_TEXTURE)`
			`,`
			`cudaTextureObject_t texComplexA,`
			`cudaTextureObject_t texComplexB`
			`#endif`
			`) {`
			`const uint threadId = blockIdx.x * blockDim.x + threadIdx.x;`

			`if (threadId >= threadCount) {`
			`return;`
			`}`

			`uint x, y, i = threadId;`
			`udivmod(i, DX, x);`
			`udivmod(i, DY / 2, y);`

			`const uint offset = i * DY * DX;`

			`// Avoid overwrites in rows 0 and DY / 2 by different threads (left and right`
			`// halves) Otherwise correctness for in-place transformations is affected`
			`if ((y == 0) && (x > DX / 2)) {`
			`return;`
			`}`

			`fComplex twiddle;`

			`// Process y = [0 .. DY / 2 - 1] U [DY - (DY / 2) + 1 .. DY - 1]`
			`{`
			`const uint pos1 = offset + y * DX + x;`
			`const uint pos2 = offset + mod(y, DY) * DX + mod(x, DX);`

			`fComplex D1 = LOAD_FCOMPLEX_A(pos1);`
			`fComplex D2 = LOAD_FCOMPLEX_A(pos2);`
			`fComplex K1 = LOAD_FCOMPLEX_B(pos1);`
			`fComplex K2 = LOAD_FCOMPLEX_B(pos2);`
			`getTwiddle(twiddle, phaseBase * (float)x);`

			`spPostprocessC2C(D1, D2, twiddle);`
			`spPostprocessC2C(K1, K2, twiddle);`
			`mulAndScale(D1, K1, c);`
			`mulAndScale(D2, K2, c);`
			`spPreprocessC2C(D1, D2, twiddle);`

			`d_Dst[pos1] = D1;`
			`d_Dst[pos2] = D2;`
			`}`

			`if (y == 0) {`
			`const uint pos1 = offset + (DY / 2) * DX + x;`
			`const uint pos2 = offset + (DY / 2) * DX + mod(x, DX);`

			`fComplex D1 = LOAD_FCOMPLEX_A(pos1);`
			`fComplex D2 = LOAD_FCOMPLEX_A(pos2);`
			`fComplex K1 = LOAD_FCOMPLEX_B(pos1);`
			`fComplex K2 = LOAD_FCOMPLEX_B(pos2);`

			`spPostprocessC2C(D1, D2, twiddle);`
			`spPostprocessC2C(K1, K2, twiddle);`
			`mulAndScale(D1, K1, c);`
			`mulAndScale(D2, K2, c);`
			`spPreprocessC2C(D1, D2, twiddle);`

			`d_Dst[pos1] = D1;`
			`d_Dst[pos2] = D2;`
			`}`
			`}`