cuda-samples/Samples/fastWalshTransform/fastWalshTransform_gold.cpp
2021-10-21 16:34:49 +05:30

96 lines
3.8 KiB
C++

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
///////////////////////////////////////////////////////////////////////////////
// CPU Fast Walsh Transform
///////////////////////////////////////////////////////////////////////////////
extern "C" void fwtCPU(float *h_Output, float *h_Input, int log2N) {
const int N = 1 << log2N;
for (int pos = 0; pos < N; pos++) h_Output[pos] = h_Input[pos];
// Cycle through stages with different butterfly strides
for (int stride = N / 2; stride >= 1; stride >>= 1) {
// Cycle through subvectors of (2 * stride) elements
for (int base = 0; base < N; base += 2 * stride)
// Butterfly index within subvector of (2 * stride) size
for (int j = 0; j < stride; j++) {
int i0 = base + j + 0;
int i1 = base + j + stride;
float T1 = h_Output[i0];
float T2 = h_Output[i1];
h_Output[i0] = T1 + T2;
h_Output[i1] = T1 - T2;
}
}
}
///////////////////////////////////////////////////////////////////////////////
// Straightforward Walsh Transform: used to test both CPU and GPU FWT
// Slow. Uses doubles because of straightforward accumulation
///////////////////////////////////////////////////////////////////////////////
extern "C" void slowWTcpu(float *h_Output, float *h_Input, int log2N) {
const int N = 1 << log2N;
for (int i = 0; i < N; i++) {
double sum = 0;
for (int j = 0; j < N; j++) {
// Walsh-Hadamard quotient
double q = 1.0;
for (int t = i & j; t != 0; t >>= 1)
if (t & 1) q = -q;
sum += q * h_Input[j];
}
h_Output[i] = (float)sum;
}
}
////////////////////////////////////////////////////////////////////////////////
// Reference CPU dyadic convolution.
// Extremely slow because of non-linear memory access patterns (cache thrashing)
////////////////////////////////////////////////////////////////////////////////
extern "C" void dyadicConvolutionCPU(float *h_Result, float *h_Data,
float *h_Kernel, int log2dataN,
int log2kernelN) {
const int dataN = 1 << log2dataN;
const int kernelN = 1 << log2kernelN;
for (int i = 0; i < dataN; i++) {
double sum = 0;
for (int j = 0; j < kernelN; j++) sum += h_Data[i ^ j] * h_Kernel[j];
h_Result[i] = (float)sum;
}
}