/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /////////////////////////////////////////////////////////////////////////////// // CPU Fast Walsh Transform /////////////////////////////////////////////////////////////////////////////// extern "C" void fwtCPU(float *h_Output, float *h_Input, int log2N) { const int N = 1 << log2N; for (int pos = 0; pos < N; pos++) h_Output[pos] = h_Input[pos]; // Cycle through stages with different butterfly strides for (int stride = N / 2; stride >= 1; stride >>= 1) { // Cycle through subvectors of (2 * stride) elements for (int base = 0; base < N; base += 2 * stride) // Butterfly index within subvector of (2 * stride) size for (int j = 0; j < stride; j++) { int i0 = base + j + 0; int i1 = base + j + stride; float T1 = h_Output[i0]; float T2 = h_Output[i1]; h_Output[i0] = T1 + T2; h_Output[i1] = T1 - T2; } } } /////////////////////////////////////////////////////////////////////////////// // Straightforward Walsh Transform: used to test both CPU and GPU FWT // Slow. Uses doubles because of straightforward accumulation /////////////////////////////////////////////////////////////////////////////// extern "C" void slowWTcpu(float *h_Output, float *h_Input, int log2N) { const int N = 1 << log2N; for (int i = 0; i < N; i++) { double sum = 0; for (int j = 0; j < N; j++) { // Walsh-Hadamard quotient double q = 1.0; for (int t = i & j; t != 0; t >>= 1) if (t & 1) q = -q; sum += q * h_Input[j]; } h_Output[i] = (float)sum; } } //////////////////////////////////////////////////////////////////////////////// // Reference CPU dyadic convolution. // Extremely slow because of non-linear memory access patterns (cache thrashing) //////////////////////////////////////////////////////////////////////////////// extern "C" void dyadicConvolutionCPU(float *h_Result, float *h_Data, float *h_Kernel, int log2dataN, int log2kernelN) { const int dataN = 1 << log2dataN; const int kernelN = 1 << log2kernelN; for (int i = 0; i < dataN; i++) { double sum = 0; for (int j = 0; j < kernelN; j++) sum += h_Data[i ^ j] * h_Kernel[j]; h_Result[i] = (float)sum; } }