cuda-samples/Samples/fastWalshTransform/fastWalshTransform_gold.cpp

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

///////////////////////////////////////////////////////////////////////////////
// CPU Fast Walsh Transform
///////////////////////////////////////////////////////////////////////////////
extern "C" void fwtCPU(float *h_Output, float *h_Input, int log2N) {
  const int N = 1 << log2N;

  for (int pos = 0; pos < N; pos++) h_Output[pos] = h_Input[pos];

  // Cycle through stages with different butterfly strides
  for (int stride = N / 2; stride >= 1; stride >>= 1) {
    // Cycle through subvectors of (2 * stride) elements
    for (int base = 0; base < N; base += 2 * stride)

      // Butterfly index within subvector of (2 * stride) size
      for (int j = 0; j < stride; j++) {
        int i0 = base + j + 0;
        int i1 = base + j + stride;

        float T1 = h_Output[i0];
        float T2 = h_Output[i1];
        h_Output[i0] = T1 + T2;
        h_Output[i1] = T1 - T2;
      }
  }
}

///////////////////////////////////////////////////////////////////////////////
// Straightforward Walsh Transform: used to test both CPU and GPU FWT
// Slow. Uses doubles because of straightforward accumulation
///////////////////////////////////////////////////////////////////////////////
extern "C" void slowWTcpu(float *h_Output, float *h_Input, int log2N) {
  const int N = 1 << log2N;

  for (int i = 0; i < N; i++) {
    double sum = 0;

    for (int j = 0; j < N; j++) {
      // Walsh-Hadamard quotient
      double q = 1.0;

      for (int t = i & j; t != 0; t >>= 1)
        if (t & 1) q = -q;

      sum += q * h_Input[j];
    }

    h_Output[i] = (float)sum;
  }
}

////////////////////////////////////////////////////////////////////////////////
// Reference CPU dyadic convolution.
// Extremely slow because of non-linear memory access patterns (cache thrashing)
////////////////////////////////////////////////////////////////////////////////
extern "C" void dyadicConvolutionCPU(float *h_Result, float *h_Data,
                                     float *h_Kernel, int log2dataN,
                                     int log2kernelN) {
  const int dataN = 1 << log2dataN;
  const int kernelN = 1 << log2kernelN;

  for (int i = 0; i < dataN; i++) {
    double sum = 0;

    for (int j = 0; j < kernelN; j++) sum += h_Data[i ^ j] * h_Kernel[j];

    h_Result[i] = (float)sum;
  }
}
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`///////////////////////////////////////////////////////////////////////////////`
			`// CPU Fast Walsh Transform`
			`///////////////////////////////////////////////////////////////////////////////`
			`extern "C" void fwtCPU(float h_Output, float h_Input, int log2N) {`
			`const int N = 1 << log2N;`

			`for (int pos = 0; pos < N; pos++) h_Output[pos] = h_Input[pos];`

			`// Cycle through stages with different butterfly strides`
			`for (int stride = N / 2; stride >= 1; stride >>= 1) {`
			`// Cycle through subvectors of (2 * stride) elements`
			`for (int base = 0; base < N; base += 2 * stride)`

			`// Butterfly index within subvector of (2 * stride) size`
			`for (int j = 0; j < stride; j++) {`
			`int i0 = base + j + 0;`
			`int i1 = base + j + stride;`

			`float T1 = h_Output[i0];`
			`float T2 = h_Output[i1];`
			`h_Output[i0] = T1 + T2;`
			`h_Output[i1] = T1 - T2;`
			`}`
			`}`
			`}`

			`///////////////////////////////////////////////////////////////////////////////`
			`// Straightforward Walsh Transform: used to test both CPU and GPU FWT`
			`// Slow. Uses doubles because of straightforward accumulation`
			`///////////////////////////////////////////////////////////////////////////////`
			`extern "C" void slowWTcpu(float h_Output, float h_Input, int log2N) {`
			`const int N = 1 << log2N;`

			`for (int i = 0; i < N; i++) {`
			`double sum = 0;`

			`for (int j = 0; j < N; j++) {`
			`// Walsh-Hadamard quotient`
			`double q = 1.0;`

			`for (int t = i & j; t != 0; t >>= 1)`
			`if (t & 1) q = -q;`

			`sum += q * h_Input[j];`
			`}`

			`h_Output[i] = (float)sum;`
			`}`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Reference CPU dyadic convolution.`
			`// Extremely slow because of non-linear memory access patterns (cache thrashing)`
			`////////////////////////////////////////////////////////////////////////////////`
			`extern "C" void dyadicConvolutionCPU(float h_Result, float h_Data,`
			`float *h_Kernel, int log2dataN,`
			`int log2kernelN) {`
			`const int dataN = 1 << log2dataN;`
			`const int kernelN = 1 << log2kernelN;`

			`for (int i = 0; i < dataN; i++) {`
			`double sum = 0;`

			`for (int j = 0; j < kernelN; j++) sum += h_Data[i ^ j] * h_Kernel[j];`

			`h_Result[i] = (float)sum;`
			`}`
			`}`