mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-24 23:59:18 +08:00
96 lines
3.8 KiB
C++
96 lines
3.8 KiB
C++
|
/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||
|
*
|
||
|
* Redistribution and use in source and binary forms, with or without
|
||
|
* modification, are permitted provided that the following conditions
|
||
|
* are met:
|
||
|
* * Redistributions of source code must retain the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer.
|
||
|
* * Redistributions in binary form must reproduce the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer in the
|
||
|
* documentation and/or other materials provided with the distribution.
|
||
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||
|
* contributors may be used to endorse or promote products derived
|
||
|
* from this software without specific prior written permission.
|
||
|
*
|
||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
*/
|
||
|
|
||
|
///////////////////////////////////////////////////////////////////////////////
|
||
|
// CPU Fast Walsh Transform
|
||
|
///////////////////////////////////////////////////////////////////////////////
|
||
|
extern "C" void fwtCPU(float *h_Output, float *h_Input, int log2N) {
|
||
|
const int N = 1 << log2N;
|
||
|
|
||
|
for (int pos = 0; pos < N; pos++) h_Output[pos] = h_Input[pos];
|
||
|
|
||
|
// Cycle through stages with different butterfly strides
|
||
|
for (int stride = N / 2; stride >= 1; stride >>= 1) {
|
||
|
// Cycle through subvectors of (2 * stride) elements
|
||
|
for (int base = 0; base < N; base += 2 * stride)
|
||
|
|
||
|
// Butterfly index within subvector of (2 * stride) size
|
||
|
for (int j = 0; j < stride; j++) {
|
||
|
int i0 = base + j + 0;
|
||
|
int i1 = base + j + stride;
|
||
|
|
||
|
float T1 = h_Output[i0];
|
||
|
float T2 = h_Output[i1];
|
||
|
h_Output[i0] = T1 + T2;
|
||
|
h_Output[i1] = T1 - T2;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
///////////////////////////////////////////////////////////////////////////////
|
||
|
// Straightforward Walsh Transform: used to test both CPU and GPU FWT
|
||
|
// Slow. Uses doubles because of straightforward accumulation
|
||
|
///////////////////////////////////////////////////////////////////////////////
|
||
|
extern "C" void slowWTcpu(float *h_Output, float *h_Input, int log2N) {
|
||
|
const int N = 1 << log2N;
|
||
|
|
||
|
for (int i = 0; i < N; i++) {
|
||
|
double sum = 0;
|
||
|
|
||
|
for (int j = 0; j < N; j++) {
|
||
|
// Walsh-Hadamard quotient
|
||
|
double q = 1.0;
|
||
|
|
||
|
for (int t = i & j; t != 0; t >>= 1)
|
||
|
if (t & 1) q = -q;
|
||
|
|
||
|
sum += q * h_Input[j];
|
||
|
}
|
||
|
|
||
|
h_Output[i] = (float)sum;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
// Reference CPU dyadic convolution.
|
||
|
// Extremely slow because of non-linear memory access patterns (cache thrashing)
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
extern "C" void dyadicConvolutionCPU(float *h_Result, float *h_Data,
|
||
|
float *h_Kernel, int log2dataN,
|
||
|
int log2kernelN) {
|
||
|
const int dataN = 1 << log2dataN;
|
||
|
const int kernelN = 1 << log2kernelN;
|
||
|
|
||
|
for (int i = 0; i < dataN; i++) {
|
||
|
double sum = 0;
|
||
|
|
||
|
for (int j = 0; j < kernelN; j++) sum += h_Data[i ^ j] * h_Kernel[j];
|
||
|
|
||
|
h_Result[i] = (float)sum;
|
||
|
}
|
||
|
}
|