cuda-samples/Samples/5_Domain_Specific/dwtHaar1D/dwtHaar1D.cu

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
* 1D DWT for Haar wavelet and signals with a length which is a power of 2.
* The code reduces bank conflicts and non-coalesced reads / writes as
* appropriate but does not fully remove them because the computational
* overhead to achieve this would outweighs the benefit (see inline comments
* for more details).
* Large signals are subdivided into sub-signals with 512 elements and the
* wavelet transform for these is computed with one block over 10 decomposition
* levels. The resulting signal consisting of the approximation coefficients at
* level X is then processed in a subsequent step on the device. This requires
* interblock synchronization which is only possible on host side.
* Detail coefficients which have been computed are not further referenced
* during the decomposition so that they can be stored directly in their final
* position in global memory. The transform and its storing scheme preserve
* locality in the coefficients so that these writes are coalesced.
* Approximation coefficients are stored in shared memory because they are
* needed to compute the subsequent decomposition step. The top most
* approximation coefficient for a sub-signal processed by one block is stored
* in a special global memory location to simplify the processing after the
* interblock synchronization.
* Most books on wavelets explain the Haar wavelet decomposition. A good freely
* available resource is the Wavelet primer by Stollnitz et al.
* http://grail.cs.washington.edu/projects/wavelets/article/wavelet1.pdf
* http://grail.cs.washington.edu/projects/wavelets/article/wavelet2.pdf
* The basic of all Wavelet transforms is to decompose a signal into
* approximation (a) and detail (d) coefficients where the detail tends to be
* small or zero which allows / simplifies compression. The following "graphs"
* demonstrate the transform for a signal
* of length eight. The index always describes the decomposition level where
* a coefficient arises. The input signal is interpreted as approximation signal
* at level 0. The coefficients computed on the device are stored in the same
* scheme as in the example. This data structure is particularly well suited for
* compression and also preserves the hierarchical structure of the
decomposition.

-------------------------------------------------
| a_0 | a_0 | a_0 | a_0 | a_0 | a_0 | a_0 | a_0 |
-------------------------------------------------

-------------------------------------------------
| a_1 | a_1 | a_1 | a_1 | d_1 | d_1 | d_1 | d_1 |
-------------------------------------------------

-------------------------------------------------
| a_2 | a_2 | d_2 | d_2 | d_1 | d_1 | d_1 | d_1 |
-------------------------------------------------

-------------------------------------------------
| a_3 | d_3 | d_2 | d_2 | d_1 | d_1 | d_1 | d_1 |
-------------------------------------------------

* Host code.
*/

#ifdef _WIN32
#define NOMINMAX
#endif

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <assert.h>

// includes, project
#include <helper_functions.h>
#include <helper_cuda.h>

// constants which are used in host and device code
#define INV_SQRT_2 0.70710678118654752440f;
const unsigned int LOG_NUM_BANKS = 4;
const unsigned int NUM_BANKS = 16;

////////////////////////////////////////////////////////////////////////////////
// includes, kernels
#include "dwtHaar1D_kernel.cuh"

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char **argv);
bool getLevels(unsigned int len, unsigned int *levels);

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
  // run test
  runTest(argc, argv);
}

////////////////////////////////////////////////////////////////////////////////
//! Perform the wavelet decomposition
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) {
  bool bResult = false;  // flag for final validation of the results

  char *s_fname = NULL, *r_gold_fname = NULL;
  char r_fname[256];
  const char usage[] = {
      "\nUsage:\n"
      "  dwtHaar1D --signal=<signal_file> --result=<result_file> "
      "--gold=<gold_file>\n\n"
      "  <signal_file> Input file containing the signal\n"
      "  <result_file> Output file storing the result of the wavelet "
      "decomposition\n"
      "  <gold_file>   Input file containing the reference result of the "
      "wavelet decomposition\n"
      "\nExample:\n"
      "  ./dwtHaar1D\n"
      "       --signal=signal.dat\n"
      "       --result=result.dat\n"
      "       --gold=regression.gold.dat\n"};

  printf("%s Starting...\n\n", argv[0]);

  // use command-line specified CUDA device, otherwise use device with highest
  // Gflops/s
  findCudaDevice(argc, (const char **)argv);

  // file names, either specified as cmd line args or use default
  if (argc == 4) {
    char *tmp_sfname, *tmp_rfname, *tmp_goldfname;

    if ((getCmdLineArgumentString(argc, (const char **)argv, "signal",
                                  &tmp_sfname) != true) ||
        (getCmdLineArgumentString(argc, (const char **)argv, "result",
                                  &tmp_rfname) != true) ||
        (getCmdLineArgumentString(argc, (const char **)argv, "gold",
                                  &tmp_goldfname) != true)) {
      fprintf(stderr, "Invalid input syntax.\n%s", usage);
      exit(EXIT_FAILURE);
    }

    s_fname = sdkFindFilePath(tmp_sfname, argv[0]);
    r_gold_fname = sdkFindFilePath(tmp_goldfname, argv[0]);
    strcpy(r_fname, tmp_rfname);
  } else {
    s_fname = sdkFindFilePath("signal.dat", argv[0]);
    r_gold_fname = sdkFindFilePath("regression.gold.dat", argv[0]);
    strcpy(r_fname, "result.dat");
  }

  printf("source file    = \"%s\"\n", s_fname);
  printf("reference file = \"%s\"\n", r_fname);
  printf("gold file      = \"%s\"\n", r_gold_fname);

  // read in signal
  unsigned int slength = 0;
  float *signal = NULL;

  if (s_fname == NULL) {
    fprintf(stderr, "Cannot find the file containing the signal.\n%s", usage);

    exit(EXIT_FAILURE);
  }

  if (sdkReadFile(s_fname, &signal, &slength, false) == true) {
    printf("Reading signal from \"%s\"\n", s_fname);
  } else {
    exit(EXIT_FAILURE);
  }

  // get the number of decompositions necessary to perform a full decomposition
  unsigned int dlevels_complete = 0;

  if (true != getLevels(slength, &dlevels_complete)) {
    // error message
    fprintf(stderr, "Signal length not supported.\n");
    // cleanup and abort
    free(signal);
    exit(EXIT_FAILURE);
  }

  // device in data
  float *d_idata = NULL;
  // device out data
  float *d_odata = NULL;
  // device approx_final data
  float *approx_final = NULL;
  // The very final approximation coefficient has to be written to the output
  // data, all others are reused as input data in the next global step and
  // therefore have to be written to the input data again.
  // The following flag indicates where to copy approx_final data
  //   - 0 is input, 1 is output
  int approx_is_input;

  // allocate device mem
  const unsigned int smem_size = sizeof(float) * slength;
  checkCudaErrors(cudaMalloc((void **)&d_idata, smem_size));
  checkCudaErrors(cudaMalloc((void **)&d_odata, smem_size));
  checkCudaErrors(cudaMalloc((void **)&approx_final, smem_size));
  // copy input data to device
  checkCudaErrors(
      cudaMemcpy(d_idata, signal, smem_size, cudaMemcpyHostToDevice));

  // total number of threads
  // in the first decomposition step always one thread computes the average and
  // detail signal for one pair of adjacent values
  unsigned int num_threads_total_left = slength / 2;
  // decomposition levels performed in the current / next step
  unsigned int dlevels_step = dlevels_complete;

  // 1D signal so the arrangement of elements is also 1D
  dim3 block_size;
  dim3 grid_size;

  // number of decomposition levels left after one iteration on the device
  unsigned int dlevels_left = dlevels_complete;

  // if less or equal 1k elements, then the data can be processed in one block,
  // this avoids the Wait-For-Idle (WFI) on host side which is necessary if the
  // computation is split across multiple SM's if enough input data
  if (dlevels_complete <= 10) {
    // decomposition can be performed at once
    block_size.x = num_threads_total_left;
    approx_is_input = 0;
  } else {
    // 512 threads per block
    grid_size.x = (num_threads_total_left / 512);
    block_size.x = 512;

    // 512 threads corresponds to 10 decomposition steps
    dlevels_step = 10;
    dlevels_left -= 10;

    approx_is_input = 1;
  }

  // Initialize d_odata to 0.0f
  initValue<<<grid_size, block_size>>>(d_odata, 0.0f);

  // do until full decomposition is accomplished
  while (0 != num_threads_total_left) {
    // double the number of threads as bytes
    unsigned int mem_shared = (2 * block_size.x) * sizeof(float);
    // extra memory requirements to avoid bank conflicts
    mem_shared += ((2 * block_size.x) / NUM_BANKS) * sizeof(float);

    // run kernel
    dwtHaar1D<<<grid_size, block_size, mem_shared>>>(
        d_idata, d_odata, approx_final, dlevels_step, num_threads_total_left,
        block_size.x);

    // Copy approx_final to appropriate location
    if (approx_is_input) {
      checkCudaErrors(cudaMemcpy(d_idata, approx_final, grid_size.x * 4,
                                 cudaMemcpyDeviceToDevice));
    } else {
      checkCudaErrors(cudaMemcpy(d_odata, approx_final, grid_size.x * 4,
                                 cudaMemcpyDeviceToDevice));
    }

    // update level variables
    if (dlevels_left < 10) {
      // approx_final = d_odata;
      approx_is_input = 0;
    }

    // more global steps necessary
    dlevels_step = (dlevels_left > 10) ? dlevels_left - 10 : dlevels_left;
    dlevels_left -= 10;

    // after each step only half the threads are used any longer
    // therefore after 10 steps 2^10 less threads
    num_threads_total_left = num_threads_total_left >> 10;

    // update block and grid size
    grid_size.x =
        (num_threads_total_left / 512) + (0 != (num_threads_total_left % 512))
            ? 1
            : 0;

    if (grid_size.x <= 1) {
      block_size.x = num_threads_total_left;
    }
  }

  // get the result back from the server
  // allocate mem for the result
  float *odata = (float *)malloc(smem_size);
  checkCudaErrors(
      cudaMemcpy(odata, d_odata, smem_size, cudaMemcpyDeviceToHost));

  // post processing
  // write file for regression test
  if (r_fname == NULL) {
    fprintf(stderr,
            "Cannot write the output file storing the result of the wavelet "
            "decomposition.\n%s",
            usage);
    exit(EXIT_FAILURE);
  }

  if (sdkWriteFile(r_fname, odata, slength, 0.001f, false) == true) {
    printf("Writing result to \"%s\"\n", r_fname);
  } else {
    exit(EXIT_FAILURE);
  }

  // load the reference solution
  unsigned int len_reference = 0;
  float *reference = NULL;

  if (r_gold_fname == NULL) {
    fprintf(stderr,
            "Cannot read the file containing the reference result of the "
            "wavelet decomposition.\n%s",
            usage);

    exit(EXIT_FAILURE);
  }

  if (sdkReadFile(r_gold_fname, &reference, &len_reference, false) == true) {
    printf("Reading reference result from \"%s\"\n", r_gold_fname);
  } else {
    exit(EXIT_FAILURE);
  }

  assert(slength == len_reference);

  // compare the computed solution and the reference
  bResult = (bool)sdkCompareL2fe(reference, odata, slength, 0.001f);
  free(reference);

  // free allocated host and device memory
  checkCudaErrors(cudaFree(d_odata));
  checkCudaErrors(cudaFree(d_idata));
  checkCudaErrors(cudaFree(approx_final));

  free(signal);
  free(odata);
  free(s_fname);
  free(r_gold_fname);

  printf(bResult ? "Test success!\n" : "Test failure!\n");
}

////////////////////////////////////////////////////////////////////////////////
//! Get number of decomposition levels to perform a full decomposition
//! Also check if the input signal size is suitable
//! @return  true if the number of decomposition levels could be determined
//!          and the signal length is supported by the implementation,
//!          otherwise false
//! @param   len  length of input signal
//! @param   levels  number of decomposition levels necessary to perform a full
//!           decomposition
////////////////////////////////////////////////////////////////////////////////
bool getLevels(unsigned int len, unsigned int *levels) {
  bool retval = false;

  // currently signals up to a length of 2^20 supported
  for (unsigned int i = 0; i < 20; ++i) {
    if (len == (1 << i)) {
      *levels = i;
      retval = true;
      break;
    }
  }

  return retval;
}
add and update samples for CUDA 11.6 2022-01-13 14:05:24 +08:00			`/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.`
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`/*`
			`* 1D DWT for Haar wavelet and signals with a length which is a power of 2.`
			`* The code reduces bank conflicts and non-coalesced reads / writes as`
			`* appropriate but does not fully remove them because the computational`
			`* overhead to achieve this would outweighs the benefit (see inline comments`
			`* for more details).`
			`* Large signals are subdivided into sub-signals with 512 elements and the`
			`* wavelet transform for these is computed with one block over 10 decomposition`
			`* levels. The resulting signal consisting of the approximation coefficients at`
			`* level X is then processed in a subsequent step on the device. This requires`
			`* interblock synchronization which is only possible on host side.`
			`* Detail coefficients which have been computed are not further referenced`
			`* during the decomposition so that they can be stored directly in their final`
			`* position in global memory. The transform and its storing scheme preserve`
			`* locality in the coefficients so that these writes are coalesced.`
			`* Approximation coefficients are stored in shared memory because they are`
			`* needed to compute the subsequent decomposition step. The top most`
			`* approximation coefficient for a sub-signal processed by one block is stored`
			`* in a special global memory location to simplify the processing after the`
			`* interblock synchronization.`
			`* Most books on wavelets explain the Haar wavelet decomposition. A good freely`
			`* available resource is the Wavelet primer by Stollnitz et al.`
			`* http://grail.cs.washington.edu/projects/wavelets/article/wavelet1.pdf`
			`* http://grail.cs.washington.edu/projects/wavelets/article/wavelet2.pdf`
			`* The basic of all Wavelet transforms is to decompose a signal into`
			`* approximation (a) and detail (d) coefficients where the detail tends to be`
			`* small or zero which allows / simplifies compression. The following "graphs"`
			`* demonstrate the transform for a signal`
			`* of length eight. The index always describes the decomposition level where`
			`* a coefficient arises. The input signal is interpreted as approximation signal`
			`* at level 0. The coefficients computed on the device are stored in the same`
			`* scheme as in the example. This data structure is particularly well suited for`
			`* compression and also preserves the hierarchical structure of the`
			`decomposition.`

			`-------------------------------------------------`
			`\| a_0 \| a_0 \| a_0 \| a_0 \| a_0 \| a_0 \| a_0 \| a_0 \|`
			`-------------------------------------------------`

			`-------------------------------------------------`
			`\| a_1 \| a_1 \| a_1 \| a_1 \| d_1 \| d_1 \| d_1 \| d_1 \|`
			`-------------------------------------------------`

			`-------------------------------------------------`
			`\| a_2 \| a_2 \| d_2 \| d_2 \| d_1 \| d_1 \| d_1 \| d_1 \|`
			`-------------------------------------------------`

			`-------------------------------------------------`
			`\| a_3 \| d_3 \| d_2 \| d_2 \| d_1 \| d_1 \| d_1 \| d_1 \|`
			`-------------------------------------------------`

			`* Host code.`
			`*/`

			`#ifdef _WIN32`
			`#define NOMINMAX`
			`#endif`

			`// includes, system`
			`#include <stdlib.h>`
			`#include <stdio.h>`
			`#include <string.h>`
			`#include <math.h>`
			`#include <assert.h>`

			`// includes, project`
			`#include <helper_functions.h>`
			`#include <helper_cuda.h>`

			`// constants which are used in host and device code`
			`#define INV_SQRT_2 0.70710678118654752440f;`
			`const unsigned int LOG_NUM_BANKS = 4;`
			`const unsigned int NUM_BANKS = 16;`

			`////////////////////////////////////////////////////////////////////////////////`
			`// includes, kernels`
			`#include "dwtHaar1D_kernel.cuh"`

			`////////////////////////////////////////////////////////////////////////////////`
			`// declaration, forward`
			`void runTest(int argc, char **argv);`
			`bool getLevels(unsigned int len, unsigned int *levels);`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Program main`
			`////////////////////////////////////////////////////////////////////////////////`
			`int main(int argc, char **argv) {`
			`// run test`
			`runTest(argc, argv);`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`//! Perform the wavelet decomposition`
			`////////////////////////////////////////////////////////////////////////////////`
			`void runTest(int argc, char **argv) {`
			`bool bResult = false; // flag for final validation of the results`

			`char s_fname = NULL, r_gold_fname = NULL;`
			`char r_fname[256];`
			`const char usage[] = {`
			`"\nUsage:\n"`
			`" dwtHaar1D --signal=<signal_file> --result=<result_file> "`
			`"--gold=<gold_file>\n\n"`
			`" <signal_file> Input file containing the signal\n"`
			`" <result_file> Output file storing the result of the wavelet "`
			`"decomposition\n"`
			`" <gold_file> Input file containing the reference result of the "`
			`"wavelet decomposition\n"`
			`"\nExample:\n"`
			`" ./dwtHaar1D\n"`
			`" --signal=signal.dat\n"`
			`" --result=result.dat\n"`
			`" --gold=regression.gold.dat\n"};`

			`printf("%s Starting...\n\n", argv[0]);`

			`// use command-line specified CUDA device, otherwise use device with highest`
			`// Gflops/s`
			`findCudaDevice(argc, (const char **)argv);`

			`// file names, either specified as cmd line args or use default`
			`if (argc == 4) {`
			`char tmp_sfname, tmp_rfname, *tmp_goldfname;`

			`if ((getCmdLineArgumentString(argc, (const char **)argv, "signal",`
			`&tmp_sfname) != true) \|\|`
			`(getCmdLineArgumentString(argc, (const char **)argv, "result",`
			`&tmp_rfname) != true) \|\|`
			`(getCmdLineArgumentString(argc, (const char **)argv, "gold",`
			`&tmp_goldfname) != true)) {`
			`fprintf(stderr, "Invalid input syntax.\n%s", usage);`
			`exit(EXIT_FAILURE);`
			`}`

			`s_fname = sdkFindFilePath(tmp_sfname, argv[0]);`
			`r_gold_fname = sdkFindFilePath(tmp_goldfname, argv[0]);`
			`strcpy(r_fname, tmp_rfname);`
			`} else {`
			`s_fname = sdkFindFilePath("signal.dat", argv[0]);`
			`r_gold_fname = sdkFindFilePath("regression.gold.dat", argv[0]);`
			`strcpy(r_fname, "result.dat");`
			`}`

			`printf("source file = \"%s\"\n", s_fname);`
			`printf("reference file = \"%s\"\n", r_fname);`
			`printf("gold file = \"%s\"\n", r_gold_fname);`

			`// read in signal`
			`unsigned int slength = 0;`
			`float *signal = NULL;`

			`if (s_fname == NULL) {`
			`fprintf(stderr, "Cannot find the file containing the signal.\n%s", usage);`

			`exit(EXIT_FAILURE);`
			`}`

			`if (sdkReadFile(s_fname, &signal, &slength, false) == true) {`
			`printf("Reading signal from \"%s\"\n", s_fname);`
			`} else {`
			`exit(EXIT_FAILURE);`
			`}`

			`// get the number of decompositions necessary to perform a full decomposition`
			`unsigned int dlevels_complete = 0;`

			`if (true != getLevels(slength, &dlevels_complete)) {`
			`// error message`
			`fprintf(stderr, "Signal length not supported.\n");`
			`// cleanup and abort`
			`free(signal);`
			`exit(EXIT_FAILURE);`
			`}`

			`// device in data`
			`float *d_idata = NULL;`
			`// device out data`
			`float *d_odata = NULL;`
			`// device approx_final data`
			`float *approx_final = NULL;`
			`// The very final approximation coefficient has to be written to the output`
			`// data, all others are reused as input data in the next global step and`
			`// therefore have to be written to the input data again.`
			`// The following flag indicates where to copy approx_final data`
			`// - 0 is input, 1 is output`
			`int approx_is_input;`

			`// allocate device mem`
			`const unsigned int smem_size = sizeof(float) * slength;`
			`checkCudaErrors(cudaMalloc((void **)&d_idata, smem_size));`
			`checkCudaErrors(cudaMalloc((void **)&d_odata, smem_size));`
			`checkCudaErrors(cudaMalloc((void **)&approx_final, smem_size));`
			`// copy input data to device`
			`checkCudaErrors(`
			`cudaMemcpy(d_idata, signal, smem_size, cudaMemcpyHostToDevice));`

			`// total number of threads`
			`// in the first decomposition step always one thread computes the average and`
			`// detail signal for one pair of adjacent values`
			`unsigned int num_threads_total_left = slength / 2;`
			`// decomposition levels performed in the current / next step`
			`unsigned int dlevels_step = dlevels_complete;`

			`// 1D signal so the arrangement of elements is also 1D`
			`dim3 block_size;`
			`dim3 grid_size;`

			`// number of decomposition levels left after one iteration on the device`
			`unsigned int dlevels_left = dlevels_complete;`

			`// if less or equal 1k elements, then the data can be processed in one block,`
			`// this avoids the Wait-For-Idle (WFI) on host side which is necessary if the`
			`// computation is split across multiple SM's if enough input data`
			`if (dlevels_complete <= 10) {`
			`// decomposition can be performed at once`
			`block_size.x = num_threads_total_left;`
			`approx_is_input = 0;`
			`} else {`
			`// 512 threads per block`
			`grid_size.x = (num_threads_total_left / 512);`
			`block_size.x = 512;`

			`// 512 threads corresponds to 10 decomposition steps`
			`dlevels_step = 10;`
			`dlevels_left -= 10;`

			`approx_is_input = 1;`
			`}`

			`// Initialize d_odata to 0.0f`
			`initValue<<<grid_size, block_size>>>(d_odata, 0.0f);`

			`// do until full decomposition is accomplished`
			`while (0 != num_threads_total_left) {`
			`// double the number of threads as bytes`
			`unsigned int mem_shared = (2 * block_size.x) * sizeof(float);`
			`// extra memory requirements to avoid bank conflicts`
			`mem_shared += ((2 * block_size.x) / NUM_BANKS) * sizeof(float);`

			`// run kernel`
			`dwtHaar1D<<<grid_size, block_size, mem_shared>>>(`
			`d_idata, d_odata, approx_final, dlevels_step, num_threads_total_left,`
			`block_size.x);`

			`// Copy approx_final to appropriate location`
			`if (approx_is_input) {`
			`checkCudaErrors(cudaMemcpy(d_idata, approx_final, grid_size.x * 4,`
			`cudaMemcpyDeviceToDevice));`
			`} else {`
			`checkCudaErrors(cudaMemcpy(d_odata, approx_final, grid_size.x * 4,`
			`cudaMemcpyDeviceToDevice));`
			`}`

			`// update level variables`
			`if (dlevels_left < 10) {`
			`// approx_final = d_odata;`
			`approx_is_input = 0;`
			`}`

			`// more global steps necessary`
			`dlevels_step = (dlevels_left > 10) ? dlevels_left - 10 : dlevels_left;`
			`dlevels_left -= 10;`

			`// after each step only half the threads are used any longer`
			`// therefore after 10 steps 2^10 less threads`
			`num_threads_total_left = num_threads_total_left >> 10;`

			`// update block and grid size`
			`grid_size.x =`
			`(num_threads_total_left / 512) + (0 != (num_threads_total_left % 512))`
			`? 1`
			`: 0;`

			`if (grid_size.x <= 1) {`
			`block_size.x = num_threads_total_left;`
			`}`
			`}`

			`// get the result back from the server`
			`// allocate mem for the result`
			`float odata = (float )malloc(smem_size);`
			`checkCudaErrors(`
			`cudaMemcpy(odata, d_odata, smem_size, cudaMemcpyDeviceToHost));`

			`// post processing`
			`// write file for regression test`
			`if (r_fname == NULL) {`
			`fprintf(stderr,`
			`"Cannot write the output file storing the result of the wavelet "`
			`"decomposition.\n%s",`
			`usage);`
			`exit(EXIT_FAILURE);`
			`}`

			`if (sdkWriteFile(r_fname, odata, slength, 0.001f, false) == true) {`
			`printf("Writing result to \"%s\"\n", r_fname);`
			`} else {`
			`exit(EXIT_FAILURE);`
			`}`

			`// load the reference solution`
			`unsigned int len_reference = 0;`
			`float *reference = NULL;`

			`if (r_gold_fname == NULL) {`
			`fprintf(stderr,`
			`"Cannot read the file containing the reference result of the "`
			`"wavelet decomposition.\n%s",`
			`usage);`

			`exit(EXIT_FAILURE);`
			`}`

			`if (sdkReadFile(r_gold_fname, &reference, &len_reference, false) == true) {`
			`printf("Reading reference result from \"%s\"\n", r_gold_fname);`
			`} else {`
			`exit(EXIT_FAILURE);`
			`}`

			`assert(slength == len_reference);`

			`// compare the computed solution and the reference`
			`bResult = (bool)sdkCompareL2fe(reference, odata, slength, 0.001f);`
			`free(reference);`

			`// free allocated host and device memory`
			`checkCudaErrors(cudaFree(d_odata));`
			`checkCudaErrors(cudaFree(d_idata));`
			`checkCudaErrors(cudaFree(approx_final));`

			`free(signal);`
			`free(odata);`
			`free(s_fname);`
			`free(r_gold_fname);`

			`printf(bResult ? "Test success!\n" : "Test failure!\n");`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`//! Get number of decomposition levels to perform a full decomposition`
			`//! Also check if the input signal size is suitable`
			`//! @return true if the number of decomposition levels could be determined`
			`//! and the signal length is supported by the implementation,`
			`//! otherwise false`
			`//! @param len length of input signal`
			`//! @param levels number of decomposition levels necessary to perform a full`
			`//! decomposition`
			`////////////////////////////////////////////////////////////////////////////////`
			`bool getLevels(unsigned int len, unsigned int *levels) {`
			`bool retval = false;`

			`// currently signals up to a length of 2^20 supported`
			`for (unsigned int i = 0; i < 20; ++i) {`
			`if (len == (1 << i)) {`
			`*levels = i;`
			`retval = true;`
			`break;`
			`}`
			`}`

			`return retval;`
			`}`