/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * 1D DWT for Haar wavelet and signals with a length which is a power of 2. * The code reduces bank conflicts and non-coalesced reads / writes as * appropriate but does not fully remove them because the computational * overhead to achieve this would outweighs the benefit (see inline comments * for more details). * Large signals are subdivided into sub-signals with 512 elements and the * wavelet transform for these is computed with one block over 10 decomposition * levels. The resulting signal consisting of the approximation coefficients at * level X is then processed in a subsequent step on the device. This requires * interblock synchronization which is only possible on host side. * Detail coefficients which have been computed are not further referenced * during the decomposition so that they can be stored directly in their final * position in global memory. The transform and its storing scheme preserve * locality in the coefficients so that these writes are coalesced. * Approximation coefficients are stored in shared memory because they are * needed to compute the subsequent decomposition step. The top most * approximation coefficient for a sub-signal processed by one block is stored * in a special global memory location to simplify the processing after the * interblock synchronization. * Most books on wavelets explain the Haar wavelet decomposition. A good freely * available resource is the Wavelet primer by Stollnitz et al. * http://grail.cs.washington.edu/projects/wavelets/article/wavelet1.pdf * http://grail.cs.washington.edu/projects/wavelets/article/wavelet2.pdf * The basic of all Wavelet transforms is to decompose a signal into * approximation (a) and detail (d) coefficients where the detail tends to be * small or zero which allows / simplifies compression. The following "graphs" * demonstrate the transform for a signal * of length eight. The index always describes the decomposition level where * a coefficient arises. The input signal is interpreted as approximation signal * at level 0. The coefficients computed on the device are stored in the same * scheme as in the example. This data structure is particularly well suited for * compression and also preserves the hierarchical structure of the decomposition. ------------------------------------------------- | a_0 | a_0 | a_0 | a_0 | a_0 | a_0 | a_0 | a_0 | ------------------------------------------------- ------------------------------------------------- | a_1 | a_1 | a_1 | a_1 | d_1 | d_1 | d_1 | d_1 | ------------------------------------------------- ------------------------------------------------- | a_2 | a_2 | d_2 | d_2 | d_1 | d_1 | d_1 | d_1 | ------------------------------------------------- ------------------------------------------------- | a_3 | d_3 | d_2 | d_2 | d_1 | d_1 | d_1 | d_1 | ------------------------------------------------- * Host code. */ #ifdef _WIN32 #define NOMINMAX #endif // includes, system #include #include #include #include #include // includes, project #include #include // constants which are used in host and device code #define INV_SQRT_2 0.70710678118654752440f; const unsigned int LOG_NUM_BANKS = 4; const unsigned int NUM_BANKS = 16; //////////////////////////////////////////////////////////////////////////////// // includes, kernels #include "dwtHaar1D_kernel.cuh" //////////////////////////////////////////////////////////////////////////////// // declaration, forward void runTest(int argc, char **argv); bool getLevels(unsigned int len, unsigned int *levels); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { // run test runTest(argc, argv); } //////////////////////////////////////////////////////////////////////////////// //! Perform the wavelet decomposition //////////////////////////////////////////////////////////////////////////////// void runTest(int argc, char **argv) { bool bResult = false; // flag for final validation of the results char *s_fname = NULL, *r_gold_fname = NULL; char r_fname[256]; const char usage[] = { "\nUsage:\n" " dwtHaar1D --signal= --result= " "--gold=\n\n" " Input file containing the signal\n" " Output file storing the result of the wavelet " "decomposition\n" " Input file containing the reference result of the " "wavelet decomposition\n" "\nExample:\n" " ./dwtHaar1D\n" " --signal=signal.dat\n" " --result=result.dat\n" " --gold=regression.gold.dat\n"}; printf("%s Starting...\n\n", argv[0]); // use command-line specified CUDA device, otherwise use device with highest // Gflops/s findCudaDevice(argc, (const char **)argv); // file names, either specified as cmd line args or use default if (argc == 4) { char *tmp_sfname, *tmp_rfname, *tmp_goldfname; if ((getCmdLineArgumentString(argc, (const char **)argv, "signal", &tmp_sfname) != true) || (getCmdLineArgumentString(argc, (const char **)argv, "result", &tmp_rfname) != true) || (getCmdLineArgumentString(argc, (const char **)argv, "gold", &tmp_goldfname) != true)) { fprintf(stderr, "Invalid input syntax.\n%s", usage); exit(EXIT_FAILURE); } s_fname = sdkFindFilePath(tmp_sfname, argv[0]); r_gold_fname = sdkFindFilePath(tmp_goldfname, argv[0]); strcpy(r_fname, tmp_rfname); } else { s_fname = sdkFindFilePath("signal.dat", argv[0]); r_gold_fname = sdkFindFilePath("regression.gold.dat", argv[0]); strcpy(r_fname, "result.dat"); } printf("source file = \"%s\"\n", s_fname); printf("reference file = \"%s\"\n", r_fname); printf("gold file = \"%s\"\n", r_gold_fname); // read in signal unsigned int slength = 0; float *signal = NULL; if (s_fname == NULL) { fprintf(stderr, "Cannot find the file containing the signal.\n%s", usage); exit(EXIT_FAILURE); } if (sdkReadFile(s_fname, &signal, &slength, false) == true) { printf("Reading signal from \"%s\"\n", s_fname); } else { exit(EXIT_FAILURE); } // get the number of decompositions necessary to perform a full decomposition unsigned int dlevels_complete = 0; if (true != getLevels(slength, &dlevels_complete)) { // error message fprintf(stderr, "Signal length not supported.\n"); // cleanup and abort free(signal); exit(EXIT_FAILURE); } // device in data float *d_idata = NULL; // device out data float *d_odata = NULL; // device approx_final data float *approx_final = NULL; // The very final approximation coefficient has to be written to the output // data, all others are reused as input data in the next global step and // therefore have to be written to the input data again. // The following flag indicates where to copy approx_final data // - 0 is input, 1 is output int approx_is_input; // allocate device mem const unsigned int smem_size = sizeof(float) * slength; checkCudaErrors(cudaMalloc((void **)&d_idata, smem_size)); checkCudaErrors(cudaMalloc((void **)&d_odata, smem_size)); checkCudaErrors(cudaMalloc((void **)&approx_final, smem_size)); // copy input data to device checkCudaErrors( cudaMemcpy(d_idata, signal, smem_size, cudaMemcpyHostToDevice)); // total number of threads // in the first decomposition step always one thread computes the average and // detail signal for one pair of adjacent values unsigned int num_threads_total_left = slength / 2; // decomposition levels performed in the current / next step unsigned int dlevels_step = dlevels_complete; // 1D signal so the arrangement of elements is also 1D dim3 block_size; dim3 grid_size; // number of decomposition levels left after one iteration on the device unsigned int dlevels_left = dlevels_complete; // if less or equal 1k elements, then the data can be processed in one block, // this avoids the Wait-For-Idle (WFI) on host side which is necessary if the // computation is split across multiple SM's if enough input data if (dlevels_complete <= 10) { // decomposition can be performed at once block_size.x = num_threads_total_left; approx_is_input = 0; } else { // 512 threads per block grid_size.x = (num_threads_total_left / 512); block_size.x = 512; // 512 threads corresponds to 10 decomposition steps dlevels_step = 10; dlevels_left -= 10; approx_is_input = 1; } // Initialize d_odata to 0.0f initValue<<>>(d_odata, 0.0f); // do until full decomposition is accomplished while (0 != num_threads_total_left) { // double the number of threads as bytes unsigned int mem_shared = (2 * block_size.x) * sizeof(float); // extra memory requirements to avoid bank conflicts mem_shared += ((2 * block_size.x) / NUM_BANKS) * sizeof(float); // run kernel dwtHaar1D<<>>( d_idata, d_odata, approx_final, dlevels_step, num_threads_total_left, block_size.x); // Copy approx_final to appropriate location if (approx_is_input) { checkCudaErrors(cudaMemcpy(d_idata, approx_final, grid_size.x * 4, cudaMemcpyDeviceToDevice)); } else { checkCudaErrors(cudaMemcpy(d_odata, approx_final, grid_size.x * 4, cudaMemcpyDeviceToDevice)); } // update level variables if (dlevels_left < 10) { // approx_final = d_odata; approx_is_input = 0; } // more global steps necessary dlevels_step = (dlevels_left > 10) ? dlevels_left - 10 : dlevels_left; dlevels_left -= 10; // after each step only half the threads are used any longer // therefore after 10 steps 2^10 less threads num_threads_total_left = num_threads_total_left >> 10; // update block and grid size grid_size.x = (num_threads_total_left / 512) + (0 != (num_threads_total_left % 512)) ? 1 : 0; if (grid_size.x <= 1) { block_size.x = num_threads_total_left; } } // get the result back from the server // allocate mem for the result float *odata = (float *)malloc(smem_size); checkCudaErrors( cudaMemcpy(odata, d_odata, smem_size, cudaMemcpyDeviceToHost)); // post processing // write file for regression test if (r_fname == NULL) { fprintf(stderr, "Cannot write the output file storing the result of the wavelet " "decomposition.\n%s", usage); exit(EXIT_FAILURE); } if (sdkWriteFile(r_fname, odata, slength, 0.001f, false) == true) { printf("Writing result to \"%s\"\n", r_fname); } else { exit(EXIT_FAILURE); } // load the reference solution unsigned int len_reference = 0; float *reference = NULL; if (r_gold_fname == NULL) { fprintf(stderr, "Cannot read the file containing the reference result of the " "wavelet decomposition.\n%s", usage); exit(EXIT_FAILURE); } if (sdkReadFile(r_gold_fname, &reference, &len_reference, false) == true) { printf("Reading reference result from \"%s\"\n", r_gold_fname); } else { exit(EXIT_FAILURE); } assert(slength == len_reference); // compare the computed solution and the reference bResult = (bool)sdkCompareL2fe(reference, odata, slength, 0.001f); free(reference); // free allocated host and device memory checkCudaErrors(cudaFree(d_odata)); checkCudaErrors(cudaFree(d_idata)); checkCudaErrors(cudaFree(approx_final)); free(signal); free(odata); free(s_fname); free(r_gold_fname); printf(bResult ? "Test success!\n" : "Test failure!\n"); } //////////////////////////////////////////////////////////////////////////////// //! Get number of decomposition levels to perform a full decomposition //! Also check if the input signal size is suitable //! @return true if the number of decomposition levels could be determined //! and the signal length is supported by the implementation, //! otherwise false //! @param len length of input signal //! @param levels number of decomposition levels necessary to perform a full //! decomposition //////////////////////////////////////////////////////////////////////////////// bool getLevels(unsigned int len, unsigned int *levels) { bool retval = false; // currently signals up to a length of 2^20 supported for (unsigned int i = 0; i < 20; ++i) { if (len == (1 << i)) { *levels = i; retval = true; break; } } return retval; }