cuda-samples/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_large.cu

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/* Computation of eigenvalues of a large symmetric, tridiagonal matrix */

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>

// includes, project
#include "helper_functions.h"
#include "helper_cuda.h"
#include "config.h"
#include "structs.h"
#include "util.h"
#include "matlab.h"

#include "bisect_large.cuh"

// includes, kernels
#include "bisect_kernel_large.cuh"
#include "bisect_kernel_large_onei.cuh"
#include "bisect_kernel_large_multi.cuh"

////////////////////////////////////////////////////////////////////////////////
//! Initialize variables and memory for result
//! @param  result handles to memory
//! @param  matrix_size  size of the matrix
////////////////////////////////////////////////////////////////////////////////
void initResultDataLargeMatrix(ResultDataLarge &result,
                               const unsigned int mat_size) {
  // helper variables to initialize memory
  unsigned int zero = 0;
  unsigned int mat_size_f = sizeof(float) * mat_size;
  unsigned int mat_size_ui = sizeof(unsigned int) * mat_size;

  float *tempf = (float *)malloc(mat_size_f);
  unsigned int *tempui = (unsigned int *)malloc(mat_size_ui);

  for (unsigned int i = 0; i < mat_size; ++i) {
    tempf[i] = 0.0f;
    tempui[i] = 0;
  }

  // number of intervals containing only one eigenvalue after the first step
  checkCudaErrors(cudaMalloc((void **)&result.g_num_one, sizeof(unsigned int)));
  checkCudaErrors(cudaMemcpy(result.g_num_one, &zero, sizeof(unsigned int),
                             cudaMemcpyHostToDevice));

  // number of (thread) blocks of intervals with multiple eigenvalues after
  // the first iteration
  checkCudaErrors(
      cudaMalloc((void **)&result.g_num_blocks_mult, sizeof(unsigned int)));
  checkCudaErrors(cudaMemcpy(result.g_num_blocks_mult, &zero,
                             sizeof(unsigned int), cudaMemcpyHostToDevice));

  checkCudaErrors(cudaMalloc((void **)&result.g_left_one, mat_size_f));
  checkCudaErrors(cudaMalloc((void **)&result.g_right_one, mat_size_f));
  checkCudaErrors(cudaMalloc((void **)&result.g_pos_one, mat_size_ui));

  checkCudaErrors(cudaMalloc((void **)&result.g_left_mult, mat_size_f));
  checkCudaErrors(cudaMalloc((void **)&result.g_right_mult, mat_size_f));
  checkCudaErrors(cudaMalloc((void **)&result.g_left_count_mult, mat_size_ui));
  checkCudaErrors(cudaMalloc((void **)&result.g_right_count_mult, mat_size_ui));

  checkCudaErrors(
      cudaMemcpy(result.g_left_one, tempf, mat_size_f, cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy(result.g_right_one, tempf, mat_size_f,
                             cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy(result.g_pos_one, tempui, mat_size_ui,
                             cudaMemcpyHostToDevice));

  checkCudaErrors(cudaMemcpy(result.g_left_mult, tempf, mat_size_f,
                             cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy(result.g_right_mult, tempf, mat_size_f,
                             cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy(result.g_left_count_mult, tempui, mat_size_ui,
                             cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy(result.g_right_count_mult, tempui, mat_size_ui,
                             cudaMemcpyHostToDevice));

  checkCudaErrors(cudaMalloc((void **)&result.g_blocks_mult, mat_size_ui));
  checkCudaErrors(cudaMemcpy(result.g_blocks_mult, tempui, mat_size_ui,
                             cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMalloc((void **)&result.g_blocks_mult_sum, mat_size_ui));
  checkCudaErrors(cudaMemcpy(result.g_blocks_mult_sum, tempui, mat_size_ui,
                             cudaMemcpyHostToDevice));

  checkCudaErrors(cudaMalloc((void **)&result.g_lambda_mult, mat_size_f));
  checkCudaErrors(cudaMemcpy(result.g_lambda_mult, tempf, mat_size_f,
                             cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMalloc((void **)&result.g_pos_mult, mat_size_ui));
  checkCudaErrors(cudaMemcpy(result.g_pos_mult, tempf, mat_size_ui,
                             cudaMemcpyHostToDevice));
}

////////////////////////////////////////////////////////////////////////////////
//! Cleanup result memory
//! @param result  handles to memory
////////////////////////////////////////////////////////////////////////////////
void cleanupResultDataLargeMatrix(ResultDataLarge &result) {
  checkCudaErrors(cudaFree(result.g_num_one));
  checkCudaErrors(cudaFree(result.g_num_blocks_mult));
  checkCudaErrors(cudaFree(result.g_left_one));
  checkCudaErrors(cudaFree(result.g_right_one));
  checkCudaErrors(cudaFree(result.g_pos_one));
  checkCudaErrors(cudaFree(result.g_left_mult));
  checkCudaErrors(cudaFree(result.g_right_mult));
  checkCudaErrors(cudaFree(result.g_left_count_mult));
  checkCudaErrors(cudaFree(result.g_right_count_mult));
  checkCudaErrors(cudaFree(result.g_blocks_mult));
  checkCudaErrors(cudaFree(result.g_blocks_mult_sum));
  checkCudaErrors(cudaFree(result.g_lambda_mult));
  checkCudaErrors(cudaFree(result.g_pos_mult));
}

////////////////////////////////////////////////////////////////////////////////
//! Run the kernels to compute the eigenvalues for large matrices
//! @param  input   handles to input data
//! @param  result  handles to result data
//! @param  mat_size  matrix size
//! @param  precision  desired precision of eigenvalues
//! @param  lg  lower limit of Gerschgorin interval
//! @param  ug  upper limit of Gerschgorin interval
//! @param  iterations  number of iterations (for timing)
////////////////////////////////////////////////////////////////////////////////
void computeEigenvaluesLargeMatrix(const InputData &input,
                                   const ResultDataLarge &result,
                                   const unsigned int mat_size,
                                   const float precision, const float lg,
                                   const float ug,
                                   const unsigned int iterations) {
  dim3 blocks(1, 1, 1);
  dim3 threads(MAX_THREADS_BLOCK, 1, 1);

  StopWatchInterface *timer_step1 = NULL;
  StopWatchInterface *timer_step2_one = NULL;
  StopWatchInterface *timer_step2_mult = NULL;
  StopWatchInterface *timer_total = NULL;
  sdkCreateTimer(&timer_step1);
  sdkCreateTimer(&timer_step2_one);
  sdkCreateTimer(&timer_step2_mult);
  sdkCreateTimer(&timer_total);

  sdkStartTimer(&timer_total);

  // do for multiple iterations to improve timing accuracy
  for (unsigned int iter = 0; iter < iterations; ++iter) {
    sdkStartTimer(&timer_step1);
    bisectKernelLarge<<<blocks, threads>>>(
        input.g_a, input.g_b, mat_size, lg, ug, 0, mat_size, precision,
        result.g_num_one, result.g_num_blocks_mult, result.g_left_one,
        result.g_right_one, result.g_pos_one, result.g_left_mult,
        result.g_right_mult, result.g_left_count_mult,
        result.g_right_count_mult, result.g_blocks_mult,
        result.g_blocks_mult_sum);

    getLastCudaError("Kernel launch failed.");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&timer_step1);

    // get the number of intervals containing one eigenvalue after the first
    // processing step
    unsigned int num_one_intervals;
    checkCudaErrors(cudaMemcpy(&num_one_intervals, result.g_num_one,
                               sizeof(unsigned int), cudaMemcpyDeviceToHost));

    dim3 grid_onei;
    grid_onei.x = getNumBlocksLinear(num_one_intervals, MAX_THREADS_BLOCK);
    dim3 threads_onei;
    // use always max number of available threads to better balance load times
    // for matrix data
    threads_onei.x = MAX_THREADS_BLOCK;

    // compute eigenvalues for intervals that contained only one eigenvalue
    // after the first processing step
    sdkStartTimer(&timer_step2_one);

    bisectKernelLarge_OneIntervals<<<grid_onei, threads_onei>>>(
        input.g_a, input.g_b, mat_size, num_one_intervals, result.g_left_one,
        result.g_right_one, result.g_pos_one, precision);

    getLastCudaError("bisectKernelLarge_OneIntervals() FAILED.");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&timer_step2_one);

    // process intervals that contained more than one eigenvalue after
    // the first processing step

    // get the number of blocks of intervals that contain, in total when
    // each interval contains only one eigenvalue, not more than
    // MAX_THREADS_BLOCK threads
    unsigned int num_blocks_mult = 0;
    checkCudaErrors(cudaMemcpy(&num_blocks_mult, result.g_num_blocks_mult,
                               sizeof(unsigned int), cudaMemcpyDeviceToHost));

    // setup the execution environment
    dim3 grid_mult(num_blocks_mult, 1, 1);
    dim3 threads_mult(MAX_THREADS_BLOCK, 1, 1);

    sdkStartTimer(&timer_step2_mult);

    bisectKernelLarge_MultIntervals<<<grid_mult, threads_mult>>>(
        input.g_a, input.g_b, mat_size, result.g_blocks_mult,
        result.g_blocks_mult_sum, result.g_left_mult, result.g_right_mult,
        result.g_left_count_mult, result.g_right_count_mult,
        result.g_lambda_mult, result.g_pos_mult, precision);

    getLastCudaError("bisectKernelLarge_MultIntervals() FAILED.");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&timer_step2_mult);
  }

  sdkStopTimer(&timer_total);

  printf("Average time step 1: %f ms\n",
         sdkGetTimerValue(&timer_step1) / (float)iterations);
  printf("Average time step 2, one intervals: %f ms\n",
         sdkGetTimerValue(&timer_step2_one) / (float)iterations);
  printf("Average time step 2, mult intervals: %f ms\n",
         sdkGetTimerValue(&timer_step2_mult) / (float)iterations);

  printf("Average time TOTAL: %f ms\n",
         sdkGetTimerValue(&timer_total) / (float)iterations);

  sdkDeleteTimer(&timer_step1);
  sdkDeleteTimer(&timer_step2_one);
  sdkDeleteTimer(&timer_step2_mult);
  sdkDeleteTimer(&timer_total);
}

////////////////////////////////////////////////////////////////////////////////
//! Process the result, that is obtain result from device and do simple sanity
//! checking
//! @param  input   handles to input data
//! @param  result  handles to result data
//! @param  mat_size  matrix size
//! @param  filename  output filename
////////////////////////////////////////////////////////////////////////////////
bool processResultDataLargeMatrix(const InputData &input,
                                  const ResultDataLarge &result,
                                  const unsigned int mat_size,
                                  const char *filename,
                                  const unsigned int user_defined,
                                  char *exec_path) {
  bool bCompareResult = false;
  const unsigned int mat_size_ui = sizeof(unsigned int) * mat_size;
  const unsigned int mat_size_f = sizeof(float) * mat_size;

  // copy data from intervals that contained more than one eigenvalue after
  // the first processing step
  float *lambda_mult = (float *)malloc(sizeof(float) * mat_size);
  checkCudaErrors(cudaMemcpy(lambda_mult, result.g_lambda_mult,
                             sizeof(float) * mat_size, cudaMemcpyDeviceToHost));
  unsigned int *pos_mult =
      (unsigned int *)malloc(sizeof(unsigned int) * mat_size);
  checkCudaErrors(cudaMemcpy(pos_mult, result.g_pos_mult,
                             sizeof(unsigned int) * mat_size,
                             cudaMemcpyDeviceToHost));

  unsigned int *blocks_mult_sum =
      (unsigned int *)malloc(sizeof(unsigned int) * mat_size);
  checkCudaErrors(cudaMemcpy(blocks_mult_sum, result.g_blocks_mult_sum,
                             sizeof(unsigned int) * mat_size,
                             cudaMemcpyDeviceToHost));

  unsigned int num_one_intervals;
  checkCudaErrors(cudaMemcpy(&num_one_intervals, result.g_num_one,
                             sizeof(unsigned int), cudaMemcpyDeviceToHost));

  unsigned int sum_blocks_mult = mat_size - num_one_intervals;

  // copy data for intervals that contained one eigenvalue after the first
  // processing step
  float *left_one = (float *)malloc(mat_size_f);
  float *right_one = (float *)malloc(mat_size_f);
  unsigned int *pos_one = (unsigned int *)malloc(mat_size_ui);
  checkCudaErrors(cudaMemcpy(left_one, result.g_left_one, mat_size_f,
                             cudaMemcpyDeviceToHost));
  checkCudaErrors(cudaMemcpy(right_one, result.g_right_one, mat_size_f,
                             cudaMemcpyDeviceToHost));
  checkCudaErrors(cudaMemcpy(pos_one, result.g_pos_one, mat_size_ui,
                             cudaMemcpyDeviceToHost));

  // extract eigenvalues
  float *eigenvals = (float *)malloc(mat_size_f);

  // singleton intervals generated in the second step
  for (unsigned int i = 0; i < sum_blocks_mult; ++i) {
    eigenvals[pos_mult[i] - 1] = lambda_mult[i];
  }

  // singleton intervals generated in the first step
  unsigned int index = 0;

  for (unsigned int i = 0; i < num_one_intervals; ++i, ++index) {
    eigenvals[pos_one[i] - 1] = left_one[i];
  }

  if (1 == user_defined) {
    // store result
    writeTridiagSymMatlab(filename, input.a, input.b + 1, eigenvals, mat_size);
    // getLastCudaError( sdkWriteFilef( filename, eigenvals, mat_size, 0.0f));

    printf("User requests non-default argument(s), skipping self-check!\n");
    bCompareResult = true;
  } else {
    // compare with reference solution

    float *reference = NULL;
    unsigned int input_data_size = 0;

    char *ref_path = sdkFindFilePath("reference.dat", exec_path);
    assert(NULL != ref_path);
    sdkReadFile(ref_path, &reference, &input_data_size, false);
    assert(input_data_size == mat_size);

    // there's an imprecision of Sturm count computation which makes an
    // additional offset necessary
    float tolerance = 1.0e-5f + 5.0e-6f;

    if (sdkCompareL2fe(reference, eigenvals, mat_size, tolerance) == true) {
      bCompareResult = true;
    } else {
      bCompareResult = false;
    }

    free(ref_path);
    free(reference);
  }

  freePtr(eigenvals);
  freePtr(lambda_mult);
  freePtr(pos_mult);
  freePtr(blocks_mult_sum);
  freePtr(left_one);
  freePtr(right_one);
  freePtr(pos_one);

  return bCompareResult;
}
add and update samples for CUDA 11.6 2022-01-13 14:05:24 +08:00			`/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.`
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`/* Computation of eigenvalues of a large symmetric, tridiagonal matrix */`

			`// includes, system`
			`#include <stdlib.h>`
			`#include <stdio.h>`
			`#include <string.h>`
			`#include <math.h>`
			`#include <float.h>`

			`// includes, project`
			`#include "helper_functions.h"`
			`#include "helper_cuda.h"`
			`#include "config.h"`
			`#include "structs.h"`
			`#include "util.h"`
			`#include "matlab.h"`

			`#include "bisect_large.cuh"`

			`// includes, kernels`
			`#include "bisect_kernel_large.cuh"`
			`#include "bisect_kernel_large_onei.cuh"`
			`#include "bisect_kernel_large_multi.cuh"`

			`////////////////////////////////////////////////////////////////////////////////`
			`//! Initialize variables and memory for result`
			`//! @param result handles to memory`
			`//! @param matrix_size size of the matrix`
			`////////////////////////////////////////////////////////////////////////////////`
			`void initResultDataLargeMatrix(ResultDataLarge &result,`
			`const unsigned int mat_size) {`
			`// helper variables to initialize memory`
			`unsigned int zero = 0;`
			`unsigned int mat_size_f = sizeof(float) * mat_size;`
			`unsigned int mat_size_ui = sizeof(unsigned int) * mat_size;`

			`float tempf = (float )malloc(mat_size_f);`
			`unsigned int tempui = (unsigned int )malloc(mat_size_ui);`

			`for (unsigned int i = 0; i < mat_size; ++i) {`
			`tempf[i] = 0.0f;`
			`tempui[i] = 0;`
			`}`

			`// number of intervals containing only one eigenvalue after the first step`
			`checkCudaErrors(cudaMalloc((void **)&result.g_num_one, sizeof(unsigned int)));`
			`checkCudaErrors(cudaMemcpy(result.g_num_one, &zero, sizeof(unsigned int),`
			`cudaMemcpyHostToDevice));`

			`// number of (thread) blocks of intervals with multiple eigenvalues after`
			`// the first iteration`
			`checkCudaErrors(`
			`cudaMalloc((void **)&result.g_num_blocks_mult, sizeof(unsigned int)));`
			`checkCudaErrors(cudaMemcpy(result.g_num_blocks_mult, &zero,`
			`sizeof(unsigned int), cudaMemcpyHostToDevice));`

			`checkCudaErrors(cudaMalloc((void **)&result.g_left_one, mat_size_f));`
			`checkCudaErrors(cudaMalloc((void **)&result.g_right_one, mat_size_f));`
			`checkCudaErrors(cudaMalloc((void **)&result.g_pos_one, mat_size_ui));`

			`checkCudaErrors(cudaMalloc((void **)&result.g_left_mult, mat_size_f));`
			`checkCudaErrors(cudaMalloc((void **)&result.g_right_mult, mat_size_f));`
			`checkCudaErrors(cudaMalloc((void **)&result.g_left_count_mult, mat_size_ui));`
			`checkCudaErrors(cudaMalloc((void **)&result.g_right_count_mult, mat_size_ui));`

			`checkCudaErrors(`
			`cudaMemcpy(result.g_left_one, tempf, mat_size_f, cudaMemcpyHostToDevice));`
			`checkCudaErrors(cudaMemcpy(result.g_right_one, tempf, mat_size_f,`
			`cudaMemcpyHostToDevice));`
			`checkCudaErrors(cudaMemcpy(result.g_pos_one, tempui, mat_size_ui,`
			`cudaMemcpyHostToDevice));`

			`checkCudaErrors(cudaMemcpy(result.g_left_mult, tempf, mat_size_f,`
			`cudaMemcpyHostToDevice));`
			`checkCudaErrors(cudaMemcpy(result.g_right_mult, tempf, mat_size_f,`
			`cudaMemcpyHostToDevice));`
			`checkCudaErrors(cudaMemcpy(result.g_left_count_mult, tempui, mat_size_ui,`
			`cudaMemcpyHostToDevice));`
			`checkCudaErrors(cudaMemcpy(result.g_right_count_mult, tempui, mat_size_ui,`
			`cudaMemcpyHostToDevice));`

			`checkCudaErrors(cudaMalloc((void **)&result.g_blocks_mult, mat_size_ui));`
			`checkCudaErrors(cudaMemcpy(result.g_blocks_mult, tempui, mat_size_ui,`
			`cudaMemcpyHostToDevice));`
			`checkCudaErrors(cudaMalloc((void **)&result.g_blocks_mult_sum, mat_size_ui));`
			`checkCudaErrors(cudaMemcpy(result.g_blocks_mult_sum, tempui, mat_size_ui,`
			`cudaMemcpyHostToDevice));`

			`checkCudaErrors(cudaMalloc((void **)&result.g_lambda_mult, mat_size_f));`
			`checkCudaErrors(cudaMemcpy(result.g_lambda_mult, tempf, mat_size_f,`
			`cudaMemcpyHostToDevice));`
			`checkCudaErrors(cudaMalloc((void **)&result.g_pos_mult, mat_size_ui));`
			`checkCudaErrors(cudaMemcpy(result.g_pos_mult, tempf, mat_size_ui,`
			`cudaMemcpyHostToDevice));`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`//! Cleanup result memory`
			`//! @param result handles to memory`
			`////////////////////////////////////////////////////////////////////////////////`
			`void cleanupResultDataLargeMatrix(ResultDataLarge &result) {`
			`checkCudaErrors(cudaFree(result.g_num_one));`
			`checkCudaErrors(cudaFree(result.g_num_blocks_mult));`
			`checkCudaErrors(cudaFree(result.g_left_one));`
			`checkCudaErrors(cudaFree(result.g_right_one));`
			`checkCudaErrors(cudaFree(result.g_pos_one));`
			`checkCudaErrors(cudaFree(result.g_left_mult));`
			`checkCudaErrors(cudaFree(result.g_right_mult));`
			`checkCudaErrors(cudaFree(result.g_left_count_mult));`
			`checkCudaErrors(cudaFree(result.g_right_count_mult));`
			`checkCudaErrors(cudaFree(result.g_blocks_mult));`
			`checkCudaErrors(cudaFree(result.g_blocks_mult_sum));`
			`checkCudaErrors(cudaFree(result.g_lambda_mult));`
			`checkCudaErrors(cudaFree(result.g_pos_mult));`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`//! Run the kernels to compute the eigenvalues for large matrices`
			`//! @param input handles to input data`
			`//! @param result handles to result data`
			`//! @param mat_size matrix size`
			`//! @param precision desired precision of eigenvalues`
			`//! @param lg lower limit of Gerschgorin interval`
			`//! @param ug upper limit of Gerschgorin interval`
			`//! @param iterations number of iterations (for timing)`
			`////////////////////////////////////////////////////////////////////////////////`
			`void computeEigenvaluesLargeMatrix(const InputData &input,`
			`const ResultDataLarge &result,`
			`const unsigned int mat_size,`
			`const float precision, const float lg,`
			`const float ug,`
			`const unsigned int iterations) {`
			`dim3 blocks(1, 1, 1);`
			`dim3 threads(MAX_THREADS_BLOCK, 1, 1);`

			`StopWatchInterface *timer_step1 = NULL;`
			`StopWatchInterface *timer_step2_one = NULL;`
			`StopWatchInterface *timer_step2_mult = NULL;`
			`StopWatchInterface *timer_total = NULL;`
			`sdkCreateTimer(&timer_step1);`
			`sdkCreateTimer(&timer_step2_one);`
			`sdkCreateTimer(&timer_step2_mult);`
			`sdkCreateTimer(&timer_total);`

			`sdkStartTimer(&timer_total);`

			`// do for multiple iterations to improve timing accuracy`
			`for (unsigned int iter = 0; iter < iterations; ++iter) {`
			`sdkStartTimer(&timer_step1);`
			`bisectKernelLarge<<<blocks, threads>>>(`
			`input.g_a, input.g_b, mat_size, lg, ug, 0, mat_size, precision,`
			`result.g_num_one, result.g_num_blocks_mult, result.g_left_one,`
			`result.g_right_one, result.g_pos_one, result.g_left_mult,`
			`result.g_right_mult, result.g_left_count_mult,`
			`result.g_right_count_mult, result.g_blocks_mult,`
			`result.g_blocks_mult_sum);`

			`getLastCudaError("Kernel launch failed.");`
			`checkCudaErrors(cudaDeviceSynchronize());`
			`sdkStopTimer(&timer_step1);`

			`// get the number of intervals containing one eigenvalue after the first`
			`// processing step`
			`unsigned int num_one_intervals;`
			`checkCudaErrors(cudaMemcpy(&num_one_intervals, result.g_num_one,`
			`sizeof(unsigned int), cudaMemcpyDeviceToHost));`

			`dim3 grid_onei;`
			`grid_onei.x = getNumBlocksLinear(num_one_intervals, MAX_THREADS_BLOCK);`
			`dim3 threads_onei;`
			`// use always max number of available threads to better balance load times`
			`// for matrix data`
			`threads_onei.x = MAX_THREADS_BLOCK;`

			`// compute eigenvalues for intervals that contained only one eigenvalue`
			`// after the first processing step`
			`sdkStartTimer(&timer_step2_one);`

			`bisectKernelLarge_OneIntervals<<<grid_onei, threads_onei>>>(`
			`input.g_a, input.g_b, mat_size, num_one_intervals, result.g_left_one,`
			`result.g_right_one, result.g_pos_one, precision);`

			`getLastCudaError("bisectKernelLarge_OneIntervals() FAILED.");`
			`checkCudaErrors(cudaDeviceSynchronize());`
			`sdkStopTimer(&timer_step2_one);`

			`// process intervals that contained more than one eigenvalue after`
			`// the first processing step`

			`// get the number of blocks of intervals that contain, in total when`
			`// each interval contains only one eigenvalue, not more than`
			`// MAX_THREADS_BLOCK threads`
			`unsigned int num_blocks_mult = 0;`
			`checkCudaErrors(cudaMemcpy(&num_blocks_mult, result.g_num_blocks_mult,`
			`sizeof(unsigned int), cudaMemcpyDeviceToHost));`

			`// setup the execution environment`
			`dim3 grid_mult(num_blocks_mult, 1, 1);`
			`dim3 threads_mult(MAX_THREADS_BLOCK, 1, 1);`

			`sdkStartTimer(&timer_step2_mult);`

			`bisectKernelLarge_MultIntervals<<<grid_mult, threads_mult>>>(`
			`input.g_a, input.g_b, mat_size, result.g_blocks_mult,`
			`result.g_blocks_mult_sum, result.g_left_mult, result.g_right_mult,`
			`result.g_left_count_mult, result.g_right_count_mult,`
			`result.g_lambda_mult, result.g_pos_mult, precision);`

			`getLastCudaError("bisectKernelLarge_MultIntervals() FAILED.");`
			`checkCudaErrors(cudaDeviceSynchronize());`
			`sdkStopTimer(&timer_step2_mult);`
			`}`

			`sdkStopTimer(&timer_total);`

			`printf("Average time step 1: %f ms\n",`
			`sdkGetTimerValue(&timer_step1) / (float)iterations);`
			`printf("Average time step 2, one intervals: %f ms\n",`
			`sdkGetTimerValue(&timer_step2_one) / (float)iterations);`
			`printf("Average time step 2, mult intervals: %f ms\n",`
			`sdkGetTimerValue(&timer_step2_mult) / (float)iterations);`

			`printf("Average time TOTAL: %f ms\n",`
			`sdkGetTimerValue(&timer_total) / (float)iterations);`

			`sdkDeleteTimer(&timer_step1);`
			`sdkDeleteTimer(&timer_step2_one);`
			`sdkDeleteTimer(&timer_step2_mult);`
			`sdkDeleteTimer(&timer_total);`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`//! Process the result, that is obtain result from device and do simple sanity`
			`//! checking`
			`//! @param input handles to input data`
			`//! @param result handles to result data`
			`//! @param mat_size matrix size`
			`//! @param filename output filename`
			`////////////////////////////////////////////////////////////////////////////////`
			`bool processResultDataLargeMatrix(const InputData &input,`
			`const ResultDataLarge &result,`
			`const unsigned int mat_size,`
			`const char *filename,`
			`const unsigned int user_defined,`
			`char *exec_path) {`
			`bool bCompareResult = false;`
			`const unsigned int mat_size_ui = sizeof(unsigned int) * mat_size;`
			`const unsigned int mat_size_f = sizeof(float) * mat_size;`

			`// copy data from intervals that contained more than one eigenvalue after`
			`// the first processing step`
			`float lambda_mult = (float )malloc(sizeof(float) * mat_size);`
			`checkCudaErrors(cudaMemcpy(lambda_mult, result.g_lambda_mult,`
			`sizeof(float) * mat_size, cudaMemcpyDeviceToHost));`
			`unsigned int *pos_mult =`
			`(unsigned int )malloc(sizeof(unsigned int) mat_size);`
			`checkCudaErrors(cudaMemcpy(pos_mult, result.g_pos_mult,`
			`sizeof(unsigned int) * mat_size,`
			`cudaMemcpyDeviceToHost));`

			`unsigned int *blocks_mult_sum =`
			`(unsigned int )malloc(sizeof(unsigned int) mat_size);`
			`checkCudaErrors(cudaMemcpy(blocks_mult_sum, result.g_blocks_mult_sum,`
			`sizeof(unsigned int) * mat_size,`
			`cudaMemcpyDeviceToHost));`

			`unsigned int num_one_intervals;`
			`checkCudaErrors(cudaMemcpy(&num_one_intervals, result.g_num_one,`
			`sizeof(unsigned int), cudaMemcpyDeviceToHost));`

			`unsigned int sum_blocks_mult = mat_size - num_one_intervals;`

			`// copy data for intervals that contained one eigenvalue after the first`
			`// processing step`
			`float left_one = (float )malloc(mat_size_f);`
			`float right_one = (float )malloc(mat_size_f);`
			`unsigned int pos_one = (unsigned int )malloc(mat_size_ui);`
			`checkCudaErrors(cudaMemcpy(left_one, result.g_left_one, mat_size_f,`
			`cudaMemcpyDeviceToHost));`
			`checkCudaErrors(cudaMemcpy(right_one, result.g_right_one, mat_size_f,`
			`cudaMemcpyDeviceToHost));`
			`checkCudaErrors(cudaMemcpy(pos_one, result.g_pos_one, mat_size_ui,`
			`cudaMemcpyDeviceToHost));`

			`// extract eigenvalues`
			`float eigenvals = (float )malloc(mat_size_f);`

			`// singleton intervals generated in the second step`
			`for (unsigned int i = 0; i < sum_blocks_mult; ++i) {`
			`eigenvals[pos_mult[i] - 1] = lambda_mult[i];`
			`}`

			`// singleton intervals generated in the first step`
			`unsigned int index = 0;`

			`for (unsigned int i = 0; i < num_one_intervals; ++i, ++index) {`
			`eigenvals[pos_one[i] - 1] = left_one[i];`
			`}`

			`if (1 == user_defined) {`
			`// store result`
			`writeTridiagSymMatlab(filename, input.a, input.b + 1, eigenvals, mat_size);`
			`// getLastCudaError( sdkWriteFilef( filename, eigenvals, mat_size, 0.0f));`

			`printf("User requests non-default argument(s), skipping self-check!\n");`
			`bCompareResult = true;`
			`} else {`
			`// compare with reference solution`

			`float *reference = NULL;`
			`unsigned int input_data_size = 0;`

			`char *ref_path = sdkFindFilePath("reference.dat", exec_path);`
			`assert(NULL != ref_path);`
			`sdkReadFile(ref_path, &reference, &input_data_size, false);`
			`assert(input_data_size == mat_size);`

			`// there's an imprecision of Sturm count computation which makes an`
			`// additional offset necessary`
			`float tolerance = 1.0e-5f + 5.0e-6f;`

			`if (sdkCompareL2fe(reference, eigenvals, mat_size, tolerance) == true) {`
			`bCompareResult = true;`
			`} else {`
			`bCompareResult = false;`
			`}`

			`free(ref_path);`
			`free(reference);`
			`}`

			`freePtr(eigenvals);`
			`freePtr(lambda_mult);`
			`freePtr(pos_mult);`
			`freePtr(blocks_mult_sum);`
			`freePtr(left_one);`
			`freePtr(right_one);`
			`freePtr(pos_one);`

			`return bCompareResult;`
			`}`