cuda-samples/Samples/eigenvalues/bisect_kernel_large_onei.cuh

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/* Determine eigenvalues for large matrices for intervals that contained after
 * the first step one eigenvalue
 */

#ifndef _BISECT_KERNEL_LARGE_ONEI_H_
#define _BISECT_KERNEL_LARGE_ONEI_H_

#include <cooperative_groups.h>

namespace cg = cooperative_groups;

// includes, project
#include "config.h"
#include "util.h"

// additional kernel
#include "bisect_util.cu"

////////////////////////////////////////////////////////////////////////////////
//! Determine eigenvalues for large matrices for intervals that after
//! the first step contained one eigenvalue
//! @param  g_d  diagonal elements of symmetric, tridiagonal matrix
//! @param  g_s  superdiagonal elements of symmetric, tridiagonal matrix
//! @param  n    matrix size
//! @param  num_intervals  total number of intervals containing one eigenvalue
//!                         after the first step
//! @param g_left  left interval limits
//! @param g_right  right interval limits
//! @param g_pos  index of interval / number of intervals that are smaller than
//!               right interval limit
//! @param  precision  desired precision of eigenvalues
////////////////////////////////////////////////////////////////////////////////
__global__ void bisectKernelLarge_OneIntervals(
    float *g_d, float *g_s, const unsigned int n, unsigned int num_intervals,
    float *g_left, float *g_right, unsigned int *g_pos, float precision) {
  // Handle to thread block group
  cg::thread_block cta = cg::this_thread_block();
  const unsigned int gtid = (blockDim.x * blockIdx.x) + threadIdx.x;

  __shared__ float s_left_scratch[MAX_THREADS_BLOCK];
  __shared__ float s_right_scratch[MAX_THREADS_BLOCK];

  // active interval of thread
  // left and right limit of current interval
  float left, right;
  // number of threads smaller than the right limit (also corresponds to the
  // global index of the eigenvalues contained in the active interval)
  unsigned int right_count;
  // flag if current thread converged
  unsigned int converged = 0;
  // midpoint when current interval is subdivided
  float mid = 0.0f;
  // number of eigenvalues less than mid
  unsigned int mid_count = 0;

  // read data from global memory
  if (gtid < num_intervals) {
    left = g_left[gtid];
    right = g_right[gtid];
    right_count = g_pos[gtid];
  }

  // flag to determine if all threads converged to eigenvalue
  __shared__ unsigned int converged_all_threads;

  // initialized shared flag
  if (0 == threadIdx.x) {
    converged_all_threads = 0;
  }

  cg::sync(cta);

  // process until all threads converged to an eigenvalue
  // while( 0 == converged_all_threads) {
  while (true) {
    atomicExch(&converged_all_threads, 1);

    // update midpoint for all active threads
    if ((gtid < num_intervals) && (0 == converged)) {
      mid = computeMidpoint(left, right);
    }

    // find number of eigenvalues that are smaller than midpoint
    mid_count = computeNumSmallerEigenvalsLarge(
        g_d, g_s, n, mid, gtid, num_intervals, s_left_scratch, s_right_scratch,
        converged, cta);

    cg::sync(cta);

    // for all active threads
    if ((gtid < num_intervals) && (0 == converged)) {
      // udpate intervals -- always one child interval survives
      if (right_count == mid_count) {
        right = mid;
      } else {
        left = mid;
      }

      // check for convergence
      float t0 = right - left;
      float t1 = max(abs(right), abs(left)) * precision;

      if (t0 < min(precision, t1)) {
        float lambda = computeMidpoint(left, right);
        left = lambda;
        right = lambda;

        converged = 1;
      } else {
        atomicExch(&converged_all_threads, 0);
      }
    }

    cg::sync(cta);

    if (1 == converged_all_threads) {
      break;
    }

    cg::sync(cta);
  }

  // write data back to global memory
  cg::sync(cta);

  if (gtid < num_intervals) {
    // intervals converged so left and right interval limit are both identical
    // and identical to the eigenvalue
    g_left[gtid] = left;
  }
}

#endif  // #ifndef _BISECT_KERNEL_LARGE_ONEI_H_
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`/* Determine eigenvalues for large matrices for intervals that contained after`
			`* the first step one eigenvalue`
			`*/`

			`#ifndef _BISECT_KERNEL_LARGE_ONEI_H_`
			`#define _BISECT_KERNEL_LARGE_ONEI_H_`

			`#include <cooperative_groups.h>`

			`namespace cg = cooperative_groups;`

			`// includes, project`
			`#include "config.h"`
			`#include "util.h"`

			`// additional kernel`
			`#include "bisect_util.cu"`

			`////////////////////////////////////////////////////////////////////////////////`
			`//! Determine eigenvalues for large matrices for intervals that after`
			`//! the first step contained one eigenvalue`
			`//! @param g_d diagonal elements of symmetric, tridiagonal matrix`
			`//! @param g_s superdiagonal elements of symmetric, tridiagonal matrix`
			`//! @param n matrix size`
			`//! @param num_intervals total number of intervals containing one eigenvalue`
			`//! after the first step`
			`//! @param g_left left interval limits`
			`//! @param g_right right interval limits`
			`//! @param g_pos index of interval / number of intervals that are smaller than`
			`//! right interval limit`
			`//! @param precision desired precision of eigenvalues`
			`////////////////////////////////////////////////////////////////////////////////`
			`__global__ void bisectKernelLarge_OneIntervals(`
			`float g_d, float g_s, const unsigned int n, unsigned int num_intervals,`
			`float g_left, float g_right, unsigned int *g_pos, float precision) {`
			`// Handle to thread block group`
			`cg::thread_block cta = cg::this_thread_block();`
			`const unsigned int gtid = (blockDim.x * blockIdx.x) + threadIdx.x;`

			`__shared__ float s_left_scratch[MAX_THREADS_BLOCK];`
			`__shared__ float s_right_scratch[MAX_THREADS_BLOCK];`

			`// active interval of thread`
			`// left and right limit of current interval`
			`float left, right;`
			`// number of threads smaller than the right limit (also corresponds to the`
			`// global index of the eigenvalues contained in the active interval)`
			`unsigned int right_count;`
			`// flag if current thread converged`
			`unsigned int converged = 0;`
			`// midpoint when current interval is subdivided`
			`float mid = 0.0f;`
			`// number of eigenvalues less than mid`
			`unsigned int mid_count = 0;`

			`// read data from global memory`
			`if (gtid < num_intervals) {`
			`left = g_left[gtid];`
			`right = g_right[gtid];`
			`right_count = g_pos[gtid];`
			`}`

			`// flag to determine if all threads converged to eigenvalue`
			`__shared__ unsigned int converged_all_threads;`

			`// initialized shared flag`
			`if (0 == threadIdx.x) {`
			`converged_all_threads = 0;`
			`}`

			`cg::sync(cta);`

			`// process until all threads converged to an eigenvalue`
			`// while( 0 == converged_all_threads) {`
			`while (true) {`
			`atomicExch(&converged_all_threads, 1);`

			`// update midpoint for all active threads`
			`if ((gtid < num_intervals) && (0 == converged)) {`
			`mid = computeMidpoint(left, right);`
			`}`

			`// find number of eigenvalues that are smaller than midpoint`
			`mid_count = computeNumSmallerEigenvalsLarge(`
			`g_d, g_s, n, mid, gtid, num_intervals, s_left_scratch, s_right_scratch,`
			`converged, cta);`

			`cg::sync(cta);`

			`// for all active threads`
			`if ((gtid < num_intervals) && (0 == converged)) {`
			`// udpate intervals -- always one child interval survives`
			`if (right_count == mid_count) {`
			`right = mid;`
			`} else {`
			`left = mid;`
			`}`

			`// check for convergence`
			`float t0 = right - left;`
			`float t1 = max(abs(right), abs(left)) * precision;`

			`if (t0 < min(precision, t1)) {`
			`float lambda = computeMidpoint(left, right);`
			`left = lambda;`
			`right = lambda;`

			`converged = 1;`
			`} else {`
			`atomicExch(&converged_all_threads, 0);`
			`}`
			`}`

			`cg::sync(cta);`

			`if (1 == converged_all_threads) {`
			`break;`
			`}`

			`cg::sync(cta);`
			`}`

			`// write data back to global memory`
			`cg::sync(cta);`

			`if (gtid < num_intervals) {`
			`// intervals converged so left and right interval limit are both identical`
			`// and identical to the eigenvalue`
			`g_left[gtid] = left;`
			`}`
			`}`

			`#endif // #ifndef _BISECT_KERNEL_LARGE_ONEI_H_`