mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-28 13:49:16 +08:00
161 lines
5.5 KiB
Plaintext
161 lines
5.5 KiB
Plaintext
/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/* Determine eigenvalues for large matrices for intervals that contained after
|
|
* the first step one eigenvalue
|
|
*/
|
|
|
|
#ifndef _BISECT_KERNEL_LARGE_ONEI_H_
|
|
#define _BISECT_KERNEL_LARGE_ONEI_H_
|
|
|
|
#include <cooperative_groups.h>
|
|
|
|
namespace cg = cooperative_groups;
|
|
|
|
// includes, project
|
|
#include "config.h"
|
|
#include "util.h"
|
|
|
|
// additional kernel
|
|
#include "bisect_util.cu"
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Determine eigenvalues for large matrices for intervals that after
|
|
//! the first step contained one eigenvalue
|
|
//! @param g_d diagonal elements of symmetric, tridiagonal matrix
|
|
//! @param g_s superdiagonal elements of symmetric, tridiagonal matrix
|
|
//! @param n matrix size
|
|
//! @param num_intervals total number of intervals containing one eigenvalue
|
|
//! after the first step
|
|
//! @param g_left left interval limits
|
|
//! @param g_right right interval limits
|
|
//! @param g_pos index of interval / number of intervals that are smaller than
|
|
//! right interval limit
|
|
//! @param precision desired precision of eigenvalues
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__global__ void bisectKernelLarge_OneIntervals(
|
|
float *g_d, float *g_s, const unsigned int n, unsigned int num_intervals,
|
|
float *g_left, float *g_right, unsigned int *g_pos, float precision) {
|
|
// Handle to thread block group
|
|
cg::thread_block cta = cg::this_thread_block();
|
|
const unsigned int gtid = (blockDim.x * blockIdx.x) + threadIdx.x;
|
|
|
|
__shared__ float s_left_scratch[MAX_THREADS_BLOCK];
|
|
__shared__ float s_right_scratch[MAX_THREADS_BLOCK];
|
|
|
|
// active interval of thread
|
|
// left and right limit of current interval
|
|
float left, right;
|
|
// number of threads smaller than the right limit (also corresponds to the
|
|
// global index of the eigenvalues contained in the active interval)
|
|
unsigned int right_count;
|
|
// flag if current thread converged
|
|
unsigned int converged = 0;
|
|
// midpoint when current interval is subdivided
|
|
float mid = 0.0f;
|
|
// number of eigenvalues less than mid
|
|
unsigned int mid_count = 0;
|
|
|
|
// read data from global memory
|
|
if (gtid < num_intervals) {
|
|
left = g_left[gtid];
|
|
right = g_right[gtid];
|
|
right_count = g_pos[gtid];
|
|
}
|
|
|
|
// flag to determine if all threads converged to eigenvalue
|
|
__shared__ unsigned int converged_all_threads;
|
|
|
|
// initialized shared flag
|
|
if (0 == threadIdx.x) {
|
|
converged_all_threads = 0;
|
|
}
|
|
|
|
cg::sync(cta);
|
|
|
|
// process until all threads converged to an eigenvalue
|
|
// while( 0 == converged_all_threads) {
|
|
while (true) {
|
|
atomicExch(&converged_all_threads, 1);
|
|
|
|
// update midpoint for all active threads
|
|
if ((gtid < num_intervals) && (0 == converged)) {
|
|
mid = computeMidpoint(left, right);
|
|
}
|
|
|
|
// find number of eigenvalues that are smaller than midpoint
|
|
mid_count = computeNumSmallerEigenvalsLarge(
|
|
g_d, g_s, n, mid, gtid, num_intervals, s_left_scratch, s_right_scratch,
|
|
converged, cta);
|
|
|
|
cg::sync(cta);
|
|
|
|
// for all active threads
|
|
if ((gtid < num_intervals) && (0 == converged)) {
|
|
// udpate intervals -- always one child interval survives
|
|
if (right_count == mid_count) {
|
|
right = mid;
|
|
} else {
|
|
left = mid;
|
|
}
|
|
|
|
// check for convergence
|
|
float t0 = right - left;
|
|
float t1 = max(abs(right), abs(left)) * precision;
|
|
|
|
if (t0 < min(precision, t1)) {
|
|
float lambda = computeMidpoint(left, right);
|
|
left = lambda;
|
|
right = lambda;
|
|
|
|
converged = 1;
|
|
} else {
|
|
atomicExch(&converged_all_threads, 0);
|
|
}
|
|
}
|
|
|
|
cg::sync(cta);
|
|
|
|
if (1 == converged_all_threads) {
|
|
break;
|
|
}
|
|
|
|
cg::sync(cta);
|
|
}
|
|
|
|
// write data back to global memory
|
|
cg::sync(cta);
|
|
|
|
if (gtid < num_intervals) {
|
|
// intervals converged so left and right interval limit are both identical
|
|
// and identical to the eigenvalue
|
|
g_left[gtid] = left;
|
|
}
|
|
}
|
|
|
|
#endif // #ifndef _BISECT_KERNEL_LARGE_ONEI_H_
|