mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-24 20:39:17 +08:00
224 lines
8.6 KiB
Plaintext
224 lines
8.6 KiB
Plaintext
/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/* Perform second step of bisection algorithm for large matrices for
|
|
* intervals that contained after the first step more than one eigenvalue
|
|
*/
|
|
|
|
#ifndef _BISECT_KERNEL_LARGE_MULTI_H_
|
|
#define _BISECT_KERNEL_LARGE_MULTI_H_
|
|
|
|
#include <cooperative_groups.h>
|
|
|
|
namespace cg = cooperative_groups;
|
|
// includes, project
|
|
#include "config.h"
|
|
#include "util.h"
|
|
|
|
// additional kernel
|
|
#include "bisect_util.cu"
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Perform second step of bisection algorithm for large matrices for
|
|
//! intervals that after the first step contained more than one eigenvalue
|
|
//! @param g_d diagonal elements of symmetric, tridiagonal matrix
|
|
//! @param g_s superdiagonal elements of symmetric, tridiagonal matrix
|
|
//! @param n matrix size
|
|
//! @param blocks_mult start addresses of blocks of intervals that are
|
|
//! processed by one block of threads, each of the
|
|
//! intervals contains more than one eigenvalue
|
|
//! @param blocks_mult_sum total number of eigenvalues / singleton intervals
|
|
//! in one block of intervals
|
|
//! @param g_left left limits of intervals
|
|
//! @param g_right right limits of intervals
|
|
//! @param g_left_count number of eigenvalues less than left limits
|
|
//! @param g_right_count number of eigenvalues less than right limits
|
|
//! @param g_lambda final eigenvalue
|
|
//! @param g_pos index of eigenvalue (in ascending order)
|
|
//! @param precision desired precision of eigenvalues
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__global__ void bisectKernelLarge_MultIntervals(
|
|
float *g_d, float *g_s, const unsigned int n, unsigned int *blocks_mult,
|
|
unsigned int *blocks_mult_sum, float *g_left, float *g_right,
|
|
unsigned int *g_left_count, unsigned int *g_right_count, float *g_lambda,
|
|
unsigned int *g_pos, float precision) {
|
|
// Handle to thread block group
|
|
cg::thread_block cta = cg::this_thread_block();
|
|
const unsigned int tid = threadIdx.x;
|
|
|
|
// left and right limits of interval
|
|
__shared__ float s_left[2 * MAX_THREADS_BLOCK];
|
|
__shared__ float s_right[2 * MAX_THREADS_BLOCK];
|
|
|
|
// number of eigenvalues smaller than interval limits
|
|
__shared__ unsigned int s_left_count[2 * MAX_THREADS_BLOCK];
|
|
__shared__ unsigned int s_right_count[2 * MAX_THREADS_BLOCK];
|
|
|
|
// helper array for chunk compaction of second chunk
|
|
__shared__ unsigned int s_compaction_list[2 * MAX_THREADS_BLOCK + 1];
|
|
// compaction list helper for exclusive scan
|
|
unsigned int *s_compaction_list_exc = s_compaction_list + 1;
|
|
|
|
// flag if all threads are converged
|
|
__shared__ unsigned int all_threads_converged;
|
|
// number of active threads
|
|
__shared__ unsigned int num_threads_active;
|
|
// number of threads to employ for compaction
|
|
__shared__ unsigned int num_threads_compaction;
|
|
// flag if second chunk has to be compacted
|
|
__shared__ unsigned int compact_second_chunk;
|
|
|
|
// parameters of block of intervals processed by this block of threads
|
|
__shared__ unsigned int c_block_start;
|
|
__shared__ unsigned int c_block_end;
|
|
__shared__ unsigned int c_block_offset_output;
|
|
|
|
// midpoint of currently active interval of the thread
|
|
float mid = 0.0f;
|
|
// number of eigenvalues smaller than \a mid
|
|
unsigned int mid_count = 0;
|
|
// current interval parameter
|
|
float left;
|
|
float right;
|
|
unsigned int left_count;
|
|
unsigned int right_count;
|
|
// helper for compaction, keep track which threads have a second child
|
|
unsigned int is_active_second = 0;
|
|
|
|
// initialize common start conditions
|
|
if (0 == tid) {
|
|
c_block_start = blocks_mult[blockIdx.x];
|
|
c_block_end = blocks_mult[blockIdx.x + 1];
|
|
c_block_offset_output = blocks_mult_sum[blockIdx.x];
|
|
|
|
num_threads_active = c_block_end - c_block_start;
|
|
s_compaction_list[0] = 0;
|
|
num_threads_compaction = ceilPow2(num_threads_active);
|
|
|
|
all_threads_converged = 1;
|
|
compact_second_chunk = 0;
|
|
}
|
|
|
|
cg::sync(cta);
|
|
|
|
// read data into shared memory
|
|
if (tid < num_threads_active) {
|
|
s_left[tid] = g_left[c_block_start + tid];
|
|
s_right[tid] = g_right[c_block_start + tid];
|
|
s_left_count[tid] = g_left_count[c_block_start + tid];
|
|
s_right_count[tid] = g_right_count[c_block_start + tid];
|
|
}
|
|
|
|
cg::sync(cta);
|
|
|
|
// do until all threads converged
|
|
while (true) {
|
|
// for (int iter=0; iter < 0; iter++) {
|
|
|
|
// subdivide interval if currently active and not already converged
|
|
subdivideActiveInterval(tid, s_left, s_right, s_left_count, s_right_count,
|
|
num_threads_active, left, right, left_count,
|
|
right_count, mid, all_threads_converged);
|
|
|
|
cg::sync(cta);
|
|
|
|
// stop if all eigenvalues have been found
|
|
if (1 == all_threads_converged) {
|
|
break;
|
|
}
|
|
|
|
// compute number of eigenvalues smaller than mid for active and not
|
|
// converged intervals, use all threads for loading data from gmem and
|
|
// s_left and s_right as scratch space to store the data load from gmem
|
|
// in shared memory
|
|
mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, mid, tid,
|
|
num_threads_active, s_left,
|
|
s_right, (left == right), cta);
|
|
|
|
cg::sync(cta);
|
|
|
|
if (tid < num_threads_active) {
|
|
// store intervals
|
|
if (left != right) {
|
|
storeNonEmptyIntervals(tid, num_threads_active, s_left, s_right,
|
|
s_left_count, s_right_count, left, mid, right,
|
|
left_count, mid_count, right_count, precision,
|
|
compact_second_chunk, s_compaction_list_exc,
|
|
is_active_second);
|
|
} else {
|
|
storeIntervalConverged(
|
|
s_left, s_right, s_left_count, s_right_count, left, mid, right,
|
|
left_count, mid_count, right_count, s_compaction_list_exc,
|
|
compact_second_chunk, num_threads_active, is_active_second);
|
|
}
|
|
}
|
|
|
|
cg::sync(cta);
|
|
|
|
// compact second chunk of intervals if any of the threads generated
|
|
// two child intervals
|
|
if (1 == compact_second_chunk) {
|
|
createIndicesCompaction(s_compaction_list_exc, num_threads_compaction,
|
|
cta);
|
|
|
|
compactIntervals(s_left, s_right, s_left_count, s_right_count, mid, right,
|
|
mid_count, right_count, s_compaction_list,
|
|
num_threads_active, is_active_second);
|
|
}
|
|
|
|
cg::sync(cta);
|
|
|
|
// update state variables
|
|
if (0 == tid) {
|
|
num_threads_active += s_compaction_list[num_threads_active];
|
|
num_threads_compaction = ceilPow2(num_threads_active);
|
|
|
|
compact_second_chunk = 0;
|
|
all_threads_converged = 1;
|
|
}
|
|
|
|
cg::sync(cta);
|
|
|
|
// clear
|
|
s_compaction_list_exc[threadIdx.x] = 0;
|
|
s_compaction_list_exc[threadIdx.x + blockDim.x] = 0;
|
|
|
|
cg::sync(cta);
|
|
|
|
} // end until all threads converged
|
|
|
|
// write data back to global memory
|
|
if (tid < num_threads_active) {
|
|
unsigned int addr = c_block_offset_output + tid;
|
|
|
|
g_lambda[addr] = s_left[tid];
|
|
g_pos[addr] = s_right_count[tid];
|
|
}
|
|
}
|
|
|
|
#endif // #ifndef _BISECT_KERNEL_LARGE_MULTI_H_
|