mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-28 13:59:19 +08:00
806 lines
31 KiB
Plaintext
806 lines
31 KiB
Plaintext
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/* Determine eigenvalues for large symmetric, tridiagonal matrix. First
|
|
step of the computation. */
|
|
|
|
#ifndef _BISECT_KERNEL_LARGE_H_
|
|
#define _BISECT_KERNEL_LARGE_H_
|
|
#include <cooperative_groups.h>
|
|
|
|
namespace cg = cooperative_groups;
|
|
// includes, project
|
|
#include "config.h"
|
|
#include "util.h"
|
|
|
|
// additional kernel
|
|
#include "bisect_util.cu"
|
|
|
|
// declaration, forward
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Write data to global memory
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__device__ void writeToGmem(
|
|
const unsigned int tid, const unsigned int tid_2,
|
|
const unsigned int num_threads_active, const unsigned int num_blocks_mult,
|
|
float *g_left_one, float *g_right_one, unsigned int *g_pos_one,
|
|
float *g_left_mult, float *g_right_mult, unsigned int *g_left_count_mult,
|
|
unsigned int *g_right_count_mult, float *s_left, float *s_right,
|
|
unsigned short *s_left_count, unsigned short *s_right_count,
|
|
unsigned int *g_blocks_mult, unsigned int *g_blocks_mult_sum,
|
|
unsigned short *s_compaction_list, unsigned short *s_cl_helper,
|
|
unsigned int offset_mult_lambda);
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Perform final stream compaction before writing out data
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__device__ void compactStreamsFinal(
|
|
const unsigned int tid, const unsigned int tid_2,
|
|
const unsigned int num_threads_active, unsigned int &offset_mult_lambda,
|
|
float *s_left, float *s_right, unsigned short *s_left_count,
|
|
unsigned short *s_right_count, unsigned short *s_cl_one,
|
|
unsigned short *s_cl_mult, unsigned short *s_cl_blocking,
|
|
unsigned short *s_cl_helper, unsigned int is_one_lambda,
|
|
unsigned int is_one_lambda_2, float &left, float &right, float &left_2,
|
|
float &right_2, unsigned int &left_count, unsigned int &right_count,
|
|
unsigned int &left_count_2, unsigned int &right_count_2,
|
|
unsigned int c_block_iend, unsigned int c_sum_block,
|
|
unsigned int c_block_iend_2, unsigned int c_sum_block_2,
|
|
cg::thread_block cta);
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Perform scan to compact list of block start addresses
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__device__ void scanCompactBlocksStartAddress(
|
|
const unsigned int tid, const unsigned int tid_2,
|
|
const unsigned int num_threads_compaction, unsigned short *s_cl_blocking,
|
|
unsigned short *s_cl_helper, cg::thread_block cta);
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Perform scan to obtain number of eigenvalues before a specific block
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__device__ void scanSumBlocks(const unsigned int tid, const unsigned int tid_2,
|
|
const unsigned int num_threads_active,
|
|
const unsigned int num_threads_compaction,
|
|
unsigned short *s_cl_blocking,
|
|
unsigned short *s_cl_helper,
|
|
cg::thread_block cta);
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Perform initial scan for compaction of intervals containing one and
|
|
//! multiple eigenvalues; also do initial scan to build blocks
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__device__ void scanInitial(const unsigned int tid, const unsigned int tid_2,
|
|
const unsigned int num_threads_active,
|
|
const unsigned int num_threads_compaction,
|
|
unsigned short *s_cl_one, unsigned short *s_cl_mult,
|
|
unsigned short *s_cl_blocking,
|
|
unsigned short *s_cl_helper, cg::thread_block cta);
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Store all non-empty intervals resulting from the subdivision of the interval
|
|
//! currently processed by the thread
|
|
//! @param addr address where to store
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__device__ void storeNonEmptyIntervalsLarge(
|
|
unsigned int addr, const unsigned int num_threads_active, float *s_left,
|
|
float *s_right, unsigned short *s_left_count, unsigned short *s_right_count,
|
|
float left, float mid, float right, const unsigned short left_count,
|
|
const unsigned short mid_count, const unsigned short right_count,
|
|
float epsilon, unsigned int &compact_second_chunk,
|
|
unsigned short *s_compaction_list, unsigned int &is_active_second);
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix
|
|
//! @param g_d diagonal elements in global memory
|
|
//! @param g_s superdiagonal elements in global elements (stored so that the
|
|
//! element *(g_s - 1) can be accessed an equals 0
|
|
//! @param n size of matrix
|
|
//! @param lg lower bound of input interval (e.g. Gerschgorin interval)
|
|
//! @param ug upper bound of input interval (e.g. Gerschgorin interval)
|
|
//! @param lg_eig_count number of eigenvalues that are smaller than \a lg
|
|
//! @param lu_eig_count number of eigenvalues that are smaller than \a lu
|
|
//! @param epsilon desired accuracy of eigenvalues to compute
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__global__ void bisectKernelLarge(
|
|
float *g_d, float *g_s, const unsigned int n, const float lg,
|
|
const float ug, const unsigned int lg_eig_count,
|
|
const unsigned int ug_eig_count, float epsilon, unsigned int *g_num_one,
|
|
unsigned int *g_num_blocks_mult, float *g_left_one, float *g_right_one,
|
|
unsigned int *g_pos_one, float *g_left_mult, float *g_right_mult,
|
|
unsigned int *g_left_count_mult, unsigned int *g_right_count_mult,
|
|
unsigned int *g_blocks_mult, unsigned int *g_blocks_mult_sum) {
|
|
// Handle to thread block group
|
|
cg::thread_block cta = cg::this_thread_block();
|
|
const unsigned int tid = threadIdx.x;
|
|
|
|
// intervals (store left and right because the subdivision tree is in general
|
|
// not dense
|
|
__shared__ float s_left[2 * MAX_THREADS_BLOCK + 1];
|
|
__shared__ float s_right[2 * MAX_THREADS_BLOCK + 1];
|
|
|
|
// number of eigenvalues that are smaller than s_left / s_right
|
|
// (correspondence is realized via indices)
|
|
__shared__ unsigned short s_left_count[2 * MAX_THREADS_BLOCK + 1];
|
|
__shared__ unsigned short s_right_count[2 * MAX_THREADS_BLOCK + 1];
|
|
|
|
// helper for stream compaction
|
|
__shared__ unsigned short s_compaction_list[2 * MAX_THREADS_BLOCK + 1];
|
|
|
|
// state variables for whole block
|
|
// if 0 then compaction of second chunk of child intervals is not necessary
|
|
// (because all intervals had exactly one non-dead child)
|
|
__shared__ unsigned int compact_second_chunk;
|
|
// if 1 then all threads are converged
|
|
__shared__ unsigned int all_threads_converged;
|
|
|
|
// number of currently active threads
|
|
__shared__ unsigned int num_threads_active;
|
|
|
|
// number of threads to use for stream compaction
|
|
__shared__ unsigned int num_threads_compaction;
|
|
|
|
// helper for exclusive scan
|
|
unsigned short *s_compaction_list_exc = s_compaction_list + 1;
|
|
|
|
// variables for currently processed interval
|
|
// left and right limit of active interval
|
|
float left = 0.0f;
|
|
float right = 0.0f;
|
|
unsigned int left_count = 0;
|
|
unsigned int right_count = 0;
|
|
// midpoint of active interval
|
|
float mid = 0.0f;
|
|
// number of eigenvalues smaller then mid
|
|
unsigned int mid_count = 0;
|
|
// helper for stream compaction (tracking of threads generating second child)
|
|
unsigned int is_active_second = 0;
|
|
|
|
// initialize lists
|
|
s_compaction_list[tid] = 0;
|
|
s_left[tid] = 0;
|
|
s_right[tid] = 0;
|
|
s_left_count[tid] = 0;
|
|
s_right_count[tid] = 0;
|
|
|
|
cg::sync(cta);
|
|
|
|
// set up initial configuration
|
|
if (0 == tid) {
|
|
s_left[0] = lg;
|
|
s_right[0] = ug;
|
|
s_left_count[0] = lg_eig_count;
|
|
s_right_count[0] = ug_eig_count;
|
|
|
|
compact_second_chunk = 0;
|
|
num_threads_active = 1;
|
|
|
|
num_threads_compaction = 1;
|
|
|
|
all_threads_converged = 1;
|
|
}
|
|
|
|
cg::sync(cta);
|
|
|
|
// for all active threads read intervals from the last level
|
|
// the number of (worst case) active threads per level l is 2^l
|
|
while (true) {
|
|
subdivideActiveInterval(tid, s_left, s_right, s_left_count, s_right_count,
|
|
num_threads_active, left, right, left_count,
|
|
right_count, mid, all_threads_converged);
|
|
|
|
cg::sync(cta);
|
|
|
|
// check if done
|
|
if (1 == all_threads_converged) {
|
|
break;
|
|
}
|
|
|
|
// compute number of eigenvalues smaller than mid
|
|
// use all threads for reading the necessary matrix data from global
|
|
// memory
|
|
// use s_left and s_right as scratch space for diagonal and
|
|
// superdiagonal of matrix
|
|
mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, mid, threadIdx.x,
|
|
num_threads_active, s_left,
|
|
s_right, (left == right), cta);
|
|
|
|
cg::sync(cta);
|
|
|
|
// store intervals
|
|
// for all threads store the first child interval in a continuous chunk of
|
|
// memory, and the second child interval -- if it exists -- in a second
|
|
// chunk; it is likely that all threads reach convergence up to
|
|
// \a epsilon at the same level; furthermore, for higher level most / all
|
|
// threads will have only one child, storing the first child compactly will
|
|
// (first) avoid to perform a compaction step on the first chunk, (second)
|
|
// make it for higher levels (when all threads / intervals have
|
|
// exactly one child) unnecessary to perform a compaction of the second
|
|
// chunk
|
|
if (tid < num_threads_active) {
|
|
if (left != right) {
|
|
// store intervals
|
|
storeNonEmptyIntervalsLarge(tid, num_threads_active, s_left, s_right,
|
|
s_left_count, s_right_count, left, mid,
|
|
right, left_count, mid_count, right_count,
|
|
epsilon, compact_second_chunk,
|
|
s_compaction_list_exc, is_active_second);
|
|
} else {
|
|
// re-write converged interval (has to be stored again because s_left
|
|
// and s_right are used as scratch space for
|
|
// computeNumSmallerEigenvalsLarge()
|
|
s_left[tid] = left;
|
|
s_right[tid] = left;
|
|
s_left_count[tid] = left_count;
|
|
s_right_count[tid] = right_count;
|
|
|
|
is_active_second = 0;
|
|
}
|
|
}
|
|
|
|
// necessary so that compact_second_chunk is up-to-date
|
|
cg::sync(cta);
|
|
|
|
// perform compaction of chunk where second children are stored
|
|
// scan of (num_threads_active / 2) elements, thus at most
|
|
// (num_threads_active / 4) threads are needed
|
|
if (compact_second_chunk > 0) {
|
|
// create indices for compaction
|
|
createIndicesCompaction(s_compaction_list_exc, num_threads_compaction,
|
|
cta);
|
|
|
|
compactIntervals(s_left, s_right, s_left_count, s_right_count, mid, right,
|
|
mid_count, right_count, s_compaction_list,
|
|
num_threads_active, is_active_second);
|
|
}
|
|
|
|
cg::sync(cta);
|
|
|
|
// update state variables
|
|
if (0 == tid) {
|
|
// update number of active threads with result of reduction
|
|
num_threads_active += s_compaction_list[num_threads_active];
|
|
num_threads_compaction = ceilPow2(num_threads_active);
|
|
|
|
compact_second_chunk = 0;
|
|
all_threads_converged = 1;
|
|
}
|
|
|
|
cg::sync(cta);
|
|
|
|
if (num_threads_compaction > blockDim.x) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
cg::sync(cta);
|
|
|
|
// generate two lists of intervals; one with intervals that contain one
|
|
// eigenvalue (or are converged), and one with intervals that need further
|
|
// subdivision
|
|
|
|
// perform two scans in parallel
|
|
|
|
unsigned int left_count_2;
|
|
unsigned int right_count_2;
|
|
|
|
unsigned int tid_2 = tid + blockDim.x;
|
|
|
|
// cache in per thread registers so that s_left_count and s_right_count
|
|
// can be used for scans
|
|
left_count = s_left_count[tid];
|
|
right_count = s_right_count[tid];
|
|
|
|
// some threads have to cache data for two intervals
|
|
if (tid_2 < num_threads_active) {
|
|
left_count_2 = s_left_count[tid_2];
|
|
right_count_2 = s_right_count[tid_2];
|
|
}
|
|
|
|
// compaction list for intervals containing one and multiple eigenvalues
|
|
// do not affect first element for exclusive scan
|
|
unsigned short *s_cl_one = s_left_count + 1;
|
|
unsigned short *s_cl_mult = s_right_count + 1;
|
|
|
|
// compaction list for generating blocks of intervals containing multiple
|
|
// eigenvalues
|
|
unsigned short *s_cl_blocking = s_compaction_list_exc;
|
|
// helper compaction list for generating blocks of intervals
|
|
__shared__ unsigned short s_cl_helper[2 * MAX_THREADS_BLOCK + 1];
|
|
|
|
if (0 == tid) {
|
|
// set to 0 for exclusive scan
|
|
s_left_count[0] = 0;
|
|
s_right_count[0] = 0;
|
|
}
|
|
|
|
cg::sync(cta);
|
|
|
|
// flag if interval contains one or multiple eigenvalues
|
|
unsigned int is_one_lambda = 0;
|
|
unsigned int is_one_lambda_2 = 0;
|
|
|
|
// number of eigenvalues in the interval
|
|
unsigned int multiplicity = right_count - left_count;
|
|
is_one_lambda = (1 == multiplicity);
|
|
|
|
s_cl_one[tid] = is_one_lambda;
|
|
s_cl_mult[tid] = (!is_one_lambda);
|
|
|
|
// (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
|
|
s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity;
|
|
s_cl_helper[tid] = 0;
|
|
|
|
if (tid_2 < num_threads_active) {
|
|
unsigned int multiplicity = right_count_2 - left_count_2;
|
|
is_one_lambda_2 = (1 == multiplicity);
|
|
|
|
s_cl_one[tid_2] = is_one_lambda_2;
|
|
s_cl_mult[tid_2] = (!is_one_lambda_2);
|
|
|
|
// (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
|
|
s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity;
|
|
s_cl_helper[tid_2] = 0;
|
|
} else if (tid_2 < (2 * MAX_THREADS_BLOCK + 1)) {
|
|
// clear
|
|
s_cl_blocking[tid_2] = 0;
|
|
s_cl_helper[tid_2] = 0;
|
|
}
|
|
|
|
scanInitial(tid, tid_2, num_threads_active, num_threads_compaction, s_cl_one,
|
|
s_cl_mult, s_cl_blocking, s_cl_helper, cta);
|
|
|
|
scanSumBlocks(tid, tid_2, num_threads_active, num_threads_compaction,
|
|
s_cl_blocking, s_cl_helper, cta);
|
|
|
|
// end down sweep of scan
|
|
cg::sync(cta);
|
|
|
|
unsigned int c_block_iend = 0;
|
|
unsigned int c_block_iend_2 = 0;
|
|
unsigned int c_sum_block = 0;
|
|
unsigned int c_sum_block_2 = 0;
|
|
|
|
// for each thread / interval that corresponds to root node of interval block
|
|
// store start address of block and total number of eigenvalues in all blocks
|
|
// before this block (particular thread is irrelevant, constraint is to
|
|
// have a subset of threads so that one and only one of them is in each
|
|
// interval)
|
|
if (1 == s_cl_helper[tid]) {
|
|
c_block_iend = s_cl_mult[tid] + 1;
|
|
c_sum_block = s_cl_blocking[tid];
|
|
}
|
|
|
|
if (1 == s_cl_helper[tid_2]) {
|
|
c_block_iend_2 = s_cl_mult[tid_2] + 1;
|
|
c_sum_block_2 = s_cl_blocking[tid_2];
|
|
}
|
|
|
|
scanCompactBlocksStartAddress(tid, tid_2, num_threads_compaction,
|
|
s_cl_blocking, s_cl_helper, cta);
|
|
|
|
// finished second scan for s_cl_blocking
|
|
cg::sync(cta);
|
|
|
|
// determine the global results
|
|
__shared__ unsigned int num_blocks_mult;
|
|
__shared__ unsigned int num_mult;
|
|
__shared__ unsigned int offset_mult_lambda;
|
|
|
|
if (0 == tid) {
|
|
num_blocks_mult = s_cl_blocking[num_threads_active - 1];
|
|
offset_mult_lambda = s_cl_one[num_threads_active - 1];
|
|
num_mult = s_cl_mult[num_threads_active - 1];
|
|
|
|
*g_num_one = offset_mult_lambda;
|
|
*g_num_blocks_mult = num_blocks_mult;
|
|
}
|
|
|
|
cg::sync(cta);
|
|
|
|
float left_2, right_2;
|
|
--s_cl_one;
|
|
--s_cl_mult;
|
|
--s_cl_blocking;
|
|
compactStreamsFinal(tid, tid_2, num_threads_active, offset_mult_lambda,
|
|
s_left, s_right, s_left_count, s_right_count, s_cl_one,
|
|
s_cl_mult, s_cl_blocking, s_cl_helper, is_one_lambda,
|
|
is_one_lambda_2, left, right, left_2, right_2, left_count,
|
|
right_count, left_count_2, right_count_2, c_block_iend,
|
|
c_sum_block, c_block_iend_2, c_sum_block_2, cta);
|
|
|
|
cg::sync(cta);
|
|
|
|
// final adjustment before writing out data to global memory
|
|
if (0 == tid) {
|
|
s_cl_blocking[num_blocks_mult] = num_mult;
|
|
s_cl_helper[0] = 0;
|
|
}
|
|
|
|
cg::sync(cta);
|
|
|
|
// write to global memory
|
|
writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult, g_left_one,
|
|
g_right_one, g_pos_one, g_left_mult, g_right_mult,
|
|
g_left_count_mult, g_right_count_mult, s_left, s_right,
|
|
s_left_count, s_right_count, g_blocks_mult, g_blocks_mult_sum,
|
|
s_compaction_list, s_cl_helper, offset_mult_lambda);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Write data to global memory
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__device__ void writeToGmem(
|
|
const unsigned int tid, const unsigned int tid_2,
|
|
const unsigned int num_threads_active, const unsigned int num_blocks_mult,
|
|
float *g_left_one, float *g_right_one, unsigned int *g_pos_one,
|
|
float *g_left_mult, float *g_right_mult, unsigned int *g_left_count_mult,
|
|
unsigned int *g_right_count_mult, float *s_left, float *s_right,
|
|
unsigned short *s_left_count, unsigned short *s_right_count,
|
|
unsigned int *g_blocks_mult, unsigned int *g_blocks_mult_sum,
|
|
unsigned short *s_compaction_list, unsigned short *s_cl_helper,
|
|
unsigned int offset_mult_lambda) {
|
|
if (tid < offset_mult_lambda) {
|
|
g_left_one[tid] = s_left[tid];
|
|
g_right_one[tid] = s_right[tid];
|
|
// right count can be used to order eigenvalues without sorting
|
|
g_pos_one[tid] = s_right_count[tid];
|
|
} else {
|
|
g_left_mult[tid - offset_mult_lambda] = s_left[tid];
|
|
g_right_mult[tid - offset_mult_lambda] = s_right[tid];
|
|
g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid];
|
|
g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid];
|
|
}
|
|
|
|
if (tid_2 < num_threads_active) {
|
|
if (tid_2 < offset_mult_lambda) {
|
|
g_left_one[tid_2] = s_left[tid_2];
|
|
g_right_one[tid_2] = s_right[tid_2];
|
|
// right count can be used to order eigenvalues without sorting
|
|
g_pos_one[tid_2] = s_right_count[tid_2];
|
|
} else {
|
|
g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2];
|
|
g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2];
|
|
g_left_count_mult[tid_2 - offset_mult_lambda] = s_left_count[tid_2];
|
|
g_right_count_mult[tid_2 - offset_mult_lambda] = s_right_count[tid_2];
|
|
}
|
|
|
|
} // end writing out data
|
|
|
|
// note that s_cl_blocking = s_compaction_list + 1;, that is by writing out
|
|
// s_compaction_list we write the exclusive scan result
|
|
if (tid <= num_blocks_mult) {
|
|
g_blocks_mult[tid] = s_compaction_list[tid];
|
|
g_blocks_mult_sum[tid] = s_cl_helper[tid];
|
|
}
|
|
|
|
if (tid_2 <= num_blocks_mult) {
|
|
g_blocks_mult[tid_2] = s_compaction_list[tid_2];
|
|
g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2];
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Perform final stream compaction before writing data to global memory
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__device__ void compactStreamsFinal(
|
|
const unsigned int tid, const unsigned int tid_2,
|
|
const unsigned int num_threads_active, unsigned int &offset_mult_lambda,
|
|
float *s_left, float *s_right, unsigned short *s_left_count,
|
|
unsigned short *s_right_count, unsigned short *s_cl_one,
|
|
unsigned short *s_cl_mult, unsigned short *s_cl_blocking,
|
|
unsigned short *s_cl_helper, unsigned int is_one_lambda,
|
|
unsigned int is_one_lambda_2, float &left, float &right, float &left_2,
|
|
float &right_2, unsigned int &left_count, unsigned int &right_count,
|
|
unsigned int &left_count_2, unsigned int &right_count_2,
|
|
unsigned int c_block_iend, unsigned int c_sum_block,
|
|
unsigned int c_block_iend_2, unsigned int c_sum_block_2,
|
|
cg::thread_block cta) {
|
|
// cache data before performing compaction
|
|
left = s_left[tid];
|
|
right = s_right[tid];
|
|
|
|
if (tid_2 < num_threads_active) {
|
|
left_2 = s_left[tid_2];
|
|
right_2 = s_right[tid_2];
|
|
}
|
|
|
|
cg::sync(cta);
|
|
|
|
// determine addresses for intervals containing multiple eigenvalues and
|
|
// addresses for blocks of intervals
|
|
unsigned int ptr_w = 0;
|
|
unsigned int ptr_w_2 = 0;
|
|
unsigned int ptr_blocking_w = 0;
|
|
unsigned int ptr_blocking_w_2 = 0;
|
|
|
|
ptr_w = (1 == is_one_lambda) ? s_cl_one[tid]
|
|
: s_cl_mult[tid] + offset_mult_lambda;
|
|
|
|
if (0 != c_block_iend) {
|
|
ptr_blocking_w = s_cl_blocking[tid];
|
|
}
|
|
|
|
if (tid_2 < num_threads_active) {
|
|
ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2]
|
|
: s_cl_mult[tid_2] + offset_mult_lambda;
|
|
|
|
if (0 != c_block_iend_2) {
|
|
ptr_blocking_w_2 = s_cl_blocking[tid_2];
|
|
}
|
|
}
|
|
|
|
cg::sync(cta);
|
|
|
|
// store compactly in shared mem
|
|
s_left[ptr_w] = left;
|
|
s_right[ptr_w] = right;
|
|
s_left_count[ptr_w] = left_count;
|
|
s_right_count[ptr_w] = right_count;
|
|
|
|
if (0 != c_block_iend) {
|
|
s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1;
|
|
s_cl_helper[ptr_blocking_w + 1] = c_sum_block;
|
|
}
|
|
|
|
if (tid_2 < num_threads_active) {
|
|
// store compactly in shared mem
|
|
s_left[ptr_w_2] = left_2;
|
|
s_right[ptr_w_2] = right_2;
|
|
s_left_count[ptr_w_2] = left_count_2;
|
|
s_right_count[ptr_w_2] = right_count_2;
|
|
|
|
if (0 != c_block_iend_2) {
|
|
s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1;
|
|
s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2;
|
|
}
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Compute addresses to obtain compact list of block start addresses
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__device__ void scanCompactBlocksStartAddress(
|
|
const unsigned int tid, const unsigned int tid_2,
|
|
const unsigned int num_threads_compaction, unsigned short *s_cl_blocking,
|
|
unsigned short *s_cl_helper, cg::thread_block cta) {
|
|
// prepare for second step of block generation: compaction of the block
|
|
// list itself to efficiently write out these
|
|
s_cl_blocking[tid] = s_cl_helper[tid];
|
|
|
|
if (tid_2 < num_threads_compaction) {
|
|
s_cl_blocking[tid_2] = s_cl_helper[tid_2];
|
|
}
|
|
|
|
cg::sync(cta);
|
|
|
|
// additional scan to compact s_cl_blocking that permits to generate a
|
|
// compact list of eigenvalue blocks each one containing about
|
|
// MAX_THREADS_BLOCK eigenvalues (so that each of these blocks may be
|
|
// processed by one thread block in a subsequent processing step
|
|
|
|
unsigned int offset = 1;
|
|
|
|
// build scan tree
|
|
for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) {
|
|
cg::sync(cta);
|
|
|
|
if (tid < d) {
|
|
unsigned int ai = offset * (2 * tid + 1) - 1;
|
|
unsigned int bi = offset * (2 * tid + 2) - 1;
|
|
s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
|
|
}
|
|
|
|
offset <<= 1;
|
|
}
|
|
|
|
// traverse down tree: first down to level 2 across
|
|
for (int d = 2; d < num_threads_compaction; d <<= 1) {
|
|
offset >>= 1;
|
|
cg::sync(cta);
|
|
|
|
//
|
|
if (tid < (d - 1)) {
|
|
unsigned int ai = offset * (tid + 1) - 1;
|
|
unsigned int bi = ai + (offset >> 1);
|
|
s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
|
|
}
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Perform scan to obtain number of eigenvalues before a specific block
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__device__ void scanSumBlocks(const unsigned int tid, const unsigned int tid_2,
|
|
const unsigned int num_threads_active,
|
|
const unsigned int num_threads_compaction,
|
|
unsigned short *s_cl_blocking,
|
|
unsigned short *s_cl_helper,
|
|
cg::thread_block cta) {
|
|
unsigned int offset = 1;
|
|
|
|
// first step of scan to build the sum of elements within each block
|
|
// build up tree
|
|
for (int d = num_threads_compaction >> 1; d > 0; d >>= 1) {
|
|
cg::sync(cta);
|
|
|
|
if (tid < d) {
|
|
unsigned int ai = offset * (2 * tid + 1) - 1;
|
|
unsigned int bi = offset * (2 * tid + 2) - 1;
|
|
|
|
s_cl_blocking[bi] += s_cl_blocking[ai];
|
|
}
|
|
|
|
offset *= 2;
|
|
}
|
|
|
|
// first step of scan to build the sum of elements within each block
|
|
// traverse down tree
|
|
for (int d = 2; d < (num_threads_compaction - 1); d <<= 1) {
|
|
offset >>= 1;
|
|
cg::sync(cta);
|
|
|
|
if (tid < (d - 1)) {
|
|
unsigned int ai = offset * (tid + 1) - 1;
|
|
unsigned int bi = ai + (offset >> 1);
|
|
|
|
s_cl_blocking[bi] += s_cl_blocking[ai];
|
|
}
|
|
}
|
|
|
|
cg::sync(cta);
|
|
|
|
if (0 == tid) {
|
|
// move last element of scan to last element that is valid
|
|
// necessary because the number of threads employed for scan is a power
|
|
// of two and not necessarily the number of active threasd
|
|
s_cl_helper[num_threads_active - 1] =
|
|
s_cl_helper[num_threads_compaction - 1];
|
|
s_cl_blocking[num_threads_active - 1] =
|
|
s_cl_blocking[num_threads_compaction - 1];
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Perform initial scan for compaction of intervals containing one and
|
|
//! multiple eigenvalues; also do initial scan to build blocks
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__device__ void scanInitial(const unsigned int tid, const unsigned int tid_2,
|
|
const unsigned int num_threads_active,
|
|
const unsigned int num_threads_compaction,
|
|
unsigned short *s_cl_one, unsigned short *s_cl_mult,
|
|
unsigned short *s_cl_blocking,
|
|
unsigned short *s_cl_helper, cg::thread_block cta) {
|
|
// perform scan to compactly write out the intervals containing one and
|
|
// multiple eigenvalues
|
|
// also generate tree for blocking of intervals containing multiple
|
|
// eigenvalues
|
|
|
|
unsigned int offset = 1;
|
|
|
|
// build scan tree
|
|
for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) {
|
|
cg::sync(cta);
|
|
|
|
if (tid < d) {
|
|
unsigned int ai = offset * (2 * tid + 1);
|
|
unsigned int bi = offset * (2 * tid + 2) - 1;
|
|
|
|
s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1];
|
|
s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1];
|
|
|
|
// s_cl_helper is binary and zero for an internal node and 1 for a
|
|
// root node of a tree corresponding to a block
|
|
// s_cl_blocking contains the number of nodes in each sub-tree at each
|
|
// iteration, the data has to be kept to compute the total number of
|
|
// eigenvalues per block that, in turn, is needed to efficiently
|
|
// write out data in the second step
|
|
if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1)) {
|
|
// check how many childs are non terminated
|
|
if (s_cl_helper[ai - 1] == 1) {
|
|
// mark as terminated
|
|
s_cl_helper[bi] = 1;
|
|
} else if (s_cl_helper[bi] == 1) {
|
|
// mark as terminated
|
|
s_cl_helper[ai - 1] = 1;
|
|
} else // both childs are non-terminated
|
|
{
|
|
unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 1];
|
|
|
|
if (temp > MAX_THREADS_BLOCK) {
|
|
// the two child trees have to form separate blocks, terminate trees
|
|
s_cl_helper[ai - 1] = 1;
|
|
s_cl_helper[bi] = 1;
|
|
} else {
|
|
// build up tree by joining subtrees
|
|
s_cl_blocking[bi] = temp;
|
|
s_cl_blocking[ai - 1] = 0;
|
|
}
|
|
}
|
|
} // end s_cl_helper update
|
|
}
|
|
|
|
offset <<= 1;
|
|
}
|
|
|
|
// traverse down tree, this only for stream compaction, not for block
|
|
// construction
|
|
for (int d = 2; d < num_threads_compaction; d <<= 1) {
|
|
offset >>= 1;
|
|
cg::sync(cta);
|
|
|
|
//
|
|
if (tid < (d - 1)) {
|
|
unsigned int ai = offset * (tid + 1) - 1;
|
|
unsigned int bi = ai + (offset >> 1);
|
|
|
|
s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai];
|
|
s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai];
|
|
}
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Store all non-empty intervals resulting from the subdivision of the interval
|
|
//! currently processed by the thread
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__device__ void storeNonEmptyIntervalsLarge(
|
|
unsigned int addr, const unsigned int num_threads_active, float *s_left,
|
|
float *s_right, unsigned short *s_left_count, unsigned short *s_right_count,
|
|
float left, float mid, float right, const unsigned short left_count,
|
|
const unsigned short mid_count, const unsigned short right_count,
|
|
float epsilon, unsigned int &compact_second_chunk,
|
|
unsigned short *s_compaction_list, unsigned int &is_active_second) {
|
|
// check if both child intervals are valid
|
|
if ((left_count != mid_count) && (mid_count != right_count)) {
|
|
storeInterval(addr, s_left, s_right, s_left_count, s_right_count, left, mid,
|
|
left_count, mid_count, epsilon);
|
|
|
|
is_active_second = 1;
|
|
s_compaction_list[threadIdx.x] = 1;
|
|
atomicExch(&compact_second_chunk, 1);
|
|
} else {
|
|
// only one non-empty child interval
|
|
|
|
// mark that no second child
|
|
is_active_second = 0;
|
|
s_compaction_list[threadIdx.x] = 0;
|
|
|
|
// store the one valid child interval
|
|
if (left_count != mid_count) {
|
|
storeInterval(addr, s_left, s_right, s_left_count, s_right_count, left,
|
|
mid, left_count, mid_count, epsilon);
|
|
} else {
|
|
storeInterval(addr, s_left, s_right, s_left_count, s_right_count, mid,
|
|
right, mid_count, right_count, epsilon);
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif // #ifndef _BISECT_KERNEL_LARGE_H_
|