/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Utility / shared functionality for bisection kernels */ #ifndef _BISECT_UTIL_H_ #define _BISECT_UTIL_H_ #include namespace cg = cooperative_groups; // includes, project #include "config.h" #include "util.h" //////////////////////////////////////////////////////////////////////////////// //! Compute the next lower power of two of n //! @param n number for which next higher power of two is sought //////////////////////////////////////////////////////////////////////////////// __device__ inline int floorPow2(int n) { // early out if already power of two if (0 == (n & (n - 1))) { return n; } int exp; frexp((float)n, &exp); return (1 << (exp - 1)); } //////////////////////////////////////////////////////////////////////////////// //! Compute the next higher power of two of n //! @param n number for which next higher power of two is sought //////////////////////////////////////////////////////////////////////////////// __device__ inline int ceilPow2(int n) { // early out if already power of two if (0 == (n & (n - 1))) { return n; } int exp; frexp((float)n, &exp); return (1 << exp); } //////////////////////////////////////////////////////////////////////////////// //! Compute midpoint of interval [\a left, \a right] avoiding overflow if //! possible //! @param left left / lower limit of interval //! @param right right / upper limit of interval //////////////////////////////////////////////////////////////////////////////// __device__ inline float computeMidpoint(const float left, const float right) { float mid; if (sign_f(left) == sign_f(right)) { mid = left + (right - left) * 0.5f; } else { mid = (left + right) * 0.5f; } return mid; } //////////////////////////////////////////////////////////////////////////////// //! Check if interval converged and store appropriately //! @param addr address where to store the information of the interval //! @param s_left shared memory storage for left interval limits //! @param s_right shared memory storage for right interval limits //! @param s_left_count shared memory storage for number of eigenvalues less //! than left interval limits //! @param s_right_count shared memory storage for number of eigenvalues less //! than right interval limits //! @param left lower limit of interval //! @param right upper limit of interval //! @param left_count eigenvalues less than \a left //! @param right_count eigenvalues less than \a right //! @param precision desired precision for eigenvalues //////////////////////////////////////////////////////////////////////////////// template __device__ void storeInterval(unsigned int addr, float *s_left, float *s_right, T *s_left_count, T *s_right_count, float left, float right, S left_count, S right_count, float precision) { s_left_count[addr] = left_count; s_right_count[addr] = right_count; // check if interval converged float t0 = abs(right - left); float t1 = max(abs(left), abs(right)) * precision; if (t0 <= max(MIN_ABS_INTERVAL, t1)) { // compute mid point float lambda = computeMidpoint(left, right); // mark as converged s_left[addr] = lambda; s_right[addr] = lambda; } else { // store current limits s_left[addr] = left; s_right[addr] = right; } } //////////////////////////////////////////////////////////////////////////////// //! Compute number of eigenvalues that are smaller than x given a symmetric, //! real, and tridiagonal matrix //! @param g_d diagonal elements stored in global memory //! @param g_s superdiagonal elements stored in global memory //! @param n size of matrix //! @param x value for which the number of eigenvalues that are smaller is //! seeked //! @param tid thread identified (e.g. threadIdx.x or gtid) //! @param num_intervals_active number of active intervals / threads that //! currently process an interval //! @param s_d scratch space to store diagonal entries of the tridiagonal //! matrix in shared memory //! @param s_s scratch space to store superdiagonal entries of the tridiagonal //! matrix in shared memory //! @param converged flag if the current thread is already converged (that //! is count does not have to be computed) //////////////////////////////////////////////////////////////////////////////// __device__ inline unsigned int computeNumSmallerEigenvals( float *g_d, float *g_s, const unsigned int n, const float x, const unsigned int tid, const unsigned int num_intervals_active, float *s_d, float *s_s, unsigned int converged, cg::thread_block cta) { float delta = 1.0f; unsigned int count = 0; cg::sync(cta); // read data into shared memory if (threadIdx.x < n) { s_d[threadIdx.x] = *(g_d + threadIdx.x); s_s[threadIdx.x] = *(g_s + threadIdx.x - 1); } cg::sync(cta); // perform loop only for active threads if ((tid < num_intervals_active) && (0 == converged)) { // perform (optimized) Gaussian elimination to determine the number // of eigenvalues that are smaller than n for (unsigned int k = 0; k < n; ++k) { delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; count += (delta < 0) ? 1 : 0; } } // end if thread currently processing an interval return count; } //////////////////////////////////////////////////////////////////////////////// //! Compute number of eigenvalues that are smaller than x given a symmetric, //! real, and tridiagonal matrix //! @param g_d diagonal elements stored in global memory //! @param g_s superdiagonal elements stored in global memory //! @param n size of matrix //! @param x value for which the number of eigenvalues that are smaller is //! seeked //! @param tid thread identified (e.g. threadIdx.x or gtid) //! @param num_intervals_active number of active intervals / threads that //! currently process an interval //! @param s_d scratch space to store diagonal entries of the tridiagonal //! matrix in shared memory //! @param s_s scratch space to store superdiagonal entries of the tridiagonal //! matrix in shared memory //! @param converged flag if the current thread is already converged (that //! is count does not have to be computed) //////////////////////////////////////////////////////////////////////////////// __device__ inline unsigned int computeNumSmallerEigenvalsLarge( float *g_d, float *g_s, const unsigned int n, const float x, const unsigned int tid, const unsigned int num_intervals_active, float *s_d, float *s_s, unsigned int converged, cg::thread_block cta) { float delta = 1.0f; unsigned int count = 0; unsigned int rem = n; // do until whole diagonal and superdiagonal has been loaded and processed for (unsigned int i = 0; i < n; i += blockDim.x) { cg::sync(cta); // read new chunk of data into shared memory if ((i + threadIdx.x) < n) { s_d[threadIdx.x] = *(g_d + i + threadIdx.x); s_s[threadIdx.x] = *(g_s + i + threadIdx.x - 1); } cg::sync(cta); if (tid < num_intervals_active) { // perform (optimized) Gaussian elimination to determine the number // of eigenvalues that are smaller than n for (unsigned int k = 0; k < min(rem, blockDim.x); ++k) { delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta; count += (delta < 0) ? 1 : 0; } } // end if thread currently processing an interval rem -= blockDim.x; } return count; } //////////////////////////////////////////////////////////////////////////////// //! Store all non-empty intervals resulting from the subdivision of the interval //! currently processed by the thread //! @param addr base address for storing intervals //! @param num_threads_active number of threads / intervals in current sweep //! @param s_left shared memory storage for left interval limits //! @param s_right shared memory storage for right interval limits //! @param s_left_count shared memory storage for number of eigenvalues less //! than left interval limits //! @param s_right_count shared memory storage for number of eigenvalues less //! than right interval limits //! @param left lower limit of interval //! @param mid midpoint of interval //! @param right upper limit of interval //! @param left_count eigenvalues less than \a left //! @param mid_count eigenvalues less than \a mid //! @param right_count eigenvalues less than \a right //! @param precision desired precision for eigenvalues //! @param compact_second_chunk shared mem flag if second chunk is used and //! ergo requires compaction //! @param s_compaction_list_exc helper array for stream compaction, //! s_compaction_list_exc[tid] = 1 when the //! thread generated two child intervals //! @is_active_interval mark is thread has a second non-empty child interval //////////////////////////////////////////////////////////////////////////////// template __device__ void storeNonEmptyIntervals( unsigned int addr, const unsigned int num_threads_active, float *s_left, float *s_right, T *s_left_count, T *s_right_count, float left, float mid, float right, const S left_count, const S mid_count, const S right_count, float precision, unsigned int &compact_second_chunk, T *s_compaction_list_exc, unsigned int &is_active_second) { // check if both child intervals are valid if ((left_count != mid_count) && (mid_count != right_count)) { // store the left interval storeInterval(addr, s_left, s_right, s_left_count, s_right_count, left, mid, left_count, mid_count, precision); // mark that a second interval has been generated, only stored after // stream compaction of second chunk is_active_second = 1; s_compaction_list_exc[threadIdx.x] = 1; atomicExch(&compact_second_chunk, 1); } else { // only one non-empty child interval // mark that no second child is_active_second = 0; s_compaction_list_exc[threadIdx.x] = 0; // store the one valid child interval if (left_count != mid_count) { storeInterval(addr, s_left, s_right, s_left_count, s_right_count, left, mid, left_count, mid_count, precision); } else { storeInterval(addr, s_left, s_right, s_left_count, s_right_count, mid, right, mid_count, right_count, precision); } } } //////////////////////////////////////////////////////////////////////////////// //! Create indices for compaction, that is process \a s_compaction_list_exc //! which is 1 for intervals that generated a second child and 0 otherwise //! and create for each of the non-zero elements the index where the new //! interval belongs to in a compact representation of all generated second //! childs //! @param s_compaction_list_exc list containing the flags which threads //! generated two children //! @param num_threads_compaction number of threads to employ for compaction //////////////////////////////////////////////////////////////////////////////// template __device__ void createIndicesCompaction(T *s_compaction_list_exc, unsigned int num_threads_compaction, cg::thread_block cta) { unsigned int offset = 1; const unsigned int tid = threadIdx.x; // higher levels of scan tree for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) { cg::sync(cta); if (tid < d) { unsigned int ai = offset * (2 * tid + 1) - 1; unsigned int bi = offset * (2 * tid + 2) - 1; s_compaction_list_exc[bi] = s_compaction_list_exc[bi] + s_compaction_list_exc[ai]; } offset <<= 1; } // traverse down tree: first down to level 2 across for (int d = 2; d < num_threads_compaction; d <<= 1) { offset >>= 1; cg::sync(cta); if (tid < (d - 1)) { unsigned int ai = offset * (tid + 1) - 1; unsigned int bi = ai + (offset >> 1); s_compaction_list_exc[bi] = s_compaction_list_exc[bi] + s_compaction_list_exc[ai]; } } cg::sync(cta); } /////////////////////////////////////////////////////////////////////////////// //! Perform stream compaction for second child intervals //! @param s_left shared //! @param s_left shared memory storage for left interval limits //! @param s_right shared memory storage for right interval limits //! @param s_left_count shared memory storage for number of eigenvalues less //! than left interval limits //! @param s_right_count shared memory storage for number of eigenvalues less //! than right interval limits //! @param mid midpoint of current interval (left of new interval) //! @param right upper limit of interval //! @param mid_count eigenvalues less than \a mid //! @param s_compaction_list list containing the indices where the data has //! to be stored //! @param num_threads_active number of active threads / intervals //! @is_active_interval mark is thread has a second non-empty child interval /////////////////////////////////////////////////////////////////////////////// template __device__ void compactIntervals(float *s_left, float *s_right, T *s_left_count, T *s_right_count, float mid, float right, unsigned int mid_count, unsigned int right_count, T *s_compaction_list, unsigned int num_threads_active, unsigned int is_active_second) { const unsigned int tid = threadIdx.x; // perform compaction / copy data for all threads where the second // child is not dead if ((tid < num_threads_active) && (1 == is_active_second)) { unsigned int addr_w = num_threads_active + s_compaction_list[tid]; s_left[addr_w] = mid; s_right[addr_w] = right; s_left_count[addr_w] = mid_count; s_right_count[addr_w] = right_count; } } /////////////////////////////////////////////////////////////////////////////// //! Store intervals that have already converged (w.r.t. the desired precision), //! duplicating intervals that contain multiple eigenvalues //! @param s_left shared memory storage for left interval limits //! @param s_right shared memory storage for right interval limits //! @param s_left_count shared memory storage for number of eigenvalues less //! than left interval limits //! @param s_right_count shared memory storage for number of eigenvalues less //! than right interval limits //! @param left lower limit of interval //! @param mid midpoint of interval (updated if split is necessary) //! @param right upper limit of interval //! @param left_count eigenvalues less than \a left //! @param mid_count eigenvalues less than \a mid //! @param right_count eigenvalues less than \a right //! @param s_compaction_list_exc helper array for stream compaction, updated //! at tid if split is necessary //! @param compact_second_chunk shared mem flag if second chunk is used and //! ergo requires compaction //! @param num_threads_active number of active threads / intervals /////////////////////////////////////////////////////////////////////////////// template __device__ void storeIntervalConverged(float *s_left, float *s_right, T *s_left_count, T *s_right_count, float &left, float &mid, float &right, S &left_count, S &mid_count, S &right_count, T *s_compaction_list_exc, unsigned int &compact_second_chunk, const unsigned int num_threads_active) { const unsigned int tid = threadIdx.x; const unsigned int multiplicity = right_count - left_count; // check multiplicity of eigenvalue if (1 == multiplicity) { // just re-store intervals, simple eigenvalue s_left[tid] = left; s_right[tid] = right; s_left_count[tid] = left_count; s_right_count[tid] = right_count; // mark that no second child / clear s_right_count[tid + num_threads_active] = 0; s_compaction_list_exc[tid] = 0; } else { // number of eigenvalues after the split less than mid mid_count = left_count + (multiplicity >> 1); // store left interval s_left[tid] = left; s_right[tid] = right; s_left_count[tid] = left_count; s_right_count[tid] = mid_count; mid = left; // mark that second child interval exists s_right_count[tid + num_threads_active] = right_count; s_compaction_list_exc[tid] = 1; compact_second_chunk = 1; } } template __device__ void storeIntervalConverged(float *s_left, float *s_right, T *s_left_count, T *s_right_count, float &left, float &mid, float &right, S &left_count, S &mid_count, S &right_count, T *s_compaction_list_exc, unsigned int &compact_second_chunk, const unsigned int num_threads_active, unsigned int &is_active_second) { const unsigned int tid = threadIdx.x; const unsigned int multiplicity = right_count - left_count; // check multiplicity of eigenvalue if (1 == multiplicity) { // just re-store intervals, simple eigenvalue s_left[tid] = left; s_right[tid] = right; s_left_count[tid] = left_count; s_right_count[tid] = right_count; // mark that no second child / clear is_active_second = 0; s_compaction_list_exc[tid] = 0; } else { // number of eigenvalues after the split less than mid mid_count = left_count + (multiplicity >> 1); // store left interval s_left[tid] = left; s_right[tid] = right; s_left_count[tid] = left_count; s_right_count[tid] = mid_count; mid = left; // mark that second child interval exists is_active_second = 1; s_compaction_list_exc[tid] = 1; compact_second_chunk = 1; } } /////////////////////////////////////////////////////////////////////////////// //! Subdivide interval if active and not already converged //! @param tid id of thread //! @param s_left shared memory storage for left interval limits //! @param s_right shared memory storage for right interval limits //! @param s_left_count shared memory storage for number of eigenvalues less //! than left interval limits //! @param s_right_count shared memory storage for number of eigenvalues less //! than right interval limits //! @param num_threads_active number of active threads in warp //! @param left lower limit of interval //! @param right upper limit of interval //! @param left_count eigenvalues less than \a left //! @param right_count eigenvalues less than \a right //! @param all_threads_converged shared memory flag if all threads are //! converged /////////////////////////////////////////////////////////////////////////////// template __device__ void subdivideActiveInterval( const unsigned int tid, float *s_left, float *s_right, T *s_left_count, T *s_right_count, const unsigned int num_threads_active, float &left, float &right, unsigned int &left_count, unsigned int &right_count, float &mid, unsigned int &all_threads_converged) { // for all active threads if (tid < num_threads_active) { left = s_left[tid]; right = s_right[tid]; left_count = s_left_count[tid]; right_count = s_right_count[tid]; // check if thread already converged if (left != right) { mid = computeMidpoint(left, right); atomicExch(&all_threads_converged, 0); } else if ((right_count - left_count) > 1) { // mark as not converged if multiple eigenvalues enclosed // duplicate interval in storeIntervalsConverged() atomicExch(&all_threads_converged, 0); } } // end for all active threads } #endif // #ifndef _BISECT_UTIL_H_