/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Determine eigenvalues for large matrices for intervals that contained after * the first step one eigenvalue */ #ifndef _BISECT_KERNEL_LARGE_ONEI_H_ #define _BISECT_KERNEL_LARGE_ONEI_H_ #include namespace cg = cooperative_groups; // includes, project #include "config.h" #include "util.h" // additional kernel #include "bisect_util.cu" //////////////////////////////////////////////////////////////////////////////// //! Determine eigenvalues for large matrices for intervals that after //! the first step contained one eigenvalue //! @param g_d diagonal elements of symmetric, tridiagonal matrix //! @param g_s superdiagonal elements of symmetric, tridiagonal matrix //! @param n matrix size //! @param num_intervals total number of intervals containing one eigenvalue //! after the first step //! @param g_left left interval limits //! @param g_right right interval limits //! @param g_pos index of interval / number of intervals that are smaller than //! right interval limit //! @param precision desired precision of eigenvalues //////////////////////////////////////////////////////////////////////////////// __global__ void bisectKernelLarge_OneIntervals( float *g_d, float *g_s, const unsigned int n, unsigned int num_intervals, float *g_left, float *g_right, unsigned int *g_pos, float precision) { // Handle to thread block group cg::thread_block cta = cg::this_thread_block(); const unsigned int gtid = (blockDim.x * blockIdx.x) + threadIdx.x; __shared__ float s_left_scratch[MAX_THREADS_BLOCK]; __shared__ float s_right_scratch[MAX_THREADS_BLOCK]; // active interval of thread // left and right limit of current interval float left, right; // number of threads smaller than the right limit (also corresponds to the // global index of the eigenvalues contained in the active interval) unsigned int right_count; // flag if current thread converged unsigned int converged = 0; // midpoint when current interval is subdivided float mid = 0.0f; // number of eigenvalues less than mid unsigned int mid_count = 0; // read data from global memory if (gtid < num_intervals) { left = g_left[gtid]; right = g_right[gtid]; right_count = g_pos[gtid]; } // flag to determine if all threads converged to eigenvalue __shared__ unsigned int converged_all_threads; // initialized shared flag if (0 == threadIdx.x) { converged_all_threads = 0; } cg::sync(cta); // process until all threads converged to an eigenvalue // while( 0 == converged_all_threads) { while (true) { atomicExch(&converged_all_threads, 1); // update midpoint for all active threads if ((gtid < num_intervals) && (0 == converged)) { mid = computeMidpoint(left, right); } // find number of eigenvalues that are smaller than midpoint mid_count = computeNumSmallerEigenvalsLarge( g_d, g_s, n, mid, gtid, num_intervals, s_left_scratch, s_right_scratch, converged, cta); cg::sync(cta); // for all active threads if ((gtid < num_intervals) && (0 == converged)) { // udpate intervals -- always one child interval survives if (right_count == mid_count) { right = mid; } else { left = mid; } // check for convergence float t0 = right - left; float t1 = max(abs(right), abs(left)) * precision; if (t0 < min(precision, t1)) { float lambda = computeMidpoint(left, right); left = lambda; right = lambda; converged = 1; } else { atomicExch(&converged_all_threads, 0); } } cg::sync(cta); if (1 == converged_all_threads) { break; } cg::sync(cta); } // write data back to global memory cg::sync(cta); if (gtid < num_intervals) { // intervals converged so left and right interval limit are both identical // and identical to the eigenvalue g_left[gtid] = left; } } #endif // #ifndef _BISECT_KERNEL_LARGE_ONEI_H_