/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/* Utility / shared functionality for bisection kernels */

#ifndef _BISECT_UTIL_H_
#define _BISECT_UTIL_H_

#include <cooperative_groups.h>

namespace cg = cooperative_groups;

// includes, project
#include "config.h"
#include "util.h"

////////////////////////////////////////////////////////////////////////////////
//! Compute the next lower power of two of n
//! @param  n  number for which next higher power of two is sought
////////////////////////////////////////////////////////////////////////////////
__device__ inline int floorPow2(int n) {
  // early out if already power of two
  if (0 == (n & (n - 1))) {
    return n;
  }

  int exp;
  frexp((float)n, &exp);
  return (1 << (exp - 1));
}

////////////////////////////////////////////////////////////////////////////////
//! Compute the next higher power of two of n
//! @param  n  number for which next higher power of two is sought
////////////////////////////////////////////////////////////////////////////////
__device__ inline int ceilPow2(int n) {
  // early out if already power of two
  if (0 == (n & (n - 1))) {
    return n;
  }

  int exp;
  frexp((float)n, &exp);
  return (1 << exp);
}

////////////////////////////////////////////////////////////////////////////////
//! Compute midpoint of interval [\a left, \a right] avoiding overflow if
//! possible
//! @param left   left / lower limit of interval
//! @param right  right / upper limit of interval
////////////////////////////////////////////////////////////////////////////////
__device__ inline float computeMidpoint(const float left, const float right) {
  float mid;

  if (sign_f(left) == sign_f(right)) {
    mid = left + (right - left) * 0.5f;
  } else {
    mid = (left + right) * 0.5f;
  }

  return mid;
}

////////////////////////////////////////////////////////////////////////////////
//! Check if interval converged and store appropriately
//! @param  addr    address where to store the information of the interval
//! @param  s_left  shared memory storage for left interval limits
//! @param  s_right  shared memory storage for right interval limits
//! @param  s_left_count  shared memory storage for number of eigenvalues less
//!                       than left interval limits
//! @param  s_right_count  shared memory storage for number of eigenvalues less
//!                       than right interval limits
//! @param  left   lower limit of interval
//! @param  right  upper limit of interval
//! @param  left_count  eigenvalues less than \a left
//! @param  right_count  eigenvalues less than \a right
//! @param  precision  desired precision for eigenvalues
////////////////////////////////////////////////////////////////////////////////
template <class S, class T>
__device__ void storeInterval(unsigned int addr, float *s_left, float *s_right,
                              T *s_left_count, T *s_right_count, float left,
                              float right, S left_count, S right_count,
                              float precision) {
  s_left_count[addr] = left_count;
  s_right_count[addr] = right_count;

  // check if interval converged
  float t0 = abs(right - left);
  float t1 = max(abs(left), abs(right)) * precision;

  if (t0 <= max(MIN_ABS_INTERVAL, t1)) {
    // compute mid point
    float lambda = computeMidpoint(left, right);

    // mark as converged
    s_left[addr] = lambda;
    s_right[addr] = lambda;
  } else {
    // store current limits
    s_left[addr] = left;
    s_right[addr] = right;
  }
}

////////////////////////////////////////////////////////////////////////////////
//! Compute number of eigenvalues that are smaller than x given a symmetric,
//! real, and tridiagonal matrix
//! @param  g_d  diagonal elements stored in global memory
//! @param  g_s  superdiagonal elements stored in global memory
//! @param  n    size of matrix
//! @param  x    value for which the number of eigenvalues that are smaller is
//!              seeked
//! @param  tid  thread identified (e.g. threadIdx.x or gtid)
//! @param  num_intervals_active  number of active intervals / threads that
//!                               currently process an interval
//! @param  s_d  scratch space to store diagonal entries of the tridiagonal
//!              matrix in shared memory
//! @param  s_s  scratch space to store superdiagonal entries of the tridiagonal
//!              matrix in shared memory
//! @param  converged  flag if the current thread is already converged (that
//!         is count does not have to be computed)
////////////////////////////////////////////////////////////////////////////////
__device__ inline unsigned int computeNumSmallerEigenvals(
    float *g_d, float *g_s, const unsigned int n, const float x,
    const unsigned int tid, const unsigned int num_intervals_active, float *s_d,
    float *s_s, unsigned int converged, cg::thread_block cta) {
  float delta = 1.0f;
  unsigned int count = 0;

  cg::sync(cta);

  // read data into shared memory
  if (threadIdx.x < n) {
    s_d[threadIdx.x] = *(g_d + threadIdx.x);
    s_s[threadIdx.x] = *(g_s + threadIdx.x - 1);
  }

  cg::sync(cta);

  // perform loop only for active threads
  if ((tid < num_intervals_active) && (0 == converged)) {
    // perform (optimized) Gaussian elimination to determine the number
    // of eigenvalues that are smaller than n
    for (unsigned int k = 0; k < n; ++k) {
      delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;
      count += (delta < 0) ? 1 : 0;
    }

  }  // end if thread currently processing an interval

  return count;
}

////////////////////////////////////////////////////////////////////////////////
//! Compute number of eigenvalues that are smaller than x given a symmetric,
//! real, and tridiagonal matrix
//! @param  g_d  diagonal elements stored in global memory
//! @param  g_s  superdiagonal elements stored in global memory
//! @param  n    size of matrix
//! @param  x    value for which the number of eigenvalues that are smaller is
//!              seeked
//! @param  tid  thread identified (e.g. threadIdx.x or gtid)
//! @param  num_intervals_active  number of active intervals / threads that
//!                               currently process an interval
//! @param  s_d  scratch space to store diagonal entries of the tridiagonal
//!              matrix in shared memory
//! @param  s_s  scratch space to store superdiagonal entries of the tridiagonal
//!              matrix in shared memory
//! @param  converged  flag if the current thread is already converged (that
//!         is count does not have to be computed)
////////////////////////////////////////////////////////////////////////////////
__device__ inline unsigned int computeNumSmallerEigenvalsLarge(
    float *g_d, float *g_s, const unsigned int n, const float x,
    const unsigned int tid, const unsigned int num_intervals_active, float *s_d,
    float *s_s, unsigned int converged, cg::thread_block cta) {
  float delta = 1.0f;
  unsigned int count = 0;

  unsigned int rem = n;

  // do until whole diagonal and superdiagonal has been loaded and processed
  for (unsigned int i = 0; i < n; i += blockDim.x) {
    cg::sync(cta);

    // read new chunk of data into shared memory
    if ((i + threadIdx.x) < n) {
      s_d[threadIdx.x] = *(g_d + i + threadIdx.x);
      s_s[threadIdx.x] = *(g_s + i + threadIdx.x - 1);
    }

    cg::sync(cta);

    if (tid < num_intervals_active) {
      // perform (optimized) Gaussian elimination to determine the number
      // of eigenvalues that are smaller than n
      for (unsigned int k = 0; k < min(rem, blockDim.x); ++k) {
        delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;
        // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta;
        count += (delta < 0) ? 1 : 0;
      }

    }  // end if thread currently processing an interval

    rem -= blockDim.x;
  }

  return count;
}

////////////////////////////////////////////////////////////////////////////////
//! Store all non-empty intervals resulting from the subdivision of the interval
//! currently processed by the thread
//! @param  addr  base address for storing intervals
//! @param  num_threads_active  number of threads / intervals in current sweep
//! @param  s_left  shared memory storage for left interval limits
//! @param  s_right  shared memory storage for right interval limits
//! @param  s_left_count  shared memory storage for number of eigenvalues less
//!                       than left interval limits
//! @param  s_right_count  shared memory storage for number of eigenvalues less
//!                       than right interval limits
//! @param  left   lower limit of interval
//! @param  mid    midpoint of interval
//! @param  right  upper limit of interval
//! @param  left_count  eigenvalues less than \a left
//! @param  mid_count  eigenvalues less than \a mid
//! @param  right_count  eigenvalues less than \a right
//! @param  precision  desired precision for eigenvalues
//! @param  compact_second_chunk  shared mem flag if second chunk is used and
//!                               ergo requires compaction
//! @param  s_compaction_list_exc  helper array for stream compaction,
//!                                s_compaction_list_exc[tid] = 1 when the
//!                                thread generated two child intervals
//! @is_active_interval  mark is thread has a second non-empty child interval
////////////////////////////////////////////////////////////////////////////////
template <class S, class T>
__device__ void storeNonEmptyIntervals(
    unsigned int addr, const unsigned int num_threads_active, float *s_left,
    float *s_right, T *s_left_count, T *s_right_count, float left, float mid,
    float right, const S left_count, const S mid_count, const S right_count,
    float precision, unsigned int &compact_second_chunk,
    T *s_compaction_list_exc, unsigned int &is_active_second) {
  // check if both child intervals are valid
  if ((left_count != mid_count) && (mid_count != right_count)) {
    // store the left interval
    storeInterval(addr, s_left, s_right, s_left_count, s_right_count, left, mid,
                  left_count, mid_count, precision);

    // mark that a second interval has been generated, only stored after
    // stream compaction of second chunk
    is_active_second = 1;
    s_compaction_list_exc[threadIdx.x] = 1;
    atomicExch(&compact_second_chunk, 1);
  } else {
    // only one non-empty child interval

    // mark that no second child
    is_active_second = 0;
    s_compaction_list_exc[threadIdx.x] = 0;

    // store the one valid child interval
    if (left_count != mid_count) {
      storeInterval(addr, s_left, s_right, s_left_count, s_right_count, left,
                    mid, left_count, mid_count, precision);
    } else {
      storeInterval(addr, s_left, s_right, s_left_count, s_right_count, mid,
                    right, mid_count, right_count, precision);
    }
  }
}
////////////////////////////////////////////////////////////////////////////////
//! Create indices for compaction, that is process \a s_compaction_list_exc
//! which is 1 for intervals that generated a second child and 0 otherwise
//! and create for each of the non-zero elements the index where the new
//! interval belongs to in a compact representation of all generated second
//! childs
//! @param   s_compaction_list_exc  list containing the flags which threads
//!                                 generated two children
//! @param   num_threads_compaction number of threads to employ for compaction
////////////////////////////////////////////////////////////////////////////////
template <class T>
__device__ void createIndicesCompaction(T *s_compaction_list_exc,
                                        unsigned int num_threads_compaction,
                                        cg::thread_block cta) {
  unsigned int offset = 1;
  const unsigned int tid = threadIdx.x;

  // higher levels of scan tree
  for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) {
    cg::sync(cta);

    if (tid < d) {
      unsigned int ai = offset * (2 * tid + 1) - 1;
      unsigned int bi = offset * (2 * tid + 2) - 1;

      s_compaction_list_exc[bi] =
          s_compaction_list_exc[bi] + s_compaction_list_exc[ai];
    }

    offset <<= 1;
  }

  // traverse down tree: first down to level 2 across
  for (int d = 2; d < num_threads_compaction; d <<= 1) {
    offset >>= 1;
    cg::sync(cta);

    if (tid < (d - 1)) {
      unsigned int ai = offset * (tid + 1) - 1;
      unsigned int bi = ai + (offset >> 1);

      s_compaction_list_exc[bi] =
          s_compaction_list_exc[bi] + s_compaction_list_exc[ai];
    }
  }

  cg::sync(cta);
}

///////////////////////////////////////////////////////////////////////////////
//! Perform stream compaction for second child intervals
//! @param  s_left  shared
//! @param  s_left  shared memory storage for left interval limits
//! @param  s_right  shared memory storage for right interval limits
//! @param  s_left_count  shared memory storage for number of eigenvalues less
//!                       than left interval limits
//! @param  s_right_count  shared memory storage for number of eigenvalues less
//!                       than right interval limits
//! @param  mid    midpoint of current interval (left of new interval)
//! @param  right  upper limit of interval
//! @param  mid_count  eigenvalues less than \a mid
//! @param  s_compaction_list  list containing the indices where the data has
//!         to be stored
//! @param  num_threads_active  number of active threads / intervals
//! @is_active_interval  mark is thread has a second non-empty child interval
///////////////////////////////////////////////////////////////////////////////
template <class T>
__device__ void compactIntervals(float *s_left, float *s_right, T *s_left_count,
                                 T *s_right_count, float mid, float right,
                                 unsigned int mid_count,
                                 unsigned int right_count, T *s_compaction_list,
                                 unsigned int num_threads_active,
                                 unsigned int is_active_second) {
  const unsigned int tid = threadIdx.x;

  // perform compaction / copy data for all threads where the second
  // child is not dead
  if ((tid < num_threads_active) && (1 == is_active_second)) {
    unsigned int addr_w = num_threads_active + s_compaction_list[tid];

    s_left[addr_w] = mid;
    s_right[addr_w] = right;
    s_left_count[addr_w] = mid_count;
    s_right_count[addr_w] = right_count;
  }
}

///////////////////////////////////////////////////////////////////////////////
//! Store intervals that have already converged (w.r.t. the desired precision),
//! duplicating intervals that contain multiple eigenvalues
//! @param  s_left  shared memory storage for left interval limits
//! @param  s_right  shared memory storage for right interval limits
//! @param  s_left_count  shared memory storage for number of eigenvalues less
//!                       than left interval limits
//! @param  s_right_count  shared memory storage for number of eigenvalues less
//!                       than right interval limits
//! @param  left   lower limit of interval
//! @param  mid    midpoint of interval (updated if split is necessary)
//! @param  right  upper limit of interval
//! @param  left_count  eigenvalues less than \a left
//! @param  mid_count  eigenvalues less than \a mid
//! @param  right_count  eigenvalues less than \a right
//! @param  s_compaction_list_exc  helper array for stream compaction, updated
//!                                at tid if split is necessary
//! @param  compact_second_chunk  shared mem flag if second chunk is used and
//!                               ergo requires compaction
//! @param  num_threads_active  number of active threads / intervals
///////////////////////////////////////////////////////////////////////////////
template <class T, class S>
__device__ void storeIntervalConverged(float *s_left, float *s_right,
                                       T *s_left_count, T *s_right_count,
                                       float &left, float &mid, float &right,
                                       S &left_count, S &mid_count,
                                       S &right_count, T *s_compaction_list_exc,
                                       unsigned int &compact_second_chunk,
                                       const unsigned int num_threads_active) {
  const unsigned int tid = threadIdx.x;
  const unsigned int multiplicity = right_count - left_count;

  // check multiplicity of eigenvalue
  if (1 == multiplicity) {
    // just re-store intervals, simple eigenvalue
    s_left[tid] = left;
    s_right[tid] = right;
    s_left_count[tid] = left_count;
    s_right_count[tid] = right_count;

    // mark that no second child / clear
    s_right_count[tid + num_threads_active] = 0;
    s_compaction_list_exc[tid] = 0;
  } else {
    // number of eigenvalues after the split less than mid
    mid_count = left_count + (multiplicity >> 1);

    // store left interval
    s_left[tid] = left;
    s_right[tid] = right;
    s_left_count[tid] = left_count;
    s_right_count[tid] = mid_count;

    mid = left;

    // mark that second child interval exists
    s_right_count[tid + num_threads_active] = right_count;
    s_compaction_list_exc[tid] = 1;
    compact_second_chunk = 1;
  }
}

template <class T, class S>
__device__ void storeIntervalConverged(float *s_left, float *s_right,
                                       T *s_left_count, T *s_right_count,
                                       float &left, float &mid, float &right,
                                       S &left_count, S &mid_count,
                                       S &right_count, T *s_compaction_list_exc,
                                       unsigned int &compact_second_chunk,
                                       const unsigned int num_threads_active,
                                       unsigned int &is_active_second) {
  const unsigned int tid = threadIdx.x;
  const unsigned int multiplicity = right_count - left_count;

  // check multiplicity of eigenvalue
  if (1 == multiplicity) {
    // just re-store intervals, simple eigenvalue
    s_left[tid] = left;
    s_right[tid] = right;
    s_left_count[tid] = left_count;
    s_right_count[tid] = right_count;

    // mark that no second child / clear
    is_active_second = 0;
    s_compaction_list_exc[tid] = 0;
  } else {
    // number of eigenvalues after the split less than mid
    mid_count = left_count + (multiplicity >> 1);

    // store left interval
    s_left[tid] = left;
    s_right[tid] = right;
    s_left_count[tid] = left_count;
    s_right_count[tid] = mid_count;

    mid = left;

    // mark that second child interval exists
    is_active_second = 1;
    s_compaction_list_exc[tid] = 1;
    compact_second_chunk = 1;
  }
}

///////////////////////////////////////////////////////////////////////////////
//! Subdivide interval if active and not already converged
//! @param tid  id of thread
//! @param  s_left  shared memory storage for left interval limits
//! @param  s_right  shared memory storage for right interval limits
//! @param  s_left_count  shared memory storage for number of eigenvalues less
//!                       than left interval limits
//! @param  s_right_count  shared memory storage for number of eigenvalues less
//!                       than right interval limits
//! @param  num_threads_active  number of active threads in warp
//! @param  left   lower limit of interval
//! @param  right  upper limit of interval
//! @param  left_count  eigenvalues less than \a left
//! @param  right_count  eigenvalues less than \a right
//! @param  all_threads_converged  shared memory flag if all threads are
//!                                 converged
///////////////////////////////////////////////////////////////////////////////
template <class T>
__device__ void subdivideActiveInterval(
    const unsigned int tid, float *s_left, float *s_right, T *s_left_count,
    T *s_right_count, const unsigned int num_threads_active, float &left,
    float &right, unsigned int &left_count, unsigned int &right_count,
    float &mid, unsigned int &all_threads_converged) {
  // for all active threads
  if (tid < num_threads_active) {
    left = s_left[tid];
    right = s_right[tid];
    left_count = s_left_count[tid];
    right_count = s_right_count[tid];

    // check if thread already converged
    if (left != right) {
      mid = computeMidpoint(left, right);
      atomicExch(&all_threads_converged, 0);
    } else if ((right_count - left_count) > 1) {
      // mark as not converged if multiple eigenvalues enclosed
      // duplicate interval in storeIntervalsConverged()
      atomicExch(&all_threads_converged, 0);
    }

  }  // end for all active threads
}

#endif  // #ifndef _BISECT_UTIL_H_