/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/* Determine eigenvalues for large symmetric, tridiagonal matrix. First
  step of the computation. */

#ifndef _BISECT_KERNEL_LARGE_H_
#define _BISECT_KERNEL_LARGE_H_
#include <cooperative_groups.h>

namespace cg = cooperative_groups;
// includes, project
#include "config.h"
#include "util.h"

// additional kernel
#include "bisect_util.cu"

// declaration, forward

////////////////////////////////////////////////////////////////////////////////
//! Write data to global memory
////////////////////////////////////////////////////////////////////////////////
__device__ void writeToGmem(
    const unsigned int tid, const unsigned int tid_2,
    const unsigned int num_threads_active, const unsigned int num_blocks_mult,
    float *g_left_one, float *g_right_one, unsigned int *g_pos_one,
    float *g_left_mult, float *g_right_mult, unsigned int *g_left_count_mult,
    unsigned int *g_right_count_mult, float *s_left, float *s_right,
    unsigned short *s_left_count, unsigned short *s_right_count,
    unsigned int *g_blocks_mult, unsigned int *g_blocks_mult_sum,
    unsigned short *s_compaction_list, unsigned short *s_cl_helper,
    unsigned int offset_mult_lambda);

////////////////////////////////////////////////////////////////////////////////
//! Perform final stream compaction before writing out data
////////////////////////////////////////////////////////////////////////////////
__device__ void compactStreamsFinal(
    const unsigned int tid, const unsigned int tid_2,
    const unsigned int num_threads_active, unsigned int &offset_mult_lambda,
    float *s_left, float *s_right, unsigned short *s_left_count,
    unsigned short *s_right_count, unsigned short *s_cl_one,
    unsigned short *s_cl_mult, unsigned short *s_cl_blocking,
    unsigned short *s_cl_helper, unsigned int is_one_lambda,
    unsigned int is_one_lambda_2, float &left, float &right, float &left_2,
    float &right_2, unsigned int &left_count, unsigned int &right_count,
    unsigned int &left_count_2, unsigned int &right_count_2,
    unsigned int c_block_iend, unsigned int c_sum_block,
    unsigned int c_block_iend_2, unsigned int c_sum_block_2,
    cg::thread_block cta);

////////////////////////////////////////////////////////////////////////////////
//! Perform scan to compact list of block start addresses
////////////////////////////////////////////////////////////////////////////////
__device__ void scanCompactBlocksStartAddress(
    const unsigned int tid, const unsigned int tid_2,
    const unsigned int num_threads_compaction, unsigned short *s_cl_blocking,
    unsigned short *s_cl_helper, cg::thread_block cta);

////////////////////////////////////////////////////////////////////////////////
//! Perform scan to obtain number of eigenvalues before a specific block
////////////////////////////////////////////////////////////////////////////////
__device__ void scanSumBlocks(const unsigned int tid, const unsigned int tid_2,
                              const unsigned int num_threads_active,
                              const unsigned int num_threads_compaction,
                              unsigned short *s_cl_blocking,
                              unsigned short *s_cl_helper,
                              cg::thread_block cta);

////////////////////////////////////////////////////////////////////////////////
//! Perform initial scan for compaction of intervals containing one and
//! multiple eigenvalues; also do initial scan to build blocks
////////////////////////////////////////////////////////////////////////////////
__device__ void scanInitial(const unsigned int tid, const unsigned int tid_2,
                            const unsigned int num_threads_active,
                            const unsigned int num_threads_compaction,
                            unsigned short *s_cl_one, unsigned short *s_cl_mult,
                            unsigned short *s_cl_blocking,
                            unsigned short *s_cl_helper, cg::thread_block cta);

////////////////////////////////////////////////////////////////////////////////
//! Store all non-empty intervals resulting from the subdivision of the interval
//! currently processed by the thread
//! @param  addr  address where to store
////////////////////////////////////////////////////////////////////////////////
__device__ void storeNonEmptyIntervalsLarge(
    unsigned int addr, const unsigned int num_threads_active, float *s_left,
    float *s_right, unsigned short *s_left_count, unsigned short *s_right_count,
    float left, float mid, float right, const unsigned short left_count,
    const unsigned short mid_count, const unsigned short right_count,
    float epsilon, unsigned int &compact_second_chunk,
    unsigned short *s_compaction_list, unsigned int &is_active_second);

////////////////////////////////////////////////////////////////////////////////
//! Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix
//! @param  g_d  diagonal elements in global memory
//! @param  g_s  superdiagonal elements in global elements (stored so that the
//!              element *(g_s - 1) can be accessed an equals 0
//! @param  n   size of matrix
//! @param  lg  lower bound of input interval (e.g. Gerschgorin interval)
//! @param  ug  upper bound of input interval (e.g. Gerschgorin interval)
//! @param  lg_eig_count  number of eigenvalues that are smaller than \a lg
//! @param  lu_eig_count  number of eigenvalues that are smaller than \a lu
//! @param  epsilon  desired accuracy of eigenvalues to compute
////////////////////////////////////////////////////////////////////////////////
__global__ void bisectKernelLarge(
    float *g_d, float *g_s, const unsigned int n, const float lg,
    const float ug, const unsigned int lg_eig_count,
    const unsigned int ug_eig_count, float epsilon, unsigned int *g_num_one,
    unsigned int *g_num_blocks_mult, float *g_left_one, float *g_right_one,
    unsigned int *g_pos_one, float *g_left_mult, float *g_right_mult,
    unsigned int *g_left_count_mult, unsigned int *g_right_count_mult,
    unsigned int *g_blocks_mult, unsigned int *g_blocks_mult_sum) {
  // Handle to thread block group
  cg::thread_block cta = cg::this_thread_block();
  const unsigned int tid = threadIdx.x;

  // intervals (store left and right because the subdivision tree is in general
  // not dense
  __shared__ float s_left[2 * MAX_THREADS_BLOCK + 1];
  __shared__ float s_right[2 * MAX_THREADS_BLOCK + 1];

  // number of eigenvalues that are smaller than s_left / s_right
  // (correspondence is realized via indices)
  __shared__ unsigned short s_left_count[2 * MAX_THREADS_BLOCK + 1];
  __shared__ unsigned short s_right_count[2 * MAX_THREADS_BLOCK + 1];

  // helper for stream compaction
  __shared__ unsigned short s_compaction_list[2 * MAX_THREADS_BLOCK + 1];

  // state variables for whole block
  // if 0 then compaction of second chunk of child intervals is not necessary
  // (because all intervals had exactly one non-dead child)
  __shared__ unsigned int compact_second_chunk;
  // if 1 then all threads are converged
  __shared__ unsigned int all_threads_converged;

  // number of currently active threads
  __shared__ unsigned int num_threads_active;

  // number of threads to use for stream compaction
  __shared__ unsigned int num_threads_compaction;

  // helper for exclusive scan
  unsigned short *s_compaction_list_exc = s_compaction_list + 1;

  // variables for currently processed interval
  // left and right limit of active interval
  float left = 0.0f;
  float right = 0.0f;
  unsigned int left_count = 0;
  unsigned int right_count = 0;
  // midpoint of active interval
  float mid = 0.0f;
  // number of eigenvalues smaller then mid
  unsigned int mid_count = 0;
  // helper for stream compaction (tracking of threads generating second child)
  unsigned int is_active_second = 0;

  // initialize lists
  s_compaction_list[tid] = 0;
  s_left[tid] = 0;
  s_right[tid] = 0;
  s_left_count[tid] = 0;
  s_right_count[tid] = 0;

  cg::sync(cta);

  // set up initial configuration
  if (0 == tid) {
    s_left[0] = lg;
    s_right[0] = ug;
    s_left_count[0] = lg_eig_count;
    s_right_count[0] = ug_eig_count;

    compact_second_chunk = 0;
    num_threads_active = 1;

    num_threads_compaction = 1;

    all_threads_converged = 1;
  }

  cg::sync(cta);

  // for all active threads read intervals from the last level
  // the number of (worst case) active threads per level l is 2^l
  while (true) {
    subdivideActiveInterval(tid, s_left, s_right, s_left_count, s_right_count,
                            num_threads_active, left, right, left_count,
                            right_count, mid, all_threads_converged);

    cg::sync(cta);

    // check if done
    if (1 == all_threads_converged) {
      break;
    }

    // compute number of eigenvalues smaller than mid
    // use all threads for reading the necessary matrix data from global
    // memory
    // use s_left and s_right as scratch space for diagonal and
    // superdiagonal of matrix
    mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, mid, threadIdx.x,
                                                num_threads_active, s_left,
                                                s_right, (left == right), cta);

    cg::sync(cta);

    // store intervals
    // for all threads store the first child interval in a continuous chunk of
    // memory, and the second child interval -- if it exists -- in a second
    // chunk; it is likely that all threads reach convergence up to
    // \a epsilon at the same level; furthermore, for higher level most / all
    // threads will have only one child, storing the first child compactly will
    // (first) avoid to perform a compaction step on the first chunk, (second)
    // make it for higher levels (when all threads / intervals have
    // exactly one child)  unnecessary to perform a compaction of the second
    // chunk
    if (tid < num_threads_active) {
      if (left != right) {
        // store intervals
        storeNonEmptyIntervalsLarge(tid, num_threads_active, s_left, s_right,
                                    s_left_count, s_right_count, left, mid,
                                    right, left_count, mid_count, right_count,
                                    epsilon, compact_second_chunk,
                                    s_compaction_list_exc, is_active_second);
      } else {
        // re-write converged interval (has to be stored again because s_left
        // and s_right are used as scratch space for
        // computeNumSmallerEigenvalsLarge()
        s_left[tid] = left;
        s_right[tid] = left;
        s_left_count[tid] = left_count;
        s_right_count[tid] = right_count;

        is_active_second = 0;
      }
    }

    // necessary so that compact_second_chunk is up-to-date
    cg::sync(cta);

    // perform compaction of chunk where second children are stored
    // scan of (num_threads_active / 2) elements, thus at most
    // (num_threads_active / 4) threads are needed
    if (compact_second_chunk > 0) {
      // create indices for compaction
      createIndicesCompaction(s_compaction_list_exc, num_threads_compaction,
                              cta);

      compactIntervals(s_left, s_right, s_left_count, s_right_count, mid, right,
                       mid_count, right_count, s_compaction_list,
                       num_threads_active, is_active_second);
    }

    cg::sync(cta);

    // update state variables
    if (0 == tid) {
      // update number of active threads with result of reduction
      num_threads_active += s_compaction_list[num_threads_active];
      num_threads_compaction = ceilPow2(num_threads_active);

      compact_second_chunk = 0;
      all_threads_converged = 1;
    }

    cg::sync(cta);

    if (num_threads_compaction > blockDim.x) {
      break;
    }
  }

  cg::sync(cta);

  // generate two lists of intervals; one with intervals that contain one
  // eigenvalue (or are converged), and one with intervals that need further
  // subdivision

  // perform two scans in parallel

  unsigned int left_count_2;
  unsigned int right_count_2;

  unsigned int tid_2 = tid + blockDim.x;

  // cache in per thread registers so that s_left_count and s_right_count
  // can be used for scans
  left_count = s_left_count[tid];
  right_count = s_right_count[tid];

  // some threads have to cache data for two intervals
  if (tid_2 < num_threads_active) {
    left_count_2 = s_left_count[tid_2];
    right_count_2 = s_right_count[tid_2];
  }

  // compaction list for intervals containing one and multiple eigenvalues
  // do not affect first element for exclusive scan
  unsigned short *s_cl_one = s_left_count + 1;
  unsigned short *s_cl_mult = s_right_count + 1;

  // compaction list for generating blocks of intervals containing multiple
  // eigenvalues
  unsigned short *s_cl_blocking = s_compaction_list_exc;
  // helper compaction list for generating blocks of intervals
  __shared__ unsigned short s_cl_helper[2 * MAX_THREADS_BLOCK + 1];

  if (0 == tid) {
    // set to 0 for exclusive scan
    s_left_count[0] = 0;
    s_right_count[0] = 0;
  }

  cg::sync(cta);

  // flag if interval contains one or multiple eigenvalues
  unsigned int is_one_lambda = 0;
  unsigned int is_one_lambda_2 = 0;

  // number of eigenvalues in the interval
  unsigned int multiplicity = right_count - left_count;
  is_one_lambda = (1 == multiplicity);

  s_cl_one[tid] = is_one_lambda;
  s_cl_mult[tid] = (!is_one_lambda);

  // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
  s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity;
  s_cl_helper[tid] = 0;

  if (tid_2 < num_threads_active) {
    unsigned int multiplicity = right_count_2 - left_count_2;
    is_one_lambda_2 = (1 == multiplicity);

    s_cl_one[tid_2] = is_one_lambda_2;
    s_cl_mult[tid_2] = (!is_one_lambda_2);

    // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
    s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity;
    s_cl_helper[tid_2] = 0;
  } else if (tid_2 < (2 * MAX_THREADS_BLOCK + 1)) {
    // clear
    s_cl_blocking[tid_2] = 0;
    s_cl_helper[tid_2] = 0;
  }

  scanInitial(tid, tid_2, num_threads_active, num_threads_compaction, s_cl_one,
              s_cl_mult, s_cl_blocking, s_cl_helper, cta);

  scanSumBlocks(tid, tid_2, num_threads_active, num_threads_compaction,
                s_cl_blocking, s_cl_helper, cta);

  // end down sweep of scan
  cg::sync(cta);

  unsigned int c_block_iend = 0;
  unsigned int c_block_iend_2 = 0;
  unsigned int c_sum_block = 0;
  unsigned int c_sum_block_2 = 0;

  // for each thread / interval that corresponds to root node of interval block
  // store start address of block and total number of eigenvalues in all blocks
  // before this block (particular thread is irrelevant, constraint is to
  // have a subset of threads so that one and only one of them is in each
  // interval)
  if (1 == s_cl_helper[tid]) {
    c_block_iend = s_cl_mult[tid] + 1;
    c_sum_block = s_cl_blocking[tid];
  }

  if (1 == s_cl_helper[tid_2]) {
    c_block_iend_2 = s_cl_mult[tid_2] + 1;
    c_sum_block_2 = s_cl_blocking[tid_2];
  }

  scanCompactBlocksStartAddress(tid, tid_2, num_threads_compaction,
                                s_cl_blocking, s_cl_helper, cta);

  // finished second scan for s_cl_blocking
  cg::sync(cta);

  // determine the global results
  __shared__ unsigned int num_blocks_mult;
  __shared__ unsigned int num_mult;
  __shared__ unsigned int offset_mult_lambda;

  if (0 == tid) {
    num_blocks_mult = s_cl_blocking[num_threads_active - 1];
    offset_mult_lambda = s_cl_one[num_threads_active - 1];
    num_mult = s_cl_mult[num_threads_active - 1];

    *g_num_one = offset_mult_lambda;
    *g_num_blocks_mult = num_blocks_mult;
  }

  cg::sync(cta);

  float left_2, right_2;
  --s_cl_one;
  --s_cl_mult;
  --s_cl_blocking;
  compactStreamsFinal(tid, tid_2, num_threads_active, offset_mult_lambda,
                      s_left, s_right, s_left_count, s_right_count, s_cl_one,
                      s_cl_mult, s_cl_blocking, s_cl_helper, is_one_lambda,
                      is_one_lambda_2, left, right, left_2, right_2, left_count,
                      right_count, left_count_2, right_count_2, c_block_iend,
                      c_sum_block, c_block_iend_2, c_sum_block_2, cta);

  cg::sync(cta);

  // final adjustment before writing out data to global memory
  if (0 == tid) {
    s_cl_blocking[num_blocks_mult] = num_mult;
    s_cl_helper[0] = 0;
  }

  cg::sync(cta);

  // write to global memory
  writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult, g_left_one,
              g_right_one, g_pos_one, g_left_mult, g_right_mult,
              g_left_count_mult, g_right_count_mult, s_left, s_right,
              s_left_count, s_right_count, g_blocks_mult, g_blocks_mult_sum,
              s_compaction_list, s_cl_helper, offset_mult_lambda);
}

////////////////////////////////////////////////////////////////////////////////
//! Write data to global memory
////////////////////////////////////////////////////////////////////////////////
__device__ void writeToGmem(
    const unsigned int tid, const unsigned int tid_2,
    const unsigned int num_threads_active, const unsigned int num_blocks_mult,
    float *g_left_one, float *g_right_one, unsigned int *g_pos_one,
    float *g_left_mult, float *g_right_mult, unsigned int *g_left_count_mult,
    unsigned int *g_right_count_mult, float *s_left, float *s_right,
    unsigned short *s_left_count, unsigned short *s_right_count,
    unsigned int *g_blocks_mult, unsigned int *g_blocks_mult_sum,
    unsigned short *s_compaction_list, unsigned short *s_cl_helper,
    unsigned int offset_mult_lambda) {
  if (tid < offset_mult_lambda) {
    g_left_one[tid] = s_left[tid];
    g_right_one[tid] = s_right[tid];
    // right count can be used to order eigenvalues without sorting
    g_pos_one[tid] = s_right_count[tid];
  } else {
    g_left_mult[tid - offset_mult_lambda] = s_left[tid];
    g_right_mult[tid - offset_mult_lambda] = s_right[tid];
    g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid];
    g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid];
  }

  if (tid_2 < num_threads_active) {
    if (tid_2 < offset_mult_lambda) {
      g_left_one[tid_2] = s_left[tid_2];
      g_right_one[tid_2] = s_right[tid_2];
      // right count can be used to order eigenvalues without sorting
      g_pos_one[tid_2] = s_right_count[tid_2];
    } else {
      g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2];
      g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2];
      g_left_count_mult[tid_2 - offset_mult_lambda] = s_left_count[tid_2];
      g_right_count_mult[tid_2 - offset_mult_lambda] = s_right_count[tid_2];
    }

  }  // end writing out data

  // note that s_cl_blocking = s_compaction_list + 1;, that is by writing out
  // s_compaction_list we write the exclusive scan result
  if (tid <= num_blocks_mult) {
    g_blocks_mult[tid] = s_compaction_list[tid];
    g_blocks_mult_sum[tid] = s_cl_helper[tid];
  }

  if (tid_2 <= num_blocks_mult) {
    g_blocks_mult[tid_2] = s_compaction_list[tid_2];
    g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2];
  }
}

////////////////////////////////////////////////////////////////////////////////
//! Perform final stream compaction before writing data to global memory
////////////////////////////////////////////////////////////////////////////////
__device__ void compactStreamsFinal(
    const unsigned int tid, const unsigned int tid_2,
    const unsigned int num_threads_active, unsigned int &offset_mult_lambda,
    float *s_left, float *s_right, unsigned short *s_left_count,
    unsigned short *s_right_count, unsigned short *s_cl_one,
    unsigned short *s_cl_mult, unsigned short *s_cl_blocking,
    unsigned short *s_cl_helper, unsigned int is_one_lambda,
    unsigned int is_one_lambda_2, float &left, float &right, float &left_2,
    float &right_2, unsigned int &left_count, unsigned int &right_count,
    unsigned int &left_count_2, unsigned int &right_count_2,
    unsigned int c_block_iend, unsigned int c_sum_block,
    unsigned int c_block_iend_2, unsigned int c_sum_block_2,
    cg::thread_block cta) {
  // cache data before performing compaction
  left = s_left[tid];
  right = s_right[tid];

  if (tid_2 < num_threads_active) {
    left_2 = s_left[tid_2];
    right_2 = s_right[tid_2];
  }

  cg::sync(cta);

  // determine addresses for intervals containing multiple eigenvalues and
  // addresses for blocks of intervals
  unsigned int ptr_w = 0;
  unsigned int ptr_w_2 = 0;
  unsigned int ptr_blocking_w = 0;
  unsigned int ptr_blocking_w_2 = 0;

  ptr_w = (1 == is_one_lambda) ? s_cl_one[tid]
                               : s_cl_mult[tid] + offset_mult_lambda;

  if (0 != c_block_iend) {
    ptr_blocking_w = s_cl_blocking[tid];
  }

  if (tid_2 < num_threads_active) {
    ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2]
                                     : s_cl_mult[tid_2] + offset_mult_lambda;

    if (0 != c_block_iend_2) {
      ptr_blocking_w_2 = s_cl_blocking[tid_2];
    }
  }

  cg::sync(cta);

  // store compactly in shared mem
  s_left[ptr_w] = left;
  s_right[ptr_w] = right;
  s_left_count[ptr_w] = left_count;
  s_right_count[ptr_w] = right_count;

  if (0 != c_block_iend) {
    s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1;
    s_cl_helper[ptr_blocking_w + 1] = c_sum_block;
  }

  if (tid_2 < num_threads_active) {
    // store compactly in shared mem
    s_left[ptr_w_2] = left_2;
    s_right[ptr_w_2] = right_2;
    s_left_count[ptr_w_2] = left_count_2;
    s_right_count[ptr_w_2] = right_count_2;

    if (0 != c_block_iend_2) {
      s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1;
      s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2;
    }
  }
}

////////////////////////////////////////////////////////////////////////////////
//! Compute addresses to obtain compact list of block start addresses
////////////////////////////////////////////////////////////////////////////////
__device__ void scanCompactBlocksStartAddress(
    const unsigned int tid, const unsigned int tid_2,
    const unsigned int num_threads_compaction, unsigned short *s_cl_blocking,
    unsigned short *s_cl_helper, cg::thread_block cta) {
  // prepare for second step of block generation: compaction of the block
  // list itself to efficiently write out these
  s_cl_blocking[tid] = s_cl_helper[tid];

  if (tid_2 < num_threads_compaction) {
    s_cl_blocking[tid_2] = s_cl_helper[tid_2];
  }

  cg::sync(cta);

  // additional scan to compact s_cl_blocking that permits to generate a
  // compact list of eigenvalue blocks each one containing about
  // MAX_THREADS_BLOCK eigenvalues (so that each of these blocks may be
  // processed by one thread block in a subsequent processing step

  unsigned int offset = 1;

  // build scan tree
  for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) {
    cg::sync(cta);

    if (tid < d) {
      unsigned int ai = offset * (2 * tid + 1) - 1;
      unsigned int bi = offset * (2 * tid + 2) - 1;
      s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
    }

    offset <<= 1;
  }

  // traverse down tree: first down to level 2 across
  for (int d = 2; d < num_threads_compaction; d <<= 1) {
    offset >>= 1;
    cg::sync(cta);

    //
    if (tid < (d - 1)) {
      unsigned int ai = offset * (tid + 1) - 1;
      unsigned int bi = ai + (offset >> 1);
      s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
    }
  }
}

////////////////////////////////////////////////////////////////////////////////
//! Perform scan to obtain number of eigenvalues before a specific block
////////////////////////////////////////////////////////////////////////////////
__device__ void scanSumBlocks(const unsigned int tid, const unsigned int tid_2,
                              const unsigned int num_threads_active,
                              const unsigned int num_threads_compaction,
                              unsigned short *s_cl_blocking,
                              unsigned short *s_cl_helper,
                              cg::thread_block cta) {
  unsigned int offset = 1;

  // first step of scan to build the sum of elements within each block
  // build up tree
  for (int d = num_threads_compaction >> 1; d > 0; d >>= 1) {
    cg::sync(cta);

    if (tid < d) {
      unsigned int ai = offset * (2 * tid + 1) - 1;
      unsigned int bi = offset * (2 * tid + 2) - 1;

      s_cl_blocking[bi] += s_cl_blocking[ai];
    }

    offset *= 2;
  }

  // first step of scan to build the sum of elements within each block
  // traverse down tree
  for (int d = 2; d < (num_threads_compaction - 1); d <<= 1) {
    offset >>= 1;
    cg::sync(cta);

    if (tid < (d - 1)) {
      unsigned int ai = offset * (tid + 1) - 1;
      unsigned int bi = ai + (offset >> 1);

      s_cl_blocking[bi] += s_cl_blocking[ai];
    }
  }

  cg::sync(cta);

  if (0 == tid) {
    // move last element of scan to last element that is valid
    // necessary because the number of threads employed for scan is a power
    // of two and not necessarily the number of active threasd
    s_cl_helper[num_threads_active - 1] =
        s_cl_helper[num_threads_compaction - 1];
    s_cl_blocking[num_threads_active - 1] =
        s_cl_blocking[num_threads_compaction - 1];
  }
}

////////////////////////////////////////////////////////////////////////////////
//! Perform initial scan for compaction of intervals containing one and
//! multiple eigenvalues; also do initial scan to build blocks
////////////////////////////////////////////////////////////////////////////////
__device__ void scanInitial(const unsigned int tid, const unsigned int tid_2,
                            const unsigned int num_threads_active,
                            const unsigned int num_threads_compaction,
                            unsigned short *s_cl_one, unsigned short *s_cl_mult,
                            unsigned short *s_cl_blocking,
                            unsigned short *s_cl_helper, cg::thread_block cta) {
  // perform scan to compactly write out the intervals containing one and
  // multiple eigenvalues
  // also generate tree for blocking of intervals containing multiple
  // eigenvalues

  unsigned int offset = 1;

  // build scan tree
  for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) {
    cg::sync(cta);

    if (tid < d) {
      unsigned int ai = offset * (2 * tid + 1);
      unsigned int bi = offset * (2 * tid + 2) - 1;

      s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1];
      s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1];

      // s_cl_helper is binary and zero for an internal node and 1 for a
      // root node of a tree corresponding to a block
      // s_cl_blocking contains the number of nodes in each sub-tree at each
      // iteration, the data has to be kept to compute the total number of
      // eigenvalues per block that, in turn, is needed to efficiently
      // write out data in the second step
      if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1)) {
        // check how many childs are non terminated
        if (s_cl_helper[ai - 1] == 1) {
          // mark as terminated
          s_cl_helper[bi] = 1;
        } else if (s_cl_helper[bi] == 1) {
          // mark as terminated
          s_cl_helper[ai - 1] = 1;
        } else  // both childs are non-terminated
        {
          unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 1];

          if (temp > MAX_THREADS_BLOCK) {
            // the two child trees have to form separate blocks, terminate trees
            s_cl_helper[ai - 1] = 1;
            s_cl_helper[bi] = 1;
          } else {
            // build up tree by joining subtrees
            s_cl_blocking[bi] = temp;
            s_cl_blocking[ai - 1] = 0;
          }
        }
      }  // end s_cl_helper update
    }

    offset <<= 1;
  }

  // traverse down tree, this only for stream compaction, not for block
  // construction
  for (int d = 2; d < num_threads_compaction; d <<= 1) {
    offset >>= 1;
    cg::sync(cta);

    //
    if (tid < (d - 1)) {
      unsigned int ai = offset * (tid + 1) - 1;
      unsigned int bi = ai + (offset >> 1);

      s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai];
      s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai];
    }
  }
}

////////////////////////////////////////////////////////////////////////////////
//! Store all non-empty intervals resulting from the subdivision of the interval
//! currently processed by the thread
////////////////////////////////////////////////////////////////////////////////
__device__ void storeNonEmptyIntervalsLarge(
    unsigned int addr, const unsigned int num_threads_active, float *s_left,
    float *s_right, unsigned short *s_left_count, unsigned short *s_right_count,
    float left, float mid, float right, const unsigned short left_count,
    const unsigned short mid_count, const unsigned short right_count,
    float epsilon, unsigned int &compact_second_chunk,
    unsigned short *s_compaction_list, unsigned int &is_active_second) {
  // check if both child intervals are valid
  if ((left_count != mid_count) && (mid_count != right_count)) {
    storeInterval(addr, s_left, s_right, s_left_count, s_right_count, left, mid,
                  left_count, mid_count, epsilon);

    is_active_second = 1;
    s_compaction_list[threadIdx.x] = 1;
    atomicExch(&compact_second_chunk, 1);
  } else {
    // only one non-empty child interval

    // mark that no second child
    is_active_second = 0;
    s_compaction_list[threadIdx.x] = 0;

    // store the one valid child interval
    if (left_count != mid_count) {
      storeInterval(addr, s_left, s_right, s_left_count, s_right_count, left,
                    mid, left_count, mid_count, epsilon);
    } else {
      storeInterval(addr, s_left, s_right, s_left_count, s_right_count, mid,
                    right, mid_count, right_count, epsilon);
    }
  }
}

#endif  // #ifndef _BISECT_KERNEL_LARGE_H_