cuda-samples/Samples/interval/cpu_interval.h

308 lines
9.2 KiB
C
Raw Normal View History

2021-10-21 19:04:49 +08:00
/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Simple CPU implementation
* Depends on Boost.Interval
*/
#ifndef CPU_INTERVAL_H
#define CPU_INTERVAL_H
#ifndef __USE_ISOC99
#define __USE_ISOC99
#endif
#include <iostream>
#include <vector>
#include <boost/numeric/interval.hpp>
//#include <iomanip>
#define UNPROTECTED 0
#define USE_RECURSION_CPU 1
using boost::numeric::interval;
using namespace boost::numeric;
template <class T, int N, int THREADS>
class global_stack_cpu {
private:
T *buf;
int free_index;
public:
// buf should point to an allocated global buffer of size N * THREADS *
// sizeof(T)
global_stack_cpu(T *buf, int thread_id) : buf(buf), free_index(thread_id) {}
void push(T const &v) {
buf[free_index] = v;
free_index += THREADS;
}
T pop() {
free_index -= THREADS;
return buf[free_index];
}
bool full() { return free_index >= N * THREADS; }
bool empty() { return free_index < THREADS; }
int size() { return free_index / THREADS; }
};
// The function F of which we want to find roots, defined on intervals
// Should typically depend on thread_id (indexing an array of coefficients...)
template <class I>
I f_cpu(I const &x, int thread_id) {
typedef typename I::base_type T;
T alpha = -T(thread_id) / T(THREADS);
return square(x - I(1)) + I(alpha) * x;
}
// First derivative of F, also defined on intervals
template <class I>
I fd_cpu(I const &x, int thread_id) {
typedef typename I::base_type T;
T alpha = -T(thread_id) / T(THREADS);
return I(2) * x + I(alpha - 2);
}
// Is this interval small enough to stop iterating?
template <class I>
bool is_minimal_cpu(I const &x, int thread_id) {
typedef typename I::base_type T;
T const epsilon_x = 1e-6f;
T const epsilon_y = 1e-6f;
return !empty(x) && (width(x) <= epsilon_x * abs(median(x)) ||
width(f_cpu(x, thread_id)) <= epsilon_y);
}
// In some cases, Newton iterations converge slowly.
// Bisecting the interval accelerates convergence.
template <class I>
bool should_bisect_cpu(I const &x, I const &x1, I const &x2,
typename I::base_type alpha) {
typedef typename I::base_type T;
T wmax = alpha * width(x);
return width(x1) > wmax || width(x2) > wmax;
}
int const DEPTH_WORK = 128;
// Main interval Newton loop.
// Keep refining a list of intervals stored in a stack.
// Always keep the next interval to work on in registers (avoids excessive
// spilling to local mem)
template <class I, int THREADS, int DEPTH_RESULT>
void newton_interval_cpu(global_stack_cpu<I, DEPTH_RESULT, THREADS> &result,
I const &ix0, int thread_id) {
typedef typename I::base_type T;
T const alpha = .99f; // Threshold before switching to bisection
// Intervals to be processed
I local_buffer[DEPTH_WORK];
global_stack_cpu<I, DEPTH_WORK, 1> work(local_buffer, 0);
// We start with the whole domain
I ix = ix0;
while (true) {
// Compute (x - F({x})/F'(ix)) inter ix
// -> may yield 0, 1 or 2 intervals
T x = median(ix);
I iq = f_cpu(I(x), thread_id);
I id = fd_cpu(ix, thread_id);
bool has_part2;
I part1, part2;
part1 = division_part1(iq, id, has_part2);
part1 = intersect(I(x) - part1, ix);
if (has_part2) {
part2 = division_part2(iq, id);
part2 = intersect(I(x) - part2, ix);
}
// Do we have small-enough intervals?
if (is_minimal_cpu(part1, thread_id)) {
result.push(part1);
part1 = I::empty();
}
if (has_part2 && is_minimal_cpu(part2, thread_id)) {
result.push(part2);
part2 = I::empty();
}
if (should_bisect_cpu(ix, part1, part2, alpha)) {
// Not so good improvement
// Switch to bisection method for this step
part1 = I(ix.lower(), x);
part2 = I(x, ix.upper());
has_part2 = true;
}
if ((part1.lower() <= part1.upper()) && !empty(part1)) {
// At least 1 solution
// We will compute part1 next
ix = part1;
if (has_part2 && !empty(part2)) {
// 2 solutions
// Save the second solution for later
work.push(part2);
}
} else if (has_part2 && !empty(part2)) {
// 1 solution
// Work on that next
ix = part2;
} else {
// No solution
// Do we still have work to do in the stack?
if (work.empty()) // If not, we are done
break;
else
ix = work.pop(); // Otherwise, pick an interval to work on
}
}
}
template <class I, int THREADS, int DEPTH_RESULT>
void newton_interval_rec_cpu(global_stack_cpu<I, DEPTH_RESULT, THREADS> &result,
I const &ix, int thread_id) {
typedef typename I::base_type T;
T const alpha = .99f; // Threshold before switching to bisection
if (is_minimal_cpu(ix, thread_id)) {
result.push(ix);
return;
}
// Compute (x - F({x})/F'(ix)) inter ix
// -> may yield 0, 1 or 2 intervals
T x = median(ix);
I iq = f_cpu(I(x), thread_id);
I id = fd_cpu(ix, thread_id);
bool has_part2;
I part1, part2;
part1 = division_part1(iq, id, has_part2);
part1 = intersect(I(x) - part1, ix);
if (has_part2) {
part2 = division_part2(iq, id);
part2 = intersect(I(x) - part2, ix);
}
if (should_bisect_cpu(ix, part1, part2, alpha)) {
// Not so good improvement
// Switch to bisection method for this step
part1 = I(ix.lower(), x);
part2 = I(x, ix.upper());
has_part2 = true;
}
if ((part1.lower() <= part1.upper()) && (!empty(part1))) {
newton_interval_rec_cpu<I, THREADS, DEPTH_RESULT>(result, part1, thread_id);
}
if (has_part2 && !empty(part2)) {
newton_interval_rec_cpu<I, THREADS, DEPTH_RESULT>(result, part2, thread_id);
}
}
template <class I>
void test_interval_newton_cpu(I *buffer, int *nresults, I i) {
typedef typename I::base_type T;
// Intervals to return
// std::vector<I> local_buffer(BLOCK_SIZE * GRID_SIZE * DEPTH_WORK);
for (int thread_id = 0; thread_id != BLOCK_SIZE * GRID_SIZE; ++thread_id) {
global_stack_cpu<I, DEPTH_RESULT, THREADS> result(buffer, thread_id);
#if USE_RECURSION_CPU
newton_interval_rec_cpu<I, THREADS>(result, i, thread_id);
#else
newton_interval_cpu<I, THREADS>(result, i, thread_id);
#endif
nresults[thread_id] = result.size();
}
}
typedef interval<T, interval_lib::policies<interval_lib::rounded_math<T>,
interval_lib::checking_base<T> > >
Ibase;
#if UNPROTECTED
typedef interval_lib::unprotect<Ibase>::type I_CPU;
Ibase::traits_type::rounding rnd;
#else
typedef Ibase I_CPU;
#endif
bool checkAgainstHost(int *h_nresults, int *h_nresults_cpu, I_CPU *h_result,
I_CPU *h_result_cpu) {
std::cout << "\nCheck against Host computation...\n\n";
int success = 1;
int success1 = 1;
int success2 = 1;
if (h_nresults_cpu[0] == h_nresults[0]) {
for (int i = 0; i != h_nresults[0]; ++i) {
TYPE diff1 = abs(h_result[THREADS * i + 0].lower() -
h_result_cpu[THREADS * i + 0].lower());
TYPE diff2 = abs(h_result[THREADS * i + 0].upper() -
h_result_cpu[THREADS * i + 0].upper());
if ((diff1 > 1.0e-6f) || (diff2 > 1.0e-6f)) {
success1 = 0;
break;
}
}
// in case the two intervals are reversed
for (int i = 0; i != h_nresults[0]; ++i) {
TYPE diff1 =
abs(h_result[THREADS * i + 0].lower() -
h_result_cpu[THREADS * (h_nresults[0] - i - 1) + 0].lower());
TYPE diff2 =
abs(h_result[THREADS * i + 0].upper() -
h_result_cpu[THREADS * (h_nresults[0] - i - 1) + 0].upper());
if ((diff1 > 1.0e-6f) || (diff2 > 1.0e-6f)) {
success2 = 0;
break;
}
}
success = success1 || success2;
} else
success = 0;
return (bool)success;
}
#endif