/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Simple CPU implementation * Depends on Boost.Interval */ #ifndef CPU_INTERVAL_H #define CPU_INTERVAL_H #ifndef __USE_ISOC99 #define __USE_ISOC99 #endif #include #include #include //#include #define UNPROTECTED 0 #define USE_RECURSION_CPU 1 using boost::numeric::interval; using namespace boost::numeric; template class global_stack_cpu { private: T *buf; int free_index; public: // buf should point to an allocated global buffer of size N * THREADS * // sizeof(T) global_stack_cpu(T *buf, int thread_id) : buf(buf), free_index(thread_id) {} void push(T const &v) { buf[free_index] = v; free_index += THREADS; } T pop() { free_index -= THREADS; return buf[free_index]; } bool full() { return free_index >= N * THREADS; } bool empty() { return free_index < THREADS; } int size() { return free_index / THREADS; } }; // The function F of which we want to find roots, defined on intervals // Should typically depend on thread_id (indexing an array of coefficients...) template I f_cpu(I const &x, int thread_id) { typedef typename I::base_type T; T alpha = -T(thread_id) / T(THREADS); return square(x - I(1)) + I(alpha) * x; } // First derivative of F, also defined on intervals template I fd_cpu(I const &x, int thread_id) { typedef typename I::base_type T; T alpha = -T(thread_id) / T(THREADS); return I(2) * x + I(alpha - 2); } // Is this interval small enough to stop iterating? template bool is_minimal_cpu(I const &x, int thread_id) { typedef typename I::base_type T; T const epsilon_x = 1e-6f; T const epsilon_y = 1e-6f; return !empty(x) && (width(x) <= epsilon_x * abs(median(x)) || width(f_cpu(x, thread_id)) <= epsilon_y); } // In some cases, Newton iterations converge slowly. // Bisecting the interval accelerates convergence. template bool should_bisect_cpu(I const &x, I const &x1, I const &x2, typename I::base_type alpha) { typedef typename I::base_type T; T wmax = alpha * width(x); return width(x1) > wmax || width(x2) > wmax; } int const DEPTH_WORK = 128; // Main interval Newton loop. // Keep refining a list of intervals stored in a stack. // Always keep the next interval to work on in registers (avoids excessive // spilling to local mem) template void newton_interval_cpu(global_stack_cpu &result, I const &ix0, int thread_id) { typedef typename I::base_type T; T const alpha = .99f; // Threshold before switching to bisection // Intervals to be processed I local_buffer[DEPTH_WORK]; global_stack_cpu work(local_buffer, 0); // We start with the whole domain I ix = ix0; while (true) { // Compute (x - F({x})/F'(ix)) inter ix // -> may yield 0, 1 or 2 intervals T x = median(ix); I iq = f_cpu(I(x), thread_id); I id = fd_cpu(ix, thread_id); bool has_part2; I part1, part2; part1 = division_part1(iq, id, has_part2); part1 = intersect(I(x) - part1, ix); if (has_part2) { part2 = division_part2(iq, id); part2 = intersect(I(x) - part2, ix); } // Do we have small-enough intervals? if (is_minimal_cpu(part1, thread_id)) { result.push(part1); part1 = I::empty(); } if (has_part2 && is_minimal_cpu(part2, thread_id)) { result.push(part2); part2 = I::empty(); } if (should_bisect_cpu(ix, part1, part2, alpha)) { // Not so good improvement // Switch to bisection method for this step part1 = I(ix.lower(), x); part2 = I(x, ix.upper()); has_part2 = true; } if ((part1.lower() <= part1.upper()) && !empty(part1)) { // At least 1 solution // We will compute part1 next ix = part1; if (has_part2 && !empty(part2)) { // 2 solutions // Save the second solution for later work.push(part2); } } else if (has_part2 && !empty(part2)) { // 1 solution // Work on that next ix = part2; } else { // No solution // Do we still have work to do in the stack? if (work.empty()) // If not, we are done break; else ix = work.pop(); // Otherwise, pick an interval to work on } } } template void newton_interval_rec_cpu(global_stack_cpu &result, I const &ix, int thread_id) { typedef typename I::base_type T; T const alpha = .99f; // Threshold before switching to bisection if (is_minimal_cpu(ix, thread_id)) { result.push(ix); return; } // Compute (x - F({x})/F'(ix)) inter ix // -> may yield 0, 1 or 2 intervals T x = median(ix); I iq = f_cpu(I(x), thread_id); I id = fd_cpu(ix, thread_id); bool has_part2; I part1, part2; part1 = division_part1(iq, id, has_part2); part1 = intersect(I(x) - part1, ix); if (has_part2) { part2 = division_part2(iq, id); part2 = intersect(I(x) - part2, ix); } if (should_bisect_cpu(ix, part1, part2, alpha)) { // Not so good improvement // Switch to bisection method for this step part1 = I(ix.lower(), x); part2 = I(x, ix.upper()); has_part2 = true; } if ((part1.lower() <= part1.upper()) && (!empty(part1))) { newton_interval_rec_cpu(result, part1, thread_id); } if (has_part2 && !empty(part2)) { newton_interval_rec_cpu(result, part2, thread_id); } } template void test_interval_newton_cpu(I *buffer, int *nresults, I i) { typedef typename I::base_type T; // Intervals to return // std::vector local_buffer(BLOCK_SIZE * GRID_SIZE * DEPTH_WORK); for (int thread_id = 0; thread_id != BLOCK_SIZE * GRID_SIZE; ++thread_id) { global_stack_cpu result(buffer, thread_id); #if USE_RECURSION_CPU newton_interval_rec_cpu(result, i, thread_id); #else newton_interval_cpu(result, i, thread_id); #endif nresults[thread_id] = result.size(); } } typedef interval, interval_lib::checking_base > > Ibase; #if UNPROTECTED typedef interval_lib::unprotect::type I_CPU; Ibase::traits_type::rounding rnd; #else typedef Ibase I_CPU; #endif bool checkAgainstHost(int *h_nresults, int *h_nresults_cpu, I_CPU *h_result, I_CPU *h_result_cpu) { std::cout << "\nCheck against Host computation...\n\n"; int success = 1; int success1 = 1; int success2 = 1; if (h_nresults_cpu[0] == h_nresults[0]) { for (int i = 0; i != h_nresults[0]; ++i) { TYPE diff1 = abs(h_result[THREADS * i + 0].lower() - h_result_cpu[THREADS * i + 0].lower()); TYPE diff2 = abs(h_result[THREADS * i + 0].upper() - h_result_cpu[THREADS * i + 0].upper()); if ((diff1 > 1.0e-6f) || (diff2 > 1.0e-6f)) { success1 = 0; break; } } // in case the two intervals are reversed for (int i = 0; i != h_nresults[0]; ++i) { TYPE diff1 = abs(h_result[THREADS * i + 0].lower() - h_result_cpu[THREADS * (h_nresults[0] - i - 1) + 0].lower()); TYPE diff2 = abs(h_result[THREADS * i + 0].upper() - h_result_cpu[THREADS * (h_nresults[0] - i - 1) + 0].upper()); if ((diff1 > 1.0e-6f) || (diff2 > 1.0e-6f)) { success2 = 0; break; } } success = success1 || success2; } else success = 0; return (bool)success; } #endif