cuda-samples/Samples/2_Concepts_and_Techniques/interval/cuda_interval.h
2022-01-13 11:35:24 +05:30

332 lines
9.9 KiB
C++

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CUDA_INTERVAL_H
#define CUDA_INTERVAL_H
#include "interval.h"
#include "cuda_interval_lib.h"
// Stack in local memory. Managed independently for each thread.
template <class T, int N>
class local_stack {
private:
T buf[N];
int tos;
public:
__device__ local_stack() : tos(-1) {}
__device__ T const &top() const { return buf[tos]; }
__device__ T &top() { return buf[tos]; }
__device__ void push(T const &v) { buf[++tos] = v; }
__device__ T pop() { return buf[tos--]; }
__device__ bool full() { return tos == (N - 1); }
__device__ bool empty() { return tos == -1; }
};
// Stacks in global memory.
// Same function as local_stack, but accessible from the host.
// Interleaved between threads by blocks of THREADS elements.
// Independent stack for each thread, no sharing of data between threads.
template <class T, int N, int THREADS>
class global_stack {
private:
T *buf;
int free_index;
public:
// buf should point to an allocated global buffer of
// size N * THREADS * sizeof(T)
__device__ global_stack(T *buf, int thread_id)
: buf(buf), free_index(thread_id) {}
__device__ void push(T const &v) {
buf[free_index] = v;
free_index += THREADS;
}
__device__ T pop() {
free_index -= THREADS;
return buf[free_index];
}
__device__ bool full() { return free_index >= N * THREADS; }
__device__ bool empty() { return free_index < THREADS; }
__device__ int size() { return free_index / THREADS; }
};
// The function F of which we want to find roots, defined on intervals
// Should typically depend on thread_id (indexing an array of coefficients...)
template <class T>
__device__ interval_gpu<T> f(interval_gpu<T> const &x, int thread_id) {
typedef interval_gpu<T> I;
T alpha = -T(thread_id) / T(THREADS);
return square(x - I(1)) + I(alpha) * x;
}
// First derivative of F, also defined on intervals
template <class T>
__device__ interval_gpu<T> fd(interval_gpu<T> const &x, int thread_id) {
typedef interval_gpu<T> I;
T alpha = -T(thread_id) / T(THREADS);
return I(2) * x + I(alpha - 2);
}
// Is this interval small enough to stop iterating?
template <class T>
__device__ bool is_minimal(interval_gpu<T> const &x, int thread_id) {
T const epsilon_x = 1e-6f;
T const epsilon_y = 1e-6f;
return !empty(x) && (width(x) <= epsilon_x * abs(median(x)) ||
width(f(x, thread_id)) <= epsilon_y);
}
// In some cases, Newton iterations converge slowly.
// Bisecting the interval accelerates convergence.
template <class T>
__device__ bool should_bisect(interval_gpu<T> const &x,
interval_gpu<T> const &x1,
interval_gpu<T> const &x2, T alpha) {
T wmax = alpha * width(x);
return (!empty(x1) && width(x1) > wmax) || (!empty(x2) && width(x2) > wmax);
}
// Main interval Newton loop.
// Keep refining a list of intervals stored in a stack.
// Always keep the next interval to work on in registers
// (avoids excessive spilling to local mem)
template <class T, int THREADS, int DEPTH_RESULT>
__device__ void newton_interval(
global_stack<interval_gpu<T>, DEPTH_RESULT, THREADS> &result,
interval_gpu<T> const &ix0, int thread_id) {
typedef interval_gpu<T> I;
int const DEPTH_WORK = 128;
T const alpha = .99f; // Threshold before switching to bisection
// Intervals to be processed
local_stack<I, DEPTH_WORK> work;
// We start with the whole domain
I ix = ix0;
while (true) {
// Compute (x - F({x})/F'(ix)) inter ix
// -> may yield 0, 1 or 2 intervals
T x = median(ix);
I iq = f(I(x), thread_id);
I id = fd(ix, thread_id);
bool has_part2;
I part1, part2 = I::empty();
part1 = division_part1(iq, id, has_part2);
part1 = intersect(I(x) - part1, ix);
if (has_part2) {
part2 = division_part2(iq, id);
part2 = intersect(I(x) - part2, ix);
}
// Do we have small-enough intervals?
if (is_minimal(part1, thread_id)) {
result.push(part1);
part1 = I::empty();
}
if (has_part2 && is_minimal(part2, thread_id)) {
result.push(part2);
part2 = I::empty();
}
if (should_bisect(ix, part1, part2, alpha)) {
// Not so good improvement
// Switch to bisection method for this step
part1 = I(ix.lower(), x);
part2 = I(x, ix.upper());
has_part2 = true;
}
if (!empty(part1)) {
// At least 1 solution
// We will compute part1 next
ix = part1;
if (has_part2 && !empty(part2)) {
// 2 solutions
// Save the second solution for later
work.push(part2);
}
} else if (has_part2 && !empty(part2)) {
// 1 solution
// Work on that next
ix = part2;
} else {
// No solution
// Do we still have work to do in the stack?
if (work.empty()) // If not, we are done
break;
else
ix = work.pop(); // Otherwise, pick an interval to work on
}
}
}
// Recursive implementation
template <class T, int THREADS, int DEPTH_RESULT>
__device__ void newton_interval_rec(
global_stack<interval_gpu<T>, DEPTH_RESULT, THREADS> &result,
interval_gpu<T> const &ix, int thread_id) {
typedef interval_gpu<T> I;
T const alpha = .99f; // Threshold before switching to bisection
if (is_minimal(ix, thread_id)) {
result.push(ix);
return;
}
// Compute (x - F({x})/F'(ix)) inter ix
// -> may yield 0, 1 or 2 intervals
T x = median(ix);
I iq = f(I(x), thread_id);
I id = fd(ix, thread_id);
bool has_part2;
I part1, part2 = I::empty();
part1 = division_part1(iq, id, has_part2);
part1 = intersect(I(x) - part1, ix);
if (has_part2) {
part2 = division_part2(iq, id);
part2 = intersect(I(x) - part2, ix);
}
if (should_bisect(ix, part1, part2, alpha)) {
// Not so good improvement
// Switch to bisection method for this step
part1 = I(ix.lower(), x);
part2 = I(x, ix.upper());
has_part2 = true;
}
if (has_part2 && !empty(part2)) {
newton_interval_rec<T, THREADS, DEPTH_RESULT>(result, part2, thread_id);
}
if (!empty(part1)) {
newton_interval_rec<T, THREADS, DEPTH_RESULT>(result, part1, thread_id);
}
}
// Naive implementation, no attempt to keep the top of the stack in registers
template <class T, int THREADS, int DEPTH_RESULT>
__device__ void newton_interval_naive(
global_stack<interval_gpu<T>, DEPTH_RESULT, THREADS> &result,
interval_gpu<T> const &ix0, int thread_id) {
typedef interval_gpu<T> I;
int const DEPTH_WORK = 128;
T const alpha = .99f; // Threshold before switching to bisection
// Intervals to be processed
local_stack<I, DEPTH_WORK> work;
// We start with the whole domain
work.push(ix0);
while (!work.empty()) {
I ix = work.pop();
if (is_minimal(ix, thread_id)) {
result.push(ix);
} else {
// Compute (x - F({x})/F'(ix)) inter ix
// -> may yield 0, 1 or 2 intervals
T x = median(ix);
I iq = f(I(x), thread_id);
I id = fd(ix, thread_id);
bool has_part2;
I part1, part2 = I::empty();
part1 = division_part1(iq, id, has_part2);
part1 = intersect(I(x) - part1, ix);
if (has_part2) {
part2 = division_part2(iq, id);
part2 = intersect(I(x) - part2, ix);
}
if (should_bisect(ix, part1, part2, alpha)) {
// Not so good improvement
// Switch to bisection method for this step
part1 = I(ix.lower(), x);
part2 = I(x, ix.upper());
has_part2 = true;
}
if (!empty(part1)) {
work.push(part1);
}
if (has_part2 && !empty(part2)) {
work.push(part2);
}
}
}
}
template <class T>
__global__ void test_interval_newton(interval_gpu<T> *buffer, int *nresults,
interval_gpu<T> i,
int implementation_choice) {
int thread_id = blockIdx.x * BLOCK_SIZE + threadIdx.x;
typedef interval_gpu<T> I;
// Intervals to return
global_stack<I, DEPTH_RESULT, THREADS> result(buffer, thread_id);
switch (implementation_choice) {
case 0:
newton_interval_naive<T, THREADS>(result, i, thread_id);
break;
case 1:
newton_interval<T, THREADS>(result, i, thread_id);
break;
#if (__CUDA_ARCH__ >= 200)
case 2:
newton_interval_rec<T, THREADS>(result, i, thread_id);
break;
#endif
default:
newton_interval_naive<T, THREADS>(result, i, thread_id);
}
nresults[thread_id] = result.size();
}
#endif