mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-12-01 11:49:19 +08:00
332 lines
9.9 KiB
C++
332 lines
9.9 KiB
C++
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#ifndef CUDA_INTERVAL_H
|
|
#define CUDA_INTERVAL_H
|
|
|
|
#include "interval.h"
|
|
#include "cuda_interval_lib.h"
|
|
|
|
// Stack in local memory. Managed independently for each thread.
|
|
template <class T, int N>
|
|
class local_stack {
|
|
private:
|
|
T buf[N];
|
|
int tos;
|
|
|
|
public:
|
|
__device__ local_stack() : tos(-1) {}
|
|
__device__ T const &top() const { return buf[tos]; }
|
|
__device__ T &top() { return buf[tos]; }
|
|
__device__ void push(T const &v) { buf[++tos] = v; }
|
|
__device__ T pop() { return buf[tos--]; }
|
|
__device__ bool full() { return tos == (N - 1); }
|
|
__device__ bool empty() { return tos == -1; }
|
|
};
|
|
|
|
// Stacks in global memory.
|
|
// Same function as local_stack, but accessible from the host.
|
|
// Interleaved between threads by blocks of THREADS elements.
|
|
// Independent stack for each thread, no sharing of data between threads.
|
|
template <class T, int N, int THREADS>
|
|
class global_stack {
|
|
private:
|
|
T *buf;
|
|
int free_index;
|
|
|
|
public:
|
|
// buf should point to an allocated global buffer of
|
|
// size N * THREADS * sizeof(T)
|
|
__device__ global_stack(T *buf, int thread_id)
|
|
: buf(buf), free_index(thread_id) {}
|
|
|
|
__device__ void push(T const &v) {
|
|
buf[free_index] = v;
|
|
free_index += THREADS;
|
|
}
|
|
__device__ T pop() {
|
|
free_index -= THREADS;
|
|
return buf[free_index];
|
|
}
|
|
__device__ bool full() { return free_index >= N * THREADS; }
|
|
__device__ bool empty() { return free_index < THREADS; }
|
|
__device__ int size() { return free_index / THREADS; }
|
|
};
|
|
|
|
// The function F of which we want to find roots, defined on intervals
|
|
// Should typically depend on thread_id (indexing an array of coefficients...)
|
|
template <class T>
|
|
__device__ interval_gpu<T> f(interval_gpu<T> const &x, int thread_id) {
|
|
typedef interval_gpu<T> I;
|
|
T alpha = -T(thread_id) / T(THREADS);
|
|
return square(x - I(1)) + I(alpha) * x;
|
|
}
|
|
|
|
// First derivative of F, also defined on intervals
|
|
template <class T>
|
|
__device__ interval_gpu<T> fd(interval_gpu<T> const &x, int thread_id) {
|
|
typedef interval_gpu<T> I;
|
|
T alpha = -T(thread_id) / T(THREADS);
|
|
return I(2) * x + I(alpha - 2);
|
|
}
|
|
|
|
// Is this interval small enough to stop iterating?
|
|
template <class T>
|
|
__device__ bool is_minimal(interval_gpu<T> const &x, int thread_id) {
|
|
T const epsilon_x = 1e-6f;
|
|
T const epsilon_y = 1e-6f;
|
|
return !empty(x) && (width(x) <= epsilon_x * abs(median(x)) ||
|
|
width(f(x, thread_id)) <= epsilon_y);
|
|
}
|
|
|
|
// In some cases, Newton iterations converge slowly.
|
|
// Bisecting the interval accelerates convergence.
|
|
template <class T>
|
|
__device__ bool should_bisect(interval_gpu<T> const &x,
|
|
interval_gpu<T> const &x1,
|
|
interval_gpu<T> const &x2, T alpha) {
|
|
T wmax = alpha * width(x);
|
|
return (!empty(x1) && width(x1) > wmax) || (!empty(x2) && width(x2) > wmax);
|
|
}
|
|
|
|
// Main interval Newton loop.
|
|
// Keep refining a list of intervals stored in a stack.
|
|
// Always keep the next interval to work on in registers
|
|
// (avoids excessive spilling to local mem)
|
|
template <class T, int THREADS, int DEPTH_RESULT>
|
|
__device__ void newton_interval(
|
|
global_stack<interval_gpu<T>, DEPTH_RESULT, THREADS> &result,
|
|
interval_gpu<T> const &ix0, int thread_id) {
|
|
typedef interval_gpu<T> I;
|
|
int const DEPTH_WORK = 128;
|
|
|
|
T const alpha = .99f; // Threshold before switching to bisection
|
|
|
|
// Intervals to be processed
|
|
local_stack<I, DEPTH_WORK> work;
|
|
|
|
// We start with the whole domain
|
|
I ix = ix0;
|
|
|
|
while (true) {
|
|
// Compute (x - F({x})/F'(ix)) inter ix
|
|
// -> may yield 0, 1 or 2 intervals
|
|
T x = median(ix);
|
|
I iq = f(I(x), thread_id);
|
|
I id = fd(ix, thread_id);
|
|
|
|
bool has_part2;
|
|
I part1, part2 = I::empty();
|
|
part1 = division_part1(iq, id, has_part2);
|
|
part1 = intersect(I(x) - part1, ix);
|
|
|
|
if (has_part2) {
|
|
part2 = division_part2(iq, id);
|
|
part2 = intersect(I(x) - part2, ix);
|
|
}
|
|
|
|
// Do we have small-enough intervals?
|
|
if (is_minimal(part1, thread_id)) {
|
|
result.push(part1);
|
|
part1 = I::empty();
|
|
}
|
|
|
|
if (has_part2 && is_minimal(part2, thread_id)) {
|
|
result.push(part2);
|
|
part2 = I::empty();
|
|
}
|
|
|
|
if (should_bisect(ix, part1, part2, alpha)) {
|
|
// Not so good improvement
|
|
// Switch to bisection method for this step
|
|
part1 = I(ix.lower(), x);
|
|
part2 = I(x, ix.upper());
|
|
has_part2 = true;
|
|
}
|
|
|
|
if (!empty(part1)) {
|
|
// At least 1 solution
|
|
// We will compute part1 next
|
|
ix = part1;
|
|
|
|
if (has_part2 && !empty(part2)) {
|
|
// 2 solutions
|
|
// Save the second solution for later
|
|
work.push(part2);
|
|
}
|
|
} else if (has_part2 && !empty(part2)) {
|
|
// 1 solution
|
|
// Work on that next
|
|
ix = part2;
|
|
} else {
|
|
// No solution
|
|
// Do we still have work to do in the stack?
|
|
if (work.empty()) // If not, we are done
|
|
break;
|
|
else
|
|
ix = work.pop(); // Otherwise, pick an interval to work on
|
|
}
|
|
}
|
|
}
|
|
|
|
// Recursive implementation
|
|
template <class T, int THREADS, int DEPTH_RESULT>
|
|
__device__ void newton_interval_rec(
|
|
global_stack<interval_gpu<T>, DEPTH_RESULT, THREADS> &result,
|
|
interval_gpu<T> const &ix, int thread_id) {
|
|
typedef interval_gpu<T> I;
|
|
T const alpha = .99f; // Threshold before switching to bisection
|
|
|
|
if (is_minimal(ix, thread_id)) {
|
|
result.push(ix);
|
|
return;
|
|
}
|
|
|
|
// Compute (x - F({x})/F'(ix)) inter ix
|
|
// -> may yield 0, 1 or 2 intervals
|
|
T x = median(ix);
|
|
I iq = f(I(x), thread_id);
|
|
I id = fd(ix, thread_id);
|
|
|
|
bool has_part2;
|
|
I part1, part2 = I::empty();
|
|
part1 = division_part1(iq, id, has_part2);
|
|
part1 = intersect(I(x) - part1, ix);
|
|
|
|
if (has_part2) {
|
|
part2 = division_part2(iq, id);
|
|
part2 = intersect(I(x) - part2, ix);
|
|
}
|
|
|
|
if (should_bisect(ix, part1, part2, alpha)) {
|
|
// Not so good improvement
|
|
// Switch to bisection method for this step
|
|
part1 = I(ix.lower(), x);
|
|
part2 = I(x, ix.upper());
|
|
has_part2 = true;
|
|
}
|
|
|
|
if (has_part2 && !empty(part2)) {
|
|
newton_interval_rec<T, THREADS, DEPTH_RESULT>(result, part2, thread_id);
|
|
}
|
|
|
|
if (!empty(part1)) {
|
|
newton_interval_rec<T, THREADS, DEPTH_RESULT>(result, part1, thread_id);
|
|
}
|
|
}
|
|
|
|
// Naive implementation, no attempt to keep the top of the stack in registers
|
|
template <class T, int THREADS, int DEPTH_RESULT>
|
|
__device__ void newton_interval_naive(
|
|
global_stack<interval_gpu<T>, DEPTH_RESULT, THREADS> &result,
|
|
interval_gpu<T> const &ix0, int thread_id) {
|
|
typedef interval_gpu<T> I;
|
|
int const DEPTH_WORK = 128;
|
|
T const alpha = .99f; // Threshold before switching to bisection
|
|
|
|
// Intervals to be processed
|
|
local_stack<I, DEPTH_WORK> work;
|
|
|
|
// We start with the whole domain
|
|
work.push(ix0);
|
|
|
|
while (!work.empty()) {
|
|
I ix = work.pop();
|
|
|
|
if (is_minimal(ix, thread_id)) {
|
|
result.push(ix);
|
|
} else {
|
|
// Compute (x - F({x})/F'(ix)) inter ix
|
|
// -> may yield 0, 1 or 2 intervals
|
|
T x = median(ix);
|
|
I iq = f(I(x), thread_id);
|
|
I id = fd(ix, thread_id);
|
|
|
|
bool has_part2;
|
|
I part1, part2 = I::empty();
|
|
part1 = division_part1(iq, id, has_part2);
|
|
part1 = intersect(I(x) - part1, ix);
|
|
|
|
if (has_part2) {
|
|
part2 = division_part2(iq, id);
|
|
part2 = intersect(I(x) - part2, ix);
|
|
}
|
|
|
|
if (should_bisect(ix, part1, part2, alpha)) {
|
|
// Not so good improvement
|
|
// Switch to bisection method for this step
|
|
part1 = I(ix.lower(), x);
|
|
part2 = I(x, ix.upper());
|
|
has_part2 = true;
|
|
}
|
|
|
|
if (!empty(part1)) {
|
|
work.push(part1);
|
|
}
|
|
|
|
if (has_part2 && !empty(part2)) {
|
|
work.push(part2);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <class T>
|
|
__global__ void test_interval_newton(interval_gpu<T> *buffer, int *nresults,
|
|
interval_gpu<T> i,
|
|
int implementation_choice) {
|
|
int thread_id = blockIdx.x * BLOCK_SIZE + threadIdx.x;
|
|
typedef interval_gpu<T> I;
|
|
|
|
// Intervals to return
|
|
global_stack<I, DEPTH_RESULT, THREADS> result(buffer, thread_id);
|
|
|
|
switch (implementation_choice) {
|
|
case 0:
|
|
newton_interval_naive<T, THREADS>(result, i, thread_id);
|
|
break;
|
|
|
|
case 1:
|
|
newton_interval<T, THREADS>(result, i, thread_id);
|
|
break;
|
|
|
|
#if (__CUDA_ARCH__ >= 200)
|
|
|
|
case 2:
|
|
newton_interval_rec<T, THREADS>(result, i, thread_id);
|
|
break;
|
|
#endif
|
|
|
|
default:
|
|
newton_interval_naive<T, THREADS>(result, i, thread_id);
|
|
}
|
|
|
|
nresults[thread_id] = result.size();
|
|
}
|
|
|
|
#endif
|