2026-05-27 21:03:57 +00:00

249 lines
11 KiB
Plaintext

/* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* This sample demonstrates the three device-wide search algorithms
* introduced in CCCL 3.3: cub::DeviceFind::FindIf for predicate search,
* and cub::DeviceFind::LowerBound / UpperBound for parallel binary
* search. Results are verified against std::find_if, std::lower_bound,
* and std::upper_bound on the host.
*/
/* Includes, system */
#include <algorithm>
#include <stdio.h>
#include <stdlib.h>
#include <vector>
/* Includes, cuda */
#include <cuda_runtime.h>
#include <helper_cuda.h>
/* Includes, cccl */
#include <cub/device/device_find.cuh>
#include <cuda/std/functional>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
/* Predicate used with cub::DeviceFind::FindIf. */
struct is_greater_than_t
{
int threshold;
__host__ __device__ bool operator()(int value) const { return value > threshold; }
};
static bool run_find_if()
{
/* Input: 0, 1, ..., 15. Predicate: value > 9. Expected index: 10. */
const int num_items = 16;
thrust::device_vector<int> d_in(num_items);
for (int i = 0; i < num_items; ++i)
d_in[i] = i;
thrust::device_vector<int> d_out(1);
is_greater_than_t predicate{9};
size_t temp_bytes = 0;
checkCudaErrors(
cub::DeviceFind::FindIf(nullptr, temp_bytes, d_in.begin(), d_out.begin(), predicate, num_items));
thrust::device_vector<char> temp(temp_bytes);
checkCudaErrors(cub::DeviceFind::FindIf(thrust::raw_pointer_cast(temp.data()),
temp_bytes,
d_in.begin(),
d_out.begin(),
predicate,
num_items));
checkCudaErrors(cudaDeviceSynchronize());
const int got = d_out[0];
thrust::host_vector<int> h_in = d_in;
auto host_it = std::find_if(h_in.begin(), h_in.end(),
[&](int v) { return v > predicate.threshold; });
const int expected = static_cast<int>(host_it - h_in.begin());
printf("cub::DeviceFind::FindIf(value > %d) over [0..%d)\n", predicate.threshold, num_items);
printf(" got index = %d, expected = %d %s\n", got, expected, (got == expected ? "OK" : "FAIL"));
return got == expected;
}
static bool run_lower_bound()
{
/* Sorted range: [0, 2, 4, 6, 8]. Values to locate: [1, 3, 5, 7]. */
thrust::device_vector<int> d_range = {0, 2, 4, 6, 8};
thrust::device_vector<int> d_values = {1, 3, 5, 7};
thrust::device_vector<int> d_out(d_values.size());
size_t temp_bytes = 0;
checkCudaErrors(cub::DeviceFind::LowerBound(nullptr,
temp_bytes,
d_range.begin(),
static_cast<int>(d_range.size()),
d_values.begin(),
static_cast<int>(d_values.size()),
d_out.begin(),
cuda::std::less{}));
thrust::device_vector<char> temp(temp_bytes);
checkCudaErrors(cub::DeviceFind::LowerBound(thrust::raw_pointer_cast(temp.data()),
temp_bytes,
d_range.begin(),
static_cast<int>(d_range.size()),
d_values.begin(),
static_cast<int>(d_values.size()),
d_out.begin(),
cuda::std::less{}));
checkCudaErrors(cudaDeviceSynchronize());
thrust::host_vector<int> h_range = d_range;
thrust::host_vector<int> h_values = d_values;
thrust::host_vector<int> got = d_out;
std::vector<int> expected(h_values.size());
for (size_t i = 0; i < h_values.size(); ++i) {
expected[i] = static_cast<int>(
std::lower_bound(h_range.begin(), h_range.end(), h_values[i]) - h_range.begin());
}
bool ok = true;
printf("cub::DeviceFind::LowerBound\n");
printf(" range = { 0, 2, 4, 6, 8 }\n");
printf(" values = { 1, 3, 5, 7 }\n");
printf(" got = {");
for (size_t i = 0; i < got.size(); ++i) {
printf(" %d", got[i]);
if (got[i] != expected[i])
ok = false;
}
printf(" }\n expect = {");
for (size_t i = 0; i < expected.size(); ++i)
printf(" %d", expected[i]);
printf(" } %s\n", ok ? "OK" : "FAIL");
return ok;
}
static bool run_upper_bound()
{
/* Range with duplicates so LowerBound and UpperBound differ on values
* that appear in the range. */
thrust::device_vector<int> d_range = {0, 2, 2, 4, 6, 8};
thrust::device_vector<int> d_values = {2, 2};
thrust::device_vector<int> d_lb(d_values.size());
thrust::device_vector<int> d_ub(d_values.size());
size_t temp_bytes_lb = 0;
checkCudaErrors(cub::DeviceFind::LowerBound(nullptr,
temp_bytes_lb,
d_range.begin(),
static_cast<int>(d_range.size()),
d_values.begin(),
static_cast<int>(d_values.size()),
d_lb.begin(),
cuda::std::less{}));
thrust::device_vector<char> temp_lb(temp_bytes_lb);
checkCudaErrors(cub::DeviceFind::LowerBound(thrust::raw_pointer_cast(temp_lb.data()),
temp_bytes_lb,
d_range.begin(),
static_cast<int>(d_range.size()),
d_values.begin(),
static_cast<int>(d_values.size()),
d_lb.begin(),
cuda::std::less{}));
size_t temp_bytes_ub = 0;
checkCudaErrors(cub::DeviceFind::UpperBound(nullptr,
temp_bytes_ub,
d_range.begin(),
static_cast<int>(d_range.size()),
d_values.begin(),
static_cast<int>(d_values.size()),
d_ub.begin(),
cuda::std::less{}));
thrust::device_vector<char> temp_ub(temp_bytes_ub);
checkCudaErrors(cub::DeviceFind::UpperBound(thrust::raw_pointer_cast(temp_ub.data()),
temp_bytes_ub,
d_range.begin(),
static_cast<int>(d_range.size()),
d_values.begin(),
static_cast<int>(d_values.size()),
d_ub.begin(),
cuda::std::less{}));
checkCudaErrors(cudaDeviceSynchronize());
thrust::host_vector<int> h_range = d_range;
thrust::host_vector<int> h_values = d_values;
thrust::host_vector<int> got_lb = d_lb;
thrust::host_vector<int> got_ub = d_ub;
std::vector<int> exp_lb(h_values.size());
std::vector<int> exp_ub(h_values.size());
for (size_t i = 0; i < h_values.size(); ++i) {
exp_lb[i] =
static_cast<int>(std::lower_bound(h_range.begin(), h_range.end(), h_values[i]) - h_range.begin());
exp_ub[i] =
static_cast<int>(std::upper_bound(h_range.begin(), h_range.end(), h_values[i]) - h_range.begin());
}
bool ok = true;
printf("cub::DeviceFind::UpperBound (with duplicates in range)\n");
printf(" range = { 0, 2, 2, 4, 6, 8 }\n");
printf(" values = { 2, 2 }\n");
printf(" lb = {");
for (size_t i = 0; i < got_lb.size(); ++i) {
printf(" %d", got_lb[i]);
if (got_lb[i] != exp_lb[i])
ok = false;
}
printf(" } expected = {");
for (size_t i = 0; i < exp_lb.size(); ++i)
printf(" %d", exp_lb[i]);
printf(" }\n ub = {");
for (size_t i = 0; i < got_ub.size(); ++i) {
printf(" %d", got_ub[i]);
if (got_ub[i] != exp_ub[i])
ok = false;
}
printf(" } expected = {");
for (size_t i = 0; i < exp_ub.size(); ++i)
printf(" %d", exp_ub[i]);
printf(" } %s\n", ok ? "OK" : "FAIL");
return ok;
}
int main(int argc, char **argv)
{
int devID = findCudaDevice(argc, (const char **)argv);
cudaDeviceProp props;
checkCudaErrors(cudaGetDeviceProperties(&props, devID));
printf("Device: %s (Compute Capability %d.%d)\n\n", props.name, props.major, props.minor);
bool ok = true;
ok &= run_find_if();
printf("\n");
ok &= run_lower_bound();
printf("\n");
ok &= run_upper_bound();
printf("\n%s\n", ok ? "Done" : "FAILED");
return ok ? EXIT_SUCCESS : EXIT_FAILURE;
}