/* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* This sample demonstrates the three device-wide search algorithms * introduced in CCCL 3.3: cub::DeviceFind::FindIf for predicate search, * and cub::DeviceFind::LowerBound / UpperBound for parallel binary * search. Results are verified against std::find_if, std::lower_bound, * and std::upper_bound on the host. */ /* Includes, system */ #include #include #include #include /* Includes, cuda */ #include #include /* Includes, cccl */ #include #include #include #include /* Predicate used with cub::DeviceFind::FindIf. */ struct is_greater_than_t { int threshold; __host__ __device__ bool operator()(int value) const { return value > threshold; } }; static bool run_find_if() { /* Input: 0, 1, ..., 15. Predicate: value > 9. Expected index: 10. */ const int num_items = 16; thrust::device_vector d_in(num_items); for (int i = 0; i < num_items; ++i) d_in[i] = i; thrust::device_vector d_out(1); is_greater_than_t predicate{9}; size_t temp_bytes = 0; checkCudaErrors( cub::DeviceFind::FindIf(nullptr, temp_bytes, d_in.begin(), d_out.begin(), predicate, num_items)); thrust::device_vector temp(temp_bytes); checkCudaErrors(cub::DeviceFind::FindIf(thrust::raw_pointer_cast(temp.data()), temp_bytes, d_in.begin(), d_out.begin(), predicate, num_items)); checkCudaErrors(cudaDeviceSynchronize()); const int got = d_out[0]; thrust::host_vector h_in = d_in; auto host_it = std::find_if(h_in.begin(), h_in.end(), [&](int v) { return v > predicate.threshold; }); const int expected = static_cast(host_it - h_in.begin()); printf("cub::DeviceFind::FindIf(value > %d) over [0..%d)\n", predicate.threshold, num_items); printf(" got index = %d, expected = %d %s\n", got, expected, (got == expected ? "OK" : "FAIL")); return got == expected; } static bool run_lower_bound() { /* Sorted range: [0, 2, 4, 6, 8]. Values to locate: [1, 3, 5, 7]. */ thrust::device_vector d_range = {0, 2, 4, 6, 8}; thrust::device_vector d_values = {1, 3, 5, 7}; thrust::device_vector d_out(d_values.size()); size_t temp_bytes = 0; checkCudaErrors(cub::DeviceFind::LowerBound(nullptr, temp_bytes, d_range.begin(), static_cast(d_range.size()), d_values.begin(), static_cast(d_values.size()), d_out.begin(), cuda::std::less{})); thrust::device_vector temp(temp_bytes); checkCudaErrors(cub::DeviceFind::LowerBound(thrust::raw_pointer_cast(temp.data()), temp_bytes, d_range.begin(), static_cast(d_range.size()), d_values.begin(), static_cast(d_values.size()), d_out.begin(), cuda::std::less{})); checkCudaErrors(cudaDeviceSynchronize()); thrust::host_vector h_range = d_range; thrust::host_vector h_values = d_values; thrust::host_vector got = d_out; std::vector expected(h_values.size()); for (size_t i = 0; i < h_values.size(); ++i) { expected[i] = static_cast( std::lower_bound(h_range.begin(), h_range.end(), h_values[i]) - h_range.begin()); } bool ok = true; printf("cub::DeviceFind::LowerBound\n"); printf(" range = { 0, 2, 4, 6, 8 }\n"); printf(" values = { 1, 3, 5, 7 }\n"); printf(" got = {"); for (size_t i = 0; i < got.size(); ++i) { printf(" %d", got[i]); if (got[i] != expected[i]) ok = false; } printf(" }\n expect = {"); for (size_t i = 0; i < expected.size(); ++i) printf(" %d", expected[i]); printf(" } %s\n", ok ? "OK" : "FAIL"); return ok; } static bool run_upper_bound() { /* Range with duplicates so LowerBound and UpperBound differ on values * that appear in the range. */ thrust::device_vector d_range = {0, 2, 2, 4, 6, 8}; thrust::device_vector d_values = {2, 2}; thrust::device_vector d_lb(d_values.size()); thrust::device_vector d_ub(d_values.size()); size_t temp_bytes_lb = 0; checkCudaErrors(cub::DeviceFind::LowerBound(nullptr, temp_bytes_lb, d_range.begin(), static_cast(d_range.size()), d_values.begin(), static_cast(d_values.size()), d_lb.begin(), cuda::std::less{})); thrust::device_vector temp_lb(temp_bytes_lb); checkCudaErrors(cub::DeviceFind::LowerBound(thrust::raw_pointer_cast(temp_lb.data()), temp_bytes_lb, d_range.begin(), static_cast(d_range.size()), d_values.begin(), static_cast(d_values.size()), d_lb.begin(), cuda::std::less{})); size_t temp_bytes_ub = 0; checkCudaErrors(cub::DeviceFind::UpperBound(nullptr, temp_bytes_ub, d_range.begin(), static_cast(d_range.size()), d_values.begin(), static_cast(d_values.size()), d_ub.begin(), cuda::std::less{})); thrust::device_vector temp_ub(temp_bytes_ub); checkCudaErrors(cub::DeviceFind::UpperBound(thrust::raw_pointer_cast(temp_ub.data()), temp_bytes_ub, d_range.begin(), static_cast(d_range.size()), d_values.begin(), static_cast(d_values.size()), d_ub.begin(), cuda::std::less{})); checkCudaErrors(cudaDeviceSynchronize()); thrust::host_vector h_range = d_range; thrust::host_vector h_values = d_values; thrust::host_vector got_lb = d_lb; thrust::host_vector got_ub = d_ub; std::vector exp_lb(h_values.size()); std::vector exp_ub(h_values.size()); for (size_t i = 0; i < h_values.size(); ++i) { exp_lb[i] = static_cast(std::lower_bound(h_range.begin(), h_range.end(), h_values[i]) - h_range.begin()); exp_ub[i] = static_cast(std::upper_bound(h_range.begin(), h_range.end(), h_values[i]) - h_range.begin()); } bool ok = true; printf("cub::DeviceFind::UpperBound (with duplicates in range)\n"); printf(" range = { 0, 2, 2, 4, 6, 8 }\n"); printf(" values = { 2, 2 }\n"); printf(" lb = {"); for (size_t i = 0; i < got_lb.size(); ++i) { printf(" %d", got_lb[i]); if (got_lb[i] != exp_lb[i]) ok = false; } printf(" } expected = {"); for (size_t i = 0; i < exp_lb.size(); ++i) printf(" %d", exp_lb[i]); printf(" }\n ub = {"); for (size_t i = 0; i < got_ub.size(); ++i) { printf(" %d", got_ub[i]); if (got_ub[i] != exp_ub[i]) ok = false; } printf(" } expected = {"); for (size_t i = 0; i < exp_ub.size(); ++i) printf(" %d", exp_ub[i]); printf(" } %s\n", ok ? "OK" : "FAIL"); return ok; } int main(int argc, char **argv) { int devID = findCudaDevice(argc, (const char **)argv); cudaDeviceProp props; checkCudaErrors(cudaGetDeviceProperties(&props, devID)); printf("Device: %s (Compute Capability %d.%d)\n\n", props.name, props.major, props.minor); bool ok = true; ok &= run_find_if(); printf("\n"); ok &= run_lower_bound(); printf("\n"); ok &= run_upper_bound(); printf("\n%s\n", ok ? "Done" : "FAILED"); return ok ? EXIT_SUCCESS : EXIT_FAILURE; }