mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2026-06-04 00:06:52 +08:00
This is the release of the CUDA 13.3 samples, which include additions for CUDA Tile C++, and updated CCCL and Python samples.
148 lines
5.2 KiB
Python
148 lines
5.2 KiB
Python
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions
|
|
# are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
# contributors may be used to endorse or promote products derived
|
|
# from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
"""
|
|
This sample demonstrates the parallel binary-search algorithms exposed
|
|
by cuda.compute (from the cuda-cccl package). Given a sorted
|
|
``d_data`` array and a batch of ``d_values`` to locate, cuda.compute:
|
|
|
|
- ``cuda.compute.lower_bound(d_data, num_items, d_values, num_values, d_out)``
|
|
writes, for each value, the lowest index where it could be inserted
|
|
into d_data without breaking the sort order. Matches
|
|
``numpy.searchsorted(..., side="left")``.
|
|
|
|
- ``cuda.compute.upper_bound(d_data, num_items, d_values, num_values, d_out)``
|
|
is the analogous upper form, matching ``side="right"``.
|
|
|
|
The sample runs both algorithms on a curated sorted input with
|
|
duplicates so the lower/upper distinction is visible, verifies the
|
|
results against ``numpy.searchsorted``, and prints both sets of
|
|
indices side-by-side.
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
|
|
|
|
try:
|
|
import cuda.compute
|
|
import cupy as cp
|
|
import numpy as np
|
|
from cuda.core import Device
|
|
from cuda_samples_utils import print_gpu_info # noqa: E402
|
|
except ImportError as e:
|
|
print(f"Error: Required package not found: {e}")
|
|
print("Please install from requirements.txt:")
|
|
print(" pip install -r requirements.txt")
|
|
sys.exit(1)
|
|
|
|
|
|
def run_binary_search(h_data: np.ndarray, h_values: np.ndarray) -> bool:
|
|
d_data = cp.asarray(h_data)
|
|
d_values = cp.asarray(h_values)
|
|
|
|
d_lb = cp.empty(len(h_values), dtype=np.uintp)
|
|
d_ub = cp.empty(len(h_values), dtype=np.uintp)
|
|
|
|
cuda.compute.lower_bound(
|
|
d_data=d_data,
|
|
num_items=len(d_data),
|
|
d_values=d_values,
|
|
num_values=len(d_values),
|
|
d_out=d_lb,
|
|
)
|
|
cuda.compute.upper_bound(
|
|
d_data=d_data,
|
|
num_items=len(d_data),
|
|
d_values=d_values,
|
|
num_values=len(d_values),
|
|
d_out=d_ub,
|
|
)
|
|
|
|
got_lb = cp.asnumpy(d_lb)
|
|
got_ub = cp.asnumpy(d_ub)
|
|
expected_lb = np.searchsorted(h_data, h_values, side="left").astype(np.uintp)
|
|
expected_ub = np.searchsorted(h_data, h_values, side="right").astype(np.uintp)
|
|
|
|
ok_lb = np.array_equal(got_lb, expected_lb)
|
|
ok_ub = np.array_equal(got_ub, expected_ub)
|
|
|
|
print(f" data = {h_data.tolist()}")
|
|
print(f" values = {h_values.tolist()}")
|
|
print(
|
|
f" lower_bound: got {got_lb.tolist()} "
|
|
f"expected {expected_lb.tolist()} {'OK' if ok_lb else 'FAIL'}"
|
|
)
|
|
print(
|
|
f" upper_bound: got {got_ub.tolist()} "
|
|
f"expected {expected_ub.tolist()} {'OK' if ok_ub else 'FAIL'}"
|
|
)
|
|
return ok_lb and ok_ub
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Parallel upper_bound / lower_bound via cuda.compute"
|
|
)
|
|
parser.add_argument("--device", type=int, default=0, help="CUDA device id")
|
|
args = parser.parse_args()
|
|
|
|
device = Device(args.device)
|
|
device.set_current()
|
|
print_gpu_info(device)
|
|
print()
|
|
|
|
ok = True
|
|
|
|
# Case 1: values both inside and outside the data range; no duplicates
|
|
# in the data. lower_bound and upper_bound agree on values not present.
|
|
print("Case 1: distinct data, mixed queries")
|
|
h_data1 = np.array([1, 3, 5, 7, 9], dtype=np.int32)
|
|
h_values1 = np.array([0, 3, 4, 10], dtype=np.int32)
|
|
ok &= run_binary_search(h_data1, h_values1)
|
|
print()
|
|
|
|
# Case 2: duplicates in the data so lower_bound and upper_bound diverge
|
|
# on present values.
|
|
print("Case 2: duplicates in data")
|
|
h_data2 = np.array([1, 3, 3, 5, 7, 9], dtype=np.int32)
|
|
h_values2 = np.array([3, 3, 5, 8], dtype=np.int32)
|
|
ok &= run_binary_search(h_data2, h_values2)
|
|
|
|
print()
|
|
if ok:
|
|
print("Done")
|
|
return 0
|
|
print("FAILED")
|
|
return 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|