mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2026-06-04 00:06:52 +08:00
This is the release of the CUDA 13.3 samples, which include additions for CUDA Tile C++, and updated CCCL and Python samples.
180 lines
5.8 KiB
Python
180 lines
5.8 KiB
Python
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions
|
|
# are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
# contributors may be used to endorse or promote products derived
|
|
# from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
"""
|
|
cuda.compute: Python lambdas as device-wide operators
|
|
|
|
This sample demonstrates how cuda.compute 1.0 (from the cuda-cccl
|
|
package) accepts plain Python callables, including lambdas, as the
|
|
operators that drive device-wide reductions, transforms, and scans.
|
|
Internally cuda.compute JIT-compiles the callable with Numba for the
|
|
device, so you can iterate on the operator in pure Python and still
|
|
get a fused GPU kernel.
|
|
|
|
The sample exercises three algorithm families with Python lambdas /
|
|
regular functions:
|
|
|
|
1. cuda.compute.reduce_into - sum via a lambda.
|
|
2. cuda.compute.unary_transform - elementwise y = x*x + 1 via a lambda.
|
|
3. cuda.compute.inclusive_scan - prefix sum over only the even values,
|
|
using a regular Python function as the binary operator.
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
|
|
|
|
try:
|
|
import cuda.compute
|
|
import cupy as cp
|
|
import numpy as np
|
|
from cuda.core import Device
|
|
from cuda_samples_utils import print_gpu_info # noqa: E402
|
|
except ImportError as e:
|
|
print(f"Error: Required package not found: {e}")
|
|
print("Please install from requirements.txt:")
|
|
print(" pip install -r requirements.txt")
|
|
sys.exit(1)
|
|
|
|
|
|
def demo_reduce_lambda() -> bool:
|
|
"""reduce_into driven by a lambda."""
|
|
dtype = np.int32
|
|
h_init = np.array([0], dtype=dtype)
|
|
d_in = cp.arange(1, 11, dtype=dtype) # 1..10
|
|
d_out = cp.empty(1, dtype=dtype)
|
|
|
|
cuda.compute.reduce_into(
|
|
d_in=d_in,
|
|
d_out=d_out,
|
|
num_items=int(d_in.size),
|
|
op=lambda a, b: a + b,
|
|
h_init=h_init,
|
|
)
|
|
|
|
got = int(d_out.get()[0])
|
|
expected = int(d_in.get().sum())
|
|
ok = got == expected
|
|
print(
|
|
f"reduce_into(lambda a,b: a+b) over 1..10 -> {got} "
|
|
f"(expected {expected}) {'OK' if ok else 'FAIL'}"
|
|
)
|
|
return ok
|
|
|
|
|
|
def demo_unary_transform_lambda() -> bool:
|
|
"""unary_transform driven by a lambda: y = x*x + 1."""
|
|
d_in = cp.arange(8, dtype=cp.int32)
|
|
d_out = cp.empty_like(d_in)
|
|
|
|
cuda.compute.unary_transform(
|
|
d_in=d_in,
|
|
d_out=d_out,
|
|
num_items=int(d_in.size),
|
|
op=lambda x: x * x + 1,
|
|
)
|
|
|
|
got = d_out.get()
|
|
expected = (d_in.get().astype(np.int64) ** 2 + 1).astype(np.int32)
|
|
ok = np.array_equal(got, expected)
|
|
print(
|
|
f"unary_transform(lambda x: x*x + 1):\n"
|
|
f" got = {got.tolist()}\n"
|
|
f" expected = {expected.tolist()} {'OK' if ok else 'FAIL'}"
|
|
)
|
|
return ok
|
|
|
|
|
|
def demo_scan_custom_op() -> bool:
|
|
"""inclusive_scan with a Python function that sums only even values.
|
|
|
|
This shows the same pattern that also works for reduce/transform:
|
|
the Python callable is JIT-compiled for the device by cuda.compute.
|
|
"""
|
|
dtype = np.int32
|
|
d_in = cp.array([1, 2, 3, 4, 5, 6], dtype=dtype)
|
|
d_out = cp.empty_like(d_in)
|
|
h_init = np.array([0], dtype=dtype)
|
|
|
|
def add_evens(a, b):
|
|
# Treat odd operands as zero; scan accumulates only even values.
|
|
return (a if a % 2 == 0 else 0) + (b if b % 2 == 0 else 0)
|
|
|
|
cuda.compute.inclusive_scan(
|
|
d_in=d_in,
|
|
d_out=d_out,
|
|
op=add_evens,
|
|
init_value=h_init,
|
|
num_items=int(d_in.size),
|
|
)
|
|
|
|
got = d_out.get()
|
|
# Host reference: running sum of even-only projection of the input.
|
|
h_in = d_in.get()
|
|
proj = np.where(h_in % 2 == 0, h_in, 0)
|
|
expected = np.cumsum(proj).astype(dtype)
|
|
ok = np.array_equal(got, expected)
|
|
print(
|
|
f"inclusive_scan(add-evens-only) over [1,2,3,4,5,6]:\n"
|
|
f" got = {got.tolist()}\n"
|
|
f" expected = {expected.tolist()} {'OK' if ok else 'FAIL'}"
|
|
)
|
|
return ok
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Drive cuda.compute device algorithms with Python lambdas / callables"
|
|
)
|
|
parser.add_argument("--device", type=int, default=0, help="CUDA device id")
|
|
args = parser.parse_args()
|
|
|
|
device = Device(args.device)
|
|
device.set_current()
|
|
print_gpu_info(device)
|
|
print()
|
|
|
|
ok = True
|
|
ok &= demo_reduce_lambda()
|
|
print()
|
|
ok &= demo_unary_transform_lambda()
|
|
print()
|
|
ok &= demo_scan_custom_op()
|
|
|
|
print()
|
|
if ok:
|
|
print("Done")
|
|
return 0
|
|
print("FAILED")
|
|
return 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|