Dheemanth aeab82ff30
CUDA 13.2 samples update (#432)
- Added Python samples for CUDA Python 1.0 release
- Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
2026-05-13 17:13:18 -05:00

200 lines
7.0 KiB
Python

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Prefix Sum (Scan)
Demonstrates parallel prefix sum algorithms using cuda.compute:
- Inclusive scan: output[i] = [init_value] + input[0] + ... + input[i]
- Exclusive scan: output[i] = init_value + input[0] + ... + input[i-1]
Uses cuda.compute APIs for optimized CUB-based scan operations.
Uses cuda.core APIs for device and stream management.
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
try:
import cupy as cp
import numpy as np
from cuda.compute import OpKind, exclusive_scan, inclusive_scan
from cuda.core import Device, EventOptions
from cuda_samples_utils import print_gpu_info, verify_array_result
except ImportError as e:
print(f"Error: Required package not found: {e}")
print("Please install from requirements.txt:")
print(" pip install -r requirements.txt")
sys.exit(1)
def main() -> bool:
"""Run prefix sum sample. Returns True if all tests passed."""
print("=" * 60)
print("Prefix Sum (Scan) - Using cuda.compute")
print("=" * 60)
device = Device(0)
device.set_current()
stream = device.create_stream()
cp_stream = cp.cuda.ExternalStream(int(stream.handle))
ok = True
try:
print()
print_gpu_info(device)
h_input = np.array([3, 1, 4, 1, 5, 9, 2, 6], dtype=np.int32)
init_value = np.array([0], dtype=np.int32)
# =========================================================================
# Inclusive Scan
# =========================================================================
print("\n" + "-" * 60)
print("INCLUSIVE SCAN")
print("-" * 60)
print(
"Formula: output[i] = [init_value] + input[0] + input[1] + ... + input[i]"
)
with cp_stream:
d_input = cp.asarray(h_input)
d_output = cp.empty_like(d_input)
print(f"\nInput: {h_input.tolist()}")
inclusive_scan(
d_in=d_input,
d_out=d_output,
op=OpKind.PLUS,
init_value=None,
num_items=len(h_input),
stream=stream,
)
stream.sync()
print(f"Output: {cp.asnumpy(d_output).tolist()}")
with cp_stream:
expected = cp.asarray(np.cumsum(h_input))
ok &= verify_array_result(d_output, expected, rtol=0, atol=0)
# =========================================================================
# Exclusive Scan
# =========================================================================
print("\n" + "-" * 60)
print("EXCLUSIVE SCAN")
print("-" * 60)
print("Formula: output[i] = init_value + input[0] + ... + input[i-1]")
with cp_stream:
d_output = cp.empty_like(d_input)
print(f"\nInput: {h_input.tolist()}")
exclusive_scan(
d_in=d_input,
d_out=d_output,
op=OpKind.PLUS,
init_value=init_value,
num_items=len(h_input),
stream=stream,
)
stream.sync()
print(f"Output: {cp.asnumpy(d_output).tolist()}")
with cp_stream:
expected = cp.asarray(np.concatenate([init_value, np.cumsum(h_input)[:-1]]))
ok &= verify_array_result(d_output, expected, rtol=0, atol=0)
# =========================================================================
# Large Array Performance
# =========================================================================
print("\n" + "-" * 60)
print("PERFORMANCE (10M elements)")
print("-" * 60)
N = 10_000_000
with cp_stream:
d_large_in = cp.ones(N, dtype=np.int32)
d_large_out = cp.empty_like(d_large_in)
inclusive_scan(
d_in=d_large_in,
d_out=d_large_out,
op=OpKind.PLUS,
init_value=None,
num_items=N,
stream=stream,
)
stream.sync()
event_opts = EventOptions(enable_timing=True)
start_event = device.create_event(options=event_opts)
end_event = device.create_event(options=event_opts)
num_iterations = 10
stream.record(start_event)
for _ in range(num_iterations):
inclusive_scan(
d_in=d_large_in,
d_out=d_large_out,
op=OpKind.PLUS,
init_value=None,
num_items=N,
stream=stream,
)
stream.record(end_event)
end_event.sync()
elapsed_ms = (end_event - start_event) / num_iterations
print(f"Inclusive scan: {elapsed_ms:.3f} ms")
print(f"Throughput: {N / elapsed_ms / 1e6:.1f} M elements/ms")
# =========================================================================
# Summary
# =========================================================================
print("\n" + "=" * 60)
print("KEY CONCEPTS")
print("=" * 60)
print("• Inclusive: output[i] includes input[i]")
print("• Exclusive: output[i] excludes input[i], starts with init_value")
print("• cuda.compute provides CUB-based optimized implementations")
print("• cuda.core Stream integrates with CuPy via ExternalStream")
print("• Applications: stream compaction, radix sort, histograms")
print("=" * 60)
return ok
finally:
cp.cuda.Stream.null.use()
stream.close()
if __name__ == "__main__":
success = main()
if not success:
sys.exit(1)