mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2026-05-14 14:06:53 +08:00
- Added Python samples for CUDA Python 1.0 release - Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
200 lines
7.0 KiB
Python
200 lines
7.0 KiB
Python
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions
|
|
# are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
# contributors may be used to endorse or promote products derived
|
|
# from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
"""
|
|
Prefix Sum (Scan)
|
|
|
|
Demonstrates parallel prefix sum algorithms using cuda.compute:
|
|
- Inclusive scan: output[i] = [init_value] + input[0] + ... + input[i]
|
|
- Exclusive scan: output[i] = init_value + input[0] + ... + input[i-1]
|
|
|
|
Uses cuda.compute APIs for optimized CUB-based scan operations.
|
|
Uses cuda.core APIs for device and stream management.
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
|
|
|
|
try:
|
|
import cupy as cp
|
|
import numpy as np
|
|
from cuda.compute import OpKind, exclusive_scan, inclusive_scan
|
|
from cuda.core import Device, EventOptions
|
|
from cuda_samples_utils import print_gpu_info, verify_array_result
|
|
except ImportError as e:
|
|
print(f"Error: Required package not found: {e}")
|
|
print("Please install from requirements.txt:")
|
|
print(" pip install -r requirements.txt")
|
|
sys.exit(1)
|
|
|
|
|
|
def main() -> bool:
|
|
"""Run prefix sum sample. Returns True if all tests passed."""
|
|
print("=" * 60)
|
|
print("Prefix Sum (Scan) - Using cuda.compute")
|
|
print("=" * 60)
|
|
|
|
device = Device(0)
|
|
device.set_current()
|
|
stream = device.create_stream()
|
|
cp_stream = cp.cuda.ExternalStream(int(stream.handle))
|
|
|
|
ok = True
|
|
try:
|
|
print()
|
|
print_gpu_info(device)
|
|
|
|
h_input = np.array([3, 1, 4, 1, 5, 9, 2, 6], dtype=np.int32)
|
|
init_value = np.array([0], dtype=np.int32)
|
|
|
|
# =========================================================================
|
|
# Inclusive Scan
|
|
# =========================================================================
|
|
print("\n" + "-" * 60)
|
|
print("INCLUSIVE SCAN")
|
|
print("-" * 60)
|
|
print(
|
|
"Formula: output[i] = [init_value] + input[0] + input[1] + ... + input[i]"
|
|
)
|
|
|
|
with cp_stream:
|
|
d_input = cp.asarray(h_input)
|
|
d_output = cp.empty_like(d_input)
|
|
|
|
print(f"\nInput: {h_input.tolist()}")
|
|
|
|
inclusive_scan(
|
|
d_in=d_input,
|
|
d_out=d_output,
|
|
op=OpKind.PLUS,
|
|
init_value=None,
|
|
num_items=len(h_input),
|
|
stream=stream,
|
|
)
|
|
stream.sync()
|
|
print(f"Output: {cp.asnumpy(d_output).tolist()}")
|
|
|
|
with cp_stream:
|
|
expected = cp.asarray(np.cumsum(h_input))
|
|
ok &= verify_array_result(d_output, expected, rtol=0, atol=0)
|
|
|
|
# =========================================================================
|
|
# Exclusive Scan
|
|
# =========================================================================
|
|
print("\n" + "-" * 60)
|
|
print("EXCLUSIVE SCAN")
|
|
print("-" * 60)
|
|
print("Formula: output[i] = init_value + input[0] + ... + input[i-1]")
|
|
|
|
with cp_stream:
|
|
d_output = cp.empty_like(d_input)
|
|
|
|
print(f"\nInput: {h_input.tolist()}")
|
|
|
|
exclusive_scan(
|
|
d_in=d_input,
|
|
d_out=d_output,
|
|
op=OpKind.PLUS,
|
|
init_value=init_value,
|
|
num_items=len(h_input),
|
|
stream=stream,
|
|
)
|
|
stream.sync()
|
|
print(f"Output: {cp.asnumpy(d_output).tolist()}")
|
|
|
|
with cp_stream:
|
|
expected = cp.asarray(np.concatenate([init_value, np.cumsum(h_input)[:-1]]))
|
|
ok &= verify_array_result(d_output, expected, rtol=0, atol=0)
|
|
|
|
# =========================================================================
|
|
# Large Array Performance
|
|
# =========================================================================
|
|
print("\n" + "-" * 60)
|
|
print("PERFORMANCE (10M elements)")
|
|
print("-" * 60)
|
|
|
|
N = 10_000_000
|
|
with cp_stream:
|
|
d_large_in = cp.ones(N, dtype=np.int32)
|
|
d_large_out = cp.empty_like(d_large_in)
|
|
|
|
inclusive_scan(
|
|
d_in=d_large_in,
|
|
d_out=d_large_out,
|
|
op=OpKind.PLUS,
|
|
init_value=None,
|
|
num_items=N,
|
|
stream=stream,
|
|
)
|
|
stream.sync()
|
|
|
|
event_opts = EventOptions(enable_timing=True)
|
|
start_event = device.create_event(options=event_opts)
|
|
end_event = device.create_event(options=event_opts)
|
|
|
|
num_iterations = 10
|
|
stream.record(start_event)
|
|
for _ in range(num_iterations):
|
|
inclusive_scan(
|
|
d_in=d_large_in,
|
|
d_out=d_large_out,
|
|
op=OpKind.PLUS,
|
|
init_value=None,
|
|
num_items=N,
|
|
stream=stream,
|
|
)
|
|
stream.record(end_event)
|
|
end_event.sync()
|
|
elapsed_ms = (end_event - start_event) / num_iterations
|
|
|
|
print(f"Inclusive scan: {elapsed_ms:.3f} ms")
|
|
print(f"Throughput: {N / elapsed_ms / 1e6:.1f} M elements/ms")
|
|
|
|
# =========================================================================
|
|
# Summary
|
|
# =========================================================================
|
|
print("\n" + "=" * 60)
|
|
print("KEY CONCEPTS")
|
|
print("=" * 60)
|
|
print("• Inclusive: output[i] includes input[i]")
|
|
print("• Exclusive: output[i] excludes input[i], starts with init_value")
|
|
print("• cuda.compute provides CUB-based optimized implementations")
|
|
print("• cuda.core Stream integrates with CuPy via ExternalStream")
|
|
print("• Applications: stream compaction, radix sort, histograms")
|
|
print("=" * 60)
|
|
return ok
|
|
finally:
|
|
cp.cuda.Stream.null.use()
|
|
stream.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
success = main()
|
|
if not success:
|
|
sys.exit(1)
|