Dheemanth aeab82ff30
CUDA 13.2 samples update (#432)
- Added Python samples for CUDA Python 1.0 release
- Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
2026-05-13 17:13:18 -05:00

142 lines
4.7 KiB
Python

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# distribution and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import argparse
import contextlib
import sys
import time
from pathlib import Path
try:
import cupy as cp
import numpy as np
from cuda.core import Device, EventOptions
except ImportError as e:
print(f"Error: Required package not found: {e}")
print("Please install from requirements.txt:")
print(" pip install -r requirements.txt")
sys.exit(1)
# Add parent directory to path to import utilities
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
from cuda_samples_utils import verify_array_result
@contextlib.contextmanager
def timer(message):
"""CPU timing context manager."""
start = time.time()
yield
end = time.time()
print(f"{message}: {(end - start):.6f} seconds")
@contextlib.contextmanager
def gpu_timer(message, stream):
"""GPU timing context manager using cuda.core CUDA events."""
event_options = EventOptions(enable_timing=True)
start_event = stream.record(options=event_options)
yield
end_event = stream.record(options=event_options)
end_event.sync()
elapsed_time_ms = end_event - start_event # Returns milliseconds
elapsed_time_s = elapsed_time_ms / 1000.0 # Convert to seconds
print(f"{message}: {elapsed_time_s:.6f} seconds")
def warmup():
# Pre-runs a simple GPU operation to avoid first-run overhead in benchmarking.
print("Warmup...")
a_cp = cp.ones((16, 16))
b_cp = cp.ones((16, 16))
result_cp = cp.dot(a_cp, b_cp)
return result_cp
def run(n):
# Benchmarks NumPy vs. CuPy matrix multiplication for n x n random arrays.
# Prints timing results.
device = Device() # Use device 0 explicitly
device.set_current()
major, minor = device.compute_capability
print()
print(f"Device Name: {device.name}, SM: {major}.{minor}")
print()
# Create explicit stream
stream = device.create_stream()
try:
# Warm up GPU before measuring
warmup()
stream.sync()
# Generate random matrices on CPU
a_np = np.random.rand(n, n)
b_np = np.random.rand(n, n)
# NumPy dot product (CPU)
with timer(f"NumPy dot of {n}*{n} arrays"):
result_np = np.dot(a_np, b_np)
# Transfer NumPy arrays to GPU (using events for timing)
with gpu_timer("Transfer arrays to GPU", stream):
a_cp = cp.asarray(a_np)
b_cp = cp.asarray(b_np)
# CuPy dot product (GPU) - using events for accurate GPU timing
with gpu_timer(f"CuPy dot of {n}*{n} arrays", stream):
result_cp = cp.dot(a_cp, b_cp)
print()
# Result validation
if not verify_array_result(result_np, result_cp.get()):
print(
"Validation FAILED: NumPy and CuPy results do not match "
"within tolerance"
)
sys.exit(1)
print("Validation PASSED: NumPy and CuPy results match within tolerance")
finally:
stream.close()
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--n_size", "-n", default=4096, type=int, help="Size of the matrix(n * n)."
)
args = parser.parse_args()
run(args.n_size)
print("Demo completed successfully!")
if __name__ == "__main__":
main()