mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2026-05-14 14:06:53 +08:00
- Added Python samples for CUDA Python 1.0 release - Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
142 lines
4.7 KiB
Python
142 lines
4.7 KiB
Python
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions
|
|
# are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# distribution and/or other materials provided with the distribution.
|
|
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
# contributors may be used to endorse or promote products derived
|
|
# from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
import argparse
|
|
import contextlib
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import cupy as cp
|
|
import numpy as np
|
|
from cuda.core import Device, EventOptions
|
|
except ImportError as e:
|
|
print(f"Error: Required package not found: {e}")
|
|
print("Please install from requirements.txt:")
|
|
print(" pip install -r requirements.txt")
|
|
sys.exit(1)
|
|
|
|
# Add parent directory to path to import utilities
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
|
|
from cuda_samples_utils import verify_array_result
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def timer(message):
|
|
"""CPU timing context manager."""
|
|
start = time.time()
|
|
yield
|
|
end = time.time()
|
|
print(f"{message}: {(end - start):.6f} seconds")
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def gpu_timer(message, stream):
|
|
"""GPU timing context manager using cuda.core CUDA events."""
|
|
event_options = EventOptions(enable_timing=True)
|
|
start_event = stream.record(options=event_options)
|
|
yield
|
|
end_event = stream.record(options=event_options)
|
|
end_event.sync()
|
|
|
|
elapsed_time_ms = end_event - start_event # Returns milliseconds
|
|
elapsed_time_s = elapsed_time_ms / 1000.0 # Convert to seconds
|
|
print(f"{message}: {elapsed_time_s:.6f} seconds")
|
|
|
|
|
|
def warmup():
|
|
# Pre-runs a simple GPU operation to avoid first-run overhead in benchmarking.
|
|
print("Warmup...")
|
|
a_cp = cp.ones((16, 16))
|
|
b_cp = cp.ones((16, 16))
|
|
result_cp = cp.dot(a_cp, b_cp)
|
|
return result_cp
|
|
|
|
|
|
def run(n):
|
|
# Benchmarks NumPy vs. CuPy matrix multiplication for n x n random arrays.
|
|
# Prints timing results.
|
|
|
|
device = Device() # Use device 0 explicitly
|
|
device.set_current()
|
|
major, minor = device.compute_capability
|
|
print()
|
|
print(f"Device Name: {device.name}, SM: {major}.{minor}")
|
|
print()
|
|
|
|
# Create explicit stream
|
|
stream = device.create_stream()
|
|
|
|
try:
|
|
# Warm up GPU before measuring
|
|
warmup()
|
|
stream.sync()
|
|
|
|
# Generate random matrices on CPU
|
|
a_np = np.random.rand(n, n)
|
|
b_np = np.random.rand(n, n)
|
|
|
|
# NumPy dot product (CPU)
|
|
with timer(f"NumPy dot of {n}*{n} arrays"):
|
|
result_np = np.dot(a_np, b_np)
|
|
|
|
# Transfer NumPy arrays to GPU (using events for timing)
|
|
with gpu_timer("Transfer arrays to GPU", stream):
|
|
a_cp = cp.asarray(a_np)
|
|
b_cp = cp.asarray(b_np)
|
|
|
|
# CuPy dot product (GPU) - using events for accurate GPU timing
|
|
with gpu_timer(f"CuPy dot of {n}*{n} arrays", stream):
|
|
result_cp = cp.dot(a_cp, b_cp)
|
|
|
|
print()
|
|
# Result validation
|
|
if not verify_array_result(result_np, result_cp.get()):
|
|
print(
|
|
"Validation FAILED: NumPy and CuPy results do not match "
|
|
"within tolerance"
|
|
)
|
|
sys.exit(1)
|
|
|
|
print("Validation PASSED: NumPy and CuPy results match within tolerance")
|
|
finally:
|
|
stream.close()
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
"--n_size", "-n", default=4096, type=int, help="Size of the matrix(n * n)."
|
|
)
|
|
args = parser.parse_args()
|
|
run(args.n_size)
|
|
print("Demo completed successfully!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|