Dheemanth aeab82ff30
CUDA 13.2 samples update (#432)
- Added Python samples for CUDA Python 1.0 release
- Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
2026-05-13 17:13:18 -05:00

313 lines
11 KiB
Python

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Streaming Copy + Compute Overlap
Demonstrates how to overlap memory transfers with kernel computation using
CUDA streams to maximize GPU utilization.
Uses pure cuda.core APIs:
- Device, Stream for device and stream management
- PinnedMemoryResource, DeviceMemoryResource for memory allocation
- Buffer.copy_to() for async memory copies
- Program, LaunchConfig, launch for kernel compilation and execution
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
try:
import numpy as np
from cuda.core import (
Device,
DeviceMemoryResource,
EventOptions,
LaunchConfig,
PinnedMemoryResource,
Program,
ProgramOptions,
launch,
)
from cuda_samples_utils import print_gpu_info
except ImportError as e:
print(f"Error: Required package not found: {e}")
print("Install with: pip install -r requirements.txt")
sys.exit(1)
# CUDA Kernel - compute-intensive vector operation (grid-stride loop)
VECTOR_SCALE_KERNEL = r"""
extern "C" __global__
void vector_scale(const float* input, float* output, float scale, size_t N) {
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = (size_t)gridDim.x * blockDim.x;
for (size_t i = tid; i < N; i += stride) {
float val = input[i] * scale;
// Add compute work to make kernel non-trivial
for (int j = 0; j < 50; j++) {
val = sqrtf(val * val + 1.0f);
}
output[i] = val;
}
}
"""
def buffer_to_numpy(buffer, n_elements):
"""Create numpy view of cuda.core Buffer via DLPack."""
return np.from_dlpack(buffer).view(np.float32).reshape(n_elements)
def main():
print("=" * 60)
print("Streaming Copy + Compute Overlap")
print("Using pure cuda.core APIs")
print("=" * 60)
# Initialize device
device = Device(0)
device.set_current()
print()
print_gpu_info(device)
# Compile kernel
arch = f"sm_{device.arch}"
program = Program(
VECTOR_SCALE_KERNEL, code_type="c++", options=ProgramOptions(arch=arch)
)
kernel = program.compile(target_type="cubin").get_kernel("vector_scale")
print("Kernel compiled ✓")
# Parameters
N = 16_000_000 # 16M elements
n_bytes = N * 4
scale = 2.5
n_runs = 10
print(f"\nProblem size: {N:,} elements ({n_bytes / 1024 / 1024:.0f} MB)")
# Create memory resources
pinned_mr = PinnedMemoryResource()
device_mr = DeviceMemoryResource(device.device_id)
default_stream = device.create_stream()
# =========================================================================
# Sequential Execution
# =========================================================================
print("\n--- Sequential (no overlap) ---")
print("Timeline: [H2D][Compute][D2H]")
h_in = h_out = d_in = d_out = None
try:
# Pre-allocate buffers
h_in = pinned_mr.allocate(n_bytes, default_stream)
h_out = pinned_mr.allocate(n_bytes, default_stream)
d_in = device_mr.allocate(n_bytes, default_stream)
d_out = device_mr.allocate(n_bytes, default_stream)
# Sync before numpy access (numpy operations aren't stream ordered)
default_stream.sync()
# Initialize input
np_in = buffer_to_numpy(h_in, N)
np_in[:] = np.random.rand(N).astype(np.float32) * 100
config = LaunchConfig(grid=((N + 255) // 256,), block=(256,))
event_opts = EventOptions(enable_timing=True)
# Warm up
h_in.copy_to(d_in, stream=default_stream)
launch(
default_stream,
config,
kernel,
d_in,
d_out,
np.float32(scale),
np.uint64(N),
)
d_out.copy_to(h_out, stream=default_stream)
default_stream.sync()
# Benchmark with CUDA events
times = []
for _ in range(n_runs):
start_ev = device.create_event(options=event_opts)
end_ev = device.create_event(options=event_opts)
default_stream.record(start_ev)
h_in.copy_to(d_in, stream=default_stream) # Async H2D
launch(
default_stream,
config,
kernel,
d_in,
d_out,
np.float32(scale),
np.uint64(N),
)
d_out.copy_to(h_out, stream=default_stream) # Async D2H
default_stream.record(end_ev)
default_stream.sync()
times.append(end_ev - start_ev)
seq_time = np.mean(times)
print(f"Time: {seq_time:.2f} ms (±{np.std(times):.2f})")
# Verification: compute expected on CPU and compare
default_stream.sync()
np_out = buffer_to_numpy(h_out, N)
expected = np_in.astype(np.float32) * scale
for _ in range(50):
expected = np.sqrt(expected * expected + 1.0).astype(np.float32)
if np.allclose(np_out, expected, rtol=1e-4, atol=1e-4):
print("Verification: PASSED")
else:
print("Verification: FAILED")
finally:
for buf in (h_in, h_out, d_in, d_out):
if buf is not None:
buf.close()
default_stream.close()
# =========================================================================
# Streamed Execution
# =========================================================================
print("\n--- Streamed (with overlap) ---")
print("Stream 0: [H2D][Compute][D2H]")
print("Stream 1: [H2D][Compute][D2H]")
print("Stream 2: [H2D][Compute][D2H]")
print("...")
for n_streams in [2, 4, 8]:
chunk_size = N // n_streams
chunk_bytes = chunk_size * 4
# Create streams
streams = [device.create_stream() for _ in range(n_streams)]
# Pre-allocate per-stream buffers
h_ins, h_outs, d_ins, d_outs = [], [], [], []
try:
for i in range(n_streams):
h_ins.append(pinned_mr.allocate(chunk_bytes, streams[i]))
h_outs.append(pinned_mr.allocate(chunk_bytes, streams[i]))
d_ins.append(device_mr.allocate(chunk_bytes, streams[i]))
d_outs.append(device_mr.allocate(chunk_bytes, streams[i]))
# Initialize input data
for i in range(n_streams):
streams[i].sync()
np_view = buffer_to_numpy(h_ins[i], chunk_size)
np_view[:] = np.random.rand(chunk_size).astype(np.float32) * 100
chunk_config = LaunchConfig(grid=((chunk_size + 255) // 256,), block=(256,))
# Warm up
for i in range(n_streams):
h_ins[i].copy_to(d_ins[i], stream=streams[i])
launch(
streams[i],
chunk_config,
kernel,
d_ins[i],
d_outs[i],
np.float32(scale),
np.uint64(chunk_size),
)
d_outs[i].copy_to(h_outs[i], stream=streams[i])
for stream in streams:
stream.sync()
# Benchmark with CUDA events (use stream 0 for timing)
times = []
event_opts = EventOptions(enable_timing=True)
for _ in range(n_runs):
start_ev = device.create_event(options=event_opts)
end_ev = device.create_event(options=event_opts)
streams[0].record(start_ev)
# Issue all operations - they overlap across streams
for i in range(n_streams):
h_ins[i].copy_to(d_ins[i], stream=streams[i]) # Async H2D
launch(
streams[i],
chunk_config,
kernel,
d_ins[i],
d_outs[i],
np.float32(scale),
np.uint64(chunk_size),
)
d_outs[i].copy_to(h_outs[i], stream=streams[i]) # Async D2H
# Wait for all streams, record end on stream 0
for stream in streams:
stream.sync()
streams[0].record(end_ev)
streams[0].sync()
times.append(end_ev - start_ev)
avg = np.mean(times)
speedup = seq_time / avg
print(
f"{n_streams} streams: {avg:.2f} ms (±{np.std(times):.2f}) "
f"- speedup: {speedup:.2f}x"
)
# Verification (streamed): concatenate chunks and compare to expected
for s in streams:
s.sync()
out_chunks = [
buffer_to_numpy(h_outs[i], chunk_size) for i in range(n_streams)
]
in_chunks = [
buffer_to_numpy(h_ins[i], chunk_size) for i in range(n_streams)
]
np_out = np.concatenate(out_chunks)
np_in = np.concatenate(in_chunks)
expected = np_in.astype(np.float32) * scale
for _ in range(50):
expected = np.sqrt(expected * expected + 1.0).astype(np.float32)
if not np.allclose(np_out, expected, rtol=1e-4, atol=1e-4):
print(f" Verification: FAILED for {n_streams} streams")
finally:
for buf in h_ins + h_outs + d_ins + d_outs:
buf.close()
for s in streams:
s.close()
print("\n" + "=" * 60)
print("Key: Pinned memory + multiple streams = overlap transfers with compute")
print("\nNote: Speedup depends on hardware characteristics. This technique")
print("benefits most when transfer time is significant relative to compute.")
print("=" * 60)
if __name__ == "__main__":
main()