mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2026-05-14 14:06:53 +08:00
- Added Python samples for CUDA Python 1.0 release - Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
313 lines
11 KiB
Python
313 lines
11 KiB
Python
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions
|
|
# are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
# contributors may be used to endorse or promote products derived
|
|
# from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
"""
|
|
Streaming Copy + Compute Overlap
|
|
|
|
Demonstrates how to overlap memory transfers with kernel computation using
|
|
CUDA streams to maximize GPU utilization.
|
|
|
|
Uses pure cuda.core APIs:
|
|
- Device, Stream for device and stream management
|
|
- PinnedMemoryResource, DeviceMemoryResource for memory allocation
|
|
- Buffer.copy_to() for async memory copies
|
|
- Program, LaunchConfig, launch for kernel compilation and execution
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
|
|
|
|
try:
|
|
import numpy as np
|
|
from cuda.core import (
|
|
Device,
|
|
DeviceMemoryResource,
|
|
EventOptions,
|
|
LaunchConfig,
|
|
PinnedMemoryResource,
|
|
Program,
|
|
ProgramOptions,
|
|
launch,
|
|
)
|
|
from cuda_samples_utils import print_gpu_info
|
|
except ImportError as e:
|
|
print(f"Error: Required package not found: {e}")
|
|
print("Install with: pip install -r requirements.txt")
|
|
sys.exit(1)
|
|
|
|
|
|
# CUDA Kernel - compute-intensive vector operation (grid-stride loop)
|
|
VECTOR_SCALE_KERNEL = r"""
|
|
extern "C" __global__
|
|
void vector_scale(const float* input, float* output, float scale, size_t N) {
|
|
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
size_t stride = (size_t)gridDim.x * blockDim.x;
|
|
for (size_t i = tid; i < N; i += stride) {
|
|
float val = input[i] * scale;
|
|
// Add compute work to make kernel non-trivial
|
|
for (int j = 0; j < 50; j++) {
|
|
val = sqrtf(val * val + 1.0f);
|
|
}
|
|
output[i] = val;
|
|
}
|
|
}
|
|
"""
|
|
|
|
|
|
def buffer_to_numpy(buffer, n_elements):
|
|
"""Create numpy view of cuda.core Buffer via DLPack."""
|
|
return np.from_dlpack(buffer).view(np.float32).reshape(n_elements)
|
|
|
|
|
|
def main():
|
|
print("=" * 60)
|
|
print("Streaming Copy + Compute Overlap")
|
|
print("Using pure cuda.core APIs")
|
|
print("=" * 60)
|
|
|
|
# Initialize device
|
|
device = Device(0)
|
|
device.set_current()
|
|
print()
|
|
print_gpu_info(device)
|
|
|
|
# Compile kernel
|
|
arch = f"sm_{device.arch}"
|
|
program = Program(
|
|
VECTOR_SCALE_KERNEL, code_type="c++", options=ProgramOptions(arch=arch)
|
|
)
|
|
kernel = program.compile(target_type="cubin").get_kernel("vector_scale")
|
|
print("Kernel compiled ✓")
|
|
|
|
# Parameters
|
|
N = 16_000_000 # 16M elements
|
|
n_bytes = N * 4
|
|
scale = 2.5
|
|
n_runs = 10
|
|
|
|
print(f"\nProblem size: {N:,} elements ({n_bytes / 1024 / 1024:.0f} MB)")
|
|
|
|
# Create memory resources
|
|
pinned_mr = PinnedMemoryResource()
|
|
device_mr = DeviceMemoryResource(device.device_id)
|
|
default_stream = device.create_stream()
|
|
|
|
# =========================================================================
|
|
# Sequential Execution
|
|
# =========================================================================
|
|
print("\n--- Sequential (no overlap) ---")
|
|
print("Timeline: [H2D][Compute][D2H]")
|
|
|
|
h_in = h_out = d_in = d_out = None
|
|
try:
|
|
# Pre-allocate buffers
|
|
h_in = pinned_mr.allocate(n_bytes, default_stream)
|
|
h_out = pinned_mr.allocate(n_bytes, default_stream)
|
|
d_in = device_mr.allocate(n_bytes, default_stream)
|
|
d_out = device_mr.allocate(n_bytes, default_stream)
|
|
# Sync before numpy access (numpy operations aren't stream ordered)
|
|
default_stream.sync()
|
|
|
|
# Initialize input
|
|
np_in = buffer_to_numpy(h_in, N)
|
|
np_in[:] = np.random.rand(N).astype(np.float32) * 100
|
|
|
|
config = LaunchConfig(grid=((N + 255) // 256,), block=(256,))
|
|
event_opts = EventOptions(enable_timing=True)
|
|
|
|
# Warm up
|
|
h_in.copy_to(d_in, stream=default_stream)
|
|
launch(
|
|
default_stream,
|
|
config,
|
|
kernel,
|
|
d_in,
|
|
d_out,
|
|
np.float32(scale),
|
|
np.uint64(N),
|
|
)
|
|
d_out.copy_to(h_out, stream=default_stream)
|
|
default_stream.sync()
|
|
|
|
# Benchmark with CUDA events
|
|
times = []
|
|
for _ in range(n_runs):
|
|
start_ev = device.create_event(options=event_opts)
|
|
end_ev = device.create_event(options=event_opts)
|
|
default_stream.record(start_ev)
|
|
h_in.copy_to(d_in, stream=default_stream) # Async H2D
|
|
launch(
|
|
default_stream,
|
|
config,
|
|
kernel,
|
|
d_in,
|
|
d_out,
|
|
np.float32(scale),
|
|
np.uint64(N),
|
|
)
|
|
d_out.copy_to(h_out, stream=default_stream) # Async D2H
|
|
default_stream.record(end_ev)
|
|
default_stream.sync()
|
|
times.append(end_ev - start_ev)
|
|
|
|
seq_time = np.mean(times)
|
|
print(f"Time: {seq_time:.2f} ms (±{np.std(times):.2f})")
|
|
|
|
# Verification: compute expected on CPU and compare
|
|
default_stream.sync()
|
|
np_out = buffer_to_numpy(h_out, N)
|
|
expected = np_in.astype(np.float32) * scale
|
|
for _ in range(50):
|
|
expected = np.sqrt(expected * expected + 1.0).astype(np.float32)
|
|
if np.allclose(np_out, expected, rtol=1e-4, atol=1e-4):
|
|
print("Verification: PASSED")
|
|
else:
|
|
print("Verification: FAILED")
|
|
finally:
|
|
for buf in (h_in, h_out, d_in, d_out):
|
|
if buf is not None:
|
|
buf.close()
|
|
default_stream.close()
|
|
|
|
# =========================================================================
|
|
# Streamed Execution
|
|
# =========================================================================
|
|
print("\n--- Streamed (with overlap) ---")
|
|
print("Stream 0: [H2D][Compute][D2H]")
|
|
print("Stream 1: [H2D][Compute][D2H]")
|
|
print("Stream 2: [H2D][Compute][D2H]")
|
|
print("...")
|
|
|
|
for n_streams in [2, 4, 8]:
|
|
chunk_size = N // n_streams
|
|
chunk_bytes = chunk_size * 4
|
|
|
|
# Create streams
|
|
streams = [device.create_stream() for _ in range(n_streams)]
|
|
|
|
# Pre-allocate per-stream buffers
|
|
h_ins, h_outs, d_ins, d_outs = [], [], [], []
|
|
try:
|
|
for i in range(n_streams):
|
|
h_ins.append(pinned_mr.allocate(chunk_bytes, streams[i]))
|
|
h_outs.append(pinned_mr.allocate(chunk_bytes, streams[i]))
|
|
d_ins.append(device_mr.allocate(chunk_bytes, streams[i]))
|
|
d_outs.append(device_mr.allocate(chunk_bytes, streams[i]))
|
|
|
|
# Initialize input data
|
|
for i in range(n_streams):
|
|
streams[i].sync()
|
|
np_view = buffer_to_numpy(h_ins[i], chunk_size)
|
|
np_view[:] = np.random.rand(chunk_size).astype(np.float32) * 100
|
|
|
|
chunk_config = LaunchConfig(grid=((chunk_size + 255) // 256,), block=(256,))
|
|
|
|
# Warm up
|
|
for i in range(n_streams):
|
|
h_ins[i].copy_to(d_ins[i], stream=streams[i])
|
|
launch(
|
|
streams[i],
|
|
chunk_config,
|
|
kernel,
|
|
d_ins[i],
|
|
d_outs[i],
|
|
np.float32(scale),
|
|
np.uint64(chunk_size),
|
|
)
|
|
d_outs[i].copy_to(h_outs[i], stream=streams[i])
|
|
for stream in streams:
|
|
stream.sync()
|
|
|
|
# Benchmark with CUDA events (use stream 0 for timing)
|
|
times = []
|
|
event_opts = EventOptions(enable_timing=True)
|
|
for _ in range(n_runs):
|
|
start_ev = device.create_event(options=event_opts)
|
|
end_ev = device.create_event(options=event_opts)
|
|
streams[0].record(start_ev)
|
|
|
|
# Issue all operations - they overlap across streams
|
|
for i in range(n_streams):
|
|
h_ins[i].copy_to(d_ins[i], stream=streams[i]) # Async H2D
|
|
launch(
|
|
streams[i],
|
|
chunk_config,
|
|
kernel,
|
|
d_ins[i],
|
|
d_outs[i],
|
|
np.float32(scale),
|
|
np.uint64(chunk_size),
|
|
)
|
|
d_outs[i].copy_to(h_outs[i], stream=streams[i]) # Async D2H
|
|
|
|
# Wait for all streams, record end on stream 0
|
|
for stream in streams:
|
|
stream.sync()
|
|
streams[0].record(end_ev)
|
|
streams[0].sync()
|
|
times.append(end_ev - start_ev)
|
|
|
|
avg = np.mean(times)
|
|
speedup = seq_time / avg
|
|
print(
|
|
f"{n_streams} streams: {avg:.2f} ms (±{np.std(times):.2f}) "
|
|
f"- speedup: {speedup:.2f}x"
|
|
)
|
|
|
|
# Verification (streamed): concatenate chunks and compare to expected
|
|
for s in streams:
|
|
s.sync()
|
|
out_chunks = [
|
|
buffer_to_numpy(h_outs[i], chunk_size) for i in range(n_streams)
|
|
]
|
|
in_chunks = [
|
|
buffer_to_numpy(h_ins[i], chunk_size) for i in range(n_streams)
|
|
]
|
|
np_out = np.concatenate(out_chunks)
|
|
np_in = np.concatenate(in_chunks)
|
|
expected = np_in.astype(np.float32) * scale
|
|
for _ in range(50):
|
|
expected = np.sqrt(expected * expected + 1.0).astype(np.float32)
|
|
if not np.allclose(np_out, expected, rtol=1e-4, atol=1e-4):
|
|
print(f" Verification: FAILED for {n_streams} streams")
|
|
finally:
|
|
for buf in h_ins + h_outs + d_ins + d_outs:
|
|
buf.close()
|
|
for s in streams:
|
|
s.close()
|
|
|
|
print("\n" + "=" * 60)
|
|
print("Key: Pinned memory + multiple streams = overlap transfers with compute")
|
|
print("\nNote: Speedup depends on hardware characteristics. This technique")
|
|
print("benefits most when transfer time is significant relative to compute.")
|
|
print("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|