# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ Streaming Copy + Compute Overlap Demonstrates how to overlap memory transfers with kernel computation using CUDA streams to maximize GPU utilization. Uses pure cuda.core APIs: - Device, Stream for device and stream management - PinnedMemoryResource, DeviceMemoryResource for memory allocation - Buffer.copy_to() for async memory copies - Program, LaunchConfig, launch for kernel compilation and execution """ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities")) try: import numpy as np from cuda.core import ( Device, DeviceMemoryResource, EventOptions, LaunchConfig, PinnedMemoryResource, Program, ProgramOptions, launch, ) from cuda_samples_utils import print_gpu_info except ImportError as e: print(f"Error: Required package not found: {e}") print("Install with: pip install -r requirements.txt") sys.exit(1) # CUDA Kernel - compute-intensive vector operation (grid-stride loop) VECTOR_SCALE_KERNEL = r""" extern "C" __global__ void vector_scale(const float* input, float* output, float scale, size_t N) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = (size_t)gridDim.x * blockDim.x; for (size_t i = tid; i < N; i += stride) { float val = input[i] * scale; // Add compute work to make kernel non-trivial for (int j = 0; j < 50; j++) { val = sqrtf(val * val + 1.0f); } output[i] = val; } } """ def buffer_to_numpy(buffer, n_elements): """Create numpy view of cuda.core Buffer via DLPack.""" return np.from_dlpack(buffer).view(np.float32).reshape(n_elements) def main(): print("=" * 60) print("Streaming Copy + Compute Overlap") print("Using pure cuda.core APIs") print("=" * 60) # Initialize device device = Device(0) device.set_current() print() print_gpu_info(device) # Compile kernel arch = f"sm_{device.arch}" program = Program( VECTOR_SCALE_KERNEL, code_type="c++", options=ProgramOptions(arch=arch) ) kernel = program.compile(target_type="cubin").get_kernel("vector_scale") print("Kernel compiled ✓") # Parameters N = 16_000_000 # 16M elements n_bytes = N * 4 scale = 2.5 n_runs = 10 print(f"\nProblem size: {N:,} elements ({n_bytes / 1024 / 1024:.0f} MB)") # Create memory resources pinned_mr = PinnedMemoryResource() device_mr = DeviceMemoryResource(device.device_id) default_stream = device.create_stream() # ========================================================================= # Sequential Execution # ========================================================================= print("\n--- Sequential (no overlap) ---") print("Timeline: [H2D][Compute][D2H]") h_in = h_out = d_in = d_out = None try: # Pre-allocate buffers h_in = pinned_mr.allocate(n_bytes, default_stream) h_out = pinned_mr.allocate(n_bytes, default_stream) d_in = device_mr.allocate(n_bytes, default_stream) d_out = device_mr.allocate(n_bytes, default_stream) # Sync before numpy access (numpy operations aren't stream ordered) default_stream.sync() # Initialize input np_in = buffer_to_numpy(h_in, N) np_in[:] = np.random.rand(N).astype(np.float32) * 100 config = LaunchConfig(grid=((N + 255) // 256,), block=(256,)) event_opts = EventOptions(enable_timing=True) # Warm up h_in.copy_to(d_in, stream=default_stream) launch( default_stream, config, kernel, d_in, d_out, np.float32(scale), np.uint64(N), ) d_out.copy_to(h_out, stream=default_stream) default_stream.sync() # Benchmark with CUDA events times = [] for _ in range(n_runs): start_ev = device.create_event(options=event_opts) end_ev = device.create_event(options=event_opts) default_stream.record(start_ev) h_in.copy_to(d_in, stream=default_stream) # Async H2D launch( default_stream, config, kernel, d_in, d_out, np.float32(scale), np.uint64(N), ) d_out.copy_to(h_out, stream=default_stream) # Async D2H default_stream.record(end_ev) default_stream.sync() times.append(end_ev - start_ev) seq_time = np.mean(times) print(f"Time: {seq_time:.2f} ms (±{np.std(times):.2f})") # Verification: compute expected on CPU and compare default_stream.sync() np_out = buffer_to_numpy(h_out, N) expected = np_in.astype(np.float32) * scale for _ in range(50): expected = np.sqrt(expected * expected + 1.0).astype(np.float32) if np.allclose(np_out, expected, rtol=1e-4, atol=1e-4): print("Verification: PASSED") else: print("Verification: FAILED") finally: for buf in (h_in, h_out, d_in, d_out): if buf is not None: buf.close() default_stream.close() # ========================================================================= # Streamed Execution # ========================================================================= print("\n--- Streamed (with overlap) ---") print("Stream 0: [H2D][Compute][D2H]") print("Stream 1: [H2D][Compute][D2H]") print("Stream 2: [H2D][Compute][D2H]") print("...") for n_streams in [2, 4, 8]: chunk_size = N // n_streams chunk_bytes = chunk_size * 4 # Create streams streams = [device.create_stream() for _ in range(n_streams)] # Pre-allocate per-stream buffers h_ins, h_outs, d_ins, d_outs = [], [], [], [] try: for i in range(n_streams): h_ins.append(pinned_mr.allocate(chunk_bytes, streams[i])) h_outs.append(pinned_mr.allocate(chunk_bytes, streams[i])) d_ins.append(device_mr.allocate(chunk_bytes, streams[i])) d_outs.append(device_mr.allocate(chunk_bytes, streams[i])) # Initialize input data for i in range(n_streams): streams[i].sync() np_view = buffer_to_numpy(h_ins[i], chunk_size) np_view[:] = np.random.rand(chunk_size).astype(np.float32) * 100 chunk_config = LaunchConfig(grid=((chunk_size + 255) // 256,), block=(256,)) # Warm up for i in range(n_streams): h_ins[i].copy_to(d_ins[i], stream=streams[i]) launch( streams[i], chunk_config, kernel, d_ins[i], d_outs[i], np.float32(scale), np.uint64(chunk_size), ) d_outs[i].copy_to(h_outs[i], stream=streams[i]) for stream in streams: stream.sync() # Benchmark with CUDA events (use stream 0 for timing) times = [] event_opts = EventOptions(enable_timing=True) for _ in range(n_runs): start_ev = device.create_event(options=event_opts) end_ev = device.create_event(options=event_opts) streams[0].record(start_ev) # Issue all operations - they overlap across streams for i in range(n_streams): h_ins[i].copy_to(d_ins[i], stream=streams[i]) # Async H2D launch( streams[i], chunk_config, kernel, d_ins[i], d_outs[i], np.float32(scale), np.uint64(chunk_size), ) d_outs[i].copy_to(h_outs[i], stream=streams[i]) # Async D2H # Wait for all streams, record end on stream 0 for stream in streams: stream.sync() streams[0].record(end_ev) streams[0].sync() times.append(end_ev - start_ev) avg = np.mean(times) speedup = seq_time / avg print( f"{n_streams} streams: {avg:.2f} ms (±{np.std(times):.2f}) " f"- speedup: {speedup:.2f}x" ) # Verification (streamed): concatenate chunks and compare to expected for s in streams: s.sync() out_chunks = [ buffer_to_numpy(h_outs[i], chunk_size) for i in range(n_streams) ] in_chunks = [ buffer_to_numpy(h_ins[i], chunk_size) for i in range(n_streams) ] np_out = np.concatenate(out_chunks) np_in = np.concatenate(in_chunks) expected = np_in.astype(np.float32) * scale for _ in range(50): expected = np.sqrt(expected * expected + 1.0).astype(np.float32) if not np.allclose(np_out, expected, rtol=1e-4, atol=1e-4): print(f" Verification: FAILED for {n_streams} streams") finally: for buf in h_ins + h_outs + d_ins + d_outs: buf.close() for s in streams: s.close() print("\n" + "=" * 60) print("Key: Pinned memory + multiple streams = overlap transfers with compute") print("\nNote: Speedup depends on hardware characteristics. This technique") print("benefits most when transfer time is significant relative to compute.") print("=" * 60) if __name__ == "__main__": main()