# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # distribution and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse import ctypes import sys from pathlib import Path try: import numpy as np from cuda.bindings import runtime as cuda_rt from cuda.core import ( Device, LaunchConfig, Program, ProgramOptions, launch, ) except ImportError as e: print(f"Error: Required package not found: {e}") print("Please install from requirements.txt:") print(" pip install -r requirements.txt") sys.exit(1) # Add parent directory to path to import utilities sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities")) def _mapped_host_alloc(num_floats, stream): """ Allocate page-locked host memory mapped for device access; return (host_ptr, device_ptr) for CPU views and for ``launch()``. """ nbytes = int(num_floats) * np.dtype(np.float32).itemsize if nbytes <= 0: return 0, 0 err, h_ptr = cuda_rt.cudaHostAlloc( nbytes, cuda_rt.cudaHostAllocMapped | cuda_rt.cudaHostAllocPortable ) if err != cuda_rt.cudaError_t.cudaSuccess: raise RuntimeError(f"cudaHostAlloc failed: {err}") err, d_ptr = cuda_rt.cudaHostGetDevicePointer(h_ptr, 0) if err != cuda_rt.cudaError_t.cudaSuccess: cuda_rt.cudaFreeHost(h_ptr) raise RuntimeError(f"cudaHostGetDevicePointer failed: {err}") # Ensure prior work on this stream is visible before host fills buffers. if stream is not None: stream.sync() return h_ptr, d_ptr def _float_view(host_ptr, num_floats): return np.frombuffer( (ctypes.c_float * num_floats).from_address(host_ptr), dtype=np.float32, count=num_floats, ) # CUDA C++: vector add with grid-stride loop VECTOR_ADD_KERNEL = """ extern "C" __global__ void vectorAddGPU(float* c, const float* a, const float* b, int N) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; for (size_t i = tid; i < N; i += stride) { c[i] = a[i] + b[i]; } } """ def run(num_elements=1048576): """ Zero-copy vector add: map host memory, launch kernel with device pointers, validate on CPU. This function shows how to: 1. Allocate pinned (page-locked) host memory 2. Map host memory into GPU address space (zero-copy) 3. Access host memory directly from GPU kernel 4. Validate results Parameters ---------- num_elements : int Number of elements in vectors (default: 1048576) """ print("\n" + "=" * 70) print("simpleZeroCopy - CUDA Python Sample") print("=" * 70) # Initialize device device = Device() device.set_current() major, minor = device.compute_capability print("\nDevice Information:") print(f" Name: {device.name}") print(f" Compute Capability: {major}.{minor}") # Create stream stream = device.create_stream() mapped_host_ptrs = [] try: print( "\n> Memory: mapped pinned host " "(cudaHostAlloc + cudaHostGetDevicePointer)" ) print("\nCompiling CUDA kernel...") program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") prog = Program(VECTOR_ADD_KERNEL, code_type="c++", options=program_options) mod = prog.compile("cubin") kernel = mod.get_kernel("vectorAddGPU") print(" Kernel compiled successfully") bytes_total = num_elements * np.dtype(np.float32).itemsize print("\nAllocating memory:") print(f" Vector size: {num_elements:,} elements") print(f" Memory per vector: {bytes_total / (1024**2):.2f} MB") print(f" Total memory: {3 * bytes_total / (1024**2):.2f} MB") print("\n> Allocating mapped pinned host memory...") h_a, d_a = _mapped_host_alloc(num_elements, stream) mapped_host_ptrs.append(h_a) h_b, d_b = _mapped_host_alloc(num_elements, stream) mapped_host_ptrs.append(h_b) h_c, d_c = _mapped_host_alloc(num_elements, stream) mapped_host_ptrs.append(h_c) a = _float_view(h_a, num_elements) b = _float_view(h_b, num_elements) c = _float_view(h_c, num_elements) print(" Mapped host memory allocated successfully") print("\n> Initializing vectors on host...") rng = np.random.default_rng(42) a[:] = rng.random(num_elements).astype(np.float32) b[:] = rng.random(num_elements).astype(np.float32) c[:] = 0 print("> Computing reference result on CPU...") reference = a + b print("\n> Launching vectorAddGPU kernel...") print(" Note: GPU accesses host memory directly (zero-copy)") block_size = 256 grid_size = (num_elements + block_size - 1) // block_size config = LaunchConfig(grid=grid_size, block=block_size) # Pass device pointers from cudaHostGetDevicePointer, not raw host VAs. launch( stream, config, kernel, int(d_c), int(d_a), int(d_b), np.int32(num_elements), ) stream.sync() print(" Kernel execution complete") print("\n> Checking results from vectorAddGPU()...") print(f" Comparing {num_elements:,} elements...") # ``c`` is a host view of the same buffer; no cudaMemcpy D2H needed. if np.allclose(c, reference, rtol=1e-5, atol=1e-6): error_norm = np.linalg.norm(c - reference) ref_norm = np.linalg.norm(reference) relative_error = error_norm / ref_norm print(f" Relative error: {relative_error:.6e}") print(" Validation PASSED") success = True else: max_error = np.max(np.abs(c - reference)) print(f" Max error: {max_error}") print(" Validation FAILED") success = False print("\n" + "=" * 70) if success: print("simpleZeroCopy completed successfully!") else: print("simpleZeroCopy FAILED!") print("=" * 70 + "\n") return 0 if success else 1 finally: for h in reversed(mapped_host_ptrs): if h: cuda_rt.cudaFreeHost(h) stream.close() def main(): """Parse CLI, call ``run()``, and exit with validation status.""" parser = argparse.ArgumentParser( description="Demonstrate zero-copy memory access with CUDA", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python simpleZeroCopy.py python simpleZeroCopy.py --num_elements 2097152 What is Zero-Copy Memory? Zero-copy allows the GPU to directly access host (CPU) memory without explicit memory transfers. This is useful for: - Small data that doesn't benefit from explicit transfers - Data that is accessed infrequently - Integrated GPUs that share memory with CPU Trade-offs: - Slower than device memory (PCIe bandwidth limited) - No explicit transfers needed (simpler code) - Good for discrete GPUs with small data - Excellent for integrated GPUs (e.g., Tegra) """, ) parser.add_argument( "--num_elements", type=int, default=1048576, help="Number of elements in vectors (default: 1048576)", ) args = parser.parse_args() if args.num_elements <= 0: print("Error: num_elements must be positive") sys.exit(1) try: exit_code = run(num_elements=args.num_elements) except Exception as e: print(f"\nError: {e}") import traceback traceback.print_exc() exit_code = 1 sys.exit(exit_code) if __name__ == "__main__": main()