# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # distribution and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse import sys from pathlib import Path try: import numpy as np from cuda.core import ( Device, DeviceMemoryResource, EventOptions, LaunchConfig, PinnedMemoryResource, Program, ProgramOptions, launch, system, ) except ImportError as e: print(f"Error: Required package not found: {e}") print("Please install from requirements.txt:") print(" pip install -r requirements.txt") sys.exit(1) # Add parent directory to path to import utilities sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities")) from cuda_samples_utils import verify_array_result # noqa: E402 # CUDA kernel for simple P2P operation SIMPLE_P2P_KERNEL = """ extern "C" __global__ void SimpleKernel(float *src, float *dst, int N) { // Grid-stride loop pattern for canonical CUDA kernel size_t tid = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; for (size_t i = tid; i < N; i += stride) { dst[i] = src[i] * 2.0f; } } """ def run(num_elements=1024 * 1024 * 16): """ Demonstrates peer-to-peer (P2P) memory access between multiple GPUs using cuda.core. This function shows how to: 1. Detect and verify multiple GPUs with P2P capability 2. Enable peer access between GPUs 3. Perform direct GPU-to-GPU memory transfers 4. Launch kernels that access memory from other GPUs 5. Measure P2P bandwidth 6. Validate results Parameters ---------- num_elements : int Number of elements in arrays (default: 16M elements = 64MB) """ print("\n" + "=" * 70) print("simpleP2P - CUDA Python Sample") print("=" * 70) print("\nStarting...") # Check for multiple GPUs print("\nChecking for multiple GPUs...") num_devices = system.get_num_devices() print(f"CUDA-capable device count: {num_devices}") if num_devices < 2: print( "Two or more GPUs with Peer-to-Peer access capability are " "required, waiving this sample." ) return 2 # Get device properties devices = [Device(i) for i in range(num_devices)] # Check for P2P capability print("\nChecking GPU(s) for support of peer to peer memory access...") p2p_capable_gpus = [-1, -1] for i in range(num_devices): p2p_capable_gpus[0] = i for j in range(num_devices): if i == j: continue # Check peer access capability using cuda.core i_access_j = devices[i].can_access_peer(devices[j]) j_access_i = devices[j].can_access_peer(devices[i]) print( f"> Peer access from {devices[i].name} (GPU{i}) -> " f"{devices[j].name} (GPU{j}): {'Yes' if i_access_j else 'No'}" ) print( f"> Peer access from {devices[j].name} (GPU{j}) -> " f"{devices[i].name} (GPU{i}): {'Yes' if j_access_i else 'No'}" ) if i_access_j and j_access_i: p2p_capable_gpus[1] = j break if p2p_capable_gpus[1] != -1: break if p2p_capable_gpus[0] == -1 or p2p_capable_gpus[1] == -1: print("\nTwo or more GPUs with Peer-to-Peer access capability are required.") print( "Peer to Peer access is not available amongst GPUs in the system, " "waiving test." ) return 2 # Use first pair of P2P capable GPUs detected gpuid = [p2p_capable_gpus[0], p2p_capable_gpus[1]] dev0 = devices[gpuid[0]] dev1 = devices[gpuid[1]] print(f"\nUsing GPU{gpuid[0]} ({dev0.name}) and GPU{gpuid[1]} ({dev1.name})") # Allocate buffers with P2P access buf_size = num_elements * np.dtype(np.float32).itemsize print( f"\nAllocating buffers ({int(buf_size / 1024 / 1024)}MB on " f"GPU{gpuid[0]}, GPU{gpuid[1]} and CPU Host)..." ) # Allocate on GPU 0 and grant access to GPU 1 dev0.set_current() mr0 = DeviceMemoryResource(dev0) mr0.peer_accessible_by = [gpuid[1]] # Grant GPU 1 access to GPU 0's memory g0 = mr0.allocate(buf_size) # Allocate on GPU 1 and grant access to GPU 0 dev1.set_current() mr1 = DeviceMemoryResource(dev1) mr1.peer_accessible_by = [gpuid[0]] # Grant GPU 0 access to GPU 1's memory g1 = mr1.allocate(buf_size) print(f" Peer access enabled: GPU{gpuid[0]} <-> GPU{gpuid[1]}") print( f" Peer access status: MR0 accessible by {mr0.peer_accessible_by}, " f"MR1 accessible by {mr1.peer_accessible_by}" ) # Allocate pinned host memory pinned_mr = PinnedMemoryResource() h0 = pinned_mr.allocate(buf_size) print(" Memory allocated successfully") # Create streams stream0 = dev0.create_stream() stream1 = dev1.create_stream() try: # P2P bandwidth test using CUDA events for accurate GPU-side timing print("\nMeasuring P2P bandwidth...") print(" Performing 100 ping-pong copies between GPUs...") event_options = EventOptions(enable_timing=True) sync_event0 = None sync_event1 = None # Record start event on stream0 start_event = stream0.record(options=event_options) for i in range(100): # Ping-pong copy between GPUs with explicit event-based synchronization if i % 2 == 0: # Wait for previous stream1 copy to complete (if any) if sync_event1 is not None: stream0.wait(sync_event1) # Copy g0 -> g1 on stream0 g1.copy_from(g0, stream=stream0) # Record event on stream0 to signal completion of this copy sync_event0 = stream0.record(options=EventOptions(enable_timing=False)) else: # Wait for previous stream0 copy to complete if sync_event0 is not None: stream1.wait(sync_event0) # Copy g1 -> g0 on stream1 g0.copy_from(g1, stream=stream1) # Record event on stream1 to signal completion of this copy sync_event1 = stream1.record(options=EventOptions(enable_timing=False)) # Wait for last stream1 copy to complete if sync_event1 is not None: stream0.wait(sync_event1) # Record end event on stream0 after all copies have been enqueued end_event = stream0.record(options=event_options) end_event.sync() # Elapsed time in milliseconds (using subtraction operator) time_memcpy = end_event - start_event bandwidth = (1.0 / (time_memcpy / 1000.0)) * (100.0 * buf_size) / (1024.0**3) print(f" P2P bandwidth: {bandwidth:.2f} GB/s") # Prepare host buffer and initialize data print(f"\nPreparing host buffer and memcpy to GPU{gpuid[0]}...") # Create numpy view and initialize h0_array = np.from_dlpack(h0).view(dtype=np.float32) h0_array[:] = (np.arange(num_elements, dtype=np.float32) % 4096).astype( np.float32 ) # Copy to GPU 0 dev0.set_current() g0.copy_from(h0, stream=stream0) stream0.sync() print(" Data initialized and copied to GPU") # Compile kernel for both GPUs print("\nCompiling CUDA kernel...") dev0.set_current() program_options = ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}") prog = Program(SIMPLE_P2P_KERNEL, code_type="c++", options=program_options) mod0 = prog.compile("cubin") kernel0 = mod0.get_kernel("SimpleKernel") dev1.set_current() program_options = ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}") prog = Program(SIMPLE_P2P_KERNEL, code_type="c++", options=program_options) mod1 = prog.compile("cubin") kernel1 = mod1.get_kernel("SimpleKernel") print(" Kernels compiled successfully") # Launch configuration threads = 512 blocks = (num_elements + threads - 1) // threads config = LaunchConfig(grid=blocks, block=threads) # Run kernel on GPU 1, reading from GPU 0, writing to GPU 1 print( f"\nRun kernel on GPU{gpuid[1]}, taking source data from " f"GPU{gpuid[0]} and writing to GPU{gpuid[1]}..." ) dev1.set_current() launch(stream1, config, kernel1, g0, g1, np.int32(num_elements)) stream1.sync() print(" Kernel execution complete") # Run kernel on GPU 0, reading from GPU 1, writing to GPU 0 print( f"\nRun kernel on GPU{gpuid[0]}, taking source data from " f"GPU{gpuid[1]} and writing to GPU{gpuid[0]}..." ) dev0.set_current() launch(stream0, config, kernel0, g1, g0, np.int32(num_elements)) stream0.sync() print(" Kernel execution complete") # Copy data back to host and verify print(f"\nCopy data back to host from GPU{gpuid[0]} and verify results...") g0.copy_to(h0, stream=stream0) stream0.sync() # Verify results print("\nChecking results...") print(f" Comparing {num_elements:,} elements...") # Input data goes through two kernels, each multiplying by 2.0. expected = (np.arange(num_elements, dtype=np.float32) % 4096) * 4.0 # Use utility function for verification (handles both numpy and cupy arrays) if verify_array_result(h0_array, expected, rtol=1e-5, atol=1e-6, verbose=True): print(" [PASS] Validation PASSED") success = True else: print(" [FAIL] Validation FAILED") # Show first few errors for debugging errors = np.where(~np.isclose(h0_array, expected, rtol=1e-5, atol=1e-6))[0] print(f" Number of mismatches: {len(errors)}") for idx in errors[:10]: print( f" Error @ element {idx}: got {h0_array[idx]}, " f"expected {expected[idx]}" ) success = False # Disable peer access print("\nDisabling peer access...") mr0.peer_accessible_by = [] # Revoke GPU 1's access to GPU 0's memory mr1.peer_accessible_by = [] # Revoke GPU 0's access to GPU 1's memory print( f" Peer access revoked: MR0 accessible by {mr0.peer_accessible_by}, " f"MR1 accessible by {mr1.peer_accessible_by}" ) print("\n" + "=" * 70) if success: print("simpleP2P completed successfully!") else: print("simpleP2P FAILED!") print("=" * 70 + "\n") return 0 if success else 1 finally: # Cleanup streams and buffers print("Shutting down...") stream0.close() stream1.close() def main(): """Main entry point with argument parsing.""" parser = argparse.ArgumentParser( description=( "Demonstrate peer-to-peer (P2P) memory access between " "multiple GPUs with CUDA" ) ) parser.add_argument( "--num_elements", type=int, default=1024 * 1024 * 16, # 16M elements = 64MB help="Number of elements in arrays (default: 16777216 = 64MB)", ) args = parser.parse_args() # Validate arguments if args.num_elements <= 0: print("Error: num_elements must be positive") return 1 try: exit_code = run(num_elements=args.num_elements) sys.exit(exit_code) except Exception as e: print(f"\nError: {e}") import traceback traceback.print_exc() sys.exit(1) if __name__ == "__main__": main()