Dheemanth aeab82ff30
CUDA 13.2 samples update (#432)
- Added Python samples for CUDA Python 1.0 release
- Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
2026-05-13 17:13:18 -05:00

378 lines
13 KiB
Python

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# distribution and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import argparse
import sys
from pathlib import Path
try:
import numpy as np
from cuda.core import (
Device,
DeviceMemoryResource,
EventOptions,
LaunchConfig,
PinnedMemoryResource,
Program,
ProgramOptions,
launch,
system,
)
except ImportError as e:
print(f"Error: Required package not found: {e}")
print("Please install from requirements.txt:")
print(" pip install -r requirements.txt")
sys.exit(1)
# Add parent directory to path to import utilities
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
from cuda_samples_utils import verify_array_result # noqa: E402
# CUDA kernel for simple P2P operation
SIMPLE_P2P_KERNEL = """
extern "C" __global__
void SimpleKernel(float *src, float *dst, int N) {
// Grid-stride loop pattern for canonical CUDA kernel
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = gridDim.x * blockDim.x;
for (size_t i = tid; i < N; i += stride) {
dst[i] = src[i] * 2.0f;
}
}
"""
def run(num_elements=1024 * 1024 * 16):
"""
Demonstrates peer-to-peer (P2P) memory access between multiple GPUs using cuda.core.
This function shows how to:
1. Detect and verify multiple GPUs with P2P capability
2. Enable peer access between GPUs
3. Perform direct GPU-to-GPU memory transfers
4. Launch kernels that access memory from other GPUs
5. Measure P2P bandwidth
6. Validate results
Parameters
----------
num_elements : int
Number of elements in arrays (default: 16M elements = 64MB)
"""
print("\n" + "=" * 70)
print("simpleP2P - CUDA Python Sample")
print("=" * 70)
print("\nStarting...")
# Check for multiple GPUs
print("\nChecking for multiple GPUs...")
num_devices = system.get_num_devices()
print(f"CUDA-capable device count: {num_devices}")
if num_devices < 2:
print(
"Two or more GPUs with Peer-to-Peer access capability are "
"required, waiving this sample."
)
return 2
# Get device properties
devices = [Device(i) for i in range(num_devices)]
# Check for P2P capability
print("\nChecking GPU(s) for support of peer to peer memory access...")
p2p_capable_gpus = [-1, -1]
for i in range(num_devices):
p2p_capable_gpus[0] = i
for j in range(num_devices):
if i == j:
continue
# Check peer access capability using cuda.core
i_access_j = devices[i].can_access_peer(devices[j])
j_access_i = devices[j].can_access_peer(devices[i])
print(
f"> Peer access from {devices[i].name} (GPU{i}) -> "
f"{devices[j].name} (GPU{j}): {'Yes' if i_access_j else 'No'}"
)
print(
f"> Peer access from {devices[j].name} (GPU{j}) -> "
f"{devices[i].name} (GPU{i}): {'Yes' if j_access_i else 'No'}"
)
if i_access_j and j_access_i:
p2p_capable_gpus[1] = j
break
if p2p_capable_gpus[1] != -1:
break
if p2p_capable_gpus[0] == -1 or p2p_capable_gpus[1] == -1:
print("\nTwo or more GPUs with Peer-to-Peer access capability are required.")
print(
"Peer to Peer access is not available amongst GPUs in the system, "
"waiving test."
)
return 2
# Use first pair of P2P capable GPUs detected
gpuid = [p2p_capable_gpus[0], p2p_capable_gpus[1]]
dev0 = devices[gpuid[0]]
dev1 = devices[gpuid[1]]
print(f"\nUsing GPU{gpuid[0]} ({dev0.name}) and GPU{gpuid[1]} ({dev1.name})")
# Allocate buffers with P2P access
buf_size = num_elements * np.dtype(np.float32).itemsize
print(
f"\nAllocating buffers ({int(buf_size / 1024 / 1024)}MB on "
f"GPU{gpuid[0]}, GPU{gpuid[1]} and CPU Host)..."
)
# Allocate on GPU 0 and grant access to GPU 1
dev0.set_current()
mr0 = DeviceMemoryResource(dev0)
mr0.peer_accessible_by = [gpuid[1]] # Grant GPU 1 access to GPU 0's memory
g0 = mr0.allocate(buf_size)
# Allocate on GPU 1 and grant access to GPU 0
dev1.set_current()
mr1 = DeviceMemoryResource(dev1)
mr1.peer_accessible_by = [gpuid[0]] # Grant GPU 0 access to GPU 1's memory
g1 = mr1.allocate(buf_size)
print(f" Peer access enabled: GPU{gpuid[0]} <-> GPU{gpuid[1]}")
print(
f" Peer access status: MR0 accessible by {mr0.peer_accessible_by}, "
f"MR1 accessible by {mr1.peer_accessible_by}"
)
# Allocate pinned host memory
pinned_mr = PinnedMemoryResource()
h0 = pinned_mr.allocate(buf_size)
print(" Memory allocated successfully")
# Create streams
stream0 = dev0.create_stream()
stream1 = dev1.create_stream()
try:
# P2P bandwidth test using CUDA events for accurate GPU-side timing
print("\nMeasuring P2P bandwidth...")
print(" Performing 100 ping-pong copies between GPUs...")
event_options = EventOptions(enable_timing=True)
sync_event0 = None
sync_event1 = None
# Record start event on stream0
start_event = stream0.record(options=event_options)
for i in range(100):
# Ping-pong copy between GPUs with explicit event-based synchronization
if i % 2 == 0:
# Wait for previous stream1 copy to complete (if any)
if sync_event1 is not None:
stream0.wait(sync_event1)
# Copy g0 -> g1 on stream0
g1.copy_from(g0, stream=stream0)
# Record event on stream0 to signal completion of this copy
sync_event0 = stream0.record(options=EventOptions(enable_timing=False))
else:
# Wait for previous stream0 copy to complete
if sync_event0 is not None:
stream1.wait(sync_event0)
# Copy g1 -> g0 on stream1
g0.copy_from(g1, stream=stream1)
# Record event on stream1 to signal completion of this copy
sync_event1 = stream1.record(options=EventOptions(enable_timing=False))
# Wait for last stream1 copy to complete
if sync_event1 is not None:
stream0.wait(sync_event1)
# Record end event on stream0 after all copies have been enqueued
end_event = stream0.record(options=event_options)
end_event.sync()
# Elapsed time in milliseconds (using subtraction operator)
time_memcpy = end_event - start_event
bandwidth = (1.0 / (time_memcpy / 1000.0)) * (100.0 * buf_size) / (1024.0**3)
print(f" P2P bandwidth: {bandwidth:.2f} GB/s")
# Prepare host buffer and initialize data
print(f"\nPreparing host buffer and memcpy to GPU{gpuid[0]}...")
# Create numpy view and initialize
h0_array = np.from_dlpack(h0).view(dtype=np.float32)
h0_array[:] = (np.arange(num_elements, dtype=np.float32) % 4096).astype(
np.float32
)
# Copy to GPU 0
dev0.set_current()
g0.copy_from(h0, stream=stream0)
stream0.sync()
print(" Data initialized and copied to GPU")
# Compile kernel for both GPUs
print("\nCompiling CUDA kernel...")
dev0.set_current()
program_options = ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}")
prog = Program(SIMPLE_P2P_KERNEL, code_type="c++", options=program_options)
mod0 = prog.compile("cubin")
kernel0 = mod0.get_kernel("SimpleKernel")
dev1.set_current()
program_options = ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}")
prog = Program(SIMPLE_P2P_KERNEL, code_type="c++", options=program_options)
mod1 = prog.compile("cubin")
kernel1 = mod1.get_kernel("SimpleKernel")
print(" Kernels compiled successfully")
# Launch configuration
threads = 512
blocks = (num_elements + threads - 1) // threads
config = LaunchConfig(grid=blocks, block=threads)
# Run kernel on GPU 1, reading from GPU 0, writing to GPU 1
print(
f"\nRun kernel on GPU{gpuid[1]}, taking source data from "
f"GPU{gpuid[0]} and writing to GPU{gpuid[1]}..."
)
dev1.set_current()
launch(stream1, config, kernel1, g0, g1, np.int32(num_elements))
stream1.sync()
print(" Kernel execution complete")
# Run kernel on GPU 0, reading from GPU 1, writing to GPU 0
print(
f"\nRun kernel on GPU{gpuid[0]}, taking source data from "
f"GPU{gpuid[1]} and writing to GPU{gpuid[0]}..."
)
dev0.set_current()
launch(stream0, config, kernel0, g1, g0, np.int32(num_elements))
stream0.sync()
print(" Kernel execution complete")
# Copy data back to host and verify
print(f"\nCopy data back to host from GPU{gpuid[0]} and verify results...")
g0.copy_to(h0, stream=stream0)
stream0.sync()
# Verify results
print("\nChecking results...")
print(f" Comparing {num_elements:,} elements...")
# Input data goes through two kernels, each multiplying by 2.0.
expected = (np.arange(num_elements, dtype=np.float32) % 4096) * 4.0
# Use utility function for verification (handles both numpy and cupy arrays)
if verify_array_result(h0_array, expected, rtol=1e-5, atol=1e-6, verbose=True):
print(" [PASS] Validation PASSED")
success = True
else:
print(" [FAIL] Validation FAILED")
# Show first few errors for debugging
errors = np.where(~np.isclose(h0_array, expected, rtol=1e-5, atol=1e-6))[0]
print(f" Number of mismatches: {len(errors)}")
for idx in errors[:10]:
print(
f" Error @ element {idx}: got {h0_array[idx]}, "
f"expected {expected[idx]}"
)
success = False
# Disable peer access
print("\nDisabling peer access...")
mr0.peer_accessible_by = [] # Revoke GPU 1's access to GPU 0's memory
mr1.peer_accessible_by = [] # Revoke GPU 0's access to GPU 1's memory
print(
f" Peer access revoked: MR0 accessible by {mr0.peer_accessible_by}, "
f"MR1 accessible by {mr1.peer_accessible_by}"
)
print("\n" + "=" * 70)
if success:
print("simpleP2P completed successfully!")
else:
print("simpleP2P FAILED!")
print("=" * 70 + "\n")
return 0 if success else 1
finally:
# Cleanup streams and buffers
print("Shutting down...")
stream0.close()
stream1.close()
def main():
"""Main entry point with argument parsing."""
parser = argparse.ArgumentParser(
description=(
"Demonstrate peer-to-peer (P2P) memory access between "
"multiple GPUs with CUDA"
)
)
parser.add_argument(
"--num_elements",
type=int,
default=1024 * 1024 * 16, # 16M elements = 64MB
help="Number of elements in arrays (default: 16777216 = 64MB)",
)
args = parser.parse_args()
# Validate arguments
if args.num_elements <= 0:
print("Error: num_elements must be positive")
return 1
try:
exit_code = run(num_elements=args.num_elements)
sys.exit(exit_code)
except Exception as e:
print(f"\nError: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()