mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2026-05-14 14:06:53 +08:00
- Added Python samples for CUDA Python 1.0 release - Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
378 lines
13 KiB
Python
378 lines
13 KiB
Python
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions
|
|
# are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# distribution and/or other materials provided with the distribution.
|
|
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
# contributors may be used to endorse or promote products derived
|
|
# from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
import argparse
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import numpy as np
|
|
from cuda.core import (
|
|
Device,
|
|
DeviceMemoryResource,
|
|
EventOptions,
|
|
LaunchConfig,
|
|
PinnedMemoryResource,
|
|
Program,
|
|
ProgramOptions,
|
|
launch,
|
|
system,
|
|
)
|
|
except ImportError as e:
|
|
print(f"Error: Required package not found: {e}")
|
|
print("Please install from requirements.txt:")
|
|
print(" pip install -r requirements.txt")
|
|
sys.exit(1)
|
|
|
|
# Add parent directory to path to import utilities
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
|
|
from cuda_samples_utils import verify_array_result # noqa: E402
|
|
|
|
# CUDA kernel for simple P2P operation
|
|
SIMPLE_P2P_KERNEL = """
|
|
extern "C" __global__
|
|
void SimpleKernel(float *src, float *dst, int N) {
|
|
// Grid-stride loop pattern for canonical CUDA kernel
|
|
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
size_t stride = gridDim.x * blockDim.x;
|
|
|
|
for (size_t i = tid; i < N; i += stride) {
|
|
dst[i] = src[i] * 2.0f;
|
|
}
|
|
}
|
|
"""
|
|
|
|
|
|
def run(num_elements=1024 * 1024 * 16):
|
|
"""
|
|
Demonstrates peer-to-peer (P2P) memory access between multiple GPUs using cuda.core.
|
|
|
|
This function shows how to:
|
|
1. Detect and verify multiple GPUs with P2P capability
|
|
2. Enable peer access between GPUs
|
|
3. Perform direct GPU-to-GPU memory transfers
|
|
4. Launch kernels that access memory from other GPUs
|
|
5. Measure P2P bandwidth
|
|
6. Validate results
|
|
|
|
Parameters
|
|
----------
|
|
num_elements : int
|
|
Number of elements in arrays (default: 16M elements = 64MB)
|
|
"""
|
|
|
|
print("\n" + "=" * 70)
|
|
print("simpleP2P - CUDA Python Sample")
|
|
print("=" * 70)
|
|
print("\nStarting...")
|
|
|
|
# Check for multiple GPUs
|
|
print("\nChecking for multiple GPUs...")
|
|
num_devices = system.get_num_devices()
|
|
print(f"CUDA-capable device count: {num_devices}")
|
|
|
|
if num_devices < 2:
|
|
print(
|
|
"Two or more GPUs with Peer-to-Peer access capability are "
|
|
"required, waiving this sample."
|
|
)
|
|
return 2
|
|
|
|
# Get device properties
|
|
devices = [Device(i) for i in range(num_devices)]
|
|
|
|
# Check for P2P capability
|
|
print("\nChecking GPU(s) for support of peer to peer memory access...")
|
|
|
|
p2p_capable_gpus = [-1, -1]
|
|
|
|
for i in range(num_devices):
|
|
p2p_capable_gpus[0] = i
|
|
for j in range(num_devices):
|
|
if i == j:
|
|
continue
|
|
|
|
# Check peer access capability using cuda.core
|
|
i_access_j = devices[i].can_access_peer(devices[j])
|
|
j_access_i = devices[j].can_access_peer(devices[i])
|
|
|
|
print(
|
|
f"> Peer access from {devices[i].name} (GPU{i}) -> "
|
|
f"{devices[j].name} (GPU{j}): {'Yes' if i_access_j else 'No'}"
|
|
)
|
|
print(
|
|
f"> Peer access from {devices[j].name} (GPU{j}) -> "
|
|
f"{devices[i].name} (GPU{i}): {'Yes' if j_access_i else 'No'}"
|
|
)
|
|
|
|
if i_access_j and j_access_i:
|
|
p2p_capable_gpus[1] = j
|
|
break
|
|
|
|
if p2p_capable_gpus[1] != -1:
|
|
break
|
|
|
|
if p2p_capable_gpus[0] == -1 or p2p_capable_gpus[1] == -1:
|
|
print("\nTwo or more GPUs with Peer-to-Peer access capability are required.")
|
|
print(
|
|
"Peer to Peer access is not available amongst GPUs in the system, "
|
|
"waiving test."
|
|
)
|
|
return 2
|
|
|
|
# Use first pair of P2P capable GPUs detected
|
|
gpuid = [p2p_capable_gpus[0], p2p_capable_gpus[1]]
|
|
dev0 = devices[gpuid[0]]
|
|
dev1 = devices[gpuid[1]]
|
|
|
|
print(f"\nUsing GPU{gpuid[0]} ({dev0.name}) and GPU{gpuid[1]} ({dev1.name})")
|
|
|
|
# Allocate buffers with P2P access
|
|
buf_size = num_elements * np.dtype(np.float32).itemsize
|
|
print(
|
|
f"\nAllocating buffers ({int(buf_size / 1024 / 1024)}MB on "
|
|
f"GPU{gpuid[0]}, GPU{gpuid[1]} and CPU Host)..."
|
|
)
|
|
|
|
# Allocate on GPU 0 and grant access to GPU 1
|
|
dev0.set_current()
|
|
mr0 = DeviceMemoryResource(dev0)
|
|
mr0.peer_accessible_by = [gpuid[1]] # Grant GPU 1 access to GPU 0's memory
|
|
g0 = mr0.allocate(buf_size)
|
|
|
|
# Allocate on GPU 1 and grant access to GPU 0
|
|
dev1.set_current()
|
|
mr1 = DeviceMemoryResource(dev1)
|
|
mr1.peer_accessible_by = [gpuid[0]] # Grant GPU 0 access to GPU 1's memory
|
|
g1 = mr1.allocate(buf_size)
|
|
|
|
print(f" Peer access enabled: GPU{gpuid[0]} <-> GPU{gpuid[1]}")
|
|
print(
|
|
f" Peer access status: MR0 accessible by {mr0.peer_accessible_by}, "
|
|
f"MR1 accessible by {mr1.peer_accessible_by}"
|
|
)
|
|
|
|
# Allocate pinned host memory
|
|
pinned_mr = PinnedMemoryResource()
|
|
h0 = pinned_mr.allocate(buf_size)
|
|
|
|
print(" Memory allocated successfully")
|
|
|
|
# Create streams
|
|
stream0 = dev0.create_stream()
|
|
stream1 = dev1.create_stream()
|
|
|
|
try:
|
|
# P2P bandwidth test using CUDA events for accurate GPU-side timing
|
|
print("\nMeasuring P2P bandwidth...")
|
|
print(" Performing 100 ping-pong copies between GPUs...")
|
|
|
|
event_options = EventOptions(enable_timing=True)
|
|
sync_event0 = None
|
|
sync_event1 = None
|
|
|
|
# Record start event on stream0
|
|
start_event = stream0.record(options=event_options)
|
|
|
|
for i in range(100):
|
|
# Ping-pong copy between GPUs with explicit event-based synchronization
|
|
if i % 2 == 0:
|
|
# Wait for previous stream1 copy to complete (if any)
|
|
if sync_event1 is not None:
|
|
stream0.wait(sync_event1)
|
|
# Copy g0 -> g1 on stream0
|
|
g1.copy_from(g0, stream=stream0)
|
|
# Record event on stream0 to signal completion of this copy
|
|
sync_event0 = stream0.record(options=EventOptions(enable_timing=False))
|
|
else:
|
|
# Wait for previous stream0 copy to complete
|
|
if sync_event0 is not None:
|
|
stream1.wait(sync_event0)
|
|
# Copy g1 -> g0 on stream1
|
|
g0.copy_from(g1, stream=stream1)
|
|
# Record event on stream1 to signal completion of this copy
|
|
sync_event1 = stream1.record(options=EventOptions(enable_timing=False))
|
|
|
|
# Wait for last stream1 copy to complete
|
|
if sync_event1 is not None:
|
|
stream0.wait(sync_event1)
|
|
|
|
# Record end event on stream0 after all copies have been enqueued
|
|
end_event = stream0.record(options=event_options)
|
|
end_event.sync()
|
|
|
|
# Elapsed time in milliseconds (using subtraction operator)
|
|
time_memcpy = end_event - start_event
|
|
|
|
bandwidth = (1.0 / (time_memcpy / 1000.0)) * (100.0 * buf_size) / (1024.0**3)
|
|
print(f" P2P bandwidth: {bandwidth:.2f} GB/s")
|
|
|
|
# Prepare host buffer and initialize data
|
|
print(f"\nPreparing host buffer and memcpy to GPU{gpuid[0]}...")
|
|
|
|
# Create numpy view and initialize
|
|
h0_array = np.from_dlpack(h0).view(dtype=np.float32)
|
|
h0_array[:] = (np.arange(num_elements, dtype=np.float32) % 4096).astype(
|
|
np.float32
|
|
)
|
|
|
|
# Copy to GPU 0
|
|
dev0.set_current()
|
|
g0.copy_from(h0, stream=stream0)
|
|
stream0.sync()
|
|
|
|
print(" Data initialized and copied to GPU")
|
|
|
|
# Compile kernel for both GPUs
|
|
print("\nCompiling CUDA kernel...")
|
|
dev0.set_current()
|
|
program_options = ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}")
|
|
prog = Program(SIMPLE_P2P_KERNEL, code_type="c++", options=program_options)
|
|
mod0 = prog.compile("cubin")
|
|
kernel0 = mod0.get_kernel("SimpleKernel")
|
|
|
|
dev1.set_current()
|
|
program_options = ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}")
|
|
prog = Program(SIMPLE_P2P_KERNEL, code_type="c++", options=program_options)
|
|
mod1 = prog.compile("cubin")
|
|
kernel1 = mod1.get_kernel("SimpleKernel")
|
|
|
|
print(" Kernels compiled successfully")
|
|
|
|
# Launch configuration
|
|
threads = 512
|
|
blocks = (num_elements + threads - 1) // threads
|
|
config = LaunchConfig(grid=blocks, block=threads)
|
|
|
|
# Run kernel on GPU 1, reading from GPU 0, writing to GPU 1
|
|
print(
|
|
f"\nRun kernel on GPU{gpuid[1]}, taking source data from "
|
|
f"GPU{gpuid[0]} and writing to GPU{gpuid[1]}..."
|
|
)
|
|
dev1.set_current()
|
|
launch(stream1, config, kernel1, g0, g1, np.int32(num_elements))
|
|
stream1.sync()
|
|
print(" Kernel execution complete")
|
|
|
|
# Run kernel on GPU 0, reading from GPU 1, writing to GPU 0
|
|
print(
|
|
f"\nRun kernel on GPU{gpuid[0]}, taking source data from "
|
|
f"GPU{gpuid[1]} and writing to GPU{gpuid[0]}..."
|
|
)
|
|
dev0.set_current()
|
|
launch(stream0, config, kernel0, g1, g0, np.int32(num_elements))
|
|
stream0.sync()
|
|
print(" Kernel execution complete")
|
|
|
|
# Copy data back to host and verify
|
|
print(f"\nCopy data back to host from GPU{gpuid[0]} and verify results...")
|
|
g0.copy_to(h0, stream=stream0)
|
|
stream0.sync()
|
|
|
|
# Verify results
|
|
print("\nChecking results...")
|
|
print(f" Comparing {num_elements:,} elements...")
|
|
|
|
# Input data goes through two kernels, each multiplying by 2.0.
|
|
expected = (np.arange(num_elements, dtype=np.float32) % 4096) * 4.0
|
|
|
|
# Use utility function for verification (handles both numpy and cupy arrays)
|
|
if verify_array_result(h0_array, expected, rtol=1e-5, atol=1e-6, verbose=True):
|
|
print(" [PASS] Validation PASSED")
|
|
success = True
|
|
else:
|
|
print(" [FAIL] Validation FAILED")
|
|
# Show first few errors for debugging
|
|
errors = np.where(~np.isclose(h0_array, expected, rtol=1e-5, atol=1e-6))[0]
|
|
print(f" Number of mismatches: {len(errors)}")
|
|
for idx in errors[:10]:
|
|
print(
|
|
f" Error @ element {idx}: got {h0_array[idx]}, "
|
|
f"expected {expected[idx]}"
|
|
)
|
|
success = False
|
|
|
|
# Disable peer access
|
|
print("\nDisabling peer access...")
|
|
mr0.peer_accessible_by = [] # Revoke GPU 1's access to GPU 0's memory
|
|
mr1.peer_accessible_by = [] # Revoke GPU 0's access to GPU 1's memory
|
|
print(
|
|
f" Peer access revoked: MR0 accessible by {mr0.peer_accessible_by}, "
|
|
f"MR1 accessible by {mr1.peer_accessible_by}"
|
|
)
|
|
|
|
print("\n" + "=" * 70)
|
|
if success:
|
|
print("simpleP2P completed successfully!")
|
|
else:
|
|
print("simpleP2P FAILED!")
|
|
print("=" * 70 + "\n")
|
|
|
|
return 0 if success else 1
|
|
finally:
|
|
# Cleanup streams and buffers
|
|
print("Shutting down...")
|
|
stream0.close()
|
|
stream1.close()
|
|
|
|
|
|
def main():
|
|
"""Main entry point with argument parsing."""
|
|
parser = argparse.ArgumentParser(
|
|
description=(
|
|
"Demonstrate peer-to-peer (P2P) memory access between "
|
|
"multiple GPUs with CUDA"
|
|
)
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--num_elements",
|
|
type=int,
|
|
default=1024 * 1024 * 16, # 16M elements = 64MB
|
|
help="Number of elements in arrays (default: 16777216 = 64MB)",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate arguments
|
|
if args.num_elements <= 0:
|
|
print("Error: num_elements must be positive")
|
|
return 1
|
|
|
|
try:
|
|
exit_code = run(num_elements=args.num_elements)
|
|
sys.exit(exit_code)
|
|
except Exception as e:
|
|
print(f"\nError: {e}")
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|