Dheemanth aeab82ff30
CUDA 13.2 samples update (#432)
- Added Python samples for CUDA Python 1.0 release
- Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
2026-05-13 17:13:18 -05:00

417 lines
14 KiB
Python

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# distribution and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Multi-GPU Gradient Average using MPI and cuda.core (Host-staging Allreduce)
Question: How do I synchronize gradients across GPUs?
Answer:
Each GPU (each MPI rank) computes local gradients on device via CUDA.
Gradients are then averaged across ranks via an MPI Allreduce over host
(CPU) buffers, following the classic data-parallel training pattern.
This sample shows how to:
- Initialize MPI for multi-process GPU workloads
- Map MPI ranks to GPUs
- Use cuda.core for kernel compilation and execution
- Integrate cuda.core with CuPy using the stream protocol
- Perform gradient averaging with MPI Allreduce (using host staging)
- Use cuda.core Event for GPU timing measurements
- Verify correctness of distributed gradient synchronization
Key concepts: Allreduce, NCCL collectives (conceptually), distributed training
Note:
- All gradient computation and validation happen on GPUs.
- MPI Allreduce is executed on CPU (host) buffers via a simple
GPU -> CPU -> MPI -> CPU -> GPU staging pattern so that the sample
works on any MPI stack, without requiring CUDA-aware MPI.
- In production deep learning frameworks (e.g., PyTorch DDP), NCCL
usually implements the GPU Allreduce directly; the communication
pattern and semantics are the same as demonstrated here.
"""
import sys
from pathlib import Path
# Add parent directory to path to import utilities
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
from cuda_samples_utils import verify_array_result # noqa: E402
try:
import cupy as cp
from cuda.core import (
Device,
EventOptions,
LaunchConfig,
Program,
ProgramOptions,
launch,
system,
)
from mpi4py import MPI
except ImportError as e:
print(f"Error: Required package not found: {e}")
print("Please install: pip install mpi4py cupy-cuda12x cuda-python cuda-core")
sys.exit(1)
# ============================================================================
# CUDA device selection and stream management
# ============================================================================
def init_device(rank: int):
"""
Initialize CUDA device and stream for this MPI rank.
For a simple single-node run, we map rank % num_gpus to a device id.
This covers both the common case (world_size == num_gpus) and the case
where multiple ranks share a GPU.
Returns
-------
tuple[Device, Stream]
CUDA device object and stream object.
"""
num_gpus = system.get_num_devices()
if num_gpus == 0:
raise RuntimeError("No CUDA devices available")
dev_id = rank % num_gpus # simple mapping: rank -> GPU in round-robin
try:
device = Device(dev_id)
except (RuntimeError, ValueError) as e:
if rank == 0:
print(f"Warning: Cannot assign GPU {dev_id}, using GPU 0. Error: {e}")
device = Device(0)
device.set_current()
# Align CuPy with cuda.core's chosen device ID
cp.cuda.Device(device.device_id).use()
# Create cuda.core stream and make CuPy use it
stream = device.create_stream()
cp.cuda.ExternalStream(int(stream.handle)).use()
return device, stream
# ============================================================================
# CUDA kernel definition and compilation
# ============================================================================
# Tiny CUDA kernel to initialize local "gradients"
# Uses grid-stride loop to handle arrays larger than grid size
INIT_KERNEL = r"""
extern "C" __global__
void init_grad_kernel(float* grad, int n, int rank)
{
// Grid-stride loop: each thread processes multiple elements
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = gridDim.x * blockDim.x;
for (size_t i = tid; i < n; i += stride) {
// Gradient value depends on MPI rank so we can verify reduction:
// grad_i = rank + 0.001 * i
grad[i] = rank + 0.001f * i;
}
}
"""
_kernel_cache = {}
def get_init_kernel(device: Device):
"""Compile (or retrieve cached) init_grad_kernel for this device."""
key = device.pci_bus_id
if key not in _kernel_cache:
opts = ProgramOptions(std="c++17", arch=f"sm_{device.arch}")
prog = Program(INIT_KERNEL, code_type="c++", options=opts)
mod = prog.compile("cubin")
_kernel_cache[key] = mod.get_kernel("init_grad_kernel")
return _kernel_cache[key]
# ============================================================================
# Local gradient computation on each GPU
# ============================================================================
def compute_local_gradients(
num_elements: int, device: Device, stream: object, rank: int
) -> cp.ndarray:
"""
Compute a local "gradient" vector on the current GPU.
For demo purposes, we initialize:
grad[i] = rank + 0.001 * i
Parameters
----------
num_elements : int
Length of gradient vector.
device : Device
CUDA device object.
stream : Stream
CUDA stream object (created at device initialization).
rank : int
MPI rank ID.
Returns
-------
cupy.ndarray
Gradient vector on GPU.
"""
# Create gradient array (CuPy uses the stream set at device initialization)
grad = cp.empty(num_elements, dtype=cp.float32)
# Use a CUDA kernel compiled with cuda.core to fill the array
kernel = get_init_kernel(device)
threads_per_block = 256
blocks_per_grid = (num_elements + threads_per_block - 1) // threads_per_block
config = LaunchConfig(grid=blocks_per_grid, block=threads_per_block)
# Launch kernel using cuda.core stream
launch(stream, config, kernel, grad.data.ptr, num_elements, rank)
return grad
# ============================================================================
# MPI Allreduce to average gradients (host-staging)
# ============================================================================
def average_gradients(
local_grad: cp.ndarray, comm: object, world_size: int
) -> cp.ndarray:
"""
Average gradients across all MPI ranks using host-staging Allreduce.
Steps:
1. Copy local gradients from GPU to CPU (NumPy).
2. Perform MPI_Allreduce on host buffers.
3. Divide by world_size to obtain the average.
4. Copy the averaged gradients back to GPU.
This pattern is environment-agnostic and works on any MPI stack.
"""
assert local_grad.dtype == cp.float32
# GPU -> CPU
local_host = local_grad.get() # NumPy array on host
avg_host = local_host.copy()
# Allreduce on host buffers
comm.Allreduce(local_host, avg_host, op=MPI.SUM)
# Average
avg_host /= world_size
# CPU -> GPU
avg_grad = cp.asarray(avg_host)
return avg_grad
# ============================================================================
# Testing and verification
# ============================================================================
def main():
"""Demo: Multi-GPU gradient averaging with MPI (host-staging Allreduce)."""
import argparse
# Initialize MPI
comm = MPI.COMM_WORLD
world_size = comm.Get_size()
rank = comm.Get_rank()
parser = argparse.ArgumentParser(
description=(
"Multi-GPU Gradient Average with mpi4py + cuda.core "
"(host-staging Allreduce)"
)
)
parser.add_argument(
"--size",
type=int,
default=1024,
help="Number of gradient elements per GPU (default: 1024)",
)
args = parser.parse_args()
num_elements = args.size
# Initialize device and stream
device = None
stream = None
try:
device, stream = init_device(rank)
if rank == 0:
print(f"[Rank 0] World size = {world_size}")
comm.Barrier()
# Validate world size
if world_size < 2:
if rank == 0:
print("=" * 70)
print("ERROR: This sample requires at least 2 MPI processes!")
print("=" * 70)
print("\nPlease run with mpirun:")
print(" mpirun -np 2 python multiGPUGradientAverage.py")
print(" mpirun -np 4 python multiGPUGradientAverage.py --size 10000")
print("\nFor multi-GPU systems:")
print(" mpirun -np N python multiGPUGradientAverage.py")
print(" (where N = number of GPUs)")
print("=" * 70)
sys.exit(1)
# Validate input
if num_elements <= 0:
if rank == 0:
print("Error: --size must be positive")
sys.exit(1)
if rank == 0:
print("\n" + "=" * 70)
print("Multi-GPU Gradient Average Demo")
print("=" * 70)
print(f"Number of MPI ranks (GPUs): {world_size}")
print(f"Gradient vector length per GPU: {num_elements}")
print(f"Device: {device.name}")
print("Computation: gradients computed on GPU via cuda.core.")
print(
"Communication: gradients averaged via MPI_Allreduce on host "
"(CPU) buffers."
)
print("=" * 70)
# Step 1: Compute local gradients on each GPU
# Use cuda.core Event for GPU timing measurements
timing_options = EventOptions(enable_timing=True)
start_event = stream.record(options=timing_options)
local_grad = compute_local_gradients(num_elements, device, stream, rank)
# Record end event and synchronize to ensure timing is complete
end_event = stream.record(options=timing_options)
end_event.sync()
# Calculate elapsed time: Event subtraction returns milliseconds
kernel_time = end_event - start_event
# Step 2: Average gradients across all ranks (host-staging Allreduce)
# Use CPU timing for MPI communication (host-staging includes GPU↔CPU transfers)
import time
comm_start = time.time()
avg_grad = average_gradients(local_grad, comm, world_size)
comm_time = (time.time() - comm_start) * 1000 # Convert to ms
# Step 3: Sanity check on rank 0
# For each element i:
# local_grad_r[i] = r + 0.001 * i, r = 0..world_size-1
# Sum over ranks:
# sum[i] = sum_r r + 0.001 * i * world_size
# Average:
# avg[i] = (0 + ... + (world_size-1))/world_size + 0.001 * i
# = (world_size - 1)/2 + 0.001 * i
#
# We verify this formula.
expected_base = (world_size - 1) / 2.0
i0 = 0
i1 = num_elements // 2
i2 = num_elements - 1
# Copy a few sample elements back to host for printing on rank 0
if rank == 0:
avg_host_samples = avg_grad[[i0, i1, i2]].get()
print("\nSample averaged gradient values (rank 0):")
print(f" avg_grad[{i0}] = {avg_host_samples[0]:.6f}")
print(f" avg_grad[{i1}] = {avg_host_samples[1]:.6f}")
print(f" avg_grad[{i2}] = {avg_host_samples[2]:.6f}")
expected0 = expected_base + 0.001 * i0
expected1 = expected_base + 0.001 * i1
expected2 = expected_base + 0.001 * i2
print("\nExpected values:")
print(f" expected[{i0}] = {expected0:.6f}")
print(f" expected[{i1}] = {expected1:.6f}")
print(f" expected[{i2}] = {expected2:.6f}")
# All ranks perform a full-array correctness check on GPU
expected_full = expected_base + 0.001 * cp.arange(
num_elements, dtype=cp.float32
)
# Use utility function to verify results
if rank == 0:
print("\nVerifying gradient averaging correctness...")
ok = verify_array_result(
avg_grad, expected_full, rtol=1e-5, atol=1e-8, verbose=(rank == 0)
)
# Ensure all ranks agree on correctness
ok_all = comm.allreduce(ok, op=MPI.LAND)
if rank == 0:
if ok_all:
print("[PASS] Gradient averaging is correct on all ranks.")
else:
print(
"[FAIL] Gradient averaging mismatch detected on one or more ranks."
)
print("\nPerformance:")
print(f" Kernel time (GPU only): {kernel_time:.3f} ms")
print(
" MPI communication time (host-staging, end-to-end): "
f"{comm_time:.3f} ms"
)
print(f" Total time: {kernel_time + comm_time:.3f} ms")
print("\n" + "=" * 70)
print("Demo complete.")
print("=" * 70)
return 0 if ok_all else 1
finally:
# Clean up stream resources
if stream is not None:
stream.close()
cp.cuda.Stream.null.use() # Reset CuPy's current stream to the null stream
if __name__ == "__main__":
sys.exit(main())