Dheemanth aeab82ff30
CUDA 13.2 samples update (#432)
- Added Python samples for CUDA Python 1.0 release
- Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
2026-05-13 17:13:18 -05:00

473 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# distribution and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Single-Pass Multi-Block Reduction with Cooperative Groups
Demonstrates single-kernel multi-stage reduction using grid-wide
synchronization. Traditional reduction needs multiple kernel launches,
but with grid.sync() from Cooperative Groups, we can complete all
stages in ONE kernel.
Key Features:
- Grid-wide synchronization with grid.sync()
- Two-stage reduction in a single kernel (no atomic operations)
- Requires compute capability 6.0+ and cooperative launch
- Achieves 400-700 GB/s on modern GPUs
How it differs from other samples:
- blockArraySum.py: Basic thread/block indexing + atomicAdd
- reduction.py: High-performance shared memory, two-kernel approach
- This sample: Single-kernel multi-stage with grid.sync()
Transfers use CuPy on the same CUDA stream as ``launch()`` (``Stream.from_external``),
not ``cuda.bindings.driver`` memcpy. GPU timing uses CUDA events.
"""
import argparse
import os
import sys
import time
try:
import cupy as cp
import numpy as np
from cuda.core import (
Device,
EventOptions,
LaunchConfig,
Program,
ProgramOptions,
launch,
)
except ImportError as e:
print(f"Error: Required package not found: {e}")
print("Please install dependencies:")
print(" pip install -r requirements.txt")
sys.exit(1)
# Import utilities
utilities_path = os.path.join(os.path.dirname(__file__), "..", "..", "Utilities")
sys.path.insert(0, utilities_path)
from cuda_samples_utils import verify_array_result # noqa: E402
def _validate_threads_arg(threads):
if threads is None:
return None
if threads <= 0 or threads > 1024:
return "threads must be between 1 and 1024"
if (threads & (threads - 1)) != 0:
return (
"threads must be a power of 2 "
"(required by the shared-memory tree reduction kernel)"
)
return None
# Single-pass multi-block reduction kernel with grid-wide sync
REDUCTION_KERNEL = """
/*
* Single-Kernel Multi-Stage Reduction using grid.sync()
*
* Strategy:
* Stage 1: Each block reduces its portion → partial sum
* grid.sync() ← KEY: All blocks synchronize
* Stage 2: Block 0 reduces all partial sums → final result
*
* Key feature: grid.sync() enables multi-stage within ONE kernel
*/
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
extern "C" __global__ void reduceSinglePassMultiBlockCG(
const float *__restrict__ g_idata,
float *__restrict__ g_odata,
unsigned int n)
{
cg::thread_block cta = cg::this_thread_block();
cg::grid_group grid = cg::this_grid();
extern __shared__ float sdata[];
unsigned int tid = threadIdx.x;
unsigned int blockSize = blockDim.x;
// Stage 1: Grid-stride loop + block reduction
float sum = 0.0f;
for (unsigned int i = grid.thread_rank(); i < n; i += grid.size()) {
sum += g_idata[i];
}
sdata[tid] = sum;
cg::sync(cta);
// Block reduction (sequential addressing)
for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
cg::sync(cta);
}
if (tid == 0) {
g_odata[blockIdx.x] = sdata[0];
}
// KEY: Grid-wide synchronization (all blocks wait here)
grid.sync();
// Stage 2: Block 0 reduces all partial sums → final result
// Use a stride loop so all gridDim.x partial sums are covered even
// when gridDim.x > blockDim.x.
if (blockIdx.x == 0) {
// mySum stays 0.0f when tid >= gridDim.x (loop never executes),
// implicitly zero-filling sdata for threads beyond the partial-sum count.
float mySum = 0.0f;
for (unsigned int i = tid; i < gridDim.x; i += blockSize) {
mySum += g_odata[i];
}
sdata[tid] = mySum;
cg::sync(cta);
for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
cg::sync(cta);
}
if (tid == 0) {
g_odata[0] = sdata[0];
}
}
}
"""
def get_max_cooperative_blocks(device, kernel, threads_per_block, shared_mem_bytes):
"""
Calculate max blocks for cooperative launch (all must be resident).
This is a conservative estimate that ignores shared memory limits;
for precise tuning, use cudaOccupancyMaxActiveBlocksPerMultiprocessor.
"""
# Get device properties
prop = device.properties
# Calculate maximum blocks per SM
# Note: We use cudaOccupancyMaxActiveBlocksPerMultiprocessor functionality
# For simplicity in Python, we'll use a conservative estimate
num_sms = prop.multiprocessor_count
max_threads_per_sm = prop.max_threads_per_multiprocessor
max_blocks_per_sm = max_threads_per_sm // threads_per_block
# Total blocks = blocks per SM × number of SMs
max_blocks = max_blocks_per_sm * num_sms
# Also respect max_grid_dim_x
max_blocks = min(max_blocks, prop.max_grid_dim_x)
return max_blocks
def run(
num_elements=1 << 25,
max_threads=None,
max_blocks=None,
test_iterations=100,
cuda_include_dir=None,
):
"""Run single-pass multi-block reduction benchmark."""
if cuda_include_dir is None:
raise ValueError("cuda_include_dir is required")
print("\n" + "=" * 70)
print("Single-Pass Multi-Block Reduction with Cooperative Groups")
print("=" * 70)
msg = "Multi-stage reduction in a single kernel using grid.sync()"
print(f"\nDemonstrates: {msg}")
# Initialize device
device = Device()
device.set_current()
major, minor = device.compute_capability
print("\nDevice Information:")
print(f" Name: {device.name}")
print(f" Compute Capability: sm_{major}.{minor}")
# Get device properties for configuration
prop = device.properties
# Determine threads per block
if max_threads is None:
max_threads = prop.max_threads_per_block
threads_per_block = min(max_threads, 1024)
# Define data type and itemsize
itemsize = np.dtype(np.float32).itemsize
print("\nReduction Configuration:")
print(f" Number of elements: {num_elements:,}")
print(f" Data size: {num_elements * itemsize / (1024**2):.2f} MB")
# Compile kernel
print("\nCompiling CUDA kernel...")
# Support colon-separated multiple include paths
include_paths = cuda_include_dir.split(":")
program_options = ProgramOptions(
std="c++17", arch=f"sm_{device.arch}", include_path=include_paths
)
prog = Program(REDUCTION_KERNEL, code_type="c++", options=program_options)
mod = prog.compile("cubin")
kernel = mod.get_kernel("reduceSinglePassMultiBlockCG")
print(" Kernel compiled successfully")
# Calculate blocks for cooperative launch
shared_mem_bytes = threads_per_block * itemsize
if max_blocks is None:
max_blocks = get_max_cooperative_blocks(
device, kernel, threads_per_block, shared_mem_bytes
)
# Calculate optimal blocks (all must be resident)
num_blocks = min(
max_blocks, (num_elements + threads_per_block - 1) // threads_per_block
)
print("\nLaunch Configuration:")
print(f" Threads per block: {threads_per_block}")
print(f" Number of blocks: {num_blocks}")
print(f" Total threads: {num_blocks * threads_per_block:,}")
print(f" Shared memory per block: {shared_mem_bytes} bytes")
print(" Launch mode: Cooperative (grid-wide sync enabled)")
# Generate random input data
print("\n> Generating random input data...")
rng = np.random.default_rng(42)
h_idata = (rng.random(num_elements) * 256).astype(np.float32)
stream = device.create_stream()
cp_stream = cp.cuda.Stream.from_external(stream)
try:
d_odata = cp.empty(num_blocks, dtype=np.float32)
with cp_stream:
d_idata = cp.asarray(h_idata, dtype=np.float32)
stream.sync()
# Compute CPU reference
print("> Computing reference result on CPU...")
cpu_start = time.perf_counter()
cpu_result = float(np.sum(h_idata))
cpu_time = time.perf_counter() - cpu_start
print(f" CPU time: {cpu_time:.6f} seconds")
# Warm-up
print("\n> Warming up GPU...")
launch_config = LaunchConfig(
grid=(num_blocks, 1, 1),
block=(threads_per_block, 1, 1),
shmem_size=shared_mem_bytes,
cooperative_launch=True,
)
n_u32 = np.uint32(num_elements)
ptr_in = d_idata.data.ptr
ptr_out = d_odata.data.ptr
try:
launch(stream, launch_config, kernel, ptr_in, ptr_out, n_u32)
except Exception as e:
print(f" Cooperative launch failed: {e}")
return 1
stream.sync()
print(" Warm-up successful")
# Benchmark (CUDA events — not host wall clock around the whole loop)
print(f"\n> Running benchmark ({test_iterations} iterations)...")
event_options = EventOptions(enable_timing=True)
start_event = stream.device.create_event(options=event_options)
end_event = stream.device.create_event(options=event_options)
# cuda.core event elapsed time (end - start) is in milliseconds (CUDA API).
gpu_times_ms = []
for _ in range(test_iterations):
try:
stream.record(start_event)
launch(stream, launch_config, kernel, ptr_in, ptr_out, n_u32)
stream.record(end_event)
end_event.sync()
gpu_times_ms.append(float(end_event - start_event))
except Exception as e:
print(f"Benchmark iteration failed: {e}")
return 1
avg_gpu_ms = float(np.mean(gpu_times_ms))
avg_gpu_s = avg_gpu_ms / 1000.0
stream.sync()
with cp_stream:
h_result = cp.asnumpy(d_odata[:1])
gpu_result = float(h_result[0])
# Performance metrics use seconds for throughput and speedup.
# CPU time is already in seconds.
bytes_processed = num_elements * 4
throughput_gb_s = bytes_processed / avg_gpu_s / 1e9
print("\n> Performance Results:")
print(f" Average GPU time: {avg_gpu_ms:.6f} ms")
print(f" Throughput: {throughput_gb_s:.2f} GB/s")
print(f" Speedup vs CPU: {cpu_time / avg_gpu_s:.2f}x")
# Validate results
print("\n> Validating results...")
success = verify_array_result(
np.array([gpu_result]),
np.array([cpu_result]),
rtol=1e-5,
atol=1e-5,
)
# Summary
print("\n" + "=" * 70)
print("Summary")
print("=" * 70)
print(f"""
Single-kernel two-stage reduction:
Stage 1: {num_blocks} blocks → {num_blocks} partial sums
grid.sync() ← All blocks synchronize (KEY innovation)
Stage 2: Block 0 → 1 final result
Total: 1 kernel launch, {throughput_gb_s:.2f} GB/s
Comparison:
• Traditional: 2 kernel launches or kernel + CPU
• This sample: 1 kernel with grid.sync() between stages
• Benefit: Eliminates ~5-20μs launch overhead per stage
""")
print("=" * 70)
if success:
print("Single-Pass Multi-Block Reduction completed successfully!")
else:
print("Single-Pass Multi-Block Reduction FAILED!")
print("=" * 70 + "\n")
return 0 if success else 1
finally:
stream.close()
def main():
"""Main entry point with argument parsing."""
parser = argparse.ArgumentParser(
description="Single-Pass Multi-Block Reduction with Cooperative Groups"
)
parser.add_argument(
"--n",
type=int,
default=1 << 25,
help="Number of elements to reduce (default: 33554432 = 2^25)",
)
parser.add_argument(
"--threads",
type=int,
default=None,
help=(
"Threads per block, power of 2 in [1, 1024]; "
"default: device maximum (typically 1024)"
),
)
parser.add_argument(
"--maxblocks",
type=int,
default=None,
help=(
"Maximum number of blocks "
"(default: auto-calculated for cooperative launch)"
),
)
parser.add_argument(
"--iterations",
type=int,
default=100,
help="Number of benchmark iterations (default: 100)",
)
parser.add_argument(
"--cuda-include-dir",
type=str,
required=True,
help=(
"CUDA include directory for NVRTC "
"(can use colon-separated paths, e.g., /path1:/path2)"
),
)
args = parser.parse_args()
# Validate arguments
if args.n <= 0:
print("Error: n must be positive")
return 1
err_threads = _validate_threads_arg(args.threads)
if err_threads:
print(f"Error: {err_threads}")
return 1
if args.maxblocks is not None and args.maxblocks <= 0:
print("Error: maxblocks must be positive")
return 1
try:
exit_code = run(
num_elements=args.n,
max_threads=args.threads,
max_blocks=args.maxblocks,
test_iterations=args.iterations,
cuda_include_dir=args.cuda_include_dir,
)
sys.exit(exit_code)
except Exception as e:
print(f"\nError: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()