mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2026-05-14 14:06:53 +08:00
- Added Python samples for CUDA Python 1.0 release - Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
473 lines
15 KiB
Python
473 lines
15 KiB
Python
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||
#
|
||
# Redistribution and use in source and binary forms, with or without
|
||
# modification, are permitted provided that the following conditions
|
||
# are met:
|
||
# * Redistributions of source code must retain the above copyright
|
||
# notice, this list of conditions and the following disclaimer.
|
||
# * Redistributions in binary form must reproduce the above copyright
|
||
# notice, this list of conditions and the following disclaimer in the
|
||
# distribution and/or other materials provided with the distribution.
|
||
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
||
# contributors may be used to endorse or promote products derived
|
||
# from this software without specific prior written permission.
|
||
#
|
||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
||
"""
|
||
Single-Pass Multi-Block Reduction with Cooperative Groups
|
||
|
||
Demonstrates single-kernel multi-stage reduction using grid-wide
|
||
synchronization. Traditional reduction needs multiple kernel launches,
|
||
but with grid.sync() from Cooperative Groups, we can complete all
|
||
stages in ONE kernel.
|
||
|
||
Key Features:
|
||
- Grid-wide synchronization with grid.sync()
|
||
- Two-stage reduction in a single kernel (no atomic operations)
|
||
- Requires compute capability 6.0+ and cooperative launch
|
||
- Achieves 400-700 GB/s on modern GPUs
|
||
|
||
How it differs from other samples:
|
||
- blockArraySum.py: Basic thread/block indexing + atomicAdd
|
||
- reduction.py: High-performance shared memory, two-kernel approach
|
||
- This sample: Single-kernel multi-stage with grid.sync()
|
||
|
||
Transfers use CuPy on the same CUDA stream as ``launch()`` (``Stream.from_external``),
|
||
not ``cuda.bindings.driver`` memcpy. GPU timing uses CUDA events.
|
||
"""
|
||
|
||
import argparse
|
||
import os
|
||
import sys
|
||
import time
|
||
|
||
try:
|
||
import cupy as cp
|
||
import numpy as np
|
||
from cuda.core import (
|
||
Device,
|
||
EventOptions,
|
||
LaunchConfig,
|
||
Program,
|
||
ProgramOptions,
|
||
launch,
|
||
)
|
||
except ImportError as e:
|
||
print(f"Error: Required package not found: {e}")
|
||
print("Please install dependencies:")
|
||
print(" pip install -r requirements.txt")
|
||
sys.exit(1)
|
||
|
||
# Import utilities
|
||
utilities_path = os.path.join(os.path.dirname(__file__), "..", "..", "Utilities")
|
||
sys.path.insert(0, utilities_path)
|
||
from cuda_samples_utils import verify_array_result # noqa: E402
|
||
|
||
|
||
def _validate_threads_arg(threads):
|
||
if threads is None:
|
||
return None
|
||
if threads <= 0 or threads > 1024:
|
||
return "threads must be between 1 and 1024"
|
||
if (threads & (threads - 1)) != 0:
|
||
return (
|
||
"threads must be a power of 2 "
|
||
"(required by the shared-memory tree reduction kernel)"
|
||
)
|
||
return None
|
||
|
||
|
||
# Single-pass multi-block reduction kernel with grid-wide sync
|
||
REDUCTION_KERNEL = """
|
||
/*
|
||
* Single-Kernel Multi-Stage Reduction using grid.sync()
|
||
*
|
||
* Strategy:
|
||
* Stage 1: Each block reduces its portion → partial sum
|
||
* grid.sync() ← KEY: All blocks synchronize
|
||
* Stage 2: Block 0 reduces all partial sums → final result
|
||
*
|
||
* Key feature: grid.sync() enables multi-stage within ONE kernel
|
||
*/
|
||
|
||
#include <cooperative_groups.h>
|
||
|
||
namespace cg = cooperative_groups;
|
||
extern "C" __global__ void reduceSinglePassMultiBlockCG(
|
||
const float *__restrict__ g_idata,
|
||
float *__restrict__ g_odata,
|
||
unsigned int n)
|
||
{
|
||
cg::thread_block cta = cg::this_thread_block();
|
||
cg::grid_group grid = cg::this_grid();
|
||
extern __shared__ float sdata[];
|
||
|
||
unsigned int tid = threadIdx.x;
|
||
unsigned int blockSize = blockDim.x;
|
||
|
||
// Stage 1: Grid-stride loop + block reduction
|
||
float sum = 0.0f;
|
||
for (unsigned int i = grid.thread_rank(); i < n; i += grid.size()) {
|
||
sum += g_idata[i];
|
||
}
|
||
|
||
sdata[tid] = sum;
|
||
cg::sync(cta);
|
||
|
||
// Block reduction (sequential addressing)
|
||
for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
|
||
if (tid < s) {
|
||
sdata[tid] += sdata[tid + s];
|
||
}
|
||
cg::sync(cta);
|
||
}
|
||
|
||
if (tid == 0) {
|
||
g_odata[blockIdx.x] = sdata[0];
|
||
}
|
||
|
||
// KEY: Grid-wide synchronization (all blocks wait here)
|
||
grid.sync();
|
||
|
||
// Stage 2: Block 0 reduces all partial sums → final result
|
||
// Use a stride loop so all gridDim.x partial sums are covered even
|
||
// when gridDim.x > blockDim.x.
|
||
if (blockIdx.x == 0) {
|
||
// mySum stays 0.0f when tid >= gridDim.x (loop never executes),
|
||
// implicitly zero-filling sdata for threads beyond the partial-sum count.
|
||
float mySum = 0.0f;
|
||
for (unsigned int i = tid; i < gridDim.x; i += blockSize) {
|
||
mySum += g_odata[i];
|
||
}
|
||
sdata[tid] = mySum;
|
||
cg::sync(cta);
|
||
|
||
for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
|
||
if (tid < s) {
|
||
sdata[tid] += sdata[tid + s];
|
||
}
|
||
cg::sync(cta);
|
||
}
|
||
|
||
if (tid == 0) {
|
||
g_odata[0] = sdata[0];
|
||
}
|
||
}
|
||
}
|
||
"""
|
||
|
||
|
||
def get_max_cooperative_blocks(device, kernel, threads_per_block, shared_mem_bytes):
|
||
"""
|
||
Calculate max blocks for cooperative launch (all must be resident).
|
||
|
||
This is a conservative estimate that ignores shared memory limits;
|
||
for precise tuning, use cudaOccupancyMaxActiveBlocksPerMultiprocessor.
|
||
"""
|
||
# Get device properties
|
||
prop = device.properties
|
||
|
||
# Calculate maximum blocks per SM
|
||
# Note: We use cudaOccupancyMaxActiveBlocksPerMultiprocessor functionality
|
||
# For simplicity in Python, we'll use a conservative estimate
|
||
num_sms = prop.multiprocessor_count
|
||
max_threads_per_sm = prop.max_threads_per_multiprocessor
|
||
max_blocks_per_sm = max_threads_per_sm // threads_per_block
|
||
|
||
# Total blocks = blocks per SM × number of SMs
|
||
max_blocks = max_blocks_per_sm * num_sms
|
||
|
||
# Also respect max_grid_dim_x
|
||
max_blocks = min(max_blocks, prop.max_grid_dim_x)
|
||
|
||
return max_blocks
|
||
|
||
|
||
def run(
|
||
num_elements=1 << 25,
|
||
max_threads=None,
|
||
max_blocks=None,
|
||
test_iterations=100,
|
||
cuda_include_dir=None,
|
||
):
|
||
"""Run single-pass multi-block reduction benchmark."""
|
||
|
||
if cuda_include_dir is None:
|
||
raise ValueError("cuda_include_dir is required")
|
||
|
||
print("\n" + "=" * 70)
|
||
print("Single-Pass Multi-Block Reduction with Cooperative Groups")
|
||
print("=" * 70)
|
||
msg = "Multi-stage reduction in a single kernel using grid.sync()"
|
||
print(f"\nDemonstrates: {msg}")
|
||
|
||
# Initialize device
|
||
device = Device()
|
||
device.set_current()
|
||
major, minor = device.compute_capability
|
||
|
||
print("\nDevice Information:")
|
||
print(f" Name: {device.name}")
|
||
print(f" Compute Capability: sm_{major}.{minor}")
|
||
|
||
# Get device properties for configuration
|
||
prop = device.properties
|
||
|
||
# Determine threads per block
|
||
if max_threads is None:
|
||
max_threads = prop.max_threads_per_block
|
||
threads_per_block = min(max_threads, 1024)
|
||
|
||
# Define data type and itemsize
|
||
itemsize = np.dtype(np.float32).itemsize
|
||
|
||
print("\nReduction Configuration:")
|
||
print(f" Number of elements: {num_elements:,}")
|
||
print(f" Data size: {num_elements * itemsize / (1024**2):.2f} MB")
|
||
|
||
# Compile kernel
|
||
print("\nCompiling CUDA kernel...")
|
||
# Support colon-separated multiple include paths
|
||
include_paths = cuda_include_dir.split(":")
|
||
program_options = ProgramOptions(
|
||
std="c++17", arch=f"sm_{device.arch}", include_path=include_paths
|
||
)
|
||
prog = Program(REDUCTION_KERNEL, code_type="c++", options=program_options)
|
||
mod = prog.compile("cubin")
|
||
kernel = mod.get_kernel("reduceSinglePassMultiBlockCG")
|
||
print(" Kernel compiled successfully")
|
||
|
||
# Calculate blocks for cooperative launch
|
||
shared_mem_bytes = threads_per_block * itemsize
|
||
|
||
if max_blocks is None:
|
||
max_blocks = get_max_cooperative_blocks(
|
||
device, kernel, threads_per_block, shared_mem_bytes
|
||
)
|
||
|
||
# Calculate optimal blocks (all must be resident)
|
||
num_blocks = min(
|
||
max_blocks, (num_elements + threads_per_block - 1) // threads_per_block
|
||
)
|
||
|
||
print("\nLaunch Configuration:")
|
||
print(f" Threads per block: {threads_per_block}")
|
||
print(f" Number of blocks: {num_blocks}")
|
||
print(f" Total threads: {num_blocks * threads_per_block:,}")
|
||
print(f" Shared memory per block: {shared_mem_bytes} bytes")
|
||
print(" Launch mode: Cooperative (grid-wide sync enabled)")
|
||
|
||
# Generate random input data
|
||
print("\n> Generating random input data...")
|
||
rng = np.random.default_rng(42)
|
||
h_idata = (rng.random(num_elements) * 256).astype(np.float32)
|
||
|
||
stream = device.create_stream()
|
||
cp_stream = cp.cuda.Stream.from_external(stream)
|
||
try:
|
||
d_odata = cp.empty(num_blocks, dtype=np.float32)
|
||
with cp_stream:
|
||
d_idata = cp.asarray(h_idata, dtype=np.float32)
|
||
stream.sync()
|
||
|
||
# Compute CPU reference
|
||
print("> Computing reference result on CPU...")
|
||
cpu_start = time.perf_counter()
|
||
cpu_result = float(np.sum(h_idata))
|
||
cpu_time = time.perf_counter() - cpu_start
|
||
print(f" CPU time: {cpu_time:.6f} seconds")
|
||
|
||
# Warm-up
|
||
print("\n> Warming up GPU...")
|
||
|
||
launch_config = LaunchConfig(
|
||
grid=(num_blocks, 1, 1),
|
||
block=(threads_per_block, 1, 1),
|
||
shmem_size=shared_mem_bytes,
|
||
cooperative_launch=True,
|
||
)
|
||
|
||
n_u32 = np.uint32(num_elements)
|
||
ptr_in = d_idata.data.ptr
|
||
ptr_out = d_odata.data.ptr
|
||
|
||
try:
|
||
launch(stream, launch_config, kernel, ptr_in, ptr_out, n_u32)
|
||
except Exception as e:
|
||
print(f" Cooperative launch failed: {e}")
|
||
return 1
|
||
|
||
stream.sync()
|
||
print(" Warm-up successful")
|
||
|
||
# Benchmark (CUDA events — not host wall clock around the whole loop)
|
||
print(f"\n> Running benchmark ({test_iterations} iterations)...")
|
||
event_options = EventOptions(enable_timing=True)
|
||
start_event = stream.device.create_event(options=event_options)
|
||
end_event = stream.device.create_event(options=event_options)
|
||
# cuda.core event elapsed time (end - start) is in milliseconds (CUDA API).
|
||
gpu_times_ms = []
|
||
for _ in range(test_iterations):
|
||
try:
|
||
stream.record(start_event)
|
||
launch(stream, launch_config, kernel, ptr_in, ptr_out, n_u32)
|
||
stream.record(end_event)
|
||
end_event.sync()
|
||
gpu_times_ms.append(float(end_event - start_event))
|
||
except Exception as e:
|
||
print(f"Benchmark iteration failed: {e}")
|
||
return 1
|
||
|
||
avg_gpu_ms = float(np.mean(gpu_times_ms))
|
||
avg_gpu_s = avg_gpu_ms / 1000.0
|
||
|
||
stream.sync()
|
||
with cp_stream:
|
||
h_result = cp.asnumpy(d_odata[:1])
|
||
gpu_result = float(h_result[0])
|
||
|
||
# Performance metrics use seconds for throughput and speedup.
|
||
# CPU time is already in seconds.
|
||
bytes_processed = num_elements * 4
|
||
throughput_gb_s = bytes_processed / avg_gpu_s / 1e9
|
||
|
||
print("\n> Performance Results:")
|
||
print(f" Average GPU time: {avg_gpu_ms:.6f} ms")
|
||
print(f" Throughput: {throughput_gb_s:.2f} GB/s")
|
||
print(f" Speedup vs CPU: {cpu_time / avg_gpu_s:.2f}x")
|
||
|
||
# Validate results
|
||
print("\n> Validating results...")
|
||
success = verify_array_result(
|
||
np.array([gpu_result]),
|
||
np.array([cpu_result]),
|
||
rtol=1e-5,
|
||
atol=1e-5,
|
||
)
|
||
|
||
# Summary
|
||
print("\n" + "=" * 70)
|
||
print("Summary")
|
||
print("=" * 70)
|
||
print(f"""
|
||
Single-kernel two-stage reduction:
|
||
Stage 1: {num_blocks} blocks → {num_blocks} partial sums
|
||
grid.sync() ← All blocks synchronize (KEY innovation)
|
||
Stage 2: Block 0 → 1 final result
|
||
Total: 1 kernel launch, {throughput_gb_s:.2f} GB/s
|
||
|
||
Comparison:
|
||
• Traditional: 2 kernel launches or kernel + CPU
|
||
• This sample: 1 kernel with grid.sync() between stages
|
||
• Benefit: Eliminates ~5-20μs launch overhead per stage
|
||
""")
|
||
|
||
print("=" * 70)
|
||
if success:
|
||
print("Single-Pass Multi-Block Reduction completed successfully!")
|
||
else:
|
||
print("Single-Pass Multi-Block Reduction FAILED!")
|
||
print("=" * 70 + "\n")
|
||
|
||
return 0 if success else 1
|
||
finally:
|
||
stream.close()
|
||
|
||
|
||
def main():
|
||
"""Main entry point with argument parsing."""
|
||
parser = argparse.ArgumentParser(
|
||
description="Single-Pass Multi-Block Reduction with Cooperative Groups"
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--n",
|
||
type=int,
|
||
default=1 << 25,
|
||
help="Number of elements to reduce (default: 33554432 = 2^25)",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--threads",
|
||
type=int,
|
||
default=None,
|
||
help=(
|
||
"Threads per block, power of 2 in [1, 1024]; "
|
||
"default: device maximum (typically 1024)"
|
||
),
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--maxblocks",
|
||
type=int,
|
||
default=None,
|
||
help=(
|
||
"Maximum number of blocks "
|
||
"(default: auto-calculated for cooperative launch)"
|
||
),
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--iterations",
|
||
type=int,
|
||
default=100,
|
||
help="Number of benchmark iterations (default: 100)",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--cuda-include-dir",
|
||
type=str,
|
||
required=True,
|
||
help=(
|
||
"CUDA include directory for NVRTC "
|
||
"(can use colon-separated paths, e.g., /path1:/path2)"
|
||
),
|
||
)
|
||
|
||
args = parser.parse_args()
|
||
|
||
# Validate arguments
|
||
if args.n <= 0:
|
||
print("Error: n must be positive")
|
||
return 1
|
||
|
||
err_threads = _validate_threads_arg(args.threads)
|
||
if err_threads:
|
||
print(f"Error: {err_threads}")
|
||
return 1
|
||
|
||
if args.maxblocks is not None and args.maxblocks <= 0:
|
||
print("Error: maxblocks must be positive")
|
||
return 1
|
||
|
||
try:
|
||
exit_code = run(
|
||
num_elements=args.n,
|
||
max_threads=args.threads,
|
||
max_blocks=args.maxblocks,
|
||
test_iterations=args.iterations,
|
||
cuda_include_dir=args.cuda_include_dir,
|
||
)
|
||
sys.exit(exit_code)
|
||
except Exception as e:
|
||
print(f"\nError: {e}")
|
||
import traceback
|
||
|
||
traceback.print_exc()
|
||
sys.exit(1)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|