mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2026-05-14 14:06:53 +08:00
- Added Python samples for CUDA Python 1.0 release - Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
486 lines
15 KiB
Python
486 lines
15 KiB
Python
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions
|
|
# are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# distribution and/or other materials provided with the distribution.
|
|
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
# contributors may be used to endorse or promote products derived
|
|
# from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
"""
|
|
Fast Array Sum using Shared Memory - Two-Stage Reduction
|
|
|
|
Demonstrates efficient parallel reduction using shared memory and
|
|
two-stage approach to avoid atomic operation bottlenecks.
|
|
|
|
Key Features:
|
|
- Block-level reduction using shared memory
|
|
- Each thread loads 2 elements to reduce global memory traffic
|
|
- Sequential addressing tree reduction pattern
|
|
- No atomic operations - eliminates serialization bottleneck
|
|
- Device memory via CuPy; ``launch()`` takes pointers as ``ndarray.data.ptr``
|
|
- CuPy uses ``cp.cuda.Stream.from_external(stream)``.
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
import time
|
|
|
|
try:
|
|
import cupy as cp
|
|
import numpy as np
|
|
from cuda.core import (
|
|
Device,
|
|
EventOptions,
|
|
LaunchConfig,
|
|
Program,
|
|
ProgramOptions,
|
|
launch,
|
|
)
|
|
except ImportError as e:
|
|
print(f"Error: Required package not found: {e}")
|
|
print("Please install dependencies:")
|
|
print(" pip install -r requirements.txt")
|
|
sys.exit(1)
|
|
|
|
# Import utilities
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "Utilities"))
|
|
from cuda_samples_utils import verify_array_result # noqa: E402
|
|
|
|
# Two-stage block reduction kernel
|
|
REDUCTION_KERNEL = """
|
|
/*
|
|
* Block-level reduction kernel using shared memory
|
|
*
|
|
* Strategy:
|
|
* - Each block processes blockSize * 2 elements
|
|
* - Uses shared memory for fast intra-block reduction
|
|
* - Outputs one partial sum per block (no atomics)
|
|
*
|
|
* Key optimizations:
|
|
* - Load 2 elements per thread (reduces global memory traffic by 50%)
|
|
* - Tree reduction with sequential addressing (avoids divergence)
|
|
* - Shared memory instead of atomic operations (eliminates bottleneck)
|
|
*
|
|
* Note: This sample provides separate implementations for each data type
|
|
* for clarity. Production code typically uses templates with SharedMemory<T>
|
|
* or reinterpret_cast to avoid duplication. See NVIDIA reduction guide for
|
|
* template-based approaches.
|
|
*/
|
|
|
|
extern "C" __global__ void blockReduceKernel_int(
|
|
const int *__restrict__ input,
|
|
int *__restrict__ blockSums,
|
|
unsigned int n)
|
|
{
|
|
extern __shared__ int sdata_int[];
|
|
|
|
unsigned int tid = threadIdx.x;
|
|
unsigned int blockSize = blockDim.x;
|
|
unsigned int gid = blockIdx.x * (blockSize * 2) + tid;
|
|
|
|
// Load 2 elements per thread
|
|
int sum = 0;
|
|
if (gid < n) sum += input[gid];
|
|
if (gid + blockSize < n) sum += input[gid + blockSize];
|
|
|
|
sdata_int[tid] = sum;
|
|
__syncthreads();
|
|
|
|
// Tree reduction with sequential addressing
|
|
for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
|
|
if (tid < s) {
|
|
sdata_int[tid] += sdata_int[tid + s];
|
|
}
|
|
__syncthreads();
|
|
}
|
|
|
|
// Write block result
|
|
if (tid == 0) {
|
|
blockSums[blockIdx.x] = sdata_int[0];
|
|
}
|
|
}
|
|
|
|
extern "C" __global__ void blockReduceKernel_float(
|
|
const float *__restrict__ input,
|
|
float *__restrict__ blockSums,
|
|
unsigned int n)
|
|
{
|
|
extern __shared__ float sdata_float[];
|
|
|
|
unsigned int tid = threadIdx.x;
|
|
unsigned int blockSize = blockDim.x;
|
|
unsigned int gid = blockIdx.x * (blockSize * 2) + tid;
|
|
|
|
// Load 2 elements per thread
|
|
float sum = 0.0f;
|
|
if (gid < n) sum += input[gid];
|
|
if (gid + blockSize < n) sum += input[gid + blockSize];
|
|
|
|
sdata_float[tid] = sum;
|
|
__syncthreads();
|
|
|
|
// Tree reduction with sequential addressing
|
|
for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
|
|
if (tid < s) {
|
|
sdata_float[tid] += sdata_float[tid + s];
|
|
}
|
|
__syncthreads();
|
|
}
|
|
|
|
// Write block result
|
|
if (tid == 0) {
|
|
blockSums[blockIdx.x] = sdata_float[0];
|
|
}
|
|
}
|
|
|
|
extern "C" __global__ void blockReduceKernel_double(
|
|
const double *__restrict__ input,
|
|
double *__restrict__ blockSums,
|
|
unsigned int n)
|
|
{
|
|
extern __shared__ double sdata_double[];
|
|
|
|
unsigned int tid = threadIdx.x;
|
|
unsigned int blockSize = blockDim.x;
|
|
unsigned int gid = blockIdx.x * (blockSize * 2) + tid;
|
|
|
|
// Load 2 elements per thread
|
|
double sum = 0.0;
|
|
if (gid < n) sum += input[gid];
|
|
if (gid + blockSize < n) sum += input[gid + blockSize];
|
|
|
|
sdata_double[tid] = sum;
|
|
__syncthreads();
|
|
|
|
// Tree reduction with sequential addressing
|
|
for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
|
|
if (tid < s) {
|
|
sdata_double[tid] += sdata_double[tid + s];
|
|
}
|
|
__syncthreads();
|
|
}
|
|
|
|
// Write block result
|
|
if (tid == 0) {
|
|
blockSums[blockIdx.x] = sdata_double[0];
|
|
}
|
|
}
|
|
"""
|
|
|
|
|
|
def reduce_cpu(data):
|
|
"""Compute sum using Kahan summation for numerical accuracy."""
|
|
if len(data) == 0:
|
|
return 0
|
|
|
|
sum_val = float(data[0])
|
|
c = 0.0
|
|
|
|
for i in range(1, len(data)):
|
|
y = float(data[i]) - c
|
|
t = sum_val + y
|
|
c = (t - sum_val) - y
|
|
sum_val = t
|
|
|
|
return sum_val
|
|
|
|
|
|
def _validate_threads_per_block(threads_per_block):
|
|
if threads_per_block <= 0 or threads_per_block > 1024:
|
|
return "threads per block must be between 1 and 1024"
|
|
if (threads_per_block & (threads_per_block - 1)) != 0:
|
|
return (
|
|
"threads per block must be a power of 2 "
|
|
"(required by the shared-memory tree reduction kernel)"
|
|
)
|
|
return None
|
|
|
|
|
|
def run(
|
|
num_elements=1 << 24, threads_per_block=256, test_iterations=100, datatype="float"
|
|
):
|
|
"""Run two-stage reduction benchmark."""
|
|
|
|
print("\n" + "=" * 70)
|
|
print("Fast Array Sum using Shared Memory - Two-Stage Reduction")
|
|
print("=" * 70)
|
|
print("\nDemonstrates: Efficient parallel reduction using shared memory")
|
|
|
|
# Map datatype
|
|
dtype_map = {"int": np.int32, "float": np.float32, "double": np.float64}
|
|
if datatype not in dtype_map:
|
|
print(f"Unknown datatype '{datatype}', using 'float'")
|
|
datatype = "float"
|
|
dtype = dtype_map[datatype]
|
|
itemsize = np.dtype(dtype).itemsize
|
|
|
|
# Initialize device
|
|
device = Device()
|
|
device.set_current()
|
|
major, minor = device.compute_capability
|
|
|
|
print("\nDevice Information:")
|
|
print(f" Name: {device.name}")
|
|
print(f" Compute Capability: sm_{major}.{minor}")
|
|
|
|
# Configuration
|
|
print("\nConfiguration:")
|
|
print(f" Array size: {num_elements:,} elements")
|
|
print(f" Data type: {datatype}")
|
|
print(f" Memory: {num_elements * itemsize / (1024**2):.2f} MB")
|
|
print(f" Threads per block: {threads_per_block}")
|
|
|
|
# Calculate number of blocks
|
|
# Each block processes threads_per_block * 2 elements
|
|
num_blocks = (num_elements + threads_per_block * 2 - 1) // (threads_per_block * 2)
|
|
|
|
print("\nTwo-Stage Reduction Strategy:")
|
|
print(" Stage 1: GPU block reduction")
|
|
print(f" - Number of blocks: {num_blocks}")
|
|
print(f" - Elements per block: {threads_per_block * 2}")
|
|
print(f" - Output: {num_blocks} partial sums")
|
|
print(" Stage 2: CPU final reduction")
|
|
print(f" - Combine {num_blocks} partial sums → 1 final result")
|
|
|
|
# Compile kernel
|
|
print("\nCompiling CUDA kernel...")
|
|
program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}")
|
|
prog = Program(REDUCTION_KERNEL, code_type="c++", options=program_options)
|
|
mod = prog.compile("cubin")
|
|
kernel_name = f"blockReduceKernel_{datatype}"
|
|
kernel = mod.get_kernel(kernel_name)
|
|
print(f" Kernel '{kernel_name}' compiled successfully")
|
|
|
|
# Generate input data
|
|
print("\n> Generating random input data...")
|
|
rng = np.random.default_rng(42)
|
|
if datatype == "int":
|
|
h_input = rng.integers(0, 256, size=num_elements, dtype=dtype)
|
|
else:
|
|
h_input = (rng.random(num_elements) * 256).astype(dtype)
|
|
|
|
# cuda.core stream for launch/events; CuPy copies use the same stream via
|
|
# Stream.from_external.
|
|
stream = device.create_stream()
|
|
cp_stream = cp.cuda.Stream.from_external(stream)
|
|
try:
|
|
d_blockSums = cp.empty(num_blocks, dtype=dtype)
|
|
with cp_stream:
|
|
d_input = cp.asarray(h_input, dtype=dtype)
|
|
stream.sync()
|
|
|
|
# Compute CPU reference
|
|
print("> Computing reference result on CPU...")
|
|
cpu_start = time.perf_counter()
|
|
cpu_result = reduce_cpu(h_input)
|
|
cpu_time = time.perf_counter() - cpu_start
|
|
print(f" CPU time: {cpu_time:.6f} seconds")
|
|
|
|
# Configure launch
|
|
shared_mem_bytes = threads_per_block * itemsize
|
|
config = LaunchConfig(
|
|
grid=num_blocks, block=threads_per_block, shmem_size=shared_mem_bytes
|
|
)
|
|
|
|
# Warm-up
|
|
print("\n> Warming up GPU...")
|
|
launch(
|
|
stream,
|
|
config,
|
|
kernel,
|
|
d_input.data.ptr,
|
|
d_blockSums.data.ptr,
|
|
np.uint32(num_elements),
|
|
)
|
|
stream.sync()
|
|
print(" Warm-up completed")
|
|
|
|
# Benchmark Stage 1 (GPU)
|
|
print("\n> Benchmarking Stage 1 (GPU block reduction)...")
|
|
print(f" Running {test_iterations} iterations...")
|
|
|
|
# cuda.core event elapsed time (end - start) is in milliseconds (CUDA API).
|
|
stage1_times_ms = []
|
|
event_options = EventOptions(enable_timing=True)
|
|
start_event = stream.device.create_event(options=event_options)
|
|
end_event = stream.device.create_event(options=event_options)
|
|
for _ in range(test_iterations):
|
|
stream.record(start_event)
|
|
launch(
|
|
stream,
|
|
config,
|
|
kernel,
|
|
d_input.data.ptr,
|
|
d_blockSums.data.ptr,
|
|
np.uint32(num_elements),
|
|
)
|
|
stream.record(end_event)
|
|
end_event.sync()
|
|
stage1_times_ms.append(float(end_event - start_event))
|
|
|
|
avg_stage1_ms = np.mean(stage1_times_ms)
|
|
avg_stage1_s = avg_stage1_ms / 1000.0
|
|
|
|
# Stage 2 (CPU)
|
|
print("\n> Running Stage 2 (CPU final reduction)...")
|
|
# Device → Host: after stream sync, partial sums are visible on host.
|
|
stream.sync()
|
|
with cp_stream:
|
|
h_blockSums = cp.asnumpy(d_blockSums)
|
|
stage2_start = time.perf_counter()
|
|
gpu_result = float(np.sum(h_blockSums))
|
|
stage2_time = time.perf_counter() - stage2_start
|
|
|
|
total_time = avg_stage1_s + stage2_time
|
|
|
|
# Performance metrics (use seconds for throughput; CPU times are in seconds)
|
|
bytes_processed = num_elements * itemsize
|
|
throughput = bytes_processed / avg_stage1_s / 1e9
|
|
|
|
print("\n" + "=" * 70)
|
|
print("Performance Results")
|
|
print("=" * 70)
|
|
print("\nStage 1 (GPU block reduction):")
|
|
print(f" Average time: {avg_stage1_ms:.6f} ms")
|
|
print(f" Throughput: {throughput:.2f} GB/s")
|
|
print("\nStage 2 (CPU final reduction):")
|
|
print(f" Time: {stage2_time * 1000:.6f} ms")
|
|
print(f" ({num_blocks} partial sums)")
|
|
print(f"\nTotal time: {total_time * 1000:.6f} ms")
|
|
print(f"Speedup vs CPU: {cpu_time / total_time:.2f}x")
|
|
|
|
# Validation
|
|
print("\n> Validating results...")
|
|
if datatype == "int":
|
|
print(f" GPU result: {int(gpu_result):,}")
|
|
print(f" CPU result: {int(cpu_result):,}")
|
|
rtol, atol = 0.0, 0.0
|
|
else:
|
|
precision = 8 if datatype == "float" else 12
|
|
print(f" GPU result: {gpu_result:.{precision}f}")
|
|
print(f" CPU result: {cpu_result:.{precision}f}")
|
|
if datatype == "float":
|
|
rtol, atol = 1e-5, 1e-8 * num_elements
|
|
else: # double
|
|
rtol, atol = 1e-8, 1e-12 * num_elements
|
|
|
|
success = verify_array_result(
|
|
np.array([gpu_result]),
|
|
np.array([cpu_result]),
|
|
rtol=rtol,
|
|
atol=atol,
|
|
verbose=True,
|
|
)
|
|
|
|
# Summary
|
|
print("\n" + "=" * 70)
|
|
print("Summary")
|
|
print("=" * 70)
|
|
print("Key optimizations:")
|
|
half_reads = num_elements // 2
|
|
print(
|
|
" - Load 2 elements per thread: "
|
|
f"{half_reads:,} global reads (50% savings)"
|
|
)
|
|
print(" - Shared memory for reduction: ~10-20x faster than global memory")
|
|
print(f" - Parallel block outputs: {num_blocks} independent writes")
|
|
print(f"Result: {throughput:.2f} GB/s throughput")
|
|
|
|
print("=" * 70)
|
|
if success:
|
|
print("Two-Stage Reduction completed successfully!")
|
|
else:
|
|
print("Two-Stage Reduction FAILED!")
|
|
print("=" * 70 + "\n")
|
|
|
|
return 0 if success else 1
|
|
finally:
|
|
stream.close()
|
|
|
|
|
|
def main():
|
|
"""Main entry point with argument parsing."""
|
|
parser = argparse.ArgumentParser(
|
|
description="Two-Stage Reduction with Shared Memory",
|
|
epilog="See README.md for usage examples and detailed documentation.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--n",
|
|
type=int,
|
|
default=1 << 24,
|
|
help="Number of elements to reduce (default: 16777216 = 2^24)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--threads",
|
|
type=int,
|
|
default=256,
|
|
help="Threads per block, power of 2 in [1, 1024] (default: 256)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--type",
|
|
type=str,
|
|
default="float",
|
|
choices=["int", "float", "double"],
|
|
help="Data type for reduction (default: float)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--iterations",
|
|
type=int,
|
|
default=100,
|
|
help="Number of benchmark iterations (default: 100)",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate arguments
|
|
if args.n <= 0:
|
|
print("Error: n must be positive")
|
|
return 1
|
|
|
|
err = _validate_threads_per_block(args.threads)
|
|
if err:
|
|
print(f"Error: {err}")
|
|
return 1
|
|
|
|
try:
|
|
exit_code = run(
|
|
num_elements=args.n,
|
|
threads_per_block=args.threads,
|
|
test_iterations=args.iterations,
|
|
datatype=args.type,
|
|
)
|
|
sys.exit(exit_code)
|
|
except Exception as e:
|
|
print(f"\nError: {e}")
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|