# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ Parallel Reduction using cuda.core and cuda.compute Demonstrates efficient parallel summation of large arrays on GPU: 1. Custom CUDA kernel showing reduction tree pattern and synchronization 2. cuda.compute.reduce_into() for production-ready reduction Key Concepts: - Reduction tree pattern: Divide-and-conquer parallel algorithm - Thread synchronization: Using __syncthreads() for coordination - Sequential thread IDs: How to avoid warp divergence - cuda.core Stream integration with CuPy via ExternalStream """ import math import sys from pathlib import Path # Add Utilities to path sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities")) try: import cupy as cp import numpy as np from cuda.compute import OpKind, reduce_into from cuda.core import ( Device, Kernel, LaunchConfig, Program, ProgramOptions, Stream, launch, ) from cuda_samples_utils import print_gpu_info, verify_array_result except ImportError as e: print(f"Error: Required package not found: {e}") print("Please install from requirements.txt:") print(" pip install -r requirements.txt") sys.exit(1) # ============================================================================= # CUDA Kernel: Parallel Reduction (optimized - no warp divergence) # ============================================================================= REDUCTION_KERNEL: str = r""" extern "C" __global__ void reduce_sum(const float* __restrict__ input, float* __restrict__ output, int n) { /* * Parallel reduction using grid-stride loop (canonical pattern) and * sequential thread IDs for the reduction tree (avoids warp divergence). * * Grid-stride loop: each thread processes multiple elements * for (i = tid; i < n; i += gridDim.x * blockDim.x) * * Reduction tree: sequential addressing keeps warps coherent. */ extern __shared__ float sdata[]; unsigned int tid = threadIdx.x; unsigned int grid_stride = (unsigned int)gridDim.x * blockDim.x; float sum = 0.0f; for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += grid_stride) { sum += input[i]; } sdata[tid] = sum; __syncthreads(); // Reduction in shared memory (sequential addressing - no divergence) for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { sdata[tid] += sdata[tid + s]; } __syncthreads(); // Wait for all threads before next iteration } // Thread 0 writes block result if (tid == 0) { output[blockIdx.x] = sdata[0]; } } """ def compile_kernel(device: Device) -> Kernel: """Compile the reduction kernel for the given device.""" arch = f"sm_{device.arch}" options = ProgramOptions(arch=arch) program = Program(REDUCTION_KERNEL, code_type="c++", options=options) return program.compile(target_type="cubin").get_kernel("reduce_sum") def reduction_stage_output_counts(n: int, block_size: int) -> list[int]: """Lengths of intermediate arrays for each multi-launch reduction stage.""" counts: list[int] = [] while n > 1: num_blocks = math.ceil(n / block_size) counts.append(num_blocks) n = num_blocks return counts def reduce_custom( stream: Stream, kernel: Kernel, d_input: cp.ndarray, block_size: int = 256, sync: bool = True, work_buffers: list[cp.ndarray] | None = None, ) -> float | cp.ndarray: """ Perform parallel reduction using custom CUDA kernel. Uses multiple kernel launches to reduce array to single value. Each launch reduces by factor of block_size. When sync=True (default), syncs and returns the scalar result. When sync=False, returns the 1-element array without syncing; caller must sync before reading (avoids host overhead in benchmarks). work_buffers: optional list of device arrays, one per stage, with length at least each stage's output count (from ``reduction_stage_output_counts``). When provided, avoids per-call allocation (e.g. for benchmarking). """ n = len(d_input) current = d_input stage = 0 if work_buffers is not None: expected_counts = reduction_stage_output_counts(n, block_size) if len(work_buffers) != len(expected_counts): msg = ( f"work_buffers length {len(work_buffers)} != " f"{len(expected_counts)} stages" ) raise ValueError(msg) while n > 1: num_blocks = math.ceil(n / block_size) if work_buffers is not None: d_output = work_buffers[stage] if d_output.size < num_blocks: msg = f"work_buffers[{stage}] size {d_output.size} < {num_blocks}" raise ValueError(msg) if d_output.size != num_blocks: d_output = d_output[:num_blocks] else: d_output = cp.empty(num_blocks, dtype=cp.float32) config = LaunchConfig( grid=(num_blocks, 1, 1), block=(block_size, 1, 1), shmem_size=block_size * 4, # float = 4 bytes ) launch( stream, config, kernel, current.data.ptr, d_output.data.ptr, np.int32(n), ) current = d_output n = num_blocks stage += 1 if sync: stream.sync() return float(current[0]) return current def benchmark_custom( stream: Stream, kernel: Kernel, d_input: cp.ndarray, num_runs: int = 10, block_size: int = 256, ) -> tuple[float, float]: """Benchmark custom reduction kernel using cuda.core events.""" stage_counts = reduction_stage_output_counts(len(d_input), block_size) work_buffers = [cp.empty(c, dtype=cp.float32) for c in stage_counts] # Warmup run (with sync to get valid result) _ = reduce_custom( stream, kernel, d_input, block_size=block_size, work_buffers=work_buffers ) event_opts = {"enable_timing": True} start_event = stream.device.create_event(options=event_opts) end_event = stream.device.create_event(options=event_opts) times: list[float] = [] result = 0.0 for _ in range(num_runs): stream.record(start_event) d_result = reduce_custom( stream, kernel, d_input, block_size=block_size, sync=False, work_buffers=work_buffers, ) stream.record(end_event) end_event.sync() result = float(d_result[0]) times.append(end_event - start_event) return result, float(np.mean(times)) def benchmark_cuda_compute( stream: Stream, d_input: cp.ndarray, num_runs: int = 10, ) -> tuple[float, float]: """Benchmark cuda.compute.reduce_into() using cuda.core events.""" h_init = np.array([0.0], dtype=np.float32) # Warmup (includes JIT compilation) d_warmup = cp.empty(1, dtype=cp.float32) reduce_into( d_in=d_input, d_out=d_warmup, op=OpKind.PLUS, num_items=len(d_input), h_init=h_init, stream=stream, ) stream.sync() d_output = cp.empty(1, dtype=cp.float32) event_opts = {"enable_timing": True} start_event = stream.device.create_event(options=event_opts) end_event = stream.device.create_event(options=event_opts) times: list[float] = [] result = 0.0 for _ in range(num_runs): stream.record(start_event) reduce_into( d_in=d_input, d_out=d_output, op=OpKind.PLUS, num_items=len(d_input), h_init=h_init, stream=stream, ) stream.record(end_event) end_event.sync() result = float(d_output[0]) times.append(end_event - start_event) return result, float(np.mean(times)) def main() -> bool: """Main function demonstrating parallel reduction.""" print("=" * 70) print("Parallel Reduction - Efficient GPU Array Summation") print("=" * 70) device = Device(0) device.set_current() stream = device.create_stream() cp_stream = cp.cuda.ExternalStream(int(stream.handle)) print() print_gpu_info(device) array_size = 1 << 20 # 1M elements h_input = np.random.rand(array_size).astype(np.float32) expected_sum = float(np.sum(h_input)) print(f"\nArray size: {array_size:,} elements ({array_size * 4 / 1e6:.1f} MB)") print(f"Expected sum: {expected_sum:.6f}") print("\nCompiling custom CUDA kernel...") kernel = compile_kernel(device) try: with cp_stream: d_input = cp.asarray(h_input) # ====================================================================== # Part 1: Custom Kernel # ====================================================================== print("\n" + "=" * 70) print("PART 1: Custom Kernel (Educational)") print("=" * 70) result, time_ms = benchmark_custom(stream, kernel, d_input) print(f"\nReduction tree kernel: {result:>14.2f}") print(f"Expected: {expected_sum:>14.2f}") print(f"Time: {time_ms:>14.3f} ms") # ====================================================================== # Part 2: cuda.compute (Production) # ====================================================================== print("\n" + "=" * 70) print("PART 2: cuda.compute.reduce_into() (Production)") print("=" * 70) result_cc, time_cc = benchmark_cuda_compute(stream, d_input) print(f"\ncuda.compute result: {result_cc:>14.2f}") print(f"Expected: {expected_sum:>14.2f}") print(f"Time: {time_cc:>14.3f} ms") # Verify both results using principled rtol/atol with cp_stream: d_expected = cp.array([expected_sum], dtype=cp.float32) custom_ok = verify_array_result( cp.array([result], dtype=cp.float32), d_expected, rtol=1e-5, atol=1e-8, verbose=False, ) compute_ok = verify_array_result( cp.array([result_cc], dtype=cp.float32), d_expected, rtol=1e-5, atol=1e-8, verbose=False, ) if custom_ok and compute_ok: print("\nTest PASSED!") return True else: print("\nTest FAILED - Error too large!") return False finally: stream.close() if __name__ == "__main__": sys.exit(0 if main() else 1)