cuda-samples/python/2_CoreConcepts/reduction/reduction.py

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    distribution and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""
Fast Array Sum using Shared Memory - Two-Stage Reduction

Demonstrates efficient parallel reduction using shared memory and
two-stage approach to avoid atomic operation bottlenecks.

Key Features:
- Block-level reduction using shared memory
- Each thread loads 2 elements to reduce global memory traffic
- Sequential addressing tree reduction pattern
- No atomic operations - eliminates serialization bottleneck
- Device memory via CuPy; ``launch()`` takes pointers as ``ndarray.data.ptr``
- CuPy uses ``cp.cuda.Stream.from_external(stream)``.
"""

import argparse
import os
import sys
import time

try:
    import cupy as cp
    import numpy as np
    from cuda.core import (
        Device,
        EventOptions,
        LaunchConfig,
        Program,
        ProgramOptions,
        launch,
    )
except ImportError as e:
    print(f"Error: Required package not found: {e}")
    print("Please install dependencies:")
    print("  pip install -r requirements.txt")
    sys.exit(1)

# Import utilities
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "Utilities"))
from cuda_samples_utils import verify_array_result  # noqa: E402

# Two-stage block reduction kernel
REDUCTION_KERNEL = """
/*
 * Block-level reduction kernel using shared memory
 *
 * Strategy:
 *   - Each block processes blockSize * 2 elements
 *   - Uses shared memory for fast intra-block reduction
 *   - Outputs one partial sum per block (no atomics)
 *
 * Key optimizations:
 *   - Load 2 elements per thread (reduces global memory traffic by 50%)
 *   - Tree reduction with sequential addressing (avoids divergence)
 *   - Shared memory instead of atomic operations (eliminates bottleneck)
 *
 * Note: This sample provides separate implementations for each data type
 * for clarity. Production code typically uses templates with SharedMemory<T>
 * or reinterpret_cast to avoid duplication. See NVIDIA reduction guide for
 * template-based approaches.
 */

extern "C" __global__ void blockReduceKernel_int(
    const int *__restrict__ input,
    int *__restrict__ blockSums,
    unsigned int n)
{
    extern __shared__ int sdata_int[];

    unsigned int tid = threadIdx.x;
    unsigned int blockSize = blockDim.x;
    unsigned int gid = blockIdx.x * (blockSize * 2) + tid;

    // Load 2 elements per thread
    int sum = 0;
    if (gid < n) sum += input[gid];
    if (gid + blockSize < n) sum += input[gid + blockSize];

    sdata_int[tid] = sum;
    __syncthreads();

    // Tree reduction with sequential addressing
    for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata_int[tid] += sdata_int[tid + s];
        }
        __syncthreads();
    }

    // Write block result
    if (tid == 0) {
        blockSums[blockIdx.x] = sdata_int[0];
    }
}

extern "C" __global__ void blockReduceKernel_float(
    const float *__restrict__ input,
    float *__restrict__ blockSums,
    unsigned int n)
{
    extern __shared__ float sdata_float[];

    unsigned int tid = threadIdx.x;
    unsigned int blockSize = blockDim.x;
    unsigned int gid = blockIdx.x * (blockSize * 2) + tid;

    // Load 2 elements per thread
    float sum = 0.0f;
    if (gid < n) sum += input[gid];
    if (gid + blockSize < n) sum += input[gid + blockSize];

    sdata_float[tid] = sum;
    __syncthreads();

    // Tree reduction with sequential addressing
    for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata_float[tid] += sdata_float[tid + s];
        }
        __syncthreads();
    }

    // Write block result
    if (tid == 0) {
        blockSums[blockIdx.x] = sdata_float[0];
    }
}

extern "C" __global__ void blockReduceKernel_double(
    const double *__restrict__ input,
    double *__restrict__ blockSums,
    unsigned int n)
{
    extern __shared__ double sdata_double[];

    unsigned int tid = threadIdx.x;
    unsigned int blockSize = blockDim.x;
    unsigned int gid = blockIdx.x * (blockSize * 2) + tid;

    // Load 2 elements per thread
    double sum = 0.0;
    if (gid < n) sum += input[gid];
    if (gid + blockSize < n) sum += input[gid + blockSize];

    sdata_double[tid] = sum;
    __syncthreads();

    // Tree reduction with sequential addressing
    for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata_double[tid] += sdata_double[tid + s];
        }
        __syncthreads();
    }

    // Write block result
    if (tid == 0) {
        blockSums[blockIdx.x] = sdata_double[0];
    }
}
"""


def reduce_cpu(data):
    """Compute sum using Kahan summation for numerical accuracy."""
    if len(data) == 0:
        return 0

    sum_val = float(data[0])
    c = 0.0

    for i in range(1, len(data)):
        y = float(data[i]) - c
        t = sum_val + y
        c = (t - sum_val) - y
        sum_val = t

    return sum_val


def _validate_threads_per_block(threads_per_block):
    if threads_per_block <= 0 or threads_per_block > 1024:
        return "threads per block must be between 1 and 1024"
    if (threads_per_block & (threads_per_block - 1)) != 0:
        return (
            "threads per block must be a power of 2 "
            "(required by the shared-memory tree reduction kernel)"
        )
    return None


def run(
    num_elements=1 << 24, threads_per_block=256, test_iterations=100, datatype="float"
):
    """Run two-stage reduction benchmark."""

    print("\n" + "=" * 70)
    print("Fast Array Sum using Shared Memory - Two-Stage Reduction")
    print("=" * 70)
    print("\nDemonstrates: Efficient parallel reduction using shared memory")

    # Map datatype
    dtype_map = {"int": np.int32, "float": np.float32, "double": np.float64}
    if datatype not in dtype_map:
        print(f"Unknown datatype '{datatype}', using 'float'")
        datatype = "float"
    dtype = dtype_map[datatype]
    itemsize = np.dtype(dtype).itemsize

    # Initialize device
    device = Device()
    device.set_current()
    major, minor = device.compute_capability

    print("\nDevice Information:")
    print(f"  Name: {device.name}")
    print(f"  Compute Capability: sm_{major}.{minor}")

    # Configuration
    print("\nConfiguration:")
    print(f"  Array size: {num_elements:,} elements")
    print(f"  Data type: {datatype}")
    print(f"  Memory: {num_elements * itemsize / (1024**2):.2f} MB")
    print(f"  Threads per block: {threads_per_block}")

    # Calculate number of blocks
    # Each block processes threads_per_block * 2 elements
    num_blocks = (num_elements + threads_per_block * 2 - 1) // (threads_per_block * 2)

    print("\nTwo-Stage Reduction Strategy:")
    print("  Stage 1: GPU block reduction")
    print(f"    - Number of blocks: {num_blocks}")
    print(f"    - Elements per block: {threads_per_block * 2}")
    print(f"    - Output: {num_blocks} partial sums")
    print("  Stage 2: CPU final reduction")
    print(f"    - Combine {num_blocks} partial sums → 1 final result")

    # Compile kernel
    print("\nCompiling CUDA kernel...")
    program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}")
    prog = Program(REDUCTION_KERNEL, code_type="c++", options=program_options)
    mod = prog.compile("cubin")
    kernel_name = f"blockReduceKernel_{datatype}"
    kernel = mod.get_kernel(kernel_name)
    print(f"  Kernel '{kernel_name}' compiled successfully")

    # Generate input data
    print("\n> Generating random input data...")
    rng = np.random.default_rng(42)
    if datatype == "int":
        h_input = rng.integers(0, 256, size=num_elements, dtype=dtype)
    else:
        h_input = (rng.random(num_elements) * 256).astype(dtype)

    # cuda.core stream for launch/events; CuPy copies use the same stream via
    # Stream.from_external.
    stream = device.create_stream()
    cp_stream = cp.cuda.Stream.from_external(stream)
    try:
        d_blockSums = cp.empty(num_blocks, dtype=dtype)
        with cp_stream:
            d_input = cp.asarray(h_input, dtype=dtype)
        stream.sync()

        # Compute CPU reference
        print("> Computing reference result on CPU...")
        cpu_start = time.perf_counter()
        cpu_result = reduce_cpu(h_input)
        cpu_time = time.perf_counter() - cpu_start
        print(f"  CPU time: {cpu_time:.6f} seconds")

        # Configure launch
        shared_mem_bytes = threads_per_block * itemsize
        config = LaunchConfig(
            grid=num_blocks, block=threads_per_block, shmem_size=shared_mem_bytes
        )

        # Warm-up
        print("\n> Warming up GPU...")
        launch(
            stream,
            config,
            kernel,
            d_input.data.ptr,
            d_blockSums.data.ptr,
            np.uint32(num_elements),
        )
        stream.sync()
        print("  Warm-up completed")

        # Benchmark Stage 1 (GPU)
        print("\n> Benchmarking Stage 1 (GPU block reduction)...")
        print(f"  Running {test_iterations} iterations...")

        # cuda.core event elapsed time (end - start) is in milliseconds (CUDA API).
        stage1_times_ms = []
        event_options = EventOptions(enable_timing=True)
        start_event = stream.device.create_event(options=event_options)
        end_event = stream.device.create_event(options=event_options)
        for _ in range(test_iterations):
            stream.record(start_event)
            launch(
                stream,
                config,
                kernel,
                d_input.data.ptr,
                d_blockSums.data.ptr,
                np.uint32(num_elements),
            )
            stream.record(end_event)
            end_event.sync()
            stage1_times_ms.append(float(end_event - start_event))

        avg_stage1_ms = np.mean(stage1_times_ms)
        avg_stage1_s = avg_stage1_ms / 1000.0

        # Stage 2 (CPU)
        print("\n> Running Stage 2 (CPU final reduction)...")
        # Device → Host: after stream sync, partial sums are visible on host.
        stream.sync()
        with cp_stream:
            h_blockSums = cp.asnumpy(d_blockSums)
        stage2_start = time.perf_counter()
        gpu_result = float(np.sum(h_blockSums))
        stage2_time = time.perf_counter() - stage2_start

        total_time = avg_stage1_s + stage2_time

        # Performance metrics (use seconds for throughput; CPU times are in seconds)
        bytes_processed = num_elements * itemsize
        throughput = bytes_processed / avg_stage1_s / 1e9

        print("\n" + "=" * 70)
        print("Performance Results")
        print("=" * 70)
        print("\nStage 1 (GPU block reduction):")
        print(f"  Average time: {avg_stage1_ms:.6f} ms")
        print(f"  Throughput: {throughput:.2f} GB/s")
        print("\nStage 2 (CPU final reduction):")
        print(f"  Time: {stage2_time * 1000:.6f} ms")
        print(f"  ({num_blocks} partial sums)")
        print(f"\nTotal time: {total_time * 1000:.6f} ms")
        print(f"Speedup vs CPU: {cpu_time / total_time:.2f}x")

        # Validation
        print("\n> Validating results...")
        if datatype == "int":
            print(f"  GPU result: {int(gpu_result):,}")
            print(f"  CPU result: {int(cpu_result):,}")
            rtol, atol = 0.0, 0.0
        else:
            precision = 8 if datatype == "float" else 12
            print(f"  GPU result: {gpu_result:.{precision}f}")
            print(f"  CPU result: {cpu_result:.{precision}f}")
            if datatype == "float":
                rtol, atol = 1e-5, 1e-8 * num_elements
            else:  # double
                rtol, atol = 1e-8, 1e-12 * num_elements

        success = verify_array_result(
            np.array([gpu_result]),
            np.array([cpu_result]),
            rtol=rtol,
            atol=atol,
            verbose=True,
        )

        # Summary
        print("\n" + "=" * 70)
        print("Summary")
        print("=" * 70)
        print("Key optimizations:")
        half_reads = num_elements // 2
        print(
            "  - Load 2 elements per thread: "
            f"{half_reads:,} global reads (50% savings)"
        )
        print("  - Shared memory for reduction: ~10-20x faster than global memory")
        print(f"  - Parallel block outputs: {num_blocks} independent writes")
        print(f"Result: {throughput:.2f} GB/s throughput")

        print("=" * 70)
        if success:
            print("Two-Stage Reduction completed successfully!")
        else:
            print("Two-Stage Reduction FAILED!")
        print("=" * 70 + "\n")

        return 0 if success else 1
    finally:
        stream.close()


def main():
    """Main entry point with argument parsing."""
    parser = argparse.ArgumentParser(
        description="Two-Stage Reduction with Shared Memory",
        epilog="See README.md for usage examples and detailed documentation.",
    )

    parser.add_argument(
        "--n",
        type=int,
        default=1 << 24,
        help="Number of elements to reduce (default: 16777216 = 2^24)",
    )

    parser.add_argument(
        "--threads",
        type=int,
        default=256,
        help="Threads per block, power of 2 in [1, 1024] (default: 256)",
    )

    parser.add_argument(
        "--type",
        type=str,
        default="float",
        choices=["int", "float", "double"],
        help="Data type for reduction (default: float)",
    )

    parser.add_argument(
        "--iterations",
        type=int,
        default=100,
        help="Number of benchmark iterations (default: 100)",
    )

    args = parser.parse_args()

    # Validate arguments
    if args.n <= 0:
        print("Error: n must be positive")
        return 1

    err = _validate_threads_per_block(args.threads)
    if err:
        print(f"Error: {err}")
        return 1

    try:
        exit_code = run(
            num_elements=args.n,
            threads_per_block=args.threads,
            test_iterations=args.iterations,
            datatype=args.type,
        )
        sys.exit(exit_code)
    except Exception as e:
        print(f"\nError: {e}")
        import traceback

        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()