# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""
Block-wise Array Sum with Threaded Access

Demonstrates thread/block indexing, strided loops, and block-wise reduction.

Key Concepts:
    Global Thread ID = blockIdx.x * blockDim.x + threadIdx.x
    Stride = blockDim.x * gridDim.x
"""

import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
from cuda_samples_utils import verify_array_result

try:
    import cupy as cp
    import numpy as np
    from cuda.core import (
        Device,
        EventOptions,
        LaunchConfig,
        Program,
        ProgramOptions,
        launch,
    )
except ImportError as e:
    print(f"Error: Required package not found: {e}")
    print("Install with: pip install -r requirements.txt")
    sys.exit(1)


KERNELS_CODE: str = r"""
// Each thread processes one element
extern "C" __global__
void simple_indexing(const float* input, float* output, size_t N) {
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < N) {
        output[tid] = input[tid] * 2.0f;
    }
}

// Each thread processes multiple elements via strided access
extern "C" __global__
void strided_loop(const float* input, float* output, size_t N) {
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    size_t stride = (size_t)blockDim.x * gridDim.x;
    for (size_t i = tid; i < N; i += stride) {
        output[i] = input[i] * 2.0f;
    }
}

// Block-wise partial sum with shared memory reduction
extern "C" __global__
void block_partial_sum(const float* input, float* partial_sums, size_t N) {
    extern __shared__ float sdata[];

    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int local_tid = threadIdx.x;
    size_t stride = (size_t)blockDim.x * gridDim.x;

    // Each thread accumulates multiple elements (strided)
    float sum = 0.0f;
    for (size_t i = tid; i < N; i += stride) {
        sum += input[i];
    }
    sdata[local_tid] = sum;
    __syncthreads();

    // Block-level tree reduction
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (local_tid < s) {
            sdata[local_tid] += sdata[local_tid + s];
        }
        __syncthreads();
    }

    if (local_tid == 0) {
        partial_sums[blockIdx.x] = sdata[0];
    }
}
"""


def run_sample(num_elements: int = 1024 * 1024, device_id: int = 0) -> bool:
    """
    Run block-wise sum demonstration.

    Parameters
    ----------
    num_elements : int
        Number of array elements
    device_id : int
        CUDA device ID

    Returns
    -------
    bool
        True if all tests passed
    """
    threads_per_block = 256
    num_blocks = 64

    device = Device(device_id)
    device.set_current()
    stream = device.create_stream()

    arch = f"sm_{device.arch}"
    print(f"Device: {device.name}")
    print(f"Compute Capability: {arch}")
    print(f"Array size: {num_elements:,} elements\n")

    try:
        # Make CuPy use our stream
        cp.cuda.ExternalStream(int(stream.handle)).use()

        # Compile kernels
        program = Program(
            KERNELS_CODE, code_type="c++", options=ProgramOptions(arch=arch)
        )
        module = program.compile(target_type="cubin")
        kernel_simple = module.get_kernel("simple_indexing")
        kernel_strided = module.get_kernel("strided_loop")
        kernel_sum = module.get_kernel("block_partial_sum")

        # Test data
        h_input = np.arange(num_elements, dtype=np.float32)
        d_input = cp.asarray(h_input)
        d_output = cp.zeros_like(d_input)
        expected = cp.asarray(h_input * 2.0)

        # Demo 1: Simple indexing (1 thread = 1 element)
        full_blocks = (num_elements + threads_per_block - 1) // threads_per_block
        config = LaunchConfig(grid=full_blocks, block=threads_per_block)
        launch(
            stream,
            config,
            kernel_simple,
            d_input.data.ptr,
            d_output.data.ptr,
            cp.uint64(num_elements),
        )
        stream.sync()
        print("Simple indexing: ", end="")
        test1 = verify_array_result(d_output, expected)

        # Demo 2: Strided loop (threads process multiple elements)
        d_output.fill(0)
        config = LaunchConfig(grid=num_blocks, block=threads_per_block)
        launch(
            stream,
            config,
            kernel_strided,
            d_input.data.ptr,
            d_output.data.ptr,
            cp.uint64(num_elements),
        )
        stream.sync()
        print("Strided loop:    ", end="")
        test2 = verify_array_result(d_output, expected)

        # Demo 3: Block-wise sum with shared memory
        d_ones = cp.ones(num_elements, dtype=cp.float32)
        d_partial = cp.zeros(num_blocks, dtype=cp.float32)
        shared_mem = threads_per_block * 4

        config = LaunchConfig(
            grid=num_blocks, block=threads_per_block, shmem_size=shared_mem
        )
        launch(
            stream,
            config,
            kernel_sum,
            d_ones.data.ptr,
            d_partial.data.ptr,
            cp.uint64(num_elements),
        )
        stream.sync()

        # Each block sums num_elements/num_blocks elements (strided access).
        # Requires num_elements % num_blocks == 0 for correct expected values.
        assert (
            num_elements % num_blocks == 0
        ), "num_elements must be divisible by num_blocks for block_partial_sum"
        expected_partial = cp.full(
            num_blocks, num_elements / num_blocks, dtype=cp.float32
        )
        print("Block-wise sum:  ", end="")
        test3 = verify_array_result(d_partial, expected_partial)

        # Performance timing
        event_opts = EventOptions(enable_timing=True)
        iterations = 100

        stream.sync()
        start = stream.record(options=event_opts)
        for _ in range(iterations):
            launch(
                stream,
                config,
                kernel_sum,
                d_ones.data.ptr,
                d_partial.data.ptr,
                cp.uint64(num_elements),
            )
        end = stream.record(options=event_opts)
        end.sync()

        time_ms = (end - start) / iterations
        bandwidth = (num_elements * 4) / (time_ms * 1e6)
        print(f"\nKernel time: {time_ms:.3f} ms, Bandwidth: {bandwidth:.1f} GB/s")

        return test1 and test2 and test3

    finally:
        # Explicit resource cleanup
        cp.cuda.Stream.null.use()
        stream.close()


def main() -> None:
    """Entry point."""
    success = run_sample()
    if success:
        print("\nDone")
    else:
        print("\nSome tests failed")
        sys.exit(1)


if __name__ == "__main__":
    main()