# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""
Prefix Sum (Scan)

Demonstrates parallel prefix sum algorithms using cuda.compute:
- Inclusive scan: output[i] = [init_value] + input[0] + ... + input[i]
- Exclusive scan: output[i] = init_value + input[0] + ... + input[i-1]

Uses cuda.compute APIs for optimized CUB-based scan operations.
Uses cuda.core APIs for device and stream management.
"""

import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))

try:
    import cupy as cp
    import numpy as np
    from cuda.compute import OpKind, exclusive_scan, inclusive_scan
    from cuda.core import Device, EventOptions
    from cuda_samples_utils import print_gpu_info, verify_array_result
except ImportError as e:
    print(f"Error: Required package not found: {e}")
    print("Please install from requirements.txt:")
    print("  pip install -r requirements.txt")
    sys.exit(1)


def main() -> bool:
    """Run prefix sum sample. Returns True if all tests passed."""
    print("=" * 60)
    print("Prefix Sum (Scan) - Using cuda.compute")
    print("=" * 60)

    device = Device(0)
    device.set_current()
    stream = device.create_stream()
    cp_stream = cp.cuda.ExternalStream(int(stream.handle))

    ok = True
    try:
        print()
        print_gpu_info(device)

        h_input = np.array([3, 1, 4, 1, 5, 9, 2, 6], dtype=np.int32)
        init_value = np.array([0], dtype=np.int32)

        # =========================================================================
        # Inclusive Scan
        # =========================================================================
        print("\n" + "-" * 60)
        print("INCLUSIVE SCAN")
        print("-" * 60)
        print(
            "Formula: output[i] = [init_value] + input[0] + input[1] + ... + input[i]"
        )

        with cp_stream:
            d_input = cp.asarray(h_input)
            d_output = cp.empty_like(d_input)

        print(f"\nInput:  {h_input.tolist()}")

        inclusive_scan(
            d_in=d_input,
            d_out=d_output,
            op=OpKind.PLUS,
            init_value=None,
            num_items=len(h_input),
            stream=stream,
        )
        stream.sync()
        print(f"Output: {cp.asnumpy(d_output).tolist()}")

        with cp_stream:
            expected = cp.asarray(np.cumsum(h_input))
        ok &= verify_array_result(d_output, expected, rtol=0, atol=0)

        # =========================================================================
        # Exclusive Scan
        # =========================================================================
        print("\n" + "-" * 60)
        print("EXCLUSIVE SCAN")
        print("-" * 60)
        print("Formula: output[i] = init_value + input[0] + ... + input[i-1]")

        with cp_stream:
            d_output = cp.empty_like(d_input)

        print(f"\nInput:  {h_input.tolist()}")

        exclusive_scan(
            d_in=d_input,
            d_out=d_output,
            op=OpKind.PLUS,
            init_value=init_value,
            num_items=len(h_input),
            stream=stream,
        )
        stream.sync()
        print(f"Output: {cp.asnumpy(d_output).tolist()}")

        with cp_stream:
            expected = cp.asarray(np.concatenate([init_value, np.cumsum(h_input)[:-1]]))
        ok &= verify_array_result(d_output, expected, rtol=0, atol=0)

        # =========================================================================
        # Large Array Performance
        # =========================================================================
        print("\n" + "-" * 60)
        print("PERFORMANCE (10M elements)")
        print("-" * 60)

        N = 10_000_000
        with cp_stream:
            d_large_in = cp.ones(N, dtype=np.int32)
            d_large_out = cp.empty_like(d_large_in)

        inclusive_scan(
            d_in=d_large_in,
            d_out=d_large_out,
            op=OpKind.PLUS,
            init_value=None,
            num_items=N,
            stream=stream,
        )
        stream.sync()

        event_opts = EventOptions(enable_timing=True)
        start_event = device.create_event(options=event_opts)
        end_event = device.create_event(options=event_opts)

        num_iterations = 10
        stream.record(start_event)
        for _ in range(num_iterations):
            inclusive_scan(
                d_in=d_large_in,
                d_out=d_large_out,
                op=OpKind.PLUS,
                init_value=None,
                num_items=N,
                stream=stream,
            )
        stream.record(end_event)
        end_event.sync()
        elapsed_ms = (end_event - start_event) / num_iterations

        print(f"Inclusive scan: {elapsed_ms:.3f} ms")
        print(f"Throughput: {N / elapsed_ms / 1e6:.1f} M elements/ms")

        # =========================================================================
        # Summary
        # =========================================================================
        print("\n" + "=" * 60)
        print("KEY CONCEPTS")
        print("=" * 60)
        print("• Inclusive: output[i] includes input[i]")
        print("• Exclusive: output[i] excludes input[i], starts with init_value")
        print("• cuda.compute provides CUB-based optimized implementations")
        print("• cuda.core Stream integrates with CuPy via ExternalStream")
        print("• Applications: stream compaction, radix sort, histograms")
        print("=" * 60)
        return ok
    finally:
        cp.cuda.Stream.null.use()
        stream.close()


if __name__ == "__main__":
    success = main()
    if not success:
        sys.exit(1)