# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    distribution and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import argparse
import contextlib
import sys
import time
from pathlib import Path

try:
    import cupy as cp
    import numpy as np
    from cuda.core import Device, EventOptions
except ImportError as e:
    print(f"Error: Required package not found: {e}")
    print("Please install from requirements.txt:")
    print("  pip install -r requirements.txt")
    sys.exit(1)

# Add parent directory to path to import utilities
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
from cuda_samples_utils import verify_array_result


@contextlib.contextmanager
def timer(message):
    """CPU timing context manager."""
    start = time.time()
    yield
    end = time.time()
    print(f"{message}:  {(end - start):.6f} seconds")


@contextlib.contextmanager
def gpu_timer(message, stream):
    """GPU timing context manager using cuda.core CUDA events."""
    event_options = EventOptions(enable_timing=True)
    start_event = stream.record(options=event_options)
    yield
    end_event = stream.record(options=event_options)
    end_event.sync()

    elapsed_time_ms = end_event - start_event  # Returns milliseconds
    elapsed_time_s = elapsed_time_ms / 1000.0  # Convert to seconds
    print(f"{message}:  {elapsed_time_s:.6f} seconds")


def warmup():
    # Pre-runs a simple GPU operation to avoid first-run overhead in benchmarking.
    print("Warmup...")
    a_cp = cp.ones((16, 16))
    b_cp = cp.ones((16, 16))
    result_cp = cp.dot(a_cp, b_cp)
    return result_cp


def run(n):
    # Benchmarks NumPy vs. CuPy matrix multiplication for n x n random arrays.
    # Prints timing results.

    device = Device()  # Use device 0 explicitly
    device.set_current()
    major, minor = device.compute_capability
    print()
    print(f"Device Name: {device.name}, SM: {major}.{minor}")
    print()

    # Create explicit stream
    stream = device.create_stream()

    try:
        # Warm up GPU before measuring
        warmup()
        stream.sync()

        # Generate random matrices on CPU
        a_np = np.random.rand(n, n)
        b_np = np.random.rand(n, n)

        # NumPy dot product (CPU)
        with timer(f"NumPy dot of {n}*{n} arrays"):
            result_np = np.dot(a_np, b_np)

        # Transfer NumPy arrays to GPU (using events for timing)
        with gpu_timer("Transfer arrays to GPU", stream):
            a_cp = cp.asarray(a_np)
            b_cp = cp.asarray(b_np)

        # CuPy dot product (GPU) - using events for accurate GPU timing
        with gpu_timer(f"CuPy dot of {n}*{n} arrays", stream):
            result_cp = cp.dot(a_cp, b_cp)

        print()
        # Result validation
        if not verify_array_result(result_np, result_cp.get()):
            print(
                "Validation FAILED: NumPy and CuPy results do not match "
                "within tolerance"
            )
            sys.exit(1)

        print("Validation PASSED: NumPy and CuPy results match within tolerance")
    finally:
        stream.close()


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--n_size", "-n", default=4096, type=int, help="Size of the matrix(n * n)."
    )
    args = parser.parse_args()
    run(args.n_size)
    print("Demo completed successfully!")


if __name__ == "__main__":
    main()