# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    distribution and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""
Multi-GPU Gradient Average using MPI and cuda.core (Host-staging Allreduce)

Question: How do I synchronize gradients across GPUs?

Answer:
Each GPU (each MPI rank) computes local gradients on device via CUDA.
Gradients are then averaged across ranks via an MPI Allreduce over host
(CPU) buffers, following the classic data-parallel training pattern.

This sample shows how to:
- Initialize MPI for multi-process GPU workloads
- Map MPI ranks to GPUs
- Use cuda.core for kernel compilation and execution
- Integrate cuda.core with CuPy using the stream protocol
- Perform gradient averaging with MPI Allreduce (using host staging)
- Use cuda.core Event for GPU timing measurements
- Verify correctness of distributed gradient synchronization

Key concepts: Allreduce, NCCL collectives (conceptually), distributed training

Note:
- All gradient computation and validation happen on GPUs.
- MPI Allreduce is executed on CPU (host) buffers via a simple
  GPU -> CPU -> MPI -> CPU -> GPU staging pattern so that the sample
  works on any MPI stack, without requiring CUDA-aware MPI.
- In production deep learning frameworks (e.g., PyTorch DDP), NCCL
  usually implements the GPU Allreduce directly; the communication
  pattern and semantics are the same as demonstrated here.
"""

import sys
from pathlib import Path

# Add parent directory to path to import utilities
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
from cuda_samples_utils import verify_array_result  # noqa: E402

try:
    import cupy as cp
    from cuda.core import (
        Device,
        EventOptions,
        LaunchConfig,
        Program,
        ProgramOptions,
        launch,
        system,
    )
    from mpi4py import MPI
except ImportError as e:
    print(f"Error: Required package not found: {e}")
    print("Please install: pip install mpi4py cupy-cuda12x cuda-python cuda-core")
    sys.exit(1)


# ============================================================================
# CUDA device selection and stream management
# ============================================================================


def init_device(rank: int):
    """
    Initialize CUDA device and stream for this MPI rank.

    For a simple single-node run, we map rank % num_gpus to a device id.
    This covers both the common case (world_size == num_gpus) and the case
    where multiple ranks share a GPU.

    Returns
    -------
    tuple[Device, Stream]
        CUDA device object and stream object.
    """
    num_gpus = system.get_num_devices()
    if num_gpus == 0:
        raise RuntimeError("No CUDA devices available")

    dev_id = rank % num_gpus  # simple mapping: rank -> GPU in round-robin

    try:
        device = Device(dev_id)
    except (RuntimeError, ValueError) as e:
        if rank == 0:
            print(f"Warning: Cannot assign GPU {dev_id}, using GPU 0. Error: {e}")
        device = Device(0)

    device.set_current()
    # Align CuPy with cuda.core's chosen device ID
    cp.cuda.Device(device.device_id).use()

    # Create cuda.core stream and make CuPy use it
    stream = device.create_stream()
    cp.cuda.ExternalStream(int(stream.handle)).use()

    return device, stream


# ============================================================================
# CUDA kernel definition and compilation
# ============================================================================

# Tiny CUDA kernel to initialize local "gradients"
# Uses grid-stride loop to handle arrays larger than grid size
INIT_KERNEL = r"""
extern "C" __global__
void init_grad_kernel(float* grad, int n, int rank)
{
    // Grid-stride loop: each thread processes multiple elements
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    size_t stride = gridDim.x * blockDim.x;
    for (size_t i = tid; i < n; i += stride) {
        // Gradient value depends on MPI rank so we can verify reduction:
        // grad_i = rank + 0.001 * i
        grad[i] = rank + 0.001f * i;
    }
}
"""

_kernel_cache = {}


def get_init_kernel(device: Device):
    """Compile (or retrieve cached) init_grad_kernel for this device."""
    key = device.pci_bus_id
    if key not in _kernel_cache:
        opts = ProgramOptions(std="c++17", arch=f"sm_{device.arch}")
        prog = Program(INIT_KERNEL, code_type="c++", options=opts)
        mod = prog.compile("cubin")
        _kernel_cache[key] = mod.get_kernel("init_grad_kernel")
    return _kernel_cache[key]


# ============================================================================
# Local gradient computation on each GPU
# ============================================================================


def compute_local_gradients(
    num_elements: int, device: Device, stream: object, rank: int
) -> cp.ndarray:
    """
    Compute a local "gradient" vector on the current GPU.

    For demo purposes, we initialize:
        grad[i] = rank + 0.001 * i

    Parameters
    ----------
    num_elements : int
        Length of gradient vector.
    device : Device
        CUDA device object.
    stream : Stream
        CUDA stream object (created at device initialization).
    rank : int
        MPI rank ID.

    Returns
    -------
    cupy.ndarray
        Gradient vector on GPU.
    """
    # Create gradient array (CuPy uses the stream set at device initialization)
    grad = cp.empty(num_elements, dtype=cp.float32)

    # Use a CUDA kernel compiled with cuda.core to fill the array
    kernel = get_init_kernel(device)

    threads_per_block = 256
    blocks_per_grid = (num_elements + threads_per_block - 1) // threads_per_block
    config = LaunchConfig(grid=blocks_per_grid, block=threads_per_block)

    # Launch kernel using cuda.core stream
    launch(stream, config, kernel, grad.data.ptr, num_elements, rank)

    return grad


# ============================================================================
# MPI Allreduce to average gradients (host-staging)
# ============================================================================


def average_gradients(
    local_grad: cp.ndarray, comm: object, world_size: int
) -> cp.ndarray:
    """
    Average gradients across all MPI ranks using host-staging Allreduce.

    Steps:
    1. Copy local gradients from GPU to CPU (NumPy).
    2. Perform MPI_Allreduce on host buffers.
    3. Divide by world_size to obtain the average.
    4. Copy the averaged gradients back to GPU.

    This pattern is environment-agnostic and works on any MPI stack.
    """
    assert local_grad.dtype == cp.float32

    # GPU -> CPU
    local_host = local_grad.get()  # NumPy array on host
    avg_host = local_host.copy()

    # Allreduce on host buffers
    comm.Allreduce(local_host, avg_host, op=MPI.SUM)

    # Average
    avg_host /= world_size

    # CPU -> GPU
    avg_grad = cp.asarray(avg_host)

    return avg_grad


# ============================================================================
# Testing and verification
# ============================================================================


def main():
    """Demo: Multi-GPU gradient averaging with MPI (host-staging Allreduce)."""
    import argparse

    # Initialize MPI
    comm = MPI.COMM_WORLD
    world_size = comm.Get_size()
    rank = comm.Get_rank()

    parser = argparse.ArgumentParser(
        description=(
            "Multi-GPU Gradient Average with mpi4py + cuda.core "
            "(host-staging Allreduce)"
        )
    )
    parser.add_argument(
        "--size",
        type=int,
        default=1024,
        help="Number of gradient elements per GPU (default: 1024)",
    )
    args = parser.parse_args()

    num_elements = args.size

    # Initialize device and stream
    device = None
    stream = None
    try:
        device, stream = init_device(rank)

        if rank == 0:
            print(f"[Rank 0] World size = {world_size}")
        comm.Barrier()

        # Validate world size
        if world_size < 2:
            if rank == 0:
                print("=" * 70)
                print("ERROR: This sample requires at least 2 MPI processes!")
                print("=" * 70)
                print("\nPlease run with mpirun:")
                print("  mpirun -np 2 python multiGPUGradientAverage.py")
                print("  mpirun -np 4 python multiGPUGradientAverage.py --size 10000")
                print("\nFor multi-GPU systems:")
                print("  mpirun -np N python multiGPUGradientAverage.py")
                print("  (where N = number of GPUs)")
                print("=" * 70)
            sys.exit(1)

        # Validate input
        if num_elements <= 0:
            if rank == 0:
                print("Error: --size must be positive")
            sys.exit(1)

        if rank == 0:
            print("\n" + "=" * 70)
            print("Multi-GPU Gradient Average Demo")
            print("=" * 70)
            print(f"Number of MPI ranks (GPUs): {world_size}")
            print(f"Gradient vector length per GPU: {num_elements}")
            print(f"Device: {device.name}")
            print("Computation: gradients computed on GPU via cuda.core.")
            print(
                "Communication: gradients averaged via MPI_Allreduce on host "
                "(CPU) buffers."
            )
            print("=" * 70)

        # Step 1: Compute local gradients on each GPU
        # Use cuda.core Event for GPU timing measurements
        timing_options = EventOptions(enable_timing=True)
        start_event = stream.record(options=timing_options)

        local_grad = compute_local_gradients(num_elements, device, stream, rank)

        # Record end event and synchronize to ensure timing is complete
        end_event = stream.record(options=timing_options)
        end_event.sync()

        # Calculate elapsed time: Event subtraction returns milliseconds
        kernel_time = end_event - start_event

        # Step 2: Average gradients across all ranks (host-staging Allreduce)
        # Use CPU timing for MPI communication (host-staging includes GPU↔CPU transfers)
        import time

        comm_start = time.time()
        avg_grad = average_gradients(local_grad, comm, world_size)
        comm_time = (time.time() - comm_start) * 1000  # Convert to ms

        # Step 3: Sanity check on rank 0
        # For each element i:
        #   local_grad_r[i] = r + 0.001 * i, r = 0..world_size-1
        # Sum over ranks:
        #   sum[i] = sum_r r + 0.001 * i * world_size
        # Average:
        #   avg[i] = (0 + ... + (world_size-1))/world_size + 0.001 * i
        #          = (world_size - 1)/2 + 0.001 * i
        #
        # We verify this formula.

        expected_base = (world_size - 1) / 2.0
        i0 = 0
        i1 = num_elements // 2
        i2 = num_elements - 1

        # Copy a few sample elements back to host for printing on rank 0
        if rank == 0:
            avg_host_samples = avg_grad[[i0, i1, i2]].get()
            print("\nSample averaged gradient values (rank 0):")
            print(f"  avg_grad[{i0}] = {avg_host_samples[0]:.6f}")
            print(f"  avg_grad[{i1}] = {avg_host_samples[1]:.6f}")
            print(f"  avg_grad[{i2}] = {avg_host_samples[2]:.6f}")

            expected0 = expected_base + 0.001 * i0
            expected1 = expected_base + 0.001 * i1
            expected2 = expected_base + 0.001 * i2
            print("\nExpected values:")
            print(f"  expected[{i0}] = {expected0:.6f}")
            print(f"  expected[{i1}] = {expected1:.6f}")
            print(f"  expected[{i2}] = {expected2:.6f}")

        # All ranks perform a full-array correctness check on GPU
        expected_full = expected_base + 0.001 * cp.arange(
            num_elements, dtype=cp.float32
        )

        # Use utility function to verify results
        if rank == 0:
            print("\nVerifying gradient averaging correctness...")
        ok = verify_array_result(
            avg_grad, expected_full, rtol=1e-5, atol=1e-8, verbose=(rank == 0)
        )

        # Ensure all ranks agree on correctness
        ok_all = comm.allreduce(ok, op=MPI.LAND)

        if rank == 0:
            if ok_all:
                print("[PASS] Gradient averaging is correct on all ranks.")
            else:
                print(
                    "[FAIL] Gradient averaging mismatch detected on one or more ranks."
                )

            print("\nPerformance:")
            print(f"  Kernel time (GPU only): {kernel_time:.3f} ms")
            print(
                "  MPI communication time (host-staging, end-to-end): "
                f"{comm_time:.3f} ms"
            )
            print(f"  Total time: {kernel_time + comm_time:.3f} ms")

            print("\n" + "=" * 70)
            print("Demo complete.")
            print("=" * 70)

        return 0 if ok_all else 1
    finally:
        # Clean up stream resources
        if stream is not None:
            stream.close()
            cp.cuda.Stream.null.use()  # Reset CuPy's current stream to the null stream


if __name__ == "__main__":
    sys.exit(main())