# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""
JIT Compilation and Link-Time Optimization with cuda.core

Real-world GPU code is rarely a single source string. Libraries ship a
"main" kernel that is compiled once, then link in user-supplied device
functions at runtime to customize behavior without recompiling the whole
program.

cuda.core exposes this pattern through ``Program`` (NVRTC compilation)
and ``Linker`` (JIT linking of multiple object codes). Two modes are
shown here:

  * **PTX linking**: compile each translation unit with
    ``relocatable_device_code=True`` to PTX and link to a CUBIN.
    The two modules remain independently compiled: no cross-module
    inlining.

  * **LTO (Link-Time Optimization)**: compile each translation unit
    with ``link_time_optimization=True`` to LTO IR, then link with
    ``LinkerOptions(link_time_optimization=True)``. The linker reruns
    the optimizer across both modules and can inline the device function
    into the main kernel, typically matching a single-source build.

The same kernel math runs in both modes and is verified against a
NumPy reference.
"""

import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))

try:
    import cupy as cp
    import numpy as np
    from cuda.core import (
        Device,
        LaunchConfig,
        Linker,
        LinkerOptions,
        Program,
        ProgramOptions,
        launch,
    )
    from cuda_samples_utils import print_gpu_info  # noqa: E402
except ImportError as e:
    print(f"Error: Required package not found: {e}")
    print("Please install from requirements.txt:")
    print("  pip install -r requirements.txt")
    sys.exit(1)


# --------------------------------------------------------------------------
# Module A: the "library" main kernel. It forwards each element through a
# user-supplied device function (resolved at link time) and writes the result.
# --------------------------------------------------------------------------
MAIN_SRC = r"""
// Forward declare the user-supplied hook. Its definition lives in a separate
// translation unit and is resolved by the Linker at runtime.
extern "C" __device__ float user_transform(float x);

extern "C" __global__
void apply_transform(const float* __restrict__ in,
                     float* __restrict__ out,
                     size_t N)
{
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    size_t stride = (size_t)gridDim.x * blockDim.x;
    for (size_t i = tid; i < N; i += stride) {
        out[i] = user_transform(in[i]);
    }
}
"""

# --------------------------------------------------------------------------
# Module B: the user-supplied "plug-in" device function. A different
# implementation of ``user_transform`` here produces different results without
# rebuilding MAIN_SRC.
# --------------------------------------------------------------------------
USER_SRC = r"""
extern "C" __device__
float user_transform(float x)
{
    // A deliberately non-trivial expression so LTO has something to inline /
    // optimize across the module boundary.
    float y = x * x + 3.0f * x - 1.0f;
    return y > 0.0f ? y : 0.0f;
}
"""


def host_reference(x: np.ndarray) -> np.ndarray:
    y = x * x + 3.0 * x - 1.0
    return np.where(y > 0.0, y, 0.0).astype(np.float32)


def link_ptx(device):
    """Compile both modules to PTX and link them into a cubin (no LTO)."""
    prog_opts = ProgramOptions(
        std="c++17", arch=f"sm_{device.arch}", relocatable_device_code=True
    )
    main_obj = Program(MAIN_SRC, "c++", options=prog_opts).compile("ptx")
    user_obj = Program(USER_SRC, "c++", options=prog_opts).compile("ptx")

    linker = Linker(main_obj, user_obj, options=LinkerOptions(arch=f"sm_{device.arch}"))
    return linker.link("cubin")


def link_lto(device):
    """Compile both modules to LTO IR and link with LTO enabled."""
    prog_opts = ProgramOptions(
        std="c++17", arch=f"sm_{device.arch}", link_time_optimization=True
    )
    main_obj = Program(MAIN_SRC, "c++", options=prog_opts).compile("ltoir")
    user_obj = Program(USER_SRC, "c++", options=prog_opts).compile("ltoir")

    linker_opts = LinkerOptions(
        arch=f"sm_{device.arch}", link_time_optimization=True
    )
    linker = Linker(main_obj, user_obj, options=linker_opts)
    return linker.link("cubin")


def run_one_mode(mode, module, stream, d_in, d_out, size, expected):
    kernel = module.get_kernel("apply_transform")
    config = LaunchConfig(grid=(size + 255) // 256, block=256)
    launch(
        stream,
        config,
        kernel,
        d_in.data.ptr,
        d_out.data.ptr,
        np.uint64(size),
    )
    stream.sync()
    actual = cp.asnumpy(d_out)
    if not np.allclose(actual, expected, rtol=1e-5, atol=1e-5):
        max_err = np.max(np.abs(actual - expected))
        print(f"  [{mode}] verification FAILED (max_err={max_err})")
        return False
    print(f"  [{mode}] result verified against NumPy reference")
    return True


def main() -> int:
    import argparse

    parser = argparse.ArgumentParser(
        description="JIT + LTO linking of two device modules with cuda.core"
    )
    parser.add_argument(
        "--elements", type=int, default=1 << 16,
        help="Number of float32 elements (default: 65536)",
    )
    parser.add_argument("--device", type=int, default=0, help="CUDA device id")
    args = parser.parse_args()

    device = Device(args.device)
    device.set_current()
    print_gpu_info(device)

    stream = device.create_stream()
    cp.cuda.ExternalStream(int(stream.handle)).use()

    try:
        N = args.elements
        rng = np.random.default_rng(seed=0)
        host_in = rng.standard_normal(N).astype(np.float32)
        expected = host_reference(host_in)

        d_in = cp.asarray(host_in)
        d_out = cp.empty(N, dtype=cp.float32)
        device.sync()

        print("\n[1] PTX linking (no LTO)")
        ptx_module = link_ptx(device)
        ok_ptx = run_one_mode("ptx", ptx_module, stream, d_in, d_out, N, expected)

        d_out.fill(0)
        device.sync()

        print("\n[2] LTO linking (link-time optimization)")
        lto_module = link_lto(device)
        ok_lto = run_one_mode("lto", lto_module, stream, d_in, d_out, N, expected)

        print()
        if ok_ptx and ok_lto:
            print("Both PTX and LTO linked kernels produced matching results. Done")
            return 0
        return 1
    finally:
        stream.close()
        cp.cuda.Stream.null.use()


if __name__ == "__main__":
    sys.exit(main())