# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    distribution and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""
TensorFlow Custom GPU Operator using cuda.core

Question: How do I add a custom GPU op to TensorFlow?
Answer: This sample shows rapid prototyping with cuda.core + tf.py_function.

This sample implements a custom ReLU operation (y = max(0, x)) to demonstrate:
- Writing CUDA kernels (forward + backward) with grid-stride loops
- Compiling with cuda.core
- Integrating with TensorFlow via tf.py_function
- Proper gradient registration

Dependencies:
- tensorflow: Deep learning framework
- cuda-core: GPU kernel compilation and launch
  (requires >=0.6.0 for LEGACY_DEFAULT_STREAM)
- cuda-python: CUDA driver API bindings
- cupy: Array operations and device pointer access

Note: This approach uses tf.py_function for rapid prototyping. For production
TensorFlow applications, use TensorFlow's C++ Custom Op API.
"""

import sys

try:
    # CuPy is required for array operations and device pointer access
    import cupy as cp
    import tensorflow as tf
    from cuda.core import (
        LEGACY_DEFAULT_STREAM,
        Device,
        LaunchConfig,
        Program,
        ProgramOptions,
        launch,
    )
except ImportError as e:
    print(f"Error: Required package not found: {e}")
    print("Please install: pip install tensorflow cupy cuda-python cuda-core")
    sys.exit(1)


# ============================================================================
# Step 1: Define CUDA Kernels
# ============================================================================
# Simple element-wise ReLU: y = max(0, x)

RELU_KERNEL = """
extern "C" __global__
void relu_forward_kernel(const float* x, float* y, int n)
{
    // Grid-stride loop: each thread processes multiple elements
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = gridDim.x * blockDim.x;
    for (int i = idx; i < n; i += stride) {
        y[i] = x[i] > 0.0f ? x[i] : 0.0f;
    }
}

extern "C" __global__
void relu_backward_kernel(const float* x, const float* grad_y, float* grad_x, int n)
{
    // Grid-stride loop: each thread processes multiple elements
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = gridDim.x * blockDim.x;
    for (int i = idx; i < n; i += stride) {
        grad_x[i] = x[i] > 0.0f ? grad_y[i] : 0.0f;
    }
}
"""


# ============================================================================
# Step 2: Kernel Compilation and Caching
# ============================================================================
# Compile kernel once per device and cache it to avoid recompilation overhead
# In real training loops, this avoids paying compilation cost on every forward.

_kernel_cache = {}


def _get_relu_kernels(device):
    """
    Get or compile the ReLU kernels for a given device.

    Parameters
    ----------
    device : Device
        CUDA device object

    Returns
    -------
    tuple
        (forward_kernel, backward_kernel) compiled CUDA kernels
    """
    # Cache key based on device to avoid recompiling for the same GPU
    key = device.pci_bus_id

    if key not in _kernel_cache:
        # Compile the kernel with appropriate architecture
        opts = ProgramOptions(std="c++17", arch=f"sm_{device.arch}")
        prog = Program(RELU_KERNEL, code_type="c++", options=opts)
        mod = prog.compile("cubin")
        forward_kernel = mod.get_kernel("relu_forward_kernel")
        backward_kernel = mod.get_kernel("relu_backward_kernel")
        _kernel_cache[key] = (forward_kernel, backward_kernel)

    return _kernel_cache[key]


def _launch_relu_forward(x_np):
    """
    Internal function: Launch forward CUDA kernel.

    Takes numpy array, returns numpy array.
    Uses CuPy for array operations and device pointer access, cuda.core for
    device/stream management.

    Note: LEGACY_DEFAULT_STREAM doesn't require explicit cleanup, but kernel
    launch failures should be handled by the caller. CuPy arrays are
    automatically cleaned up when they go out of scope.
    """
    device = Device()

    # Ensure this device is current (TensorFlow usually does this already)
    device.set_current()

    # Get compiled kernel (cached)
    forward_kernel, _ = _get_relu_kernels(device)

    # Convert numpy to CuPy (CPU-to-GPU copy)
    # CuPy is used for array operations and getting device pointers
    x_cp = cp.asarray(x_np)
    y_cp = cp.empty_like(x_cp)

    # Configure kernel launch
    n = int(x_cp.size)
    threads_per_block = 256
    blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
    config = LaunchConfig(grid=blocks_per_grid, block=threads_per_block)

    # Launch on the legacy default stream (stream 0) for TensorFlow interop
    launch(
        LEGACY_DEFAULT_STREAM, config, forward_kernel, x_cp.data.ptr, y_cp.data.ptr, n
    )

    # Return as numpy array (GPU-to-CPU copy via cp.asnumpy)
    return cp.asnumpy(y_cp)


def _launch_relu_backward(x_np, grad_y_np):
    """
    Internal function: Launch backward CUDA kernel.

    Takes numpy arrays, returns numpy array.
    Uses CuPy for array operations and device pointer access, cuda.core for
    device/stream management.

    Note: LEGACY_DEFAULT_STREAM doesn't require explicit cleanup, but kernel
    launch failures should be handled by the caller. CuPy arrays are
    automatically cleaned up when they go out of scope.
    """
    device = Device()

    # Ensure this device is current (TensorFlow usually does this already)
    device.set_current()

    # Get compiled kernel (cached)
    _, backward_kernel = _get_relu_kernels(device)

    # Convert numpy to CuPy (CPU-to-GPU copy)
    # CuPy is used for array operations and getting device pointers
    x_cp = cp.asarray(x_np)
    grad_y_cp = cp.asarray(grad_y_np)
    grad_x_cp = cp.empty_like(x_cp)

    # Configure kernel launch
    n = int(x_cp.size)
    threads_per_block = 256
    blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
    config = LaunchConfig(grid=blocks_per_grid, block=threads_per_block)

    # Launch on the legacy default stream (stream 0) for TensorFlow interop
    launch(
        LEGACY_DEFAULT_STREAM,
        config,
        backward_kernel,
        x_cp.data.ptr,
        grad_y_cp.data.ptr,
        grad_x_cp.data.ptr,
        n,
    )

    # Return as numpy array (GPU-to-CPU copy via cp.asnumpy)
    return cp.asnumpy(grad_x_cp)


# ============================================================================
# Step 3: TensorFlow Integration via tf.py_function
# ============================================================================


@tf.custom_gradient
def custom_relu(x):
    """
    Custom ReLU operation using cuda.core.

    This function provides a TensorFlow-native interface to custom CUDA kernels
    compiled with cuda.core. The implementation uses tf.py_function internally
    to bridge TensorFlow and cuda.core.

    Parameters
    ----------
    x : tf.Tensor
        Input tensor (must be float32 on GPU)

    Returns
    -------
    tf.Tensor
        Output tensor with ReLU applied

    Examples
    --------
    >>> x = tf.random.normal([100], dtype=tf.float32)
    >>> y = custom_relu(x)
    >>> # Use in models
    >>> model = tf.keras.Sequential([
    ...     tf.keras.layers.Dense(128),
    ...     tf.keras.layers.Lambda(custom_relu),  # Custom ReLU
    ...     tf.keras.layers.Dense(10)
    ... ])
    """
    # Validate input
    if x.dtype != tf.float32:
        raise ValueError("custom_relu only supports float32 tensors")

    # Forward pass using tf.py_function
    # py_function allows us to call arbitrary Python code (including cuda.core)
    y = tf.py_function(func=_launch_relu_forward, inp=[x], Tout=tf.float32)

    # Restore shape information (py_function loses shape)
    y.set_shape(x.shape)

    # Define gradient function
    def grad_fn(grad_y):
        """Backward pass using custom CUDA kernel"""
        grad_x = tf.py_function(
            func=_launch_relu_backward, inp=[x, grad_y], Tout=tf.float32
        )
        grad_x.set_shape(x.shape)
        return grad_x

    return y, grad_fn


# ============================================================================
# Step 4: Testing and Verification
# ============================================================================


def main():
    """Test the custom ReLU operation."""
    import argparse

    parser = argparse.ArgumentParser(
        description="Custom TensorFlow ReLU Operator using cuda.core"
    )
    parser.add_argument(
        "--size", type=int, default=10000, help="Number of elements (default: 10000)"
    )

    args = parser.parse_args()

    # Device info
    device = Device()
    device.set_current()
    major, minor = device.compute_capability

    print("\nDevice Information:")
    print(f"  Name: {device.name}")
    print(f"  Compute Capability: sm_{major}.{minor}")

    print("\n" + "=" * 70)
    print("Custom TensorFlow ReLU Operator Test")
    print("=" * 70)

    # ========================================================================
    # Test 1: Forward Pass Correctness
    # ========================================================================
    print("\n" + "-" * 70)
    print("Test 1: Forward Pass")
    print("-" * 70)

    # Run on the first visible GPU (respects CUDA_VISIBLE_DEVICES),
    # aligning with cuda.core Device().
    with tf.device("/GPU:0"):
        x = tf.random.normal([args.size], dtype=tf.float32)

        # Custom ReLU operation
        y_custom = custom_relu(x)

        # TensorFlow reference
        y_reference = tf.nn.relu(x)

        # Check correctness
        max_error = tf.reduce_max(tf.abs(y_custom - y_reference)).numpy()

        print(f"Max absolute error: {max_error:.2e}")

        if tf.reduce_all(tf.abs(y_custom - y_reference) < 1e-5):
            print("[PASS] Forward pass PASSED")
        else:
            print("[FAIL] Forward pass FAILED")
            return 1

    # ========================================================================
    # Test 2: Backward Pass (Gradient) Correctness
    # ========================================================================
    print("\n" + "-" * 70)
    print("Test 2: Backward Pass")
    print("-" * 70)

    with tf.device("/GPU:0"):
        x_custom = tf.random.normal([args.size], dtype=tf.float32)
        x_reference = tf.identity(x_custom)

        # Compute gradients with GradientTape
        with tf.GradientTape() as tape_custom:
            tape_custom.watch(x_custom)
            y_custom = custom_relu(x_custom)
        grad_custom = tape_custom.gradient(y_custom, x_custom)

        with tf.GradientTape() as tape_reference:
            tape_reference.watch(x_reference)
            y_reference = tf.nn.relu(x_reference)
        grad_reference = tape_reference.gradient(y_reference, x_reference)

        # Check gradients
        max_grad_error = tf.reduce_max(tf.abs(grad_custom - grad_reference)).numpy()

        print(f"Max gradient error: {max_grad_error:.2e}")

        if tf.reduce_all(tf.abs(grad_custom - grad_reference) < 1e-5):
            print("[PASS] Backward pass PASSED")
        else:
            print("[FAIL] Backward pass FAILED")
            return 1

    # ========================================================================
    # Test 3: Multi-dimensional Tensors
    # ========================================================================
    print("\n" + "-" * 70)
    print("Test 3: Multi-dimensional Tensors")
    print("-" * 70)

    with tf.device("/GPU:0"):
        # Test with 2D tensor
        x_2d = tf.random.normal([100, 100], dtype=tf.float32)
        y_2d_custom = custom_relu(x_2d)
        y_2d_reference = tf.nn.relu(x_2d)

        if tf.reduce_all(tf.abs(y_2d_custom - y_2d_reference) < 1e-5):
            print("[PASS] 2D tensor test PASSED")
        else:
            print("[FAIL] 2D tensor test FAILED")
            return 1

        # Test with 3D tensor
        x_3d = tf.random.normal([10, 20, 30], dtype=tf.float32)
        y_3d_custom = custom_relu(x_3d)
        y_3d_reference = tf.nn.relu(x_3d)

        if tf.reduce_all(tf.abs(y_3d_custom - y_3d_reference) < 1e-5):
            print("[PASS] 3D tensor test PASSED")
        else:
            print("[FAIL] 3D tensor test FAILED")
            return 1

    # ========================================================================
    # Summary
    # ========================================================================
    print("\n" + "=" * 70)
    print("All tests PASSED!")
    print("=" * 70)
    print("\nYour custom GPU operator is working correctly!")
    print("You can now use it in your TensorFlow models.")
    print("\nExample usage:")
    print("  x = tf.random.normal([100], dtype=tf.float32)")
    print("  y = custom_relu(x)  # Uses your custom CUDA kernel")
    print("  ")
    print("  # In a model:")
    print("  model = tf.keras.Sequential([")
    print("      tf.keras.layers.Dense(128),")
    print("      tf.keras.layers.Lambda(custom_relu),")
    print("      tf.keras.layers.Dense(10)")
    print("  ])")
    print("=" * 70 + "\n")

    return 0


if __name__ == "__main__":
    sys.exit(main())