# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""
Image Blur with Unified Memory using cuda.core

Demonstrates GPU image blurring using cuda.core APIs for kernel compilation,
launch, and unified memory allocation.
"""

import sys

try:
    import numpy as np
    from cuda.core import (
        Device,
        LaunchConfig,
        ManagedMemoryResource,
        ManagedMemoryResourceOptions,
        Program,
        ProgramOptions,
        launch,
    )
    from PIL import Image
except ImportError as e:
    print(f"Error: Required package not found: {e}")
    print("Please install from requirements.txt:")
    print("  pip install -r requirements.txt")
    sys.exit(1)


# CUDA kernel source code - compiled at runtime by cuda.core.Program
BOX_BLUR_KERNEL_CODE = r"""
extern "C" __global__
void box_blur_3x3(const float* __restrict__ src,
                  float* __restrict__ dst, int H, int W) {
    /*
     * Simple 3x3 box blur CUDA kernel.
     *
     * Each thread computes one output pixel by averaging
     * the 3x3 neighborhood of input pixels (stencil pattern).
     */

    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= W || y >= H) return;

    float sum = 0.0f;
    int count = 0;

    // 3x3 stencil: iterate over neighborhood
    for (int dy = -1; dy <= 1; dy++) {
        for (int dx = -1; dx <= 1; dx++) {
            int nx = x + dx;
            int ny = y + dy;

            // Boundary check (clamp to edge)
            if (nx >= 0 && nx < W && ny >= 0 && ny < H) {
                sum += src[ny * W + nx];
                count++;
            }
        }
    }

    dst[y * W + x] = sum / count;
}
"""


def make_test_image(h: int, w: int, dtype=np.uint8) -> np.ndarray:
    """Create a test grayscale image for demonstration."""
    img = np.zeros((h, w), dtype=dtype)

    # Create horizontal stripes
    for i in range(0, h, 50):
        img[i : i + 25, :] = 255

    # Create vertical stripes with different intensity
    for j in range(0, w, 50):
        img[:, j : j + 25] = 128

    # Add circular pattern for interesting blur effects
    center_y, center_x = h // 2, w // 2
    y, x = np.ogrid[:h, :w]
    circle_mask = (x - center_x) ** 2 + (y - center_y) ** 2 <= (min(h, w) // 6) ** 2
    img[circle_mask] = 200

    return np.ascontiguousarray(img)


def blur_image_unified_memory(
    host_np: np.ndarray, device: Device, stream, kernel
) -> tuple[np.ndarray, object, object]:
    """
    Blur image on GPU using unified memory with cuda.core.

    This function demonstrates:
    1. Allocate managed memory using ManagedMemoryResource
    2. Create zero-copy numpy views using np.from_dlpack()
    3. Launch kernel via cuda.core.launch

    Args:
        host_np: NumPy array containing image data on CPU
        device: CUDA device to use
        stream: cuda.core Stream for async operations
        kernel: Compiled cuda.core Kernel object

    Returns:
        Tuple of (dst_np, src_buf, dst_buf). dst_np is a zero-copy view into
        unified memory. Caller must close src_buf and dst_buf when done with
        dst_np to avoid leaking managed memory.
    """
    H, W = host_np.shape
    n_bytes = H * W * np.dtype(np.float32).itemsize

    # Create managed memory resource for unified memory allocation
    options = ManagedMemoryResourceOptions(preferred_location=device.device_id)
    mr = ManagedMemoryResource(options)

    # Allocate unified memory buffers for source and destination images
    src_buf = mr.allocate(n_bytes, stream)
    dst_buf = mr.allocate(n_bytes, stream)
    try:
        # Synchronize to ensure allocations are complete before CPU access
        stream.sync()

        # Create numpy views of unified memory using DLPack protocol (zero-copy)
        src_np = np.from_dlpack(src_buf).view(np.float32).reshape(H, W)
        dst_np = np.from_dlpack(dst_buf).view(np.float32).reshape(H, W)

        # Write input data to unified memory (CPU can access directly)
        src_np[:] = host_np.astype(np.float32) / 255.0

        # Configure kernel launch parameters
        block_size = (16, 16)
        grid_size = (
            (W + block_size[0] - 1) // block_size[0],
            (H + block_size[1] - 1) // block_size[1],
        )

        # Create LaunchConfig for kernel execution
        config = LaunchConfig(grid=grid_size, block=block_size)

        # Launch kernel - buffers can be passed directly as kernel arguments
        launch(
            stream,
            config,
            kernel,
            src_buf,
            dst_buf,
            np.int32(H),
            np.int32(W),
        )

        # Synchronize to ensure kernel completion before reading results
        stream.sync()

        # Return zero-copy view; caller closes buffers when done
        return (dst_np, src_buf, dst_buf)
    except Exception:
        src_buf.close()
        dst_buf.close()
        raise


def main():
    """
    Complete demonstration of GPU image blurring with cuda.core.

    This example shows:
    1. Device initialization with cuda.core.Device
    2. Kernel compilation with cuda.core.Program
    3. Unified memory with cuda.core.ManagedMemoryResource
    4. Kernel launch with cuda.core.launch and LaunchConfig
    """
    print("=" * 60)
    print("Image Blur with Unified Memory (cuda.core)")
    print("=" * 60)

    # Initialize CUDA device
    device = Device(0)
    device.set_current()

    print(f"\nDevice: {device.name}")
    print(f"Compute Capability: sm_{device.arch}")

    # Create stream for async operations
    stream = device.create_stream()
    try:
        # Compile kernel using cuda.core.Program
        print("\nCompiling CUDA kernel with cuda.core.Program...")
        arch = f"sm_{device.arch}"
        options = ProgramOptions(arch=arch)
        program = Program(BOX_BLUR_KERNEL_CODE, code_type="c++", options=options)
        compiled = program.compile(target_type="cubin")
        kernel = compiled.get_kernel("box_blur_3x3")
        print(f"  Compiled for architecture: {arch}")

        # Image parameters
        H, W = 256, 256
        print(f"\nImage size: {H}x{W} grayscale")

        # Create test image
        print("Creating sample image...")
        host_np = make_test_image(H, W, dtype=np.uint8)

        # Blur image on GPU using cuda.core (returns zero-copy view + buffers)
        print("Blurring image on GPU...")
        blurred_result, src_buf, dst_buf = blur_image_unified_memory(
            host_np, device, stream, kernel
        )
        try:
            # Save images (use zero-copy view before releasing buffers)
            print("\nSaving results...")
            original_pil = Image.fromarray(host_np, mode="L")
            original_pil.save("original_image.png")
            print("  Saved: original_image.png")

            blurred_uint8 = (np.clip(blurred_result, 0, 1) * 255).astype(np.uint8)
            blurred_pil = Image.fromarray(blurred_uint8, mode="L")
            blurred_pil.save("blurred_image.png")
            print("  Saved: blurred_image.png")

            # Verify blur was applied
            print("\nVerifying result...")
            original_float = host_np.astype(np.float32) / 255.0
            max_diff = np.max(np.abs(blurred_result - original_float))
            blur_applied = max_diff > 0.01

            if blur_applied:
                print("  Test PASSED")
            else:
                print("  Test FAILED - blur not applied")
                sys.exit(1)

            print(f"  Max difference from original: {max_diff:.4f}")
        finally:
            src_buf.close()
            dst_buf.close()
    finally:
        stream.close()


if __name__ == "__main__":
    main()