# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""
Launch Configuration Tuning

Demonstrates how to find the optimal threads-per-block configuration for CUDA
kernels using cuda.core APIs. Benchmarks different thread layouts to answer:
"What is the best threads-per-block for my kernel?"
"""

import sys

try:
    import numpy as np
    from cuda.core import (
        Device,
        EventOptions,
        LaunchConfig,
        ManagedMemoryResource,
        ManagedMemoryResourceOptions,
        Program,
        ProgramOptions,
        launch,
    )
except ImportError as e:
    print(f"Error: Required package not found: {e}")
    print("Please install from requirements.txt:")
    print("  pip install -r requirements.txt")
    sys.exit(1)


# =============================================================================
# CUDA Kernel Source Code
# =============================================================================

# Vector Addition Kernel - Simple memory-bound kernel (grid-stride loop)
VECTOR_ADD_KERNEL = r"""
extern "C" __global__
void vector_add(const float* __restrict__ a,
                const float* __restrict__ b,
                float* __restrict__ c,
                int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = idx; i < n; i += stride) {
        c[i] = a[i] + b[i];
    }
}
"""

# Reduction Kernel - Sensitive to block size due to shared memory (grid-stride load)
REDUCTION_KERNEL = r"""
extern "C" __global__
void reduce_sum(const float* __restrict__ input,
                float* __restrict__ partial_sums,
                int n) {
    extern __shared__ float sdata[];

    unsigned int tid = threadIdx.x;
    unsigned int stride = blockDim.x * gridDim.x;

    // Load data into shared memory (grid-stride loop)
    float sum = 0.0f;
    for (unsigned int i = blockIdx.x * blockDim.x + tid; i < n; i += stride) {
        sum += input[i];
    }
    sdata[tid] = sum;
    __syncthreads();

    // Perform reduction in shared memory
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    // Write result for this block
    if (tid == 0) {
        partial_sums[blockIdx.x] = sdata[0];
    }
}
"""


# =============================================================================
# Utility Functions
# =============================================================================


def compile_kernel(device, kernel_code, kernel_name):
    """Compile a CUDA kernel using cuda.core.Program."""
    arch = f"sm_{device.arch}"
    options = ProgramOptions(arch=arch)
    program = Program(kernel_code, code_type="c++", options=options)
    compiled = program.compile(target_type="cubin")
    return compiled.get_kernel(kernel_name)


def benchmark_kernel_1d(
    device,
    stream,
    kernel,
    args,
    n_elements,
    block_size,
    n_iterations=100,
    shared_mem_bytes=0,
):
    """
    Benchmark a 1D kernel with given threads-per-block configuration.
    Uses CUDA events for accurate GPU timing.

    Returns timing statistics as a dictionary.
    """
    grid_size = (n_elements + block_size - 1) // block_size

    config = LaunchConfig(
        grid=(grid_size,), block=(block_size,), shmem_size=shared_mem_bytes
    )

    # Warm-up run
    launch(stream, config, kernel, *args)
    stream.sync()

    # Timed runs with CUDA events
    event_opts = EventOptions(enable_timing=True)
    start_event = device.create_event(options=event_opts)
    end_event = device.create_event(options=event_opts)

    stream.record(start_event)
    for _ in range(n_iterations):
        launch(stream, config, kernel, *args)
    stream.record(end_event)
    end_event.sync()

    elapsed_ms = (end_event - start_event) / n_iterations

    return {
        "block_size": block_size,
        "grid_size": grid_size,
        "mean_time_ms": elapsed_ms,
        "std_time_ms": 0.0,  # Single measurement with events
    }


def print_gpu_info(device):
    """Print GPU information relevant to launch configuration."""
    print(f"\nDevice: {device.name}")
    cc = device.compute_capability
    print(f"Compute Capability: {cc.major}.{cc.minor}")


def allocate_managed_array(mr, stream, n_elements, dtype=np.float32):
    """Allocate device-preferred unified memory and return buffer with numpy view."""
    n_bytes = n_elements * np.dtype(dtype).itemsize
    buffer = mr.allocate(n_bytes, stream)
    stream.sync()

    # Zero-copy numpy view via DLPack (holds reference to buffer)
    np_view = np.from_dlpack(buffer).view(dtype).reshape(n_elements)
    return buffer, np_view


# =============================================================================
# Benchmark Demonstrations
# =============================================================================


def demo_vector_add_tuning(device, stream, mr, kernel):
    """Demonstrate launch configuration tuning for vector addition."""
    print("\n" + "=" * 60)
    print("VECTOR ADDITION - Launch Configuration Tuning")
    print("=" * 60)

    N = 10_000_000  # 10 million elements
    print(f"\nProblem size: {N:,} elements")
    print("Kernel: vector_add (C = A + B)")

    # Allocate device-preferred unified memory via cuda.core
    d_a, np_a = allocate_managed_array(mr, stream, N)
    d_b, np_b = allocate_managed_array(mr, stream, N)
    d_c, np_c = allocate_managed_array(mr, stream, N)
    try:
        # Initialize data via numpy views
        np_a[:] = np.random.rand(N).astype(np.float32)
        np_b[:] = np.random.rand(N).astype(np.float32)
        stream.sync()

        # Thread configurations to test (multiples of warp size = 32)
        thread_configs = [32, 64, 128, 256, 512, 1024]

        print(f"\nTesting thread configurations: {thread_configs}")
        print("-" * 60)

        results = []
        for tpb in thread_configs:
            result = benchmark_kernel_1d(
                device,
                stream,
                kernel,
                (d_a, d_b, d_c, np.int32(N)),
                N,
                tpb,
                n_iterations=100,
            )
            results.append(result)
            print(
                f"Block Size: {tpb:4d} | Blocks: {result['grid_size']:6d} | "
                f"Time: {result['mean_time_ms']:.4f} ms"
            )

        # Find optimal and worst configurations
        best = min(results, key=lambda x: x["mean_time_ms"])
        worst = max(results, key=lambda x: x["mean_time_ms"])

        print("-" * 60)
        print(
            f"\n✓ OPTIMAL: block_size={best['block_size']} "
            f"({best['mean_time_ms']:.4f} ms)"
        )
        print(
            f"✗ WORST:   block_size={worst['block_size']} "
            f"({worst['mean_time_ms']:.4f} ms)"
        )
        print(f"  Speedup: {worst['mean_time_ms']/best['mean_time_ms']:.2f}x")

        # Verify result
        stream.sync()
        expected = np_a + np_b
        if np.allclose(np_c, expected):
            print("\n✓ Results verified correct!")

        return results
    finally:
        d_a.close()
        d_b.close()
        d_c.close()


def demo_reduction_tuning(device, stream, mr, kernel):
    """Demonstrate launch config tuning for reduction (shared memory)."""
    print("\n" + "=" * 60)
    print("REDUCTION - Launch Configuration Tuning")
    print("=" * 60)

    N = 16_777_216  # 16M elements (power of 2)

    print(f"\nProblem size: {N:,} elements")
    print("Kernel: reduce_sum (parallel reduction)")
    print("Note: Reduction uses shared memory - more sensitive to block size!")

    # Allocate device-preferred unified memory via cuda.core
    d_input, np_input = allocate_managed_array(mr, stream, N)
    try:
        np_input[:] = np.random.rand(N).astype(np.float32)
        stream.sync()

        thread_configs = [32, 64, 128, 256, 512, 1024]

        print(f"\nTesting thread configurations: {thread_configs}")
        print("-" * 60)

        results = []
        for tpb in thread_configs:
            # Allocate partial sums array
            n_blocks = (N + tpb - 1) // tpb
            d_partial, _ = allocate_managed_array(mr, stream, n_blocks)
            try:
                # Shared memory size = block_size * sizeof(float)
                shared_mem_bytes = tpb * 4

                result = benchmark_kernel_1d(
                    device,
                    stream,
                    kernel,
                    (d_input, d_partial, np.int32(N)),
                    N,
                    tpb,
                    n_iterations=50,
                    shared_mem_bytes=shared_mem_bytes,
                )
                results.append(result)
                print(
                    f"Block Size: {tpb:4d} | Blocks: {result['grid_size']:6d} | "
                    f"Time: {result['mean_time_ms']:.4f} ms"
                )
            finally:
                d_partial.close()

        best = min(results, key=lambda x: x["mean_time_ms"])
        worst = max(results, key=lambda x: x["mean_time_ms"])

        print("-" * 60)
        print(f"\n✓ OPTIMAL: block_size={best['block_size']}")
        print(
            f"  Speedup over worst: {worst['mean_time_ms']/best['mean_time_ms']:.2f}x"
        )

        return results
    finally:
        d_input.close()


# =============================================================================
# Main
# =============================================================================


def main():
    """
    Complete demonstration of CUDA launch configuration tuning.

    This sample shows:
    1. Device initialization with cuda.core.Device
    2. Kernel compilation with cuda.core.Program
    3. Benchmarking different thread block configurations
    4. Finding optimal threads-per-block for various kernel types
    """
    print("=" * 60)
    print("Launch Configuration Tuning (cuda.core)")
    print("Finding the Best Block Size for Your Kernel")
    print("=" * 60)

    # Initialize CUDA device
    device = Device(0)
    device.set_current()

    # Print GPU information
    print_gpu_info(device)

    # Create stream and device-preferred memory resource
    stream = device.create_stream()
    mr_options = ManagedMemoryResourceOptions(preferred_location=device.device_id)
    mr = ManagedMemoryResource(mr_options)

    try:
        # Compile kernels
        print("\nCompiling CUDA kernels with cuda.core.Program...")
        arch = f"sm_{device.arch}"
        print(f"  Target architecture: {arch}")

        vec_add_kernel = compile_kernel(device, VECTOR_ADD_KERNEL, "vector_add")
        print("  ✓ vector_add kernel compiled")

        reduction_kernel = compile_kernel(device, REDUCTION_KERNEL, "reduce_sum")
        print("  ✓ reduce_sum kernel compiled")

        # Run demonstrations
        demo_vector_add_tuning(device, stream, mr, vec_add_kernel)
        demo_reduction_tuning(device, stream, mr, reduction_kernel)

        print("\n" + "=" * 60)
        print("SAMPLE COMPLETE")
        print("=" * 60)
        print("\nKey Takeaway: The optimal thread configuration depends on your")
        print("specific kernel characteristics. Always benchmark to find the best!")
        print()
    finally:
        stream.close()


if __name__ == "__main__":
    main()