Dheemanth aeab82ff30
CUDA 13.2 samples update (#432)
- Added Python samples for CUDA Python 1.0 release
- Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
2026-05-13 17:13:18 -05:00

389 lines
12 KiB
Python

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Launch Configuration Tuning
Demonstrates how to find the optimal threads-per-block configuration for CUDA
kernels using cuda.core APIs. Benchmarks different thread layouts to answer:
"What is the best threads-per-block for my kernel?"
"""
import sys
try:
import numpy as np
from cuda.core import (
Device,
EventOptions,
LaunchConfig,
ManagedMemoryResource,
ManagedMemoryResourceOptions,
Program,
ProgramOptions,
launch,
)
except ImportError as e:
print(f"Error: Required package not found: {e}")
print("Please install from requirements.txt:")
print(" pip install -r requirements.txt")
sys.exit(1)
# =============================================================================
# CUDA Kernel Source Code
# =============================================================================
# Vector Addition Kernel - Simple memory-bound kernel (grid-stride loop)
VECTOR_ADD_KERNEL = r"""
extern "C" __global__
void vector_add(const float* __restrict__ a,
const float* __restrict__ b,
float* __restrict__ c,
int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = idx; i < n; i += stride) {
c[i] = a[i] + b[i];
}
}
"""
# Reduction Kernel - Sensitive to block size due to shared memory (grid-stride load)
REDUCTION_KERNEL = r"""
extern "C" __global__
void reduce_sum(const float* __restrict__ input,
float* __restrict__ partial_sums,
int n) {
extern __shared__ float sdata[];
unsigned int tid = threadIdx.x;
unsigned int stride = blockDim.x * gridDim.x;
// Load data into shared memory (grid-stride loop)
float sum = 0.0f;
for (unsigned int i = blockIdx.x * blockDim.x + tid; i < n; i += stride) {
sum += input[i];
}
sdata[tid] = sum;
__syncthreads();
// Perform reduction in shared memory
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// Write result for this block
if (tid == 0) {
partial_sums[blockIdx.x] = sdata[0];
}
}
"""
# =============================================================================
# Utility Functions
# =============================================================================
def compile_kernel(device, kernel_code, kernel_name):
"""Compile a CUDA kernel using cuda.core.Program."""
arch = f"sm_{device.arch}"
options = ProgramOptions(arch=arch)
program = Program(kernel_code, code_type="c++", options=options)
compiled = program.compile(target_type="cubin")
return compiled.get_kernel(kernel_name)
def benchmark_kernel_1d(
device,
stream,
kernel,
args,
n_elements,
block_size,
n_iterations=100,
shared_mem_bytes=0,
):
"""
Benchmark a 1D kernel with given threads-per-block configuration.
Uses CUDA events for accurate GPU timing.
Returns timing statistics as a dictionary.
"""
grid_size = (n_elements + block_size - 1) // block_size
config = LaunchConfig(
grid=(grid_size,), block=(block_size,), shmem_size=shared_mem_bytes
)
# Warm-up run
launch(stream, config, kernel, *args)
stream.sync()
# Timed runs with CUDA events
event_opts = EventOptions(enable_timing=True)
start_event = device.create_event(options=event_opts)
end_event = device.create_event(options=event_opts)
stream.record(start_event)
for _ in range(n_iterations):
launch(stream, config, kernel, *args)
stream.record(end_event)
end_event.sync()
elapsed_ms = (end_event - start_event) / n_iterations
return {
"block_size": block_size,
"grid_size": grid_size,
"mean_time_ms": elapsed_ms,
"std_time_ms": 0.0, # Single measurement with events
}
def print_gpu_info(device):
"""Print GPU information relevant to launch configuration."""
print(f"\nDevice: {device.name}")
cc = device.compute_capability
print(f"Compute Capability: {cc.major}.{cc.minor}")
def allocate_managed_array(mr, stream, n_elements, dtype=np.float32):
"""Allocate device-preferred unified memory and return buffer with numpy view."""
n_bytes = n_elements * np.dtype(dtype).itemsize
buffer = mr.allocate(n_bytes, stream)
stream.sync()
# Zero-copy numpy view via DLPack (holds reference to buffer)
np_view = np.from_dlpack(buffer).view(dtype).reshape(n_elements)
return buffer, np_view
# =============================================================================
# Benchmark Demonstrations
# =============================================================================
def demo_vector_add_tuning(device, stream, mr, kernel):
"""Demonstrate launch configuration tuning for vector addition."""
print("\n" + "=" * 60)
print("VECTOR ADDITION - Launch Configuration Tuning")
print("=" * 60)
N = 10_000_000 # 10 million elements
print(f"\nProblem size: {N:,} elements")
print("Kernel: vector_add (C = A + B)")
# Allocate device-preferred unified memory via cuda.core
d_a, np_a = allocate_managed_array(mr, stream, N)
d_b, np_b = allocate_managed_array(mr, stream, N)
d_c, np_c = allocate_managed_array(mr, stream, N)
try:
# Initialize data via numpy views
np_a[:] = np.random.rand(N).astype(np.float32)
np_b[:] = np.random.rand(N).astype(np.float32)
stream.sync()
# Thread configurations to test (multiples of warp size = 32)
thread_configs = [32, 64, 128, 256, 512, 1024]
print(f"\nTesting thread configurations: {thread_configs}")
print("-" * 60)
results = []
for tpb in thread_configs:
result = benchmark_kernel_1d(
device,
stream,
kernel,
(d_a, d_b, d_c, np.int32(N)),
N,
tpb,
n_iterations=100,
)
results.append(result)
print(
f"Block Size: {tpb:4d} | Blocks: {result['grid_size']:6d} | "
f"Time: {result['mean_time_ms']:.4f} ms"
)
# Find optimal and worst configurations
best = min(results, key=lambda x: x["mean_time_ms"])
worst = max(results, key=lambda x: x["mean_time_ms"])
print("-" * 60)
print(
f"\n✓ OPTIMAL: block_size={best['block_size']} "
f"({best['mean_time_ms']:.4f} ms)"
)
print(
f"✗ WORST: block_size={worst['block_size']} "
f"({worst['mean_time_ms']:.4f} ms)"
)
print(f" Speedup: {worst['mean_time_ms']/best['mean_time_ms']:.2f}x")
# Verify result
stream.sync()
expected = np_a + np_b
if np.allclose(np_c, expected):
print("\n✓ Results verified correct!")
return results
finally:
d_a.close()
d_b.close()
d_c.close()
def demo_reduction_tuning(device, stream, mr, kernel):
"""Demonstrate launch config tuning for reduction (shared memory)."""
print("\n" + "=" * 60)
print("REDUCTION - Launch Configuration Tuning")
print("=" * 60)
N = 16_777_216 # 16M elements (power of 2)
print(f"\nProblem size: {N:,} elements")
print("Kernel: reduce_sum (parallel reduction)")
print("Note: Reduction uses shared memory - more sensitive to block size!")
# Allocate device-preferred unified memory via cuda.core
d_input, np_input = allocate_managed_array(mr, stream, N)
try:
np_input[:] = np.random.rand(N).astype(np.float32)
stream.sync()
thread_configs = [32, 64, 128, 256, 512, 1024]
print(f"\nTesting thread configurations: {thread_configs}")
print("-" * 60)
results = []
for tpb in thread_configs:
# Allocate partial sums array
n_blocks = (N + tpb - 1) // tpb
d_partial, _ = allocate_managed_array(mr, stream, n_blocks)
try:
# Shared memory size = block_size * sizeof(float)
shared_mem_bytes = tpb * 4
result = benchmark_kernel_1d(
device,
stream,
kernel,
(d_input, d_partial, np.int32(N)),
N,
tpb,
n_iterations=50,
shared_mem_bytes=shared_mem_bytes,
)
results.append(result)
print(
f"Block Size: {tpb:4d} | Blocks: {result['grid_size']:6d} | "
f"Time: {result['mean_time_ms']:.4f} ms"
)
finally:
d_partial.close()
best = min(results, key=lambda x: x["mean_time_ms"])
worst = max(results, key=lambda x: x["mean_time_ms"])
print("-" * 60)
print(f"\n✓ OPTIMAL: block_size={best['block_size']}")
print(
f" Speedup over worst: {worst['mean_time_ms']/best['mean_time_ms']:.2f}x"
)
return results
finally:
d_input.close()
# =============================================================================
# Main
# =============================================================================
def main():
"""
Complete demonstration of CUDA launch configuration tuning.
This sample shows:
1. Device initialization with cuda.core.Device
2. Kernel compilation with cuda.core.Program
3. Benchmarking different thread block configurations
4. Finding optimal threads-per-block for various kernel types
"""
print("=" * 60)
print("Launch Configuration Tuning (cuda.core)")
print("Finding the Best Block Size for Your Kernel")
print("=" * 60)
# Initialize CUDA device
device = Device(0)
device.set_current()
# Print GPU information
print_gpu_info(device)
# Create stream and device-preferred memory resource
stream = device.create_stream()
mr_options = ManagedMemoryResourceOptions(preferred_location=device.device_id)
mr = ManagedMemoryResource(mr_options)
try:
# Compile kernels
print("\nCompiling CUDA kernels with cuda.core.Program...")
arch = f"sm_{device.arch}"
print(f" Target architecture: {arch}")
vec_add_kernel = compile_kernel(device, VECTOR_ADD_KERNEL, "vector_add")
print(" ✓ vector_add kernel compiled")
reduction_kernel = compile_kernel(device, REDUCTION_KERNEL, "reduce_sum")
print(" ✓ reduce_sum kernel compiled")
# Run demonstrations
demo_vector_add_tuning(device, stream, mr, vec_add_kernel)
demo_reduction_tuning(device, stream, mr, reduction_kernel)
print("\n" + "=" * 60)
print("SAMPLE COMPLETE")
print("=" * 60)
print("\nKey Takeaway: The optimal thread configuration depends on your")
print("specific kernel characteristics. Always benchmark to find the best!")
print()
finally:
stream.close()
if __name__ == "__main__":
main()