Dheemanth aeab82ff30
CUDA 13.2 samples update (#432)
- Added Python samples for CUDA Python 1.0 release
- Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
2026-05-13 17:13:18 -05:00

270 lines
9.0 KiB
Python

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Image Blur with Unified Memory using cuda.core
Demonstrates GPU image blurring using cuda.core APIs for kernel compilation,
launch, and unified memory allocation.
"""
import sys
try:
import numpy as np
from cuda.core import (
Device,
LaunchConfig,
ManagedMemoryResource,
ManagedMemoryResourceOptions,
Program,
ProgramOptions,
launch,
)
from PIL import Image
except ImportError as e:
print(f"Error: Required package not found: {e}")
print("Please install from requirements.txt:")
print(" pip install -r requirements.txt")
sys.exit(1)
# CUDA kernel source code - compiled at runtime by cuda.core.Program
BOX_BLUR_KERNEL_CODE = r"""
extern "C" __global__
void box_blur_3x3(const float* __restrict__ src,
float* __restrict__ dst, int H, int W) {
/*
* Simple 3x3 box blur CUDA kernel.
*
* Each thread computes one output pixel by averaging
* the 3x3 neighborhood of input pixels (stencil pattern).
*/
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= W || y >= H) return;
float sum = 0.0f;
int count = 0;
// 3x3 stencil: iterate over neighborhood
for (int dy = -1; dy <= 1; dy++) {
for (int dx = -1; dx <= 1; dx++) {
int nx = x + dx;
int ny = y + dy;
// Boundary check (clamp to edge)
if (nx >= 0 && nx < W && ny >= 0 && ny < H) {
sum += src[ny * W + nx];
count++;
}
}
}
dst[y * W + x] = sum / count;
}
"""
def make_test_image(h: int, w: int, dtype=np.uint8) -> np.ndarray:
"""Create a test grayscale image for demonstration."""
img = np.zeros((h, w), dtype=dtype)
# Create horizontal stripes
for i in range(0, h, 50):
img[i : i + 25, :] = 255
# Create vertical stripes with different intensity
for j in range(0, w, 50):
img[:, j : j + 25] = 128
# Add circular pattern for interesting blur effects
center_y, center_x = h // 2, w // 2
y, x = np.ogrid[:h, :w]
circle_mask = (x - center_x) ** 2 + (y - center_y) ** 2 <= (min(h, w) // 6) ** 2
img[circle_mask] = 200
return np.ascontiguousarray(img)
def blur_image_unified_memory(
host_np: np.ndarray, device: Device, stream, kernel
) -> tuple[np.ndarray, object, object]:
"""
Blur image on GPU using unified memory with cuda.core.
This function demonstrates:
1. Allocate managed memory using ManagedMemoryResource
2. Create zero-copy numpy views using np.from_dlpack()
3. Launch kernel via cuda.core.launch
Args:
host_np: NumPy array containing image data on CPU
device: CUDA device to use
stream: cuda.core Stream for async operations
kernel: Compiled cuda.core Kernel object
Returns:
Tuple of (dst_np, src_buf, dst_buf). dst_np is a zero-copy view into
unified memory. Caller must close src_buf and dst_buf when done with
dst_np to avoid leaking managed memory.
"""
H, W = host_np.shape
n_bytes = H * W * np.dtype(np.float32).itemsize
# Create managed memory resource for unified memory allocation
options = ManagedMemoryResourceOptions(preferred_location=device.device_id)
mr = ManagedMemoryResource(options)
# Allocate unified memory buffers for source and destination images
src_buf = mr.allocate(n_bytes, stream)
dst_buf = mr.allocate(n_bytes, stream)
try:
# Synchronize to ensure allocations are complete before CPU access
stream.sync()
# Create numpy views of unified memory using DLPack protocol (zero-copy)
src_np = np.from_dlpack(src_buf).view(np.float32).reshape(H, W)
dst_np = np.from_dlpack(dst_buf).view(np.float32).reshape(H, W)
# Write input data to unified memory (CPU can access directly)
src_np[:] = host_np.astype(np.float32) / 255.0
# Configure kernel launch parameters
block_size = (16, 16)
grid_size = (
(W + block_size[0] - 1) // block_size[0],
(H + block_size[1] - 1) // block_size[1],
)
# Create LaunchConfig for kernel execution
config = LaunchConfig(grid=grid_size, block=block_size)
# Launch kernel - buffers can be passed directly as kernel arguments
launch(
stream,
config,
kernel,
src_buf,
dst_buf,
np.int32(H),
np.int32(W),
)
# Synchronize to ensure kernel completion before reading results
stream.sync()
# Return zero-copy view; caller closes buffers when done
return (dst_np, src_buf, dst_buf)
except Exception:
src_buf.close()
dst_buf.close()
raise
def main():
"""
Complete demonstration of GPU image blurring with cuda.core.
This example shows:
1. Device initialization with cuda.core.Device
2. Kernel compilation with cuda.core.Program
3. Unified memory with cuda.core.ManagedMemoryResource
4. Kernel launch with cuda.core.launch and LaunchConfig
"""
print("=" * 60)
print("Image Blur with Unified Memory (cuda.core)")
print("=" * 60)
# Initialize CUDA device
device = Device(0)
device.set_current()
print(f"\nDevice: {device.name}")
print(f"Compute Capability: sm_{device.arch}")
# Create stream for async operations
stream = device.create_stream()
try:
# Compile kernel using cuda.core.Program
print("\nCompiling CUDA kernel with cuda.core.Program...")
arch = f"sm_{device.arch}"
options = ProgramOptions(arch=arch)
program = Program(BOX_BLUR_KERNEL_CODE, code_type="c++", options=options)
compiled = program.compile(target_type="cubin")
kernel = compiled.get_kernel("box_blur_3x3")
print(f" Compiled for architecture: {arch}")
# Image parameters
H, W = 256, 256
print(f"\nImage size: {H}x{W} grayscale")
# Create test image
print("Creating sample image...")
host_np = make_test_image(H, W, dtype=np.uint8)
# Blur image on GPU using cuda.core (returns zero-copy view + buffers)
print("Blurring image on GPU...")
blurred_result, src_buf, dst_buf = blur_image_unified_memory(
host_np, device, stream, kernel
)
try:
# Save images (use zero-copy view before releasing buffers)
print("\nSaving results...")
original_pil = Image.fromarray(host_np, mode="L")
original_pil.save("original_image.png")
print(" Saved: original_image.png")
blurred_uint8 = (np.clip(blurred_result, 0, 1) * 255).astype(np.uint8)
blurred_pil = Image.fromarray(blurred_uint8, mode="L")
blurred_pil.save("blurred_image.png")
print(" Saved: blurred_image.png")
# Verify blur was applied
print("\nVerifying result...")
original_float = host_np.astype(np.float32) / 255.0
max_diff = np.max(np.abs(blurred_result - original_float))
blur_applied = max_diff > 0.01
if blur_applied:
print(" Test PASSED")
else:
print(" Test FAILED - blur not applied")
sys.exit(1)
print(f" Max difference from original: {max_diff:.4f}")
finally:
src_buf.close()
dst_buf.close()
finally:
stream.close()
if __name__ == "__main__":
main()