mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2026-05-14 14:06:53 +08:00
- Added Python samples for CUDA Python 1.0 release - Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
270 lines
9.0 KiB
Python
270 lines
9.0 KiB
Python
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions
|
|
# are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
# contributors may be used to endorse or promote products derived
|
|
# from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
"""
|
|
Image Blur with Unified Memory using cuda.core
|
|
|
|
Demonstrates GPU image blurring using cuda.core APIs for kernel compilation,
|
|
launch, and unified memory allocation.
|
|
"""
|
|
|
|
import sys
|
|
|
|
try:
|
|
import numpy as np
|
|
from cuda.core import (
|
|
Device,
|
|
LaunchConfig,
|
|
ManagedMemoryResource,
|
|
ManagedMemoryResourceOptions,
|
|
Program,
|
|
ProgramOptions,
|
|
launch,
|
|
)
|
|
from PIL import Image
|
|
except ImportError as e:
|
|
print(f"Error: Required package not found: {e}")
|
|
print("Please install from requirements.txt:")
|
|
print(" pip install -r requirements.txt")
|
|
sys.exit(1)
|
|
|
|
|
|
# CUDA kernel source code - compiled at runtime by cuda.core.Program
|
|
BOX_BLUR_KERNEL_CODE = r"""
|
|
extern "C" __global__
|
|
void box_blur_3x3(const float* __restrict__ src,
|
|
float* __restrict__ dst, int H, int W) {
|
|
/*
|
|
* Simple 3x3 box blur CUDA kernel.
|
|
*
|
|
* Each thread computes one output pixel by averaging
|
|
* the 3x3 neighborhood of input pixels (stencil pattern).
|
|
*/
|
|
|
|
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
|
|
|
if (x >= W || y >= H) return;
|
|
|
|
float sum = 0.0f;
|
|
int count = 0;
|
|
|
|
// 3x3 stencil: iterate over neighborhood
|
|
for (int dy = -1; dy <= 1; dy++) {
|
|
for (int dx = -1; dx <= 1; dx++) {
|
|
int nx = x + dx;
|
|
int ny = y + dy;
|
|
|
|
// Boundary check (clamp to edge)
|
|
if (nx >= 0 && nx < W && ny >= 0 && ny < H) {
|
|
sum += src[ny * W + nx];
|
|
count++;
|
|
}
|
|
}
|
|
}
|
|
|
|
dst[y * W + x] = sum / count;
|
|
}
|
|
"""
|
|
|
|
|
|
def make_test_image(h: int, w: int, dtype=np.uint8) -> np.ndarray:
|
|
"""Create a test grayscale image for demonstration."""
|
|
img = np.zeros((h, w), dtype=dtype)
|
|
|
|
# Create horizontal stripes
|
|
for i in range(0, h, 50):
|
|
img[i : i + 25, :] = 255
|
|
|
|
# Create vertical stripes with different intensity
|
|
for j in range(0, w, 50):
|
|
img[:, j : j + 25] = 128
|
|
|
|
# Add circular pattern for interesting blur effects
|
|
center_y, center_x = h // 2, w // 2
|
|
y, x = np.ogrid[:h, :w]
|
|
circle_mask = (x - center_x) ** 2 + (y - center_y) ** 2 <= (min(h, w) // 6) ** 2
|
|
img[circle_mask] = 200
|
|
|
|
return np.ascontiguousarray(img)
|
|
|
|
|
|
def blur_image_unified_memory(
|
|
host_np: np.ndarray, device: Device, stream, kernel
|
|
) -> tuple[np.ndarray, object, object]:
|
|
"""
|
|
Blur image on GPU using unified memory with cuda.core.
|
|
|
|
This function demonstrates:
|
|
1. Allocate managed memory using ManagedMemoryResource
|
|
2. Create zero-copy numpy views using np.from_dlpack()
|
|
3. Launch kernel via cuda.core.launch
|
|
|
|
Args:
|
|
host_np: NumPy array containing image data on CPU
|
|
device: CUDA device to use
|
|
stream: cuda.core Stream for async operations
|
|
kernel: Compiled cuda.core Kernel object
|
|
|
|
Returns:
|
|
Tuple of (dst_np, src_buf, dst_buf). dst_np is a zero-copy view into
|
|
unified memory. Caller must close src_buf and dst_buf when done with
|
|
dst_np to avoid leaking managed memory.
|
|
"""
|
|
H, W = host_np.shape
|
|
n_bytes = H * W * np.dtype(np.float32).itemsize
|
|
|
|
# Create managed memory resource for unified memory allocation
|
|
options = ManagedMemoryResourceOptions(preferred_location=device.device_id)
|
|
mr = ManagedMemoryResource(options)
|
|
|
|
# Allocate unified memory buffers for source and destination images
|
|
src_buf = mr.allocate(n_bytes, stream)
|
|
dst_buf = mr.allocate(n_bytes, stream)
|
|
try:
|
|
# Synchronize to ensure allocations are complete before CPU access
|
|
stream.sync()
|
|
|
|
# Create numpy views of unified memory using DLPack protocol (zero-copy)
|
|
src_np = np.from_dlpack(src_buf).view(np.float32).reshape(H, W)
|
|
dst_np = np.from_dlpack(dst_buf).view(np.float32).reshape(H, W)
|
|
|
|
# Write input data to unified memory (CPU can access directly)
|
|
src_np[:] = host_np.astype(np.float32) / 255.0
|
|
|
|
# Configure kernel launch parameters
|
|
block_size = (16, 16)
|
|
grid_size = (
|
|
(W + block_size[0] - 1) // block_size[0],
|
|
(H + block_size[1] - 1) // block_size[1],
|
|
)
|
|
|
|
# Create LaunchConfig for kernel execution
|
|
config = LaunchConfig(grid=grid_size, block=block_size)
|
|
|
|
# Launch kernel - buffers can be passed directly as kernel arguments
|
|
launch(
|
|
stream,
|
|
config,
|
|
kernel,
|
|
src_buf,
|
|
dst_buf,
|
|
np.int32(H),
|
|
np.int32(W),
|
|
)
|
|
|
|
# Synchronize to ensure kernel completion before reading results
|
|
stream.sync()
|
|
|
|
# Return zero-copy view; caller closes buffers when done
|
|
return (dst_np, src_buf, dst_buf)
|
|
except Exception:
|
|
src_buf.close()
|
|
dst_buf.close()
|
|
raise
|
|
|
|
|
|
def main():
|
|
"""
|
|
Complete demonstration of GPU image blurring with cuda.core.
|
|
|
|
This example shows:
|
|
1. Device initialization with cuda.core.Device
|
|
2. Kernel compilation with cuda.core.Program
|
|
3. Unified memory with cuda.core.ManagedMemoryResource
|
|
4. Kernel launch with cuda.core.launch and LaunchConfig
|
|
"""
|
|
print("=" * 60)
|
|
print("Image Blur with Unified Memory (cuda.core)")
|
|
print("=" * 60)
|
|
|
|
# Initialize CUDA device
|
|
device = Device(0)
|
|
device.set_current()
|
|
|
|
print(f"\nDevice: {device.name}")
|
|
print(f"Compute Capability: sm_{device.arch}")
|
|
|
|
# Create stream for async operations
|
|
stream = device.create_stream()
|
|
try:
|
|
# Compile kernel using cuda.core.Program
|
|
print("\nCompiling CUDA kernel with cuda.core.Program...")
|
|
arch = f"sm_{device.arch}"
|
|
options = ProgramOptions(arch=arch)
|
|
program = Program(BOX_BLUR_KERNEL_CODE, code_type="c++", options=options)
|
|
compiled = program.compile(target_type="cubin")
|
|
kernel = compiled.get_kernel("box_blur_3x3")
|
|
print(f" Compiled for architecture: {arch}")
|
|
|
|
# Image parameters
|
|
H, W = 256, 256
|
|
print(f"\nImage size: {H}x{W} grayscale")
|
|
|
|
# Create test image
|
|
print("Creating sample image...")
|
|
host_np = make_test_image(H, W, dtype=np.uint8)
|
|
|
|
# Blur image on GPU using cuda.core (returns zero-copy view + buffers)
|
|
print("Blurring image on GPU...")
|
|
blurred_result, src_buf, dst_buf = blur_image_unified_memory(
|
|
host_np, device, stream, kernel
|
|
)
|
|
try:
|
|
# Save images (use zero-copy view before releasing buffers)
|
|
print("\nSaving results...")
|
|
original_pil = Image.fromarray(host_np, mode="L")
|
|
original_pil.save("original_image.png")
|
|
print(" Saved: original_image.png")
|
|
|
|
blurred_uint8 = (np.clip(blurred_result, 0, 1) * 255).astype(np.uint8)
|
|
blurred_pil = Image.fromarray(blurred_uint8, mode="L")
|
|
blurred_pil.save("blurred_image.png")
|
|
print(" Saved: blurred_image.png")
|
|
|
|
# Verify blur was applied
|
|
print("\nVerifying result...")
|
|
original_float = host_np.astype(np.float32) / 255.0
|
|
max_diff = np.max(np.abs(blurred_result - original_float))
|
|
blur_applied = max_diff > 0.01
|
|
|
|
if blur_applied:
|
|
print(" Test PASSED")
|
|
else:
|
|
print(" Test FAILED - blur not applied")
|
|
sys.exit(1)
|
|
|
|
print(f" Max difference from original: {max_diff:.4f}")
|
|
finally:
|
|
src_buf.close()
|
|
dst_buf.close()
|
|
finally:
|
|
stream.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|