mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2026-05-14 14:06:53 +08:00
- Added Python samples for CUDA Python 1.0 release - Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
260 lines
8.0 KiB
Python
260 lines
8.0 KiB
Python
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions
|
|
# are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
# contributors may be used to endorse or promote products derived
|
|
# from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
"""
|
|
Block-wise Array Sum with Threaded Access
|
|
|
|
Demonstrates thread/block indexing, strided loops, and block-wise reduction.
|
|
|
|
Key Concepts:
|
|
Global Thread ID = blockIdx.x * blockDim.x + threadIdx.x
|
|
Stride = blockDim.x * gridDim.x
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
|
|
from cuda_samples_utils import verify_array_result
|
|
|
|
try:
|
|
import cupy as cp
|
|
import numpy as np
|
|
from cuda.core import (
|
|
Device,
|
|
EventOptions,
|
|
LaunchConfig,
|
|
Program,
|
|
ProgramOptions,
|
|
launch,
|
|
)
|
|
except ImportError as e:
|
|
print(f"Error: Required package not found: {e}")
|
|
print("Install with: pip install -r requirements.txt")
|
|
sys.exit(1)
|
|
|
|
|
|
KERNELS_CODE: str = r"""
|
|
// Each thread processes one element
|
|
extern "C" __global__
|
|
void simple_indexing(const float* input, float* output, size_t N) {
|
|
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (tid < N) {
|
|
output[tid] = input[tid] * 2.0f;
|
|
}
|
|
}
|
|
|
|
// Each thread processes multiple elements via strided access
|
|
extern "C" __global__
|
|
void strided_loop(const float* input, float* output, size_t N) {
|
|
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
size_t stride = (size_t)blockDim.x * gridDim.x;
|
|
for (size_t i = tid; i < N; i += stride) {
|
|
output[i] = input[i] * 2.0f;
|
|
}
|
|
}
|
|
|
|
// Block-wise partial sum with shared memory reduction
|
|
extern "C" __global__
|
|
void block_partial_sum(const float* input, float* partial_sums, size_t N) {
|
|
extern __shared__ float sdata[];
|
|
|
|
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
unsigned int local_tid = threadIdx.x;
|
|
size_t stride = (size_t)blockDim.x * gridDim.x;
|
|
|
|
// Each thread accumulates multiple elements (strided)
|
|
float sum = 0.0f;
|
|
for (size_t i = tid; i < N; i += stride) {
|
|
sum += input[i];
|
|
}
|
|
sdata[local_tid] = sum;
|
|
__syncthreads();
|
|
|
|
// Block-level tree reduction
|
|
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
|
|
if (local_tid < s) {
|
|
sdata[local_tid] += sdata[local_tid + s];
|
|
}
|
|
__syncthreads();
|
|
}
|
|
|
|
if (local_tid == 0) {
|
|
partial_sums[blockIdx.x] = sdata[0];
|
|
}
|
|
}
|
|
"""
|
|
|
|
|
|
def run_sample(num_elements: int = 1024 * 1024, device_id: int = 0) -> bool:
|
|
"""
|
|
Run block-wise sum demonstration.
|
|
|
|
Parameters
|
|
----------
|
|
num_elements : int
|
|
Number of array elements
|
|
device_id : int
|
|
CUDA device ID
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
True if all tests passed
|
|
"""
|
|
threads_per_block = 256
|
|
num_blocks = 64
|
|
|
|
device = Device(device_id)
|
|
device.set_current()
|
|
stream = device.create_stream()
|
|
|
|
arch = f"sm_{device.arch}"
|
|
print(f"Device: {device.name}")
|
|
print(f"Compute Capability: {arch}")
|
|
print(f"Array size: {num_elements:,} elements\n")
|
|
|
|
try:
|
|
# Make CuPy use our stream
|
|
cp.cuda.ExternalStream(int(stream.handle)).use()
|
|
|
|
# Compile kernels
|
|
program = Program(
|
|
KERNELS_CODE, code_type="c++", options=ProgramOptions(arch=arch)
|
|
)
|
|
module = program.compile(target_type="cubin")
|
|
kernel_simple = module.get_kernel("simple_indexing")
|
|
kernel_strided = module.get_kernel("strided_loop")
|
|
kernel_sum = module.get_kernel("block_partial_sum")
|
|
|
|
# Test data
|
|
h_input = np.arange(num_elements, dtype=np.float32)
|
|
d_input = cp.asarray(h_input)
|
|
d_output = cp.zeros_like(d_input)
|
|
expected = cp.asarray(h_input * 2.0)
|
|
|
|
# Demo 1: Simple indexing (1 thread = 1 element)
|
|
full_blocks = (num_elements + threads_per_block - 1) // threads_per_block
|
|
config = LaunchConfig(grid=full_blocks, block=threads_per_block)
|
|
launch(
|
|
stream,
|
|
config,
|
|
kernel_simple,
|
|
d_input.data.ptr,
|
|
d_output.data.ptr,
|
|
cp.uint64(num_elements),
|
|
)
|
|
stream.sync()
|
|
print("Simple indexing: ", end="")
|
|
test1 = verify_array_result(d_output, expected)
|
|
|
|
# Demo 2: Strided loop (threads process multiple elements)
|
|
d_output.fill(0)
|
|
config = LaunchConfig(grid=num_blocks, block=threads_per_block)
|
|
launch(
|
|
stream,
|
|
config,
|
|
kernel_strided,
|
|
d_input.data.ptr,
|
|
d_output.data.ptr,
|
|
cp.uint64(num_elements),
|
|
)
|
|
stream.sync()
|
|
print("Strided loop: ", end="")
|
|
test2 = verify_array_result(d_output, expected)
|
|
|
|
# Demo 3: Block-wise sum with shared memory
|
|
d_ones = cp.ones(num_elements, dtype=cp.float32)
|
|
d_partial = cp.zeros(num_blocks, dtype=cp.float32)
|
|
shared_mem = threads_per_block * 4
|
|
|
|
config = LaunchConfig(
|
|
grid=num_blocks, block=threads_per_block, shmem_size=shared_mem
|
|
)
|
|
launch(
|
|
stream,
|
|
config,
|
|
kernel_sum,
|
|
d_ones.data.ptr,
|
|
d_partial.data.ptr,
|
|
cp.uint64(num_elements),
|
|
)
|
|
stream.sync()
|
|
|
|
# Each block sums num_elements/num_blocks elements (strided access).
|
|
# Requires num_elements % num_blocks == 0 for correct expected values.
|
|
assert (
|
|
num_elements % num_blocks == 0
|
|
), "num_elements must be divisible by num_blocks for block_partial_sum"
|
|
expected_partial = cp.full(
|
|
num_blocks, num_elements / num_blocks, dtype=cp.float32
|
|
)
|
|
print("Block-wise sum: ", end="")
|
|
test3 = verify_array_result(d_partial, expected_partial)
|
|
|
|
# Performance timing
|
|
event_opts = EventOptions(enable_timing=True)
|
|
iterations = 100
|
|
|
|
stream.sync()
|
|
start = stream.record(options=event_opts)
|
|
for _ in range(iterations):
|
|
launch(
|
|
stream,
|
|
config,
|
|
kernel_sum,
|
|
d_ones.data.ptr,
|
|
d_partial.data.ptr,
|
|
cp.uint64(num_elements),
|
|
)
|
|
end = stream.record(options=event_opts)
|
|
end.sync()
|
|
|
|
time_ms = (end - start) / iterations
|
|
bandwidth = (num_elements * 4) / (time_ms * 1e6)
|
|
print(f"\nKernel time: {time_ms:.3f} ms, Bandwidth: {bandwidth:.1f} GB/s")
|
|
|
|
return test1 and test2 and test3
|
|
|
|
finally:
|
|
# Explicit resource cleanup
|
|
cp.cuda.Stream.null.use()
|
|
stream.close()
|
|
|
|
|
|
def main() -> None:
|
|
"""Entry point."""
|
|
success = run_sample()
|
|
if success:
|
|
print("\nDone")
|
|
else:
|
|
print("\nSome tests failed")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|