mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2026-05-14 14:06:53 +08:00
- Added Python samples for CUDA Python 1.0 release - Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
328 lines
11 KiB
Python
328 lines
11 KiB
Python
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions
|
|
# are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# distribution and/or other materials provided with the distribution.
|
|
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
# contributors may be used to endorse or promote products derived
|
|
# from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
"""
|
|
Kernel Nsys Profiling Sample - CUDA C++ Kernel Profiling with cuda.core
|
|
|
|
This sample demonstrates how to profile custom CUDA C++ kernels compiled and
|
|
launched with cuda.core using NVIDIA Nsight Systems.
|
|
|
|
The sample implements three common GPU operations as custom CUDA C++ kernels:
|
|
- Vector addition: c = a + b
|
|
- SAXPY: y = alpha * x + y
|
|
- Vector transform: sqrt(x*x + 1) + sin(x)
|
|
|
|
Use Nsight Systems to analyze:
|
|
- Custom kernel execution times
|
|
- Kernel launch patterns and overhead
|
|
- GPU utilization and memory access patterns
|
|
- NVTX markers for structured profiling
|
|
|
|
Workflow:
|
|
- Phase 1: Create GPU arrays
|
|
- Phase 2: Compile and execute cuda.core custom kernels (profiling focus)
|
|
- Phase 3: Verify correctness with CuPy reference implementation
|
|
- Phase 4: Validate results
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import cupy as cp
|
|
import numpy as np
|
|
import nvtx
|
|
from cuda.core import Device, LaunchConfig, launch
|
|
except ImportError as e:
|
|
print(f"Error: Required package not found: {e}")
|
|
print("Please install from requirements.txt:")
|
|
print(" pip install -r requirements.txt")
|
|
sys.exit(1)
|
|
|
|
# Add parent directory to path to import utilities
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
|
|
from cuda_samples_utils import verify_array_result # noqa: E402
|
|
|
|
# CUDA C++ kernel definitions
|
|
# For larger projects, separating kernels into a separate file is also valid.
|
|
KERNELS_CODE = """
|
|
template<typename T>
|
|
__global__ void vector_add(const T* a, const T* b, T* c, size_t N) {
|
|
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
|
for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) {
|
|
c[i] = a[i] + b[i];
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
__global__ void saxpy(const T alpha, const T* x, T* y, size_t N) {
|
|
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
|
for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) {
|
|
y[i] = alpha * x[i] + y[i];
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
__global__ void vector_transform(const T* a, T* b, size_t N) {
|
|
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
|
for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) {
|
|
T val = a[i];
|
|
b[i] = sqrt(val * val + T(1.0)) + sin(val);
|
|
}
|
|
}
|
|
"""
|
|
|
|
|
|
def get_cuda_core_kernels(device):
|
|
"""
|
|
Compile cuda.core kernels and return them.
|
|
|
|
Args:
|
|
device: cuda.core.Device object
|
|
|
|
Returns:
|
|
dict: Dictionary of compiled kernels
|
|
"""
|
|
from cuda.core import Program, ProgramOptions
|
|
|
|
# Compile all kernels at once
|
|
program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}")
|
|
prog = Program(KERNELS_CODE, code_type="c++", options=program_options)
|
|
mod = prog.compile(
|
|
"cubin",
|
|
name_expressions=(
|
|
"vector_add<float>",
|
|
"saxpy<float>",
|
|
"vector_transform<float>",
|
|
),
|
|
)
|
|
|
|
# Extract individual kernels
|
|
return {
|
|
"vector_add": mod.get_kernel("vector_add<float>"),
|
|
"saxpy": mod.get_kernel("saxpy<float>"),
|
|
"vector_transform": mod.get_kernel("vector_transform<float>"),
|
|
}
|
|
|
|
|
|
def run(size):
|
|
"""Main execution function"""
|
|
|
|
# =================================================================
|
|
# Device Initialization using cuda.core
|
|
# =================================================================
|
|
with nvtx.annotate("Device Initialization", color="green"):
|
|
try:
|
|
# Create device object (defaults to device 0)
|
|
dev = Device()
|
|
dev.set_current()
|
|
|
|
print()
|
|
print(f"Device: {dev.name}")
|
|
print(f"Compute Capability: sm_{dev.arch}")
|
|
print()
|
|
|
|
# Synchronize device
|
|
dev.sync()
|
|
|
|
except Exception as e:
|
|
print("ERROR: CUDA initialization failed!")
|
|
print(f"Error: {e}")
|
|
sys.exit(1)
|
|
|
|
print("Profiling cuda.core Custom Kernels")
|
|
print(f"Array size: {size:,}\n")
|
|
|
|
# Constant for SAXPY operation
|
|
alpha = 2.5
|
|
|
|
# Initialize random seed
|
|
rng = cp.random.default_rng(42)
|
|
|
|
# =================================================================
|
|
# Phase 1: Create GPU Arrays with CuPy
|
|
# =================================================================
|
|
with nvtx.annotate("Create GPU Arrays", color="yellow"):
|
|
a_gpu = rng.standard_normal(size, dtype=cp.float32)
|
|
b_gpu = rng.standard_normal(size, dtype=cp.float32)
|
|
dev.sync()
|
|
|
|
print("Phase 1: Created arrays on GPU with CuPy")
|
|
print(f" Array shape: {a_gpu.shape}")
|
|
print(f" Array dtype: {a_gpu.dtype}")
|
|
print(
|
|
f" Array a - Mean: {float(cp.mean(a_gpu)):.4f}, "
|
|
f"Std: {float(cp.std(a_gpu)):.4f}"
|
|
)
|
|
print(
|
|
f" Array b - Mean: {float(cp.mean(b_gpu)):.4f}, "
|
|
f"Std: {float(cp.std(b_gpu)):.4f}\n"
|
|
)
|
|
|
|
# =================================================================
|
|
# Phase 2: cuda.core Custom Kernels on GPU
|
|
# =================================================================
|
|
with nvtx.annotate("cuda.core Custom Kernels", color="purple"):
|
|
print("Phase 2: cuda.core custom CUDA C++ kernels on GPU")
|
|
|
|
# Create a stream for cuda.core operations
|
|
stream = dev.create_stream()
|
|
try:
|
|
with nvtx.annotate("Compile Kernels", color="cyan"):
|
|
kernels_dict = get_cuda_core_kernels(dev)
|
|
stream.sync()
|
|
print("Compiled custom CUDA C++ kernels")
|
|
|
|
# Prepare launch configuration
|
|
# Grid-stride loops in kernels handle any grid size robustly
|
|
block = 256
|
|
grid = (size + block - 1) // block
|
|
config = LaunchConfig(grid=grid, block=block)
|
|
|
|
# Execute cuda.core vector_add kernel
|
|
with nvtx.annotate("Vector Add (cuda.core)", color="cyan"):
|
|
c_cuda = cp.empty_like(a_gpu)
|
|
launch(
|
|
stream,
|
|
config,
|
|
kernels_dict["vector_add"],
|
|
a_gpu.data.ptr,
|
|
b_gpu.data.ptr,
|
|
c_cuda.data.ptr,
|
|
cp.uint64(size),
|
|
)
|
|
stream.sync()
|
|
|
|
# Execute cuda.core SAXPY kernel
|
|
with nvtx.annotate("SAXPY (cuda.core)", color="cyan"):
|
|
y_cuda = b_gpu.copy()
|
|
launch(
|
|
stream,
|
|
config,
|
|
kernels_dict["saxpy"],
|
|
np.float32(alpha),
|
|
a_gpu.data.ptr,
|
|
y_cuda.data.ptr,
|
|
cp.uint64(size),
|
|
)
|
|
stream.sync()
|
|
|
|
# Execute cuda.core vector_transform kernel
|
|
with nvtx.annotate("Vector Transform (cuda.core)", color="cyan"):
|
|
transform_cuda = cp.empty_like(a_gpu)
|
|
launch(
|
|
stream,
|
|
config,
|
|
kernels_dict["vector_transform"],
|
|
a_gpu.data.ptr,
|
|
transform_cuda.data.ptr,
|
|
cp.uint64(size),
|
|
)
|
|
stream.sync()
|
|
|
|
print("Vector Addition (custom kernel)")
|
|
print("SAXPY (custom kernel)")
|
|
print("Vector Transform (custom kernel)\n")
|
|
finally:
|
|
stream.close()
|
|
|
|
# =================================================================
|
|
# Phase 3: Generate Reference Results with CuPy (for verification)
|
|
# =================================================================
|
|
with nvtx.annotate("Generate Reference Results", color="blue"):
|
|
print("Phase 3: Generate reference results for verification")
|
|
|
|
with nvtx.annotate("Vector Add (Reference)", color="cyan"):
|
|
c_cupy = a_gpu + b_gpu
|
|
dev.sync()
|
|
|
|
with nvtx.annotate("SAXPY (Reference)", color="cyan"):
|
|
y_cupy = alpha * a_gpu + b_gpu
|
|
dev.sync()
|
|
|
|
with nvtx.annotate("Vector Transform (Reference)", color="cyan"):
|
|
transform_cupy = cp.sqrt(a_gpu * a_gpu + 1.0) + cp.sin(a_gpu)
|
|
dev.sync()
|
|
|
|
print("Reference results generated\n")
|
|
|
|
# =================================================================
|
|
# Phase 4: Verify Kernel Correctness
|
|
# =================================================================
|
|
with nvtx.annotate("Verification", color="green"):
|
|
print("Phase 4: Verify kernel correctness")
|
|
|
|
# Verify custom kernels against reference results
|
|
# Use relaxed tolerances for single-precision float comparisons
|
|
# Small differences can occur due to instruction ordering and
|
|
# compiler optimizations
|
|
print(" Validating cuda.core kernels:")
|
|
|
|
print(" Vector Add: ", end="")
|
|
vec_add_match = verify_array_result(c_cuda, c_cupy, rtol=1e-5, atol=1e-6)
|
|
|
|
print(" SAXPY: ", end="")
|
|
saxpy_match = verify_array_result(y_cuda, y_cupy, rtol=1e-5, atol=1e-6)
|
|
|
|
print(" Transform: ", end="")
|
|
transform_match = verify_array_result(
|
|
transform_cuda, transform_cupy, rtol=1e-5, atol=1e-6
|
|
)
|
|
|
|
all_pass = vec_add_match and saxpy_match and transform_match
|
|
|
|
if not all_pass:
|
|
print("\n ERROR: Kernel verification failed!")
|
|
return 1
|
|
print()
|
|
|
|
# Final synchronization
|
|
dev.sync()
|
|
print("The sample is complete PASSED!")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Kernel Nsys Profiling - Profile custom CUDA C++ "
|
|
"kernels with cuda.core"
|
|
)
|
|
parser.add_argument(
|
|
"-n",
|
|
"--array-size",
|
|
type=int,
|
|
default=50000,
|
|
metavar="N",
|
|
help="Array size (default: 50,000)",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
run(size=args.array_size)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|