# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # distribution and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ Kernel Nsys Profiling Sample - CUDA C++ Kernel Profiling with cuda.core This sample demonstrates how to profile custom CUDA C++ kernels compiled and launched with cuda.core using NVIDIA Nsight Systems. The sample implements three common GPU operations as custom CUDA C++ kernels: - Vector addition: c = a + b - SAXPY: y = alpha * x + y - Vector transform: sqrt(x*x + 1) + sin(x) Use Nsight Systems to analyze: - Custom kernel execution times - Kernel launch patterns and overhead - GPU utilization and memory access patterns - NVTX markers for structured profiling Workflow: - Phase 1: Create GPU arrays - Phase 2: Compile and execute cuda.core custom kernels (profiling focus) - Phase 3: Verify correctness with CuPy reference implementation - Phase 4: Validate results """ import argparse import sys from pathlib import Path try: import cupy as cp import numpy as np import nvtx from cuda.core import Device, LaunchConfig, launch except ImportError as e: print(f"Error: Required package not found: {e}") print("Please install from requirements.txt:") print(" pip install -r requirements.txt") sys.exit(1) # Add parent directory to path to import utilities sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities")) from cuda_samples_utils import verify_array_result # noqa: E402 # CUDA C++ kernel definitions # For larger projects, separating kernels into a separate file is also valid. KERNELS_CODE = """ template __global__ void vector_add(const T* a, const T* b, T* c, size_t N) { const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) { c[i] = a[i] + b[i]; } } template __global__ void saxpy(const T alpha, const T* x, T* y, size_t N) { const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) { y[i] = alpha * x[i] + y[i]; } } template __global__ void vector_transform(const T* a, T* b, size_t N) { const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) { T val = a[i]; b[i] = sqrt(val * val + T(1.0)) + sin(val); } } """ def get_cuda_core_kernels(device): """ Compile cuda.core kernels and return them. Args: device: cuda.core.Device object Returns: dict: Dictionary of compiled kernels """ from cuda.core import Program, ProgramOptions # Compile all kernels at once program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") prog = Program(KERNELS_CODE, code_type="c++", options=program_options) mod = prog.compile( "cubin", name_expressions=( "vector_add", "saxpy", "vector_transform", ), ) # Extract individual kernels return { "vector_add": mod.get_kernel("vector_add"), "saxpy": mod.get_kernel("saxpy"), "vector_transform": mod.get_kernel("vector_transform"), } def run(size): """Main execution function""" # ================================================================= # Device Initialization using cuda.core # ================================================================= with nvtx.annotate("Device Initialization", color="green"): try: # Create device object (defaults to device 0) dev = Device() dev.set_current() print() print(f"Device: {dev.name}") print(f"Compute Capability: sm_{dev.arch}") print() # Synchronize device dev.sync() except Exception as e: print("ERROR: CUDA initialization failed!") print(f"Error: {e}") sys.exit(1) print("Profiling cuda.core Custom Kernels") print(f"Array size: {size:,}\n") # Constant for SAXPY operation alpha = 2.5 # Initialize random seed rng = cp.random.default_rng(42) # ================================================================= # Phase 1: Create GPU Arrays with CuPy # ================================================================= with nvtx.annotate("Create GPU Arrays", color="yellow"): a_gpu = rng.standard_normal(size, dtype=cp.float32) b_gpu = rng.standard_normal(size, dtype=cp.float32) dev.sync() print("Phase 1: Created arrays on GPU with CuPy") print(f" Array shape: {a_gpu.shape}") print(f" Array dtype: {a_gpu.dtype}") print( f" Array a - Mean: {float(cp.mean(a_gpu)):.4f}, " f"Std: {float(cp.std(a_gpu)):.4f}" ) print( f" Array b - Mean: {float(cp.mean(b_gpu)):.4f}, " f"Std: {float(cp.std(b_gpu)):.4f}\n" ) # ================================================================= # Phase 2: cuda.core Custom Kernels on GPU # ================================================================= with nvtx.annotate("cuda.core Custom Kernels", color="purple"): print("Phase 2: cuda.core custom CUDA C++ kernels on GPU") # Create a stream for cuda.core operations stream = dev.create_stream() try: with nvtx.annotate("Compile Kernels", color="cyan"): kernels_dict = get_cuda_core_kernels(dev) stream.sync() print("Compiled custom CUDA C++ kernels") # Prepare launch configuration # Grid-stride loops in kernels handle any grid size robustly block = 256 grid = (size + block - 1) // block config = LaunchConfig(grid=grid, block=block) # Execute cuda.core vector_add kernel with nvtx.annotate("Vector Add (cuda.core)", color="cyan"): c_cuda = cp.empty_like(a_gpu) launch( stream, config, kernels_dict["vector_add"], a_gpu.data.ptr, b_gpu.data.ptr, c_cuda.data.ptr, cp.uint64(size), ) stream.sync() # Execute cuda.core SAXPY kernel with nvtx.annotate("SAXPY (cuda.core)", color="cyan"): y_cuda = b_gpu.copy() launch( stream, config, kernels_dict["saxpy"], np.float32(alpha), a_gpu.data.ptr, y_cuda.data.ptr, cp.uint64(size), ) stream.sync() # Execute cuda.core vector_transform kernel with nvtx.annotate("Vector Transform (cuda.core)", color="cyan"): transform_cuda = cp.empty_like(a_gpu) launch( stream, config, kernels_dict["vector_transform"], a_gpu.data.ptr, transform_cuda.data.ptr, cp.uint64(size), ) stream.sync() print("Vector Addition (custom kernel)") print("SAXPY (custom kernel)") print("Vector Transform (custom kernel)\n") finally: stream.close() # ================================================================= # Phase 3: Generate Reference Results with CuPy (for verification) # ================================================================= with nvtx.annotate("Generate Reference Results", color="blue"): print("Phase 3: Generate reference results for verification") with nvtx.annotate("Vector Add (Reference)", color="cyan"): c_cupy = a_gpu + b_gpu dev.sync() with nvtx.annotate("SAXPY (Reference)", color="cyan"): y_cupy = alpha * a_gpu + b_gpu dev.sync() with nvtx.annotate("Vector Transform (Reference)", color="cyan"): transform_cupy = cp.sqrt(a_gpu * a_gpu + 1.0) + cp.sin(a_gpu) dev.sync() print("Reference results generated\n") # ================================================================= # Phase 4: Verify Kernel Correctness # ================================================================= with nvtx.annotate("Verification", color="green"): print("Phase 4: Verify kernel correctness") # Verify custom kernels against reference results # Use relaxed tolerances for single-precision float comparisons # Small differences can occur due to instruction ordering and # compiler optimizations print(" Validating cuda.core kernels:") print(" Vector Add: ", end="") vec_add_match = verify_array_result(c_cuda, c_cupy, rtol=1e-5, atol=1e-6) print(" SAXPY: ", end="") saxpy_match = verify_array_result(y_cuda, y_cupy, rtol=1e-5, atol=1e-6) print(" Transform: ", end="") transform_match = verify_array_result( transform_cuda, transform_cupy, rtol=1e-5, atol=1e-6 ) all_pass = vec_add_match and saxpy_match and transform_match if not all_pass: print("\n ERROR: Kernel verification failed!") return 1 print() # Final synchronization dev.sync() print("The sample is complete PASSED!") def main(): parser = argparse.ArgumentParser( description="Kernel Nsys Profiling - Profile custom CUDA C++ " "kernels with cuda.core" ) parser.add_argument( "-n", "--array-size", type=int, default=50000, metavar="N", help="Array size (default: 50,000)", ) args = parser.parse_args() run(size=args.array_size) if __name__ == "__main__": main()