# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ Vector Addition using CUDA Core API This sample demonstrates element-wise vector addition: C = A + B using cuda.core for runtime compilation and kernel launch. """ import sys from pathlib import Path # Add parent directory to path to import utilities sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities")) from cuda_samples_utils import verify_array_result # noqa: E402 try: import cupy as cp from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch except ImportError as e: print(f"Error: Required package not found: {e}") print("Please install from requirements.txt:") print(" pip install -r requirements.txt") sys.exit(1) # CUDA kernel source code VECTOR_ADD_KERNEL = """ /** * CUDA Kernel for vector addition * Computes the vector addition of A and B into C. */ template __global__ void vectorAdd(const T *A, const T *B, T *C, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < numElements) { C[i] = A[i] + B[i]; } } """ def vector_add_cuda_core(num_elements=50000, device_id=0, verify=True): """ Perform vector addition using cuda.core API. Parameters ---------- num_elements : int Number of elements in each vector device_id : int CUDA device ID to use verify : bool Whether to verify the result Returns ------- bool True if successful, False otherwise """ try: # Initialize device print("[Vector addition using CUDA Core API]") device = Device(device_id) device.set_current() print(f"Device: {device.name}") print(f"Compute Capability: sm_{device.arch}") stream = device.create_stream() # Compile kernel print("Compiling kernel 'vectorAdd'...") program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") program = Program(VECTOR_ADD_KERNEL, code_type="c++", options=program_options) module = program.compile("cubin", name_expressions=("vectorAdd",)) kernel = module.get_kernel("vectorAdd") print("Kernel compiled successfully") # Allocate and initialize vectors print(f"[Vector addition of {num_elements} elements]") dtype = cp.float32 a = cp.random.rand(num_elements).astype(dtype) b = cp.random.rand(num_elements).astype(dtype) c = cp.empty(num_elements, dtype=dtype) # Synchronize before kernel launch device.sync() # Configure and launch kernel threads_per_block = 256 blocks_per_grid = (num_elements + threads_per_block - 1) // threads_per_block print( f"CUDA kernel launch with {blocks_per_grid} blocks " f"of {threads_per_block} threads" ) config = LaunchConfig(grid=blocks_per_grid, block=threads_per_block) # Launch kernel launch( stream, config, kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.int32(num_elements), ) stream.sync() # Verify result if verify: print("Verifying result...") expected = a + b if not verify_array_result(c, expected): return False return True except Exception as e: print(f"Error: {e}") import traceback traceback.print_exc() return False def main(): """ Main entry point for the vector addition sample. """ import argparse parser = argparse.ArgumentParser(description="Vector Addition using CUDA Core API") parser.add_argument( "--elements", type=int, default=50000, help="Number of elements in vectors (default: 50000)", ) parser.add_argument( "--device", type=int, default=0, help="CUDA device ID (default: 0)" ) parser.add_argument( "--no-verify", action="store_true", help="Skip result verification" ) args = parser.parse_args() if args.elements <= 0: print("Error: Number of elements must be positive") return 1 success = vector_add_cuda_core( num_elements=args.elements, device_id=args.device, verify=not args.no_verify ) if success: print("\nDone") return 0 else: return 1 if __name__ == "__main__": sys.exit(main())