# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ Simple Print - Printing from CUDA Kernels This sample demonstrates how to print output from CUDA kernels using printf(). It shows: 1. Device management with cuda.core.Device 2. Compiling CUDA C++ code that uses printf() 3. Launching kernels with 2D grids and 3D blocks 4. Seeing kernel output printed to stdout 5. Using Numba CUDA for Pythonic kernel authoring This sample demonstrates both approaches: - CUDA C++ kernels compiled via cuda.core.Program (more control, C++ features) - Numba CUDA kernels (more Pythonic, easier to write) This is the Python equivalent of the C++ simplePrintf sample. """ import sys import traceback try: from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch except ImportError as e: print(f"Error: Required package not found: {e}") print("Please install from requirements.txt:") print(" pip install -r requirements.txt") sys.exit(1) try: from numba import cuda as numba_cuda NUMBA_AVAILABLE = True except ImportError: NUMBA_AVAILABLE = False print("Warning: numba not found. Numba CUDA example will be skipped.") print("To install: pip install numba") # CUDA C++ kernel with printf # This kernel prints the block index, thread index, and a value from each thread PRINTF_KERNEL = """ extern "C" __global__ void printKernel(int val) { // Calculate linear block index from 2D grid int blockId = blockIdx.y * gridDim.x + blockIdx.x; // Calculate linear thread index from 3D block int threadId = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; // Print from each thread printf("[%d, %d]:\\t\\tValue is: %d\\n", blockId, threadId, val); } """ # Numba CUDA kernel - Pythonic equivalent using numba.cuda.grid() # This demonstrates the same functionality using Numba's Python-based kernel syntax if NUMBA_AVAILABLE: @numba_cuda.jit def numba_print_kernel(val): """ Numba CUDA kernel showing the *recommended* grid() indexing style, while also relating it to the classic CUDA C++ blockId/threadId. - Primary view: global 3D coordinates from numba.cuda.grid(3) (modern, Pythonic way to index work for a 3D thread layout). - Secondary view: linear blockId / threadId matching the CUDA C++ printf sample, to help CUDA C++ users connect the two models. """ # Modern / recommended view: global 3D thread coordinates x, y, z = numba_cuda.grid(3) # Classic CUDA-style indices, same formulas as the C++ sample block_id = numba_cuda.blockIdx.y * numba_cuda.gridDim.x + numba_cuda.blockIdx.x thread_id = ( numba_cuda.threadIdx.z * numba_cuda.blockDim.x * numba_cuda.blockDim.y + numba_cuda.threadIdx.y * numba_cuda.blockDim.x + numba_cuda.threadIdx.x ) # Print both views side-by-side # Note: Numba print() adds spaces between comma-separated args print( "global[", x, ",", y, ",", z, "] -> [", block_id, ",", thread_id, "]:\t\tValue is:", val, ) def run_cuda_cpp_kernel(device, test_value=10): """ Demonstrate printing from CUDA C++ kernel compiled with cuda.core. This approach gives you full access to CUDA C++ features and allows for more complex kernel implementations. """ print("=" * 70) print("METHOD 1: CUDA C++ Kernel (via cuda.core.Program)") print("=" * 70) print("Advantage: Full C++ features, better for complex kernels") print() # Compile the kernel print("Compiling CUDA C++ kernel...") program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") prog = Program(PRINTF_KERNEL, code_type="c++", options=program_options) mod = prog.compile("cubin", name_expressions=("printKernel",)) kernel = mod.get_kernel("printKernel") print("Kernel compiled successfully.\n") # Create stream for kernel execution stream = device.create_stream() # Configure kernel launch # Using 2D grid (2x2) and 3D blocks (2x2x2) grid_x, grid_y = 2, 2 block_x, block_y, block_z = 2, 2, 2 print("Kernel configuration:") print(f" Grid: ({grid_x}, {grid_y})") print(f" Block: ({block_x}, {block_y}, {block_z})") print(f" Total threads: {grid_x * grid_y * block_x * block_y * block_z}") print() # Launch configuration with 2D grid and 3D block config = LaunchConfig(grid=(grid_x, grid_y), block=(block_x, block_y, block_z)) print(f"Launching kernel with value={test_value}. Output:\n") try: # Launch kernel launch(stream, config, kernel, test_value) # Synchronize to ensure printf output is flushed stream.sync() print("\nCUDA C++ kernel execution complete.") except Exception as e: print(f"\nError during kernel execution: {e}") traceback.print_exc() return 1 finally: # Cleanup stream.close() return 0 def run_numba_kernel(device, test_value=10): """ Demonstrate printing from a Numba CUDA kernel. This example uses numba.cuda.grid(3) as the primary indexing mechanism (recommended modern style), and also prints the equivalent blockId / threadId used in the CUDA C++ printf sample for side-by-side comparison. Uses cuda.core APIs for stream management, demonstrating interoperability between Numba CUDA kernels and cuda.core infrastructure. """ print("\n") print("=" * 70) print("METHOD 2: Numba CUDA Kernel (Pythonic / modern indexing)") print("=" * 70) print("Advantage: Uses numba.cuda.grid(3) for global indexing,") print(" while still showing classic CUDA C++ IDs for reference.") print(" Uses cuda.core for stream management (interoperability).") print() # Same launch configuration as the C++ version grid_x, grid_y = 2, 2 block_x, block_y, block_z = 2, 2, 2 print("Kernel configuration:") print(f" Grid: ({grid_x}, {grid_y})") print(f" Block: ({block_x}, {block_y}, {block_z})") print(f" Total threads: {grid_x * grid_y * block_x * block_y * block_z}") print() # Use cuda.core stream (same as C++ example) instead of numba.cuda.stream() stream = device.create_stream() print(f"Launching Numba kernel (grid(3) + classic IDs) with value={test_value}:") print("Uses numba.cuda.grid(3) to get global (x, y, z),") print("and prints the corresponding blockId/threadId like the C++ sample.") print("Stream managed by cuda.core for consistency with C++ example.\n") try: # Launch Numba kernel on cuda.core stream numba_print_kernel[(grid_x, grid_y), (block_x, block_y, block_z), stream]( test_value ) # Synchronize cuda.core stream (same as C++ example) stream.sync() print("\nNumba CUDA kernel execution complete.") except Exception as e: print(f"\nError during Numba kernel execution: {e}") traceback.print_exc() return 1 finally: # Cleanup stream.close() return 0 def main(): """Main function demonstrating printing from CUDA kernels using both approaches""" print("Simple Print - Printing from CUDA Kernels") print("Demonstrating both CUDA C++ and Numba CUDA approaches") print() # Initialize device device = Device() device.set_current() # Get device properties print(f"Device: {device.name}") print(f"Compute Capability: sm_{device.arch}") print() # Value to pass to both kernels test_value = 10 # Run CUDA C++ kernel result = run_cuda_cpp_kernel(device, test_value) if result != 0: return result # Run Numba kernel if available if NUMBA_AVAILABLE: result = run_numba_kernel(device, test_value) if result != 0: return result else: print("\n" + "=" * 70) print("Numba CUDA example skipped (numba not installed)") print("To run the Numba example: pip install numba") print("=" * 70) print("\n" + "=" * 70) print("Done! Both kernel approaches demonstrated successfully.") print("=" * 70) return 0 if __name__ == "__main__": sys.exit(main())