# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ CUDA Graphs with cuda.core CUDA graphs let you record a DAG of operations once, then replay the entire graph with a single driver call. For workflows that issue many small kernels this can significantly reduce CPU-side launch overhead. This sample runs a three-stage elementwise pipeline (add -> multiply -> subtract) in two modes: 1. Individually launched kernels on a stream. 2. A single CUDA graph that captures the same three launches and is replayed with ``graph.launch(stream)``. We then measure the wall-clock time of each mode across many iterations to illustrate the graph replay advantage for short kernels, and demonstrate that a graph can be relaunched against new data (the pointers are baked in, but the contents of those buffers are not). """ import sys import time from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities")) try: import cupy as cp import numpy as np from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch from cuda_samples_utils import print_gpu_info # noqa: E402 except ImportError as e: print(f"Error: Required package not found: {e}") print("Please install from requirements.txt:") print(" pip install -r requirements.txt") sys.exit(1) PIPELINE_KERNELS = r""" extern "C" __global__ void vec_add(const float* A, const float* B, float* C, size_t N) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = (size_t)gridDim.x * blockDim.x; for (size_t i = tid; i < N; i += stride) C[i] = A[i] + B[i]; } extern "C" __global__ void vec_mul(const float* A, const float* B, float* C, size_t N) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = (size_t)gridDim.x * blockDim.x; for (size_t i = tid; i < N; i += stride) C[i] = A[i] * B[i]; } extern "C" __global__ void vec_sub(const float* A, const float* B, float* C, size_t N) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = (size_t)gridDim.x * blockDim.x; for (size_t i = tid; i < N; i += stride) C[i] = A[i] - B[i]; } """ def run_pipeline_individual(stream, kernels, config, buffers, size, n_iters): """Run the 3-stage pipeline `n_iters` times with one launch per stage.""" add_k, mul_k, sub_k = kernels a, b, c, r1, r2, r3 = buffers stream.sync() t0 = time.perf_counter() for _ in range(n_iters): launch( stream, config, add_k, a.data.ptr, b.data.ptr, r1.data.ptr, np.uint64(size) ) launch( stream, config, mul_k, r1.data.ptr, c.data.ptr, r2.data.ptr, np.uint64(size) ) launch( stream, config, sub_k, r2.data.ptr, a.data.ptr, r3.data.ptr, np.uint64(size) ) stream.sync() return time.perf_counter() - t0 def build_graph(stream, kernels, config, buffers, size): """Capture the 3-stage pipeline into a CUDA graph and return it.""" add_k, mul_k, sub_k = kernels a, b, c, r1, r2, r3 = buffers graph_builder = stream.create_graph_builder() graph_builder.begin_building() launch( graph_builder, config, add_k, a.data.ptr, b.data.ptr, r1.data.ptr, np.uint64(size), ) launch( graph_builder, config, mul_k, r1.data.ptr, c.data.ptr, r2.data.ptr, np.uint64(size), ) launch( graph_builder, config, sub_k, r2.data.ptr, a.data.ptr, r3.data.ptr, np.uint64(size), ) graph_builder.end_building() graph = graph_builder.complete() graph.upload(stream) return graph_builder, graph def run_pipeline_graph(stream, graph, n_iters): """Launch the compiled graph `n_iters` times.""" stream.sync() t0 = time.perf_counter() for _ in range(n_iters): graph.launch(stream) stream.sync() return time.perf_counter() - t0 def main() -> int: import argparse parser = argparse.ArgumentParser(description="CUDA Graphs demo with cuda.core") parser.add_argument( "--elements", type=int, default=1 << 12, help="Elements per vector (default: 4096 - small to emphasize launch overhead)", ) parser.add_argument( "--iters", type=int, default=1000, help="Number of pipeline iterations to time (default: 1000)", ) parser.add_argument("--device", type=int, default=0, help="CUDA device id") args = parser.parse_args() device = Device(args.device) device.set_current() print_gpu_info(device) stream = device.create_stream() # Tell CuPy to order its allocations on our stream so buffer initialization # below is serialized with the kernels we launch. cp.cuda.ExternalStream(int(stream.handle)).use() graph_builder = graph = None try: program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") program = Program(PIPELINE_KERNELS, code_type="c++", options=program_options) module = program.compile("cubin") add_k = module.get_kernel("vec_add") mul_k = module.get_kernel("vec_mul") sub_k = module.get_kernel("vec_sub") kernels = (add_k, mul_k, sub_k) N = args.elements rng = cp.random.default_rng(seed=0) a = rng.random(N, dtype=cp.float32) b = rng.random(N, dtype=cp.float32) c = rng.random(N, dtype=cp.float32) r1 = cp.empty_like(a) r2 = cp.empty_like(a) r3 = cp.empty_like(a) buffers = (a, b, c, r1, r2, r3) expected = (a + b) * c - a config = LaunchConfig(grid=(N + 255) // 256, block=256) device.sync() # Warm up compilation/caches, then measure individual launches. run_pipeline_individual(stream, kernels, config, buffers, N, n_iters=5) t_individual = run_pipeline_individual( stream, kernels, config, buffers, N, n_iters=args.iters ) assert cp.allclose(r3, expected, rtol=1e-5, atol=1e-5), ( "Individual pipeline produced incorrect results" ) print( f"\nIndividual launches: {args.iters} iters in {t_individual:.4f}s" f" ({t_individual * 1e6 / args.iters:.2f} us/iter)" ) # Capture the same pipeline as a graph and measure the replay. print("\nBuilding CUDA graph...") graph_builder, graph = build_graph(stream, kernels, config, buffers, N) run_pipeline_graph(stream, graph, n_iters=5) # warm up t_graph = run_pipeline_graph(stream, graph, n_iters=args.iters) assert cp.allclose(r3, expected, rtol=1e-5, atol=1e-5), ( "Graph pipeline produced incorrect results" ) print( f"Graph replay: {args.iters} iters in {t_graph:.4f}s" f" ({t_graph * 1e6 / args.iters:.2f} us/iter)" ) if t_graph > 0: print(f"Graph speedup: {t_individual / t_graph:.2f}x") # Demonstrate that the graph replays against current buffer contents. a[:] = cp.ones(N, dtype=cp.float32) b[:] = cp.full(N, 2.0, dtype=cp.float32) c[:] = cp.full(N, 3.0, dtype=cp.float32) device.sync() # r3 = (a + b) * c - a = (1 + 2) * 3 - 1 = 8 graph.launch(stream) stream.sync() assert cp.allclose(r3, 8.0), "Graph replay with new data produced wrong result" print( "\nGraph replay on updated data verified (same graph, new buffer contents)" ) print("\nDone") return 0 finally: if graph is not None: graph.close() if graph_builder is not None: graph_builder.close() stream.close() cp.cuda.Stream.null.use() if __name__ == "__main__": sys.exit(main())