# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ Matrix Multiplication with Shared Memory (GEMM) Demonstrates efficient matrix multiplication using: - nvmath.linalg.advanced.Matmul for high-performance GEMM via cuBLASLt - Custom CUDA kernel with tiling, shared memory, and loop unrolling Uses cuda.core APIs with CuPy arrays via ExternalStream. """ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities")) try: import cupy as cp import numpy as np import nvmath.linalg.advanced as nvmath_advanced from cuda.core import ( Device, EventOptions, LaunchConfig, Program, ProgramOptions, launch, ) except ImportError as e: print(f"Error: Required package not found: {e}") print("Install with: pip install -r requirements.txt") sys.exit(1) TILE_SIZE: int = 16 MATMUL_KERNEL: str = r""" #define TILE_SIZE 16 extern "C" __global__ void matmul_shared(const float* A, const float* B, float* C, int M, int N, int K) { __shared__ float As[TILE_SIZE][TILE_SIZE]; __shared__ float Bs[TILE_SIZE][TILE_SIZE]; int bx = blockIdx.x, by = blockIdx.y; int tx = threadIdx.x, ty = threadIdx.y; int row = by * TILE_SIZE + ty; int col = bx * TILE_SIZE + tx; float sum = 0.0f; int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE; for (int t = 0; t < numTiles; t++) { int aCol = t * TILE_SIZE + tx; int bRow = t * TILE_SIZE + ty; As[ty][tx] = (row < M && aCol < K) ? A[row * K + aCol] : 0.0f; Bs[ty][tx] = (bRow < K && col < N) ? B[bRow * N + col] : 0.0f; __syncthreads(); #pragma unroll for (int k = 0; k < TILE_SIZE; k += 4) { sum += As[ty][k] * Bs[k][tx]; sum += As[ty][k + 1] * Bs[k + 1][tx]; sum += As[ty][k + 2] * Bs[k + 2][tx]; sum += As[ty][k + 3] * Bs[k + 3][tx]; } __syncthreads(); } if (row < M && col < N) { C[row * N + col] = sum; } } """ def run_matmul_benchmark( m: int = 1024, n: int = 1024, k: int = 1024, device_id: int = 0, num_iterations: int = 10, ) -> bool: """Run matrix multiplication benchmark comparing nvmath vs custom kernel.""" print("=" * 60) print("Matrix Multiplication with Shared Memory (GEMM)") print("=" * 60) # Initialize device and stream device = Device(device_id) device.set_current() stream = device.create_stream() print(f"\nDevice: {device.name}") print(f"Compute Capability: sm_{device.arch}") # Make CuPy use our cuda.core stream cp.cuda.ExternalStream(int(stream.handle)).use() # Compile custom kernel arch = f"sm_{device.arch}" program = Program(MATMUL_KERNEL, code_type="c++", options=ProgramOptions(arch=arch)) kernel = program.compile(target_type="cubin").get_kernel("matmul_shared") print("Custom kernel compiled ✓") # Setup print(f"\nMatrix: A({m}x{k}) × B({k}x{n}) = C({m}x{n})") total_ops = 2 * m * n * k event_opts = EventOptions(enable_timing=True) # Allocate matrices rng = cp.random.default_rng(42) d_A = rng.random((m, k), dtype=cp.float32) d_B = rng.random((k, n), dtype=cp.float32) d_C_custom = cp.zeros((m, n), dtype=cp.float32) success = True try: # ------------------------------------------------------------------------- # nvmath GEMM (cuBLASLt) # ------------------------------------------------------------------------- print("\n" + "-" * 60) print("NVMATH (cuBLASLt) - plan once, execute many") print("-" * 60) with nvmath_advanced.Matmul(d_A, d_B, stream=int(stream.handle)) as mm: mm.plan() d_C_nvmath = mm.execute() stream.sync() start = stream.record(options=event_opts) for _ in range(num_iterations): d_C_nvmath = mm.execute() end = stream.record(options=event_opts) end.sync() nvmath_ms = (end - start) / num_iterations nvmath_gflops = (total_ops / 1e9) / (nvmath_ms / 1e3) print(f"Time: {nvmath_ms:.3f} ms | {nvmath_gflops:.2f} GFLOPS") # ------------------------------------------------------------------------- # Custom kernel (tiled + shared memory + unroll) # ------------------------------------------------------------------------- print("\n" + "-" * 60) print("CUSTOM KERNEL (tiled + shared memory + unroll)") print("-" * 60) block = (TILE_SIZE, TILE_SIZE) grid = ((n + TILE_SIZE - 1) // TILE_SIZE, (m + TILE_SIZE - 1) // TILE_SIZE) config = LaunchConfig(grid=grid, block=block) launch( stream, config, kernel, d_A.data.ptr, d_B.data.ptr, d_C_custom.data.ptr, np.int32(m), np.int32(n), np.int32(k), ) stream.sync() start = stream.record(options=event_opts) for _ in range(num_iterations): launch( stream, config, kernel, d_A.data.ptr, d_B.data.ptr, d_C_custom.data.ptr, np.int32(m), np.int32(n), np.int32(k), ) end = stream.record(options=event_opts) end.sync() custom_ms = (end - start) / num_iterations custom_gflops = (total_ops / 1e9) / (custom_ms / 1e3) print(f"Time: {custom_ms:.3f} ms | {custom_gflops:.2f} GFLOPS") # ------------------------------------------------------------------------- # Verification # ------------------------------------------------------------------------- print("\n" + "-" * 60) print("VERIFICATION") print("-" * 60) d_C_ref = d_A @ d_B # Host-side verification: cp.allclose triggers NVRTC failure on sm_120 # (ldexp_cexp undefined). Use asnumpy + np.allclose instead. ref_host = cp.asnumpy(d_C_ref) for name, d_C in [("nvmath", d_C_nvmath), ("custom", d_C_custom)]: print(f"{name}: ", end="") passed = np.allclose(cp.asnumpy(d_C), ref_host, rtol=1e-4, atol=1e-4) print("Test PASSED" if passed else "Test FAILED") success = success and passed return success finally: cp.cuda.Stream.null.use() stream.close() def main() -> bool: """Entry point. Returns True if benchmark passed.""" return run_matmul_benchmark() if __name__ == "__main__": success = main() if not success: sys.exit(1)