/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * CUDA Tile C++ matrix multiplication autotuner. * * This sample is the host-side driver for the tiled matrix multiplication * kernel in matmul.cu. It compiles that kernel repeatedly with different * TILE_BLOCK_M, TILE_BLOCK_N, TILE_BLOCK_K, LOAD_LATENCY, and STORE_LATENCY * values configured in a search-space file, derives the launch grid from the * selected tile size, and reports the fastest configuration for the requested * problem size. * * Backend flow: * - NVRTC compiles matmul.cu to TileIR, then invokes tileiras to produce a * cubin image. * - NVCC compiles matmul.cu as a standalone source file and reuses the * generated Tile cubin artifact. * - Both paths load the resulting cubin with the CUDA Driver API and launch * the same matmul_tile entry point. */ #include #include #include #include #include #include #include #include #include #include #include "backend_common.h" #include "backend_nvcc.h" #include "backend_nvrtc.h" #include "matmul_benchmark.h" #include // global SM value (compute capability) static int smValue = 0; static constexpr const char *kMatmulKernelName = "matmul_tile"; CompilerBackend parseCompilerBackendValue(const char *value) { if (std::strcmp(value, "nvrtc") == 0) { return CompilerBackend::NVRTC; } if (std::strcmp(value, "nvcc") == 0) { return CompilerBackend::NVCC; } fprintf(stderr, "Error: unsupported backend '%s'\n", value); fprintf(stderr, "Expected 'nvrtc' or 'nvcc'.\n"); exit(EXIT_FAILURE); } void printCompilerOptions() { printf("Backend options:\n"); printf(" --backend=nvrtc|nvcc Select backend (default: NVRTC)\n"); printf("\n"); } CompilerBackend parseCompilerBackendArgs(int argc, char** argv, std::vector& benchmark_argv) { CompilerBackend compiler_backend = CompilerBackend::NVRTC; benchmark_argv.clear(); benchmark_argv.push_back(argv[0]); for (int i = 1; i < argc; i++) { if (std::strcmp(argv[i], "--help") == 0 || std::strcmp(argv[i], "-h") == 0) { printCompilerOptions(); benchmark_argv.push_back(argv[i]); } else if (std::strncmp(argv[i], "--backend=", 10) == 0) { compiler_backend = parseCompilerBackendValue(argv[i] + 10); } else if (std::strcmp(argv[i], "--backend") == 0) { if (i + 1 >= argc) { fprintf(stderr, "Error: %s requires an argument\n", argv[i]); exit(EXIT_FAILURE); } compiler_backend = parseCompilerBackendValue(argv[++i]); } else { benchmark_argv.push_back(argv[i]); } } return compiler_backend; } void setSMValue() { CUdevice device; int major = 0, minor = 0; // initialize the CUDA Driver API checkCudaErrors(cuInit(0)); // get the first device (device 0) checkCudaErrors(cuDeviceGet(&device, 0)); checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device)); checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device)); printf("GPU Compute Capability: %d.%d\n", major, minor); smValue = major * 10 + minor; } CompiledKernel compileFile(const char *filename, int block_m, int block_n, int block_k, CompilerBackend compiler_backend, const std::vector& extra_flags = {}) { if (compiler_backend == CompilerBackend::NVCC) { return compileFileWithNVCC(filename, smValue, block_m, block_n, block_k, extra_flags); } return compileFileWithNVRTC(filename, smValue, block_m, block_n, block_k, extra_flags); } void loadAndExecuteKernel(const CompiledKernel& compiled_kernel, CUdeviceptr d_A, CUdeviceptr d_B, CUdeviceptr d_C, int M, int N, int K, unsigned int gridDimX, unsigned int gridDimY, unsigned int sMem) { CUmodule module; CUfunction kernel_addr; void* args[] = { (void*)&d_C, (void*)&d_A, (void*)&d_B, (void*)&M, (void*)&N, (void*)&K }; checkCudaErrors(cuModuleLoadData(&module, compiled_kernel.image.data())); checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, kMatmulKernelName)); checkCudaErrors(cuLaunchKernel(kernel_addr, gridDimX, gridDimY, 1, // grid dim 1, 1, 1, // block dim sMem, 0, // shared mem, stream args, // arguments NULL)); checkCudaErrors(cuCtxSynchronize()); // cleanup checkCudaErrors(cuModuleUnload(module)); } void autotuner(int M, int N, int K, const char *kernel_file, const SearchSpace& search_space, CompilerBackend compiler_backend) { printf("\n=== Matrix: C[%dx%d] = A[%dx%d] x B[%dx%d] (FP16->FP32) ===\n", M, N, M, K, K, N); printf(" FLOPs: %.2f GFLOP\n", 2.0 * M * N * K / 1e9); // allocate and initialize (FP16 for A and B) std::vector<__half> h_A(M * K), h_B(K * N); std::vector h_C; srand(42); for (int i = 0; i < M * K; i++) h_A[i] = __float2half((float)rand() / RAND_MAX - 0.5f); for (int i = 0; i < K * N; i++) h_B[i] = __float2half((float)rand() / RAND_MAX - 0.5f); // compute the CPU reference unless validation is disabled if (use_validation()) { h_C.resize(M * N); matmul_cpu(h_C.data(), h_A.data(), h_B.data(), M, N, K); } // allocate device memory CUdeviceptr d_A, d_B, d_C; checkCudaErrors(cuMemAlloc(&d_A, M * K * sizeof(__half))); checkCudaErrors(cuMemAlloc(&d_B, K * N * sizeof(__half))); checkCudaErrors(cuMemAlloc(&d_C, M * N * sizeof(float))); checkCudaErrors(cuMemcpyHtoD(d_A, h_A.data(), M * K * sizeof(__half))); checkCudaErrors(cuMemcpyHtoD(d_B, h_B.data(), K * N * sizeof(__half))); // helper lambda to clear output auto clear_output = [&]() { std::vector zeros(M * N, 0.0f); checkCudaErrors(cuMemcpyHtoD(d_C, zeros.data(), M * N * sizeof(float))); }; struct AutotuneResult { int block_m; int block_n; int block_k; int grid_x; int grid_y; int load_latency; int store_latency; BenchmarkResult result; }; std::vector autotune_results; size_t config_count = 0; size_t total_configs = search_space.tile_options.size() * search_space.load_latency_options.size() * search_space.store_latency_options.size(); for (const auto& tile : search_space.tile_options) { int grid_x = ceilDiv(M, tile.block_m); int grid_y = ceilDiv(N, tile.block_n); for (int load_lat : search_space.load_latency_options) { for (int store_lat : search_space.store_latency_options) { config_count++; printf(" [%zu/%zu] ", config_count, total_configs); std::vector compile_flags = { "-DLOAD_LATENCY=" + std::to_string(load_lat), "-DSTORE_LATENCY=" + std::to_string(store_lat) }; CompiledKernel compiled_kernel = compileFile(kernel_file, tile.block_m, tile.block_n, tile.block_k, compiler_backend, compile_flags); clear_output(); std::string config_name = "bm=" + std::to_string(tile.block_m) + ",bn=" + std::to_string(tile.block_n) + ",bk=" + std::to_string(tile.block_k) + ",gx=" + std::to_string(grid_x) + ",gy=" + std::to_string(grid_y) + ",ld=" + std::to_string(load_lat) + ",st=" + std::to_string(store_lat); auto result = run_benchmark(config_name.c_str(), [&]() { loadAndExecuteKernel(compiled_kernel, d_A, d_B, d_C, M, N, K, grid_x, grid_y, 0); }, [&]() { std::vector h_result(M * N); checkCudaErrors(cuMemcpyDtoH(h_result.data(), d_C, M * N * sizeof(float))); return verify_matmul_result(config_name.c_str(), h_result.data(), h_C.data(), M, N); }, M, N, K); print_result(result); autotune_results.push_back({tile.block_m, tile.block_n, tile.block_k, grid_x, grid_y, load_lat, store_lat, result}); } } } // find the best configuration by GFLOPS auto best = std::max_element(autotune_results.begin(), autotune_results.end(), [](const AutotuneResult& a, const AutotuneResult& b) { return a.result.gflops < b.result.gflops; }); printf("\n *** BEST CONFIGURATION ***\n"); printf(" BLOCK_M=%d, BLOCK_N=%d, BLOCK_K=%d\n", best->block_m, best->block_n, best->block_k); printf(" LOAD_LATENCY=%d, STORE_LATENCY=%d, grid_x=%d, grid_y=%d\n", best->load_latency, best->store_latency, best->grid_x, best->grid_y); printf(" Performance: %.1f GFLOPS, %.3f ms, %.1f GB/s\n", best->result.gflops, best->result.time_ms, best->result.bandwidth_gb_s); checkCudaErrors(cuMemFree(d_A)); checkCudaErrors(cuMemFree(d_B)); checkCudaErrors(cuMemFree(d_C)); } int main(int argc, char** argv) { std::vector benchmark_argv; CompilerBackend compiler_backend = parseCompilerBackendArgs(argc, argv, benchmark_argv); parse_benchmark_args(static_cast(benchmark_argv.size()), benchmark_argv.data()); print_device_info(); // initialize CUDA and get compute capability setSMValue(); CUcontext context; CUdevice cuDevice = 0; checkCudaErrors(cuInit(0)); checkCudaErrors(cuDeviceGet(&cuDevice, 0)); checkCudaErrors(cuCtxCreate(&context, NULL, 0, cuDevice)); printf("\nMatrix Multiplication Autotuner (FP16 inputs, FP32 accumulate)\n"); printf("==============================================================\n"); printf("Backend: %s\n", compilerBackendName(compiler_backend)); char *kernel_file = findSampleFile("matmul.cu", argv[0]); if (kernel_file == NULL) { fprintf(stderr, "Error: unable to locate matmul.cu\n"); return 1; } char *search_space_file = findSampleFile(kSearchSpaceFileName, argv[0]); if (search_space_file == NULL) { fprintf(stderr, "Error: unable to locate %s\n", kSearchSpaceFileName); free(kernel_file); return 1; } SearchSpace search_space = loadSearchSpace(search_space_file); printf("Search space: %s\n", search_space_file); printf("Tuning for M=1024, N=4096, K=1024\n"); autotuner(1024, 4096, 1024, kernel_file, search_space, compiler_backend); free(kernel_file); free(search_space_file); return 0; }