/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #pragma once #include "backend_common.h" #include #include #include #include #include #include #include #define NVRTC_SAFE_CALL(Name, x) \ do { \ nvrtcResult result = x; \ if (result != NVRTC_SUCCESS) { \ std::cerr << "\nerror: " << Name << " failed with error " << \ nvrtcGetErrorString(result); \ exit(EXIT_FAILURE); \ } \ } while(0) inline std::vector compileTileIRToCubin(const char *tileIR, size_t tileIRSize, int sm_value) { // Generate unique temporary file names using PID, timestamp, and a local counter. std::string tileir_file = makeTempPath("tileir", ".bc"); std::string cubin_file = makeTempPath("cubin", ".cubin"); // Write TileIR to a temporary file because tileiras consumes files rather than memory buffers. FILE* fp = fopen(tileir_file.c_str(), "wb"); if (!fp) { fprintf(stderr, "Error: failed to open %s for writing\n", tileir_file.c_str()); exit(EXIT_FAILURE); } fwrite(tileIR, 1, tileIRSize, fp); fclose(fp); /* * Ideally we would use the Driver API to compile TileIR to cubin with the * latest driver installed. However, to avoid the hassle of upgrading the * driver, we use tileiras for now, which is handily available in CUDA * Toolkit. * * The Driver API path would look like: * (requires #include ) * * CUmodule module; * CUjit_option options[] = { * CU_JIT_GENERATE_LINE_INFO * }; * void* optionValues[] = { * (void*)(uintptr_t)1, // line info * }; * unsigned int numOptions = sizeof(options) / sizeof(options[0]); * checkCudaErrors(cuModuleLoadDataEx(&module, tileIR, numOptions, * options, optionValues)); */ // Compile TileIR to cubin using tileiras from the configured CUDA Toolkit. // This happens before benchmarking so the timed path matches the NVCC backend. std::string cmd = joinShellCommand({ TILEIRAS_PATH, "-arch=sm_" + std::to_string(sm_value), tileir_file, "-o", cubin_file }); int ret = system(cmd.c_str()); if (ret != 0) { fprintf(stderr, "Error: tileiras compilation failed with code %d\n", ret); fprintf(stderr, "Command: %s\n", cmd.c_str()); remove(tileir_file.c_str()); remove(cubin_file.c_str()); exit(EXIT_FAILURE); } // Read the generated cubin into memory so later benchmark iterations do not depend on temp files. std::vector cubin = readBinaryFile(cubin_file); // Remove temporary files after the cubin has been captured. remove(tileir_file.c_str()); remove(cubin_file.c_str()); return cubin; } inline CompiledKernel compileFileWithNVRTC(const char *filename, int sm_value, int block_m, int block_n, int block_k, const std::vector& extra_flags) { // Check include path for cuda_fp16.h const char *ptr = CUDA_INCLUDE_PATH; if (ptr[0] == '\0') { printf("\n ERROR: unable to locate CUDA include directory containing cuda_fp16.h\n"); exit(EXIT_FAILURE); } std::vector option_storage = { "-I", ptr, "-std=c++20", "-enable-tile", "-lineinfo", "-arch=compute_" + std::to_string(sm_value) }; appendTileBlockMacroOptions(option_storage, block_m, block_n, block_k); for (const auto& flag : extra_flags) { option_storage.push_back(flag); } std::vector argv_vec; for (const auto& option : option_storage) { argv_vec.push_back(option.c_str()); } const char **argv = argv_vec.data(); int argc = static_cast(argv_vec.size()); std::cerr << "\nCompiling file with NVRTC\n"; std::ifstream inputFile(filename, std::ios::in | std::ios::binary | std::ios::ate); if (!inputFile.is_open()) { std::cerr << "\nerror: unable to open " << filename << " for reading!\n"; exit(EXIT_FAILURE); } std::streampos pos = inputFile.tellg(); size_t inputSize = pos; char * memBlock = new char [inputSize + 1]; inputFile.seekg (0, std::ios::beg); inputFile.read (memBlock, inputSize); inputFile.close(); memBlock[inputSize] = '\x0'; // Compile the source string to PTX and Tile IR. nvrtcProgram prog; NVRTC_SAFE_CALL("nvrtcCreateProgram", nvrtcCreateProgram(&prog, memBlock, "testprog", 0, NULL, NULL)); nvrtcResult res = nvrtcCompileProgram(prog, argc, argv); // Dump the NVRTC compilation log size_t logSize; NVRTC_SAFE_CALL("nvrtcGetProgramLogSize", nvrtcGetProgramLogSize(prog, &logSize)); char* log = (char*)malloc(sizeof(char) * logSize + 1); NVRTC_SAFE_CALL("nvrtcGetProgramLog", nvrtcGetProgramLog(prog, log)); log[logSize] = '\x0'; std::cerr << "\n compilation log ---\n"; std::cerr << log; std::cerr << "\n end log ---\n\n"; free(log); NVRTC_SAFE_CALL("nvrtcCompileProgram", res); // Fetch Tile IR and compile it to cubin before benchmarking. size_t tileIRSize; NVRTC_SAFE_CALL("nvrtcGetTileIRSize", nvrtcGetTileIRSize(prog, &tileIRSize)); std::vector tileIR(tileIRSize); NVRTC_SAFE_CALL("nvrtcGetTileIR", nvrtcGetTileIR(prog, tileIR.data())); CompiledKernel kernel; kernel.image = compileTileIRToCubin(tileIR.data(), tileIR.size(), sm_value); NVRTC_SAFE_CALL("nvrtcDestroyProgram", nvrtcDestroyProgram(&prog)); delete[] memBlock; return kernel; }