mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2026-06-04 00:06:52 +08:00
187 lines
7.6 KiB
C++
187 lines
7.6 KiB
C++
/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "backend_common.h"
|
|
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include <nvrtc.h>
|
|
|
|
#define NVRTC_SAFE_CALL(Name, x) \
|
|
do { \
|
|
nvrtcResult result = x; \
|
|
if (result != NVRTC_SUCCESS) { \
|
|
std::cerr << "\nerror: " << Name << " failed with error " << \
|
|
nvrtcGetErrorString(result); \
|
|
exit(EXIT_FAILURE); \
|
|
} \
|
|
} while(0)
|
|
|
|
inline std::vector<char> compileTileIRToCubin(const char *tileIR,
|
|
size_t tileIRSize,
|
|
int sm_value) {
|
|
// Generate unique temporary file names using PID, timestamp, and a local counter.
|
|
std::string tileir_file = makeTempPath("tileir", ".bc");
|
|
std::string cubin_file = makeTempPath("cubin", ".cubin");
|
|
|
|
// Write TileIR to a temporary file because tileiras consumes files rather than memory buffers.
|
|
FILE* fp = fopen(tileir_file.c_str(), "wb");
|
|
if (!fp) {
|
|
fprintf(stderr, "Error: failed to open %s for writing\n", tileir_file.c_str());
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
fwrite(tileIR, 1, tileIRSize, fp);
|
|
fclose(fp);
|
|
|
|
/*
|
|
* Ideally we would use the Driver API to compile TileIR to cubin with the
|
|
* latest driver installed. However, to avoid the hassle of upgrading the
|
|
* driver, we use tileiras for now, which is handily available in CUDA
|
|
* Toolkit.
|
|
*
|
|
* The Driver API path would look like:
|
|
* (requires #include <helper_cuda_drvapi.h>)
|
|
*
|
|
* CUmodule module;
|
|
* CUjit_option options[] = {
|
|
* CU_JIT_GENERATE_LINE_INFO
|
|
* };
|
|
* void* optionValues[] = {
|
|
* (void*)(uintptr_t)1, // line info
|
|
* };
|
|
* unsigned int numOptions = sizeof(options) / sizeof(options[0]);
|
|
* checkCudaErrors(cuModuleLoadDataEx(&module, tileIR, numOptions,
|
|
* options, optionValues));
|
|
*/
|
|
|
|
// Compile TileIR to cubin using tileiras from the configured CUDA Toolkit.
|
|
// This happens before benchmarking so the timed path matches the NVCC backend.
|
|
std::string cmd = joinShellCommand({
|
|
TILEIRAS_PATH,
|
|
"-arch=sm_" + std::to_string(sm_value),
|
|
tileir_file,
|
|
"-o",
|
|
cubin_file
|
|
});
|
|
int ret = system(cmd.c_str());
|
|
if (ret != 0) {
|
|
fprintf(stderr, "Error: tileiras compilation failed with code %d\n", ret);
|
|
fprintf(stderr, "Command: %s\n", cmd.c_str());
|
|
remove(tileir_file.c_str());
|
|
remove(cubin_file.c_str());
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
// Read the generated cubin into memory so later benchmark iterations do not depend on temp files.
|
|
std::vector<char> cubin = readBinaryFile(cubin_file);
|
|
|
|
// Remove temporary files after the cubin has been captured.
|
|
remove(tileir_file.c_str());
|
|
remove(cubin_file.c_str());
|
|
return cubin;
|
|
}
|
|
|
|
inline CompiledKernel compileFileWithNVRTC(const char *filename,
|
|
int sm_value,
|
|
int block_m, int block_n, int block_k,
|
|
const std::vector<std::string>& extra_flags) {
|
|
// Check include path for cuda_fp16.h
|
|
const char *ptr = CUDA_INCLUDE_PATH;
|
|
if (ptr[0] == '\0') {
|
|
printf("\n ERROR: unable to locate CUDA include directory containing cuda_fp16.h\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
std::vector<std::string> option_storage = {
|
|
"-I",
|
|
ptr,
|
|
"-std=c++20",
|
|
"-enable-tile",
|
|
"-lineinfo",
|
|
"-arch=compute_" + std::to_string(sm_value)
|
|
};
|
|
appendTileBlockMacroOptions(option_storage, block_m, block_n, block_k);
|
|
for (const auto& flag : extra_flags) {
|
|
option_storage.push_back(flag);
|
|
}
|
|
|
|
std::vector<const char*> argv_vec;
|
|
for (const auto& option : option_storage) {
|
|
argv_vec.push_back(option.c_str());
|
|
}
|
|
const char **argv = argv_vec.data();
|
|
int argc = static_cast<int>(argv_vec.size());
|
|
std::cerr << "\nCompiling file with NVRTC\n";
|
|
std::ifstream inputFile(filename, std::ios::in | std::ios::binary |
|
|
std::ios::ate);
|
|
if (!inputFile.is_open()) {
|
|
std::cerr << "\nerror: unable to open " << filename << " for reading!\n";
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
std::streampos pos = inputFile.tellg();
|
|
size_t inputSize = pos;
|
|
char * memBlock = new char [inputSize + 1];
|
|
inputFile.seekg (0, std::ios::beg);
|
|
inputFile.read (memBlock, inputSize);
|
|
inputFile.close();
|
|
memBlock[inputSize] = '\x0';
|
|
|
|
// Compile the source string to PTX and Tile IR.
|
|
nvrtcProgram prog;
|
|
NVRTC_SAFE_CALL("nvrtcCreateProgram", nvrtcCreateProgram(&prog, memBlock,
|
|
"testprog", 0, NULL, NULL));
|
|
nvrtcResult res = nvrtcCompileProgram(prog, argc, argv);
|
|
// Dump the NVRTC compilation log
|
|
size_t logSize;
|
|
NVRTC_SAFE_CALL("nvrtcGetProgramLogSize", nvrtcGetProgramLogSize(prog, &logSize));
|
|
char* log = (char*)malloc(sizeof(char) * logSize + 1);
|
|
NVRTC_SAFE_CALL("nvrtcGetProgramLog", nvrtcGetProgramLog(prog, log));
|
|
log[logSize] = '\x0';
|
|
std::cerr << "\n compilation log ---\n";
|
|
std::cerr << log;
|
|
std::cerr << "\n end log ---\n\n";
|
|
free(log);
|
|
NVRTC_SAFE_CALL("nvrtcCompileProgram", res);
|
|
|
|
// Fetch Tile IR and compile it to cubin before benchmarking.
|
|
size_t tileIRSize;
|
|
NVRTC_SAFE_CALL("nvrtcGetTileIRSize", nvrtcGetTileIRSize(prog, &tileIRSize));
|
|
std::vector<char> tileIR(tileIRSize);
|
|
NVRTC_SAFE_CALL("nvrtcGetTileIR", nvrtcGetTileIR(prog, tileIR.data()));
|
|
CompiledKernel kernel;
|
|
kernel.image = compileTileIRToCubin(tileIR.data(), tileIR.size(), sm_value);
|
|
NVRTC_SAFE_CALL("nvrtcDestroyProgram", nvrtcDestroyProgram(&prog));
|
|
delete[] memBlock;
|
|
return kernel;
|
|
}
|