cuda-samples/Samples/7_libNVVM/cuda-c-linking/cuda-c-linking.cpp

334 lines
12 KiB
C++
Raw Permalink Normal View History

2023-06-30 03:33:40 +08:00
// Copyright (c) 1993-2023, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <cuda.h>
#include <llvm/ADT/StringExtras.h>
#include <llvm/IR/IRBuilder.h>
#include <llvm/IR/LLVMContext.h>
#include <llvm/IR/Module.h>
#include <llvm/Support/CommandLine.h>
#include <llvm/Support/FileSystem.h>
#include <llvm/Support/Path.h>
#include <llvm/Support/Program.h>
#include <llvm/Support/raw_ostream.h>
#include <nvvm.h>
#include <cassert>
#include <memory>
#include <string>
#include "DDSWriter.h"
static_assert(sizeof(void *) == 8, "Only 64bit targets are supported.");
using namespace llvm;
static cl::opt<bool> SaveCubin("save-cubin",
cl::desc("Write linker cubin to disk"),
cl::init(false));
static cl::opt<bool> SaveIR("save-ir", cl::desc("Write LLVM IR to disk"),
cl::init(false));
static cl::opt<bool> SavePTX("save-ptx", cl::desc("Write PTX to disk"),
cl::init(false));
// Width and height of the output image.
const unsigned width = 1024;
const unsigned height = 512;
// If 'err' is non-zero, emit an error message and exit.
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
static void __checkCudaErrors(CUresult err, const char *filename, int line) {
assert(filename);
if (CUDA_SUCCESS != err) {
const char *ename = NULL;
const CUresult res = cuGetErrorName(err, &ename);
fprintf(stderr,
"CUDA API Error %04d: \"%s\" from file <%s>, "
"line %i.\n",
err, ((CUDA_SUCCESS == res) ? ename : "Unknown"), filename, line);
exit(err);
}
}
// Verify that the NVVM result code is success, or terminate otherwise.
void checkNVVMCall(nvvmResult res) {
if (res != NVVM_SUCCESS) {
errs() << "libnvvm call failed\n";
exit(res);
}
}
/// generateModule - Generate and LLVM IR module that calls an
/// externally-defined function
std::unique_ptr<Module> generateModule(LLVMContext &context) {
// Create the module and setup the layout and triple.
auto mod = std::make_unique<Module>("nvvm-module", context);
mod->setDataLayout(
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-"
"f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:"
"64");
mod->setTargetTriple("nvptx64-nvidia-cuda");
// Get pointers to some commonly-used types.
Type *voidTy = Type::getVoidTy(context);
Type *floatTy = Type::getFloatTy(context);
Type *i32Ty = Type::getInt32Ty(context);
Type *floatGenericPtrTy = PointerType::get(floatTy, /* address space */ 0);
// void @mandelbrot(float*)
Type *mandelbrotParamTys[] = {floatGenericPtrTy};
FunctionType *mandelbrotTy =
FunctionType::get(voidTy, mandelbrotParamTys, false);
FunctionCallee mandelbrotFunc =
mod->getOrInsertFunction("mandelbrot", mandelbrotTy);
// Kernel argument types.
Type *paramTys[] = {floatGenericPtrTy};
// Kernel function type.
FunctionType *funcTy = FunctionType::get(voidTy, paramTys, false);
// Kernel function.
Function *func =
Function::Create(funcTy, GlobalValue::ExternalLinkage, "kernel", *mod);
func->arg_begin()->setName("ptr");
// 'entry' basic block in kernel function.
BasicBlock *entry = BasicBlock::Create(context, "entry", func);
// Build the entry block.
IRBuilder<> builder(entry);
builder.CreateCall(mandelbrotFunc, func->arg_begin());
builder.CreateRetVoid();
// Create kernel metadata.
Metadata *mdVals[] = {ValueAsMetadata::get(func),
MDString::get(context, "kernel"),
ConstantAsMetadata::get(ConstantInt::getTrue(context))};
MDNode *kernelMD = MDNode::get(context, mdVals);
NamedMDNode *nvvmAnnot = mod->getOrInsertNamedMetadata("nvvm.annotations");
nvvmAnnot->addOperand(kernelMD);
// Set the NVVM IR version to 2.0.
auto *two = ConstantInt::get(Type::getInt32Ty(context), 2);
auto *zero = ConstantInt::get(Type::getInt32Ty(context), 0);
auto *versionMD = MDNode::get(
context, {ConstantAsMetadata::get(two), ConstantAsMetadata::get(zero)});
NamedMDNode *nvvmIRVersion = mod->getOrInsertNamedMetadata("nvvmir.version");
nvvmIRVersion->addOperand(versionMD);
return mod;
}
// Use libNVVM to compile an NVVM IR module to PTX.
std::string generatePtx(const std::string &module, int devMajor, int devMinor,
const char *moduleName) {
assert(moduleName);
// libNVVM initialization.
nvvmProgram compileUnit;
checkNVVMCall(nvvmCreateProgram(&compileUnit));
// Create a libNVVM compilation unit from the NVVM IR.
checkNVVMCall(nvvmAddModuleToProgram(compileUnit, module.c_str(),
module.size(), moduleName));
std::string computeArg = "-arch=compute_";
computeArg += utostr(devMajor);
computeArg += utostr(devMinor);
// Compile the NVVM IR into PTX.
const char *options[] = {computeArg.c_str()};
nvvmResult res = nvvmCompileProgram(compileUnit, 1, options);
if (res != NVVM_SUCCESS) {
errs() << "nvvmCompileProgram failed!\n";
size_t logSize;
nvvmGetProgramLogSize(compileUnit, &logSize);
char *msg = new char[logSize];
nvvmGetProgramLog(compileUnit, msg);
errs() << msg << "\n";
delete[] msg;
exit(EXIT_FAILURE);
}
// Get the result PTX size and source.
size_t ptxSize = 0;
checkNVVMCall(nvvmGetCompiledResultSize(compileUnit, &ptxSize));
char *ptx = new char[ptxSize];
checkNVVMCall(nvvmGetCompiledResult(compileUnit, ptx));
// Clean-up libNVVM.
checkNVVMCall(nvvmDestroyProgram(&compileUnit));
return std::string(ptx);
}
int main(int argc, char **argv) {
cl::ParseCommandLineOptions(argc, argv, "cuda-c-linking");
// Locate the pre-built library.
std::string libpath0 = sys::fs::getMainExecutable(argv[0], (void *)main);
SmallString<256> libpath(libpath0);
const char *mathlibFile = "libmathfuncs64.a";
sys::path::remove_filename(libpath);
sys::path::append(libpath, mathlibFile);
if (!sys::fs::exists(libpath.c_str())) {
errs() << "Unable to locate math library, expected at " << libpath << '\n';
return EXIT_FAILURE;
}
outs() << "Using math library: " << libpath.str() << "\n";
// Initialize CUDA and obtain device 0.
checkCudaErrors(cuInit(0));
int nDevices;
checkCudaErrors(cuDeviceGetCount(&nDevices));
if (nDevices == 0) {
errs() << "Failed to locate any CUDA compute devices.\n";
exit(EXIT_FAILURE);
}
CUdevice device;
checkCudaErrors(cuDeviceGet(&device, 0));
char name[128];
checkCudaErrors(cuDeviceGetName(name, 128, device));
outs() << "Using CUDA Device [0]: " << name << "\n";
int devMajor = 0, devMinor = 0;
checkCudaErrors(cuDeviceGetAttribute(
&devMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
checkCudaErrors(cuDeviceGetAttribute(
&devMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
outs() << "Device Compute Capability: " << devMajor << "." << devMinor
<< "\n";
if (devMajor < 5) {
errs() << "ERROR: Device 0 is not sm_50 or later.\n";
return 1;
}
// Generate the IR module
LLVMContext ctx;
std::string moduleStr;
auto module = generateModule(ctx);
if (SaveIR) {
std::error_code err;
raw_fd_ostream out("cuda-c-linking.kernel.ll", err);
out << *(module.get());
}
// Write the module to a string.
{
llvm::raw_string_ostream str(moduleStr);
str << *module.get();
}
// Generate PTX.
std::string ptx = generatePtx(moduleStr, devMajor, devMinor,
module->getModuleIdentifier().c_str());
if (SavePTX) {
std::error_code err;
raw_fd_ostream out("cuda-c-linking.kernel.ptx", err);
out << ptx;
}
// Create the CUDA context.
CUcontext context;
checkCudaErrors(cuCtxCreate(&context, 0, device));
// Create a JIT linker and generate the result CUBIN.
CUlinkState linker;
char linkerInfo[1024]{};
char linkerErrors[1024]{};
CUjit_option linkerOptions[] = {
CU_JIT_INFO_LOG_BUFFER, CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
CU_JIT_ERROR_LOG_BUFFER, CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
CU_JIT_LOG_VERBOSE};
void *linkerOptionValues[] = {linkerInfo, reinterpret_cast<void *>(1024),
linkerErrors, reinterpret_cast<void *>(1024),
reinterpret_cast<void *>(1)};
checkCudaErrors(cuLinkCreate(5, linkerOptions, linkerOptionValues, &linker));
checkCudaErrors(cuLinkAddData(linker, CU_JIT_INPUT_PTX, (void *)ptx.c_str(),
ptx.size(), "<compiled-ptx>", 0, NULL, NULL));
checkCudaErrors(cuLinkAddFile(linker, CU_JIT_INPUT_LIBRARY, libpath.c_str(),
0, NULL, NULL));
void *cubin;
size_t cubinSize;
checkCudaErrors(cuLinkComplete(linker, &cubin, &cubinSize));
outs() << "Linker Log:\n" << linkerInfo << "\n" << linkerErrors << "\n";
if (SaveCubin) {
std::error_code err;
raw_fd_ostream out("cuda-c-linking.linked.cubin", err, sys::fs::OF_None);
out.write(reinterpret_cast<char *>(cubin), cubinSize);
}
// Create a module and load the cubin into it.
CUmodule cudaModule;
checkCudaErrors(cuModuleLoadDataEx(&cudaModule, cubin, 0, 0, 0));
// Now that the CUBIN is loaded, we can release the linker.
checkCudaErrors(cuLinkDestroy(linker));
// Get kernel function.
CUfunction function;
checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "kernel"));
// Device data.
CUdeviceptr devBuffer;
checkCudaErrors(cuMemAlloc(&devBuffer, sizeof(float) * width * height * 4));
float *data = new float[width * height * 4];
// Each thread will generate one pixel, and we'll subdivide the problem into
// 16x16 chunks.
const unsigned blockSizeX = 16;
const unsigned blockSizeY = 16;
const unsigned blockSizeZ = 1;
const unsigned gridSizeX = width / 16;
const unsigned gridSizeY = height / 16;
const unsigned gridSizeZ = 1;
// Execute the kernel.
outs() << "Launching kernel\n";
void *params[] = {&devBuffer};
checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ,
blockSizeX, blockSizeY, blockSizeZ, 0, NULL,
params, NULL));
// Retrieve the result data from the device.
checkCudaErrors(
cuMemcpyDtoH(&data[0], devBuffer, sizeof(float) * width * height * 4));
writeDDS("mandelbrot.dds", data, width, height);
outs() << "Output saved to mandelbrot.dds\n";
// Cleanup.
delete[] data;
checkCudaErrors(cuMemFree(devBuffer));
checkCudaErrors(cuModuleUnload(cudaModule));
checkCudaErrors(cuCtxDestroy(context));
return 0;
}