#include #include #include #include #include #define NUM_THREADS 128 #define NUM_BLOCKS 32 #define NVRTC_SAFE_CALL(x) \ do { \ nvrtcResult result = x; \ if (result != NVRTC_SUCCESS) { \ std::cerr << "\nerror: " #x " failed with error " \ << nvrtcGetErrorString(result) << '\n'; \ exit(1); \ } \ } while(0) #define CUDA_SAFE_CALL(x) \ do { \ CUresult result = x; \ if (result != CUDA_SUCCESS) { \ const char *msg; \ cuGetErrorName(result, &msg); \ std::cerr << "\nerror: " #x " failed with error " \ << msg << '\n'; \ exit(1); \ } \ } while(0) #define NVJITLINK_SAFE_CALL(h,x) \ do { \ nvJitLinkResult result = x; \ if (result != NVJITLINK_SUCCESS) { \ std::cerr << "\nerror: " #x " failed with error " \ << result << '\n'; \ size_t lsize; \ result = nvJitLinkGetErrorLogSize(h, &lsize); \ if (result == NVJITLINK_SUCCESS && lsize > 0) { \ char *log = (char*)malloc(lsize); \ result = nvJitLinkGetErrorLog(h, log); \ if (result == NVJITLINK_SUCCESS) { \ std::cerr << "error log: " << log << '\n'; \ free(log); \ } \ } \ exit(1); \ } \ } while(0) const char *lto_saxpy = " \n\ extern __device__ float compute(float a, float x, float y); \n\ \n\ extern \"C\" __global__ \n\ void saxpy(float a, float *x, float *y, float *out, size_t n) \n\ { \n\ size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n\ if (tid < n) { \n\ out[tid] = compute(a, x[tid], y[tid]); \n\ } \n\ } \n"; const char *lto_compute = " \n\ __device__ float compute(float a, float x, float y) { \n\ return a * x + y; \n\ } \n"; // compile code into LTOIR, returning the IR and its size static void getLTOIR (const char *code, const char *name, char **ltoIR, size_t *ltoIRSize) { // Create an instance of nvrtcProgram with the code string. nvrtcProgram prog; NVRTC_SAFE_CALL( nvrtcCreateProgram(&prog, // prog code, // buffer name, // name 0, // numHeaders NULL, // headers NULL)); // includeNames // specify that LTO IR should be generated for LTO operation const char *opts[] = {"-dlto", "--relocatable-device-code=true"}; nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog 2, // numOptions opts); // options // Obtain compilation log from the program. size_t logSize; NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize)); char *log = new char[logSize]; NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log)); std::cout << log << '\n'; delete[] log; if (compileResult != NVRTC_SUCCESS) { exit(1); } // Obtain generated LTO IR from the program. NVRTC_SAFE_CALL(nvrtcGetLTOIRSize(prog, ltoIRSize)); *ltoIR = new char[*ltoIRSize]; NVRTC_SAFE_CALL(nvrtcGetLTOIR(prog, *ltoIR)); // Destroy the program. NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); } int main(int argc, char *argv[]) { char *ltoIR1; char *ltoIR2; size_t ltoIR1Size; size_t ltoIR2Size; // getLTOIR uses nvrtc to get the LTOIR. // We could also use nvcc offline with -dlto -fatbin // to generate the IR, but using nvrtc keeps the build simpler. getLTOIR(lto_saxpy, "lto_saxpy.cu", <oIR1, <oIR1Size); getLTOIR(lto_compute, "lto_compute.cu", <oIR2, <oIR2Size); CUdevice cuDevice; CUcontext context; CUmodule module; CUfunction kernel; CUDA_SAFE_CALL(cuInit(0)); CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0)); CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice)); // Dynamically determine the arch to link for int major = 0; int minor = 0; CUDA_SAFE_CALL(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); CUDA_SAFE_CALL(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); int arch = major*10 + minor; char smbuf[16]; memset(smbuf,0,16); sprintf(smbuf, "-arch=sm_%d", arch); // Load the generated LTO IR and link them together nvJitLinkHandle handle; const char *lopts[] = {"-lto", smbuf}; NVJITLINK_SAFE_CALL(handle, nvJitLinkCreate(&handle, 2, lopts)); NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, (void *)ltoIR1, ltoIR1Size, "lto_saxpy")); NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, (void *)ltoIR2, ltoIR2Size, "lto_compute")); // The call to nvJitLinkComplete causes linker to link together the two // LTO IR modules, do optimization on the linked LTO IR, // and generate cubin from it. NVJITLINK_SAFE_CALL(handle, nvJitLinkComplete(handle)); // check error log size_t logSize; NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLogSize(handle, &logSize)); if (logSize > 0) { char *log = (char*)malloc(logSize+1); NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLog(handle, log)); std::cout << "Error log: " << log << std::endl; free(log); } // get linked cubin size_t cubinSize; NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubinSize(handle, &cubinSize)); void *cubin = malloc(cubinSize); NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubin(handle, cubin)); NVJITLINK_SAFE_CALL(handle, nvJitLinkDestroy(&handle)); delete[] ltoIR1; delete[] ltoIR2; // cubin is linked, so now load it CUDA_SAFE_CALL(cuModuleLoadData(&module, cubin)); CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "saxpy")); // Generate input for execution, and create output buffers. size_t n = NUM_THREADS * NUM_BLOCKS; size_t bufferSize = n * sizeof(float); float a = 5.1f; float *hX = new float[n], *hY = new float[n], *hOut = new float[n]; for (size_t i = 0; i < n; ++i) { hX[i] = static_cast(i); hY[i] = static_cast(i * 2); } CUdeviceptr dX, dY, dOut; CUDA_SAFE_CALL(cuMemAlloc(&dX, bufferSize)); CUDA_SAFE_CALL(cuMemAlloc(&dY, bufferSize)); CUDA_SAFE_CALL(cuMemAlloc(&dOut, bufferSize)); CUDA_SAFE_CALL(cuMemcpyHtoD(dX, hX, bufferSize)); CUDA_SAFE_CALL(cuMemcpyHtoD(dY, hY, bufferSize)); // Execute SAXPY. void *args[] = { &a, &dX, &dY, &dOut, &n }; CUDA_SAFE_CALL( cuLaunchKernel(kernel, NUM_BLOCKS, 1, 1, // grid dim NUM_THREADS, 1, 1, // block dim 0, NULL, // shared mem and stream args, 0)); // arguments CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output. CUDA_SAFE_CALL(cuMemcpyDtoH(hOut, dOut, bufferSize)); for (size_t i = 0; i < n; ++i) { std::cout << a << " * " << hX[i] << " + " << hY[i] << " = " << hOut[i] << '\n'; } // check last value to verify if (hOut[n-1] == 29074.5) { std::cout << "PASSED!\n"; } else { std::cout << "values not expected?\n"; } // Release resources. CUDA_SAFE_CALL(cuMemFree(dX)); CUDA_SAFE_CALL(cuMemFree(dY)); CUDA_SAFE_CALL(cuMemFree(dOut)); CUDA_SAFE_CALL(cuModuleUnload(module)); CUDA_SAFE_CALL(cuCtxDestroy(context)); free(cubin); delete[] hX; delete[] hY; delete[] hOut; return 0; }