diff --git a/Samples/4_CUDA_Libraries/jitLto/jitlto.cpp b/Samples/4_CUDA_Libraries/jitLto/jitlto.cpp deleted file mode 100644 index 6300b35b..00000000 --- a/Samples/4_CUDA_Libraries/jitLto/jitlto.cpp +++ /dev/null @@ -1,251 +0,0 @@ -/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of NVIDIA CORPORATION nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include - -#define NUM_THREADS 128 -#define NUM_BLOCKS 32 - -#define NVRTC_SAFE_CALL(x) \ - do { \ - nvrtcResult result = x; \ - if (result != NVRTC_SUCCESS) { \ - std::cerr << "\nerror: " #x " failed with error " \ - << nvrtcGetErrorString(result) << '\n'; \ - exit(1); \ - } \ - } while(0) -#define CUDA_SAFE_CALL(x) \ - do { \ - CUresult result = x; \ - if (result != CUDA_SUCCESS) { \ - const char *msg; \ - cuGetErrorName(result, &msg); \ - std::cerr << "\nerror: " #x " failed with error " \ - << msg << '\n'; \ - exit(1); \ - } \ - } while(0) -#define NVJITLINK_SAFE_CALL(h,x) \ - do { \ - nvJitLinkResult result = x; \ - if (result != NVJITLINK_SUCCESS) { \ - std::cerr << "\nerror: " #x " failed with error " \ - << result << '\n'; \ - size_t lsize; \ - result = nvJitLinkGetErrorLogSize(h, &lsize); \ - if (result == NVJITLINK_SUCCESS && lsize > 0) { \ - char *log = (char*)malloc(lsize); \ - result = nvJitLinkGetErrorLog(h, log); \ - if (result == NVJITLINK_SUCCESS) { \ - std::cerr << "error log: " << log << '\n'; \ - free(log); \ - } \ - } \ - exit(1); \ - } \ - } while(0) - -const char *lto_saxpy = " \n\ -extern __device__ float compute(float a, float x, float y); \n\ - \n\ -extern \"C\" __global__ \n\ -void saxpy(float a, float *x, float *y, float *out, size_t n) \n\ -{ \n\ - size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n\ - if (tid < n) { \n\ - out[tid] = compute(a, x[tid], y[tid]); \n\ - } \n\ -} \n"; - -const char *lto_compute = " \n\ -__device__ float compute(float a, float x, float y) { \n\ - return a * x + y; \n\ -} \n"; - -// compile code into LTOIR, returning the IR and its size -static void getLTOIR (const char *code, const char *name, - char **ltoIR, size_t *ltoIRSize) -{ - // Create an instance of nvrtcProgram with the code string. - nvrtcProgram prog; - NVRTC_SAFE_CALL( - nvrtcCreateProgram(&prog, // prog - code, // buffer - name, // name - 0, // numHeaders - NULL, // headers - NULL)); // includeNames - - // specify that LTO IR should be generated for LTO operation - const char *opts[] = {"-dlto", - "--relocatable-device-code=true"}; - nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog - 2, // numOptions - opts); // options - // Obtain compilation log from the program. - size_t logSize; - NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize)); - char *log = new char[logSize]; - NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log)); - std::cout << log << '\n'; - delete[] log; - if (compileResult != NVRTC_SUCCESS) { - exit(1); - } - // Obtain generated LTO IR from the program. - NVRTC_SAFE_CALL(nvrtcGetLTOIRSize(prog, ltoIRSize)); - *ltoIR = new char[*ltoIRSize]; - NVRTC_SAFE_CALL(nvrtcGetLTOIR(prog, *ltoIR)); - // Destroy the program. - NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); -} - -int main(int argc, char *argv[]) -{ - char *ltoIR1; - char *ltoIR2; - size_t ltoIR1Size; - size_t ltoIR2Size; - // getLTOIR uses nvrtc to get the LTOIR. - // We could also use nvcc offline with -dlto -fatbin - // to generate the IR, but using nvrtc keeps the build simpler. - getLTOIR(lto_saxpy, "lto_saxpy.cu", <oIR1, <oIR1Size); - getLTOIR(lto_compute, "lto_compute.cu", <oIR2, <oIR2Size); - - CUdevice cuDevice; - CUcontext context; - CUmodule module; - CUfunction kernel; - CUDA_SAFE_CALL(cuInit(0)); - CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0)); - CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice)); - - // Dynamically determine the arch to link for - int major = 0; - int minor = 0; - CUDA_SAFE_CALL(cuDeviceGetAttribute(&major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); - CUDA_SAFE_CALL(cuDeviceGetAttribute(&minor, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); - int arch = major*10 + minor; - char smbuf[16]; - memset(smbuf,0,16); - sprintf(smbuf, "-arch=sm_%d", arch); - - // Load the generated LTO IR and link them together - nvJitLinkHandle handle; - const char *lopts[] = {"-lto", smbuf}; - NVJITLINK_SAFE_CALL(handle, nvJitLinkCreate(&handle, 2, lopts)); - - NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, - (void *)ltoIR1, ltoIR1Size, "lto_saxpy")); - NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, - (void *)ltoIR2, ltoIR2Size, "lto_compute")); - - // The call to nvJitLinkComplete causes linker to link together the two - // LTO IR modules, do optimization on the linked LTO IR, - // and generate cubin from it. - NVJITLINK_SAFE_CALL(handle, nvJitLinkComplete(handle)); - - // check error log - size_t logSize; - NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLogSize(handle, &logSize)); - if (logSize > 0) { - char *log = (char*)malloc(logSize+1); - NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLog(handle, log)); - std::cout << "Error log: " << log << std::endl; - free(log); - } - - // get linked cubin - size_t cubinSize; - NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubinSize(handle, &cubinSize)); - void *cubin = malloc(cubinSize); - NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubin(handle, cubin)); - - NVJITLINK_SAFE_CALL(handle, nvJitLinkDestroy(&handle)); - delete[] ltoIR1; - delete[] ltoIR2; - - // cubin is linked, so now load it - CUDA_SAFE_CALL(cuModuleLoadData(&module, cubin)); - CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "saxpy")); - - // Generate input for execution, and create output buffers. - size_t n = NUM_THREADS * NUM_BLOCKS; - size_t bufferSize = n * sizeof(float); - float a = 5.1f; - float *hX = new float[n], *hY = new float[n], *hOut = new float[n]; - for (size_t i = 0; i < n; ++i) { - hX[i] = static_cast(i); - hY[i] = static_cast(i * 2); - } - CUdeviceptr dX, dY, dOut; - CUDA_SAFE_CALL(cuMemAlloc(&dX, bufferSize)); - CUDA_SAFE_CALL(cuMemAlloc(&dY, bufferSize)); - CUDA_SAFE_CALL(cuMemAlloc(&dOut, bufferSize)); - CUDA_SAFE_CALL(cuMemcpyHtoD(dX, hX, bufferSize)); - CUDA_SAFE_CALL(cuMemcpyHtoD(dY, hY, bufferSize)); - // Execute SAXPY. - void *args[] = { &a, &dX, &dY, &dOut, &n }; - CUDA_SAFE_CALL( - cuLaunchKernel(kernel, - NUM_BLOCKS, 1, 1, // grid dim - NUM_THREADS, 1, 1, // block dim - 0, NULL, // shared mem and stream - args, 0)); // arguments - CUDA_SAFE_CALL(cuCtxSynchronize()); - // Retrieve and print output. - CUDA_SAFE_CALL(cuMemcpyDtoH(hOut, dOut, bufferSize)); - - for (size_t i = 0; i < n; ++i) { - std::cout << a << " * " << hX[i] << " + " << hY[i] - << " = " << hOut[i] << '\n'; - } - // check last value to verify - if (hOut[n-1] == 29074.5) { - std::cout << "PASSED!\n"; - } else { - std::cout << "values not expected?\n"; - } - // Release resources. - CUDA_SAFE_CALL(cuMemFree(dX)); - CUDA_SAFE_CALL(cuMemFree(dY)); - CUDA_SAFE_CALL(cuMemFree(dOut)); - CUDA_SAFE_CALL(cuModuleUnload(module)); - CUDA_SAFE_CALL(cuCtxDestroy(context)); - free(cubin); - delete[] hX; - delete[] hY; - delete[] hOut; - return 0; -}