diff --git a/Samples/4_CUDA_Libraries/jitLto/.vscode/c_cpp_properties.json b/Samples/4_CUDA_Libraries/jitLto/.vscode/c_cpp_properties.json new file mode 100644 index 00000000..f0066b0f --- /dev/null +++ b/Samples/4_CUDA_Libraries/jitLto/.vscode/c_cpp_properties.json @@ -0,0 +1,18 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/**", + "${workspaceFolder}/../../../Common" + ], + "defines": [], + "compilerPath": "/usr/local/cuda/bin/nvcc", + "cStandard": "gnu17", + "cppStandard": "gnu++14", + "intelliSenseMode": "linux-gcc-x64", + "configurationProvider": "ms-vscode.makefile-tools" + } + ], + "version": 4 +} diff --git a/Samples/4_CUDA_Libraries/jitLto/.vscode/extensions.json b/Samples/4_CUDA_Libraries/jitLto/.vscode/extensions.json new file mode 100644 index 00000000..c7eb54dc --- /dev/null +++ b/Samples/4_CUDA_Libraries/jitLto/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "nvidia.nsight-vscode-edition", + "ms-vscode.cpptools", + "ms-vscode.makefile-tools" + ] +} diff --git a/Samples/4_CUDA_Libraries/jitLto/.vscode/launch.json b/Samples/4_CUDA_Libraries/jitLto/.vscode/launch.json new file mode 100644 index 00000000..6a2037ec --- /dev/null +++ b/Samples/4_CUDA_Libraries/jitLto/.vscode/launch.json @@ -0,0 +1,10 @@ +{ + "configurations": [ + { + "name": "CUDA C++: Launch", + "type": "cuda-gdb", + "request": "launch", + "program": "${workspaceFolder}/jitlto" + } + ] +} diff --git a/Samples/4_CUDA_Libraries/jitLto/.vscode/tasks.json b/Samples/4_CUDA_Libraries/jitLto/.vscode/tasks.json new file mode 100644 index 00000000..4509aeb1 --- /dev/null +++ b/Samples/4_CUDA_Libraries/jitLto/.vscode/tasks.json @@ -0,0 +1,15 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "label": "sample", + "type": "shell", + "command": "make dbg=1", + "problemMatcher": ["$nvcc"], + "group": { + "kind": "build", + "isDefault": true + } + } + ] +} diff --git a/Samples/4_CUDA_Libraries/jitLto/Makefile b/Samples/4_CUDA_Libraries/jitLto/Makefile new file mode 100644 index 00000000..9b6a0d17 --- /dev/null +++ b/Samples/4_CUDA_Libraries/jitLto/Makefile @@ -0,0 +1,404 @@ +################################################################################ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif + +# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now. +ifeq ($(HOST_ARCH),aarch64) + ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null)) + HOST_ARCH := sbsa + TARGET_ARCH := sbsa + endif +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-clang++ + endif + else ifeq ($(TARGET_ARCH),sbsa) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu + LDFLAGS += --unresolved-symbols=ignore-in-shared-libs + CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm + CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + NVCCFLAGS += -D_QNX_SOURCE + NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le + CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu + LDFLAGS += -lsocket + LDFLAGS += -L/usr/lib/aarch64-qnx-gnu + CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu" + ifdef TARGET_OVERRIDE + LDFLAGS += -lslog2 + endif + + ifneq ($(TARGET_FS),) + LDFLAGS += -L$(TARGET_FS)/usr/lib + CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib" + LDFLAGS += -L$(TARGET_FS)/usr/libnvidia + CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia" + CCFLAGS += -I$(TARGET_FS)/../include + endif + endif +endif + +ifdef TARGET_OVERRIDE # cuda toolkit targets override + NVCCFLAGS += -target-dir $(TARGET_OVERRIDE) +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux) + CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu) + +SAMPLE_ENABLED := 1 + +# This sample is not supported on ARMv7 +ifeq ($(TARGET_ARCH),armv7l) + $(info >>> WARNING - jitlto is not supported on ARMv7 - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +# This sample is not supported on sbsa +ifeq ($(TARGET_ARCH),sbsa) + $(info >>> WARNING - jitlto is not supported on sbsa - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +# This sample is not supported on QNX +ifeq ($(TARGET_OS),qnx) + $(info >>> WARNING - jitlto is not supported on QNX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../../Common +LIBRARIES := + +################################################################################ + +# libNVRTC specific libraries +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -L$(CUDA_PATH)/lib -F/Library/Frameworks -framework CUDA +endif + +ifeq ($(TARGET_OS),darwin) + ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA +else + ifeq ($(TARGET_ARCH),x86_64) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs + CUDA_SEARCH_PATH += $(CUDA_PATH)/lib/stubs + CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs + ifdef TARGET_OVERRIDE + CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs + endif + endif + + ifeq ($(TARGET_ARCH),ppc64le) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs + endif + + ifeq ($(HOST_ARCH),ppc64le) + CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs + endif + + CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null) + ifeq ("$(CUDALIB)","") + $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed. Please re-install the driver. <<<) + SAMPLE_ENABLED := 0 + else + CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" ) + LIBRARIES += -L$(CUDALIB) -lcuda + endif +endif + +ALL_CCFLAGS += --threads 0 --std=c++11 + +INCLUDES += -I$(CUDA_PATH)/include + +LIBRARIES += -lnvrtc -lnvJitLink + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: jitlto + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +jitlto.o:jitlto.cpp + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +jitlto: jitlto.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./jitlto + +testrun: build + +clean: + rm -f jitlto jitlto.o + rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/jitlto + +clobber: clean diff --git a/Samples/4_CUDA_Libraries/jitLto/README.md b/Samples/4_CUDA_Libraries/jitLto/README.md new file mode 100644 index 00000000..93931706 --- /dev/null +++ b/Samples/4_CUDA_Libraries/jitLto/README.md @@ -0,0 +1,74 @@ +# jitlto - Saxpy with libnvJitLink + +## Description + +This sample does a simple saxpy multiply and add using nvrtc and nvJitLink with LTO (Link Time Optimization). It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for saxpy. + +## Key Concepts + +CUDA Runtime API, Runtime Compilation + +## Supported SM Architectures + +[SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows + +## Supported CPU Architecture + +x86_64, ppc64le, aarch64 + +## CUDA APIs involved + +### [CUDA Driver API](http://docs.nvidia.com/cuda/cuda-driver-api/index.html) +cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cuMemcpyHtoD, cuMemcpyDtoH, cuLaunchKernel + +## Dependencies needed to build/run +[NVRTC](../../../README.md#nvrtc), [NVJITLINK](../../../README.md#nvjitlink) + +## Prerequisites + +Download and install the [CUDA Toolkit 12.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Make sure the dependencies mentioned in [Dependencies]() section above are installed. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=aarch64`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +## References (for more details) + diff --git a/Samples/4_CUDA_Libraries/jitLto/jitlto.cpp b/Samples/4_CUDA_Libraries/jitLto/jitlto.cpp new file mode 100644 index 00000000..4a595132 --- /dev/null +++ b/Samples/4_CUDA_Libraries/jitLto/jitlto.cpp @@ -0,0 +1,224 @@ +#include +#include +#include +#include +#include + +#define NUM_THREADS 128 +#define NUM_BLOCKS 32 + +#define NVRTC_SAFE_CALL(x) \ + do { \ + nvrtcResult result = x; \ + if (result != NVRTC_SUCCESS) { \ + std::cerr << "\nerror: " #x " failed with error " \ + << nvrtcGetErrorString(result) << '\n'; \ + exit(1); \ + } \ + } while(0) +#define CUDA_SAFE_CALL(x) \ + do { \ + CUresult result = x; \ + if (result != CUDA_SUCCESS) { \ + const char *msg; \ + cuGetErrorName(result, &msg); \ + std::cerr << "\nerror: " #x " failed with error " \ + << msg << '\n'; \ + exit(1); \ + } \ + } while(0) +#define NVJITLINK_SAFE_CALL(h,x) \ + do { \ + nvJitLinkResult result = x; \ + if (result != NVJITLINK_SUCCESS) { \ + std::cerr << "\nerror: " #x " failed with error " \ + << result << '\n'; \ + size_t lsize; \ + result = nvJitLinkGetErrorLogSize(h, &lsize); \ + if (result == NVJITLINK_SUCCESS && lsize > 0) { \ + char *log = (char*)malloc(lsize); \ + result = nvJitLinkGetErrorLog(h, log); \ + if (result == NVJITLINK_SUCCESS) { \ + std::cerr << "error log: " << log << '\n'; \ + free(log); \ + } \ + } \ + exit(1); \ + } \ + } while(0) + +const char *lto_saxpy = " \n\ +extern __device__ float compute(float a, float x, float y); \n\ + \n\ +extern \"C\" __global__ \n\ +void saxpy(float a, float *x, float *y, float *out, size_t n) \n\ +{ \n\ + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n\ + if (tid < n) { \n\ + out[tid] = compute(a, x[tid], y[tid]); \n\ + } \n\ +} \n"; + +const char *lto_compute = " \n\ +__device__ float compute(float a, float x, float y) { \n\ + return a * x + y; \n\ +} \n"; + +// compile code into LTOIR, returning the IR and its size +static void getLTOIR (const char *code, const char *name, + char **ltoIR, size_t *ltoIRSize) +{ + // Create an instance of nvrtcProgram with the code string. + nvrtcProgram prog; + NVRTC_SAFE_CALL( + nvrtcCreateProgram(&prog, // prog + code, // buffer + name, // name + 0, // numHeaders + NULL, // headers + NULL)); // includeNames + + // specify that LTO IR should be generated for LTO operation + const char *opts[] = {"-dlto", + "--relocatable-device-code=true"}; + nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog + 2, // numOptions + opts); // options + // Obtain compilation log from the program. + size_t logSize; + NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize)); + char *log = new char[logSize]; + NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log)); + std::cout << log << '\n'; + delete[] log; + if (compileResult != NVRTC_SUCCESS) { + exit(1); + } + // Obtain generated LTO IR from the program. + NVRTC_SAFE_CALL(nvrtcGetLTOIRSize(prog, ltoIRSize)); + *ltoIR = new char[*ltoIRSize]; + NVRTC_SAFE_CALL(nvrtcGetLTOIR(prog, *ltoIR)); + // Destroy the program. + NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); +} + +int main(int argc, char *argv[]) +{ + char *ltoIR1; + char *ltoIR2; + size_t ltoIR1Size; + size_t ltoIR2Size; + // getLTOIR uses nvrtc to get the LTOIR. + // We could also use nvcc offline with -dlto -fatbin + // to generate the IR, but using nvrtc keeps the build simpler. + getLTOIR(lto_saxpy, "lto_saxpy.cu", <oIR1, <oIR1Size); + getLTOIR(lto_compute, "lto_compute.cu", <oIR2, <oIR2Size); + + CUdevice cuDevice; + CUcontext context; + CUmodule module; + CUfunction kernel; + CUDA_SAFE_CALL(cuInit(0)); + CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0)); + CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice)); + + // Dynamically determine the arch to link for + int major = 0; + int minor = 0; + CUDA_SAFE_CALL(cuDeviceGetAttribute(&major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); + CUDA_SAFE_CALL(cuDeviceGetAttribute(&minor, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); + int arch = major*10 + minor; + char smbuf[16]; + memset(smbuf,0,16); + sprintf(smbuf, "-arch=sm_%d", arch); + + // Load the generated LTO IR and link them together + nvJitLinkHandle handle; + const char *lopts[] = {"-lto", smbuf}; + NVJITLINK_SAFE_CALL(handle, nvJitLinkCreate(&handle, 2, lopts)); + + NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, + (void *)ltoIR1, ltoIR1Size, "lto_saxpy")); + NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, + (void *)ltoIR2, ltoIR2Size, "lto_compute")); + + // The call to nvJitLinkComplete causes linker to link together the two + // LTO IR modules, do optimization on the linked LTO IR, + // and generate cubin from it. + NVJITLINK_SAFE_CALL(handle, nvJitLinkComplete(handle)); + + // check error log + size_t logSize; + NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLogSize(handle, &logSize)); + if (logSize > 0) { + char *log = (char*)malloc(logSize+1); + NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLog(handle, log)); + std::cout << "Error log: " << log << std::endl; + free(log); + } + + // get linked cubin + size_t cubinSize; + NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubinSize(handle, &cubinSize)); + void *cubin = malloc(cubinSize); + NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubin(handle, cubin)); + + NVJITLINK_SAFE_CALL(handle, nvJitLinkDestroy(&handle)); + delete[] ltoIR1; + delete[] ltoIR2; + + // cubin is linked, so now load it + CUDA_SAFE_CALL(cuModuleLoadData(&module, cubin)); + CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "saxpy")); + + // Generate input for execution, and create output buffers. + size_t n = NUM_THREADS * NUM_BLOCKS; + size_t bufferSize = n * sizeof(float); + float a = 5.1f; + float *hX = new float[n], *hY = new float[n], *hOut = new float[n]; + for (size_t i = 0; i < n; ++i) { + hX[i] = static_cast(i); + hY[i] = static_cast(i * 2); + } + CUdeviceptr dX, dY, dOut; + CUDA_SAFE_CALL(cuMemAlloc(&dX, bufferSize)); + CUDA_SAFE_CALL(cuMemAlloc(&dY, bufferSize)); + CUDA_SAFE_CALL(cuMemAlloc(&dOut, bufferSize)); + CUDA_SAFE_CALL(cuMemcpyHtoD(dX, hX, bufferSize)); + CUDA_SAFE_CALL(cuMemcpyHtoD(dY, hY, bufferSize)); + // Execute SAXPY. + void *args[] = { &a, &dX, &dY, &dOut, &n }; + CUDA_SAFE_CALL( + cuLaunchKernel(kernel, + NUM_BLOCKS, 1, 1, // grid dim + NUM_THREADS, 1, 1, // block dim + 0, NULL, // shared mem and stream + args, 0)); // arguments + CUDA_SAFE_CALL(cuCtxSynchronize()); + // Retrieve and print output. + CUDA_SAFE_CALL(cuMemcpyDtoH(hOut, dOut, bufferSize)); + + for (size_t i = 0; i < n; ++i) { + std::cout << a << " * " << hX[i] << " + " << hY[i] + << " = " << hOut[i] << '\n'; + } + // check last value to verify + if (hOut[n-1] == 29074.5) { + std::cout << "PASSED!\n"; + } else { + std::cout << "values not expected?\n"; + } + // Release resources. + CUDA_SAFE_CALL(cuMemFree(dX)); + CUDA_SAFE_CALL(cuMemFree(dY)); + CUDA_SAFE_CALL(cuMemFree(dOut)); + CUDA_SAFE_CALL(cuModuleUnload(module)); + CUDA_SAFE_CALL(cuCtxDestroy(context)); + free(cubin); + delete[] hX; + delete[] hY; + delete[] hOut; + return 0; +} diff --git a/Samples/4_CUDA_Libraries/jitLto/jitlto_vs2017.sln b/Samples/4_CUDA_Libraries/jitLto/jitlto_vs2017.sln new file mode 100644 index 00000000..1f0647de --- /dev/null +++ b/Samples/4_CUDA_Libraries/jitLto/jitlto_vs2017.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2017 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "jitlto", "jitlto_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/4_CUDA_Libraries/jitLto/jitlto_vs2017.vcxproj b/Samples/4_CUDA_Libraries/jitLto/jitlto_vs2017.vcxproj new file mode 100644 index 00000000..6cb1f992 --- /dev/null +++ b/Samples/4_CUDA_Libraries/jitLto/jitlto_vs2017.vcxproj @@ -0,0 +1,112 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + jitlto_vs2017 + jitlto + + + + $([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0')) + $(LatestTargetPlatformVersion) + $(WindowsTargetPlatformVersion) + + + + Application + MultiByte + v141 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include; + + + Console + cuda.lib;nvrtc.lib;nvJitLink.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/jitlto.exe + + + + -Xcompiler "/wd 4819" --threads 0 + ./;../../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/4_CUDA_Libraries/jitLto/jitlto_vs2019.sln b/Samples/4_CUDA_Libraries/jitLto/jitlto_vs2019.sln new file mode 100644 index 00000000..4c318aa0 --- /dev/null +++ b/Samples/4_CUDA_Libraries/jitLto/jitlto_vs2019.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2019 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "jitlto", "jitlto_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/4_CUDA_Libraries/jitLto/jitlto_vs2019.vcxproj b/Samples/4_CUDA_Libraries/jitLto/jitlto_vs2019.vcxproj new file mode 100644 index 00000000..e085457c --- /dev/null +++ b/Samples/4_CUDA_Libraries/jitLto/jitlto_vs2019.vcxproj @@ -0,0 +1,108 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + jitlto_vs2019 + jitlto + + + + + Application + MultiByte + v142 + 10.0 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include; + + + Console + cuda.lib;nvrtc.lib;nvJitLink.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/jitlto.exe + + + + -Xcompiler "/wd 4819" --threads 0 + ./;../../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/4_CUDA_Libraries/jitLto/jitlto_vs2022.sln b/Samples/4_CUDA_Libraries/jitLto/jitlto_vs2022.sln new file mode 100644 index 00000000..5fb1e9dd --- /dev/null +++ b/Samples/4_CUDA_Libraries/jitLto/jitlto_vs2022.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2022 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "jitlto", "jitlto_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/4_CUDA_Libraries/jitLto/jitlto_vs2022.vcxproj b/Samples/4_CUDA_Libraries/jitLto/jitlto_vs2022.vcxproj new file mode 100644 index 00000000..6e231842 --- /dev/null +++ b/Samples/4_CUDA_Libraries/jitLto/jitlto_vs2022.vcxproj @@ -0,0 +1,108 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + jitlto_vs2022 + jitlto + + + + + Application + MultiByte + v143 + 10.0 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include; + + + Console + cuda.lib;nvrtc.lib;nvJitLink.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/jitlto.exe + + + + -Xcompiler "/wd 4819" --threads 0 + ./;../../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + +