Adding JIT LTO Sample

This commit is contained in:
Rob Nertney 2023-02-07 19:06:38 +00:00
parent 2b689228b7
commit 3d553b2ea1
13 changed files with 1140 additions and 0 deletions

View File

@ -0,0 +1,18 @@
{
"configurations": [
{
"name": "Linux",
"includePath": [
"${workspaceFolder}/**",
"${workspaceFolder}/../../../Common"
],
"defines": [],
"compilerPath": "/usr/local/cuda/bin/nvcc",
"cStandard": "gnu17",
"cppStandard": "gnu++14",
"intelliSenseMode": "linux-gcc-x64",
"configurationProvider": "ms-vscode.makefile-tools"
}
],
"version": 4
}

View File

@ -0,0 +1,7 @@
{
"recommendations": [
"nvidia.nsight-vscode-edition",
"ms-vscode.cpptools",
"ms-vscode.makefile-tools"
]
}

View File

@ -0,0 +1,10 @@
{
"configurations": [
{
"name": "CUDA C++: Launch",
"type": "cuda-gdb",
"request": "launch",
"program": "${workspaceFolder}/jitlto"
}
]
}

View File

@ -0,0 +1,15 @@
{
"version": "2.0.0",
"tasks": [
{
"label": "sample",
"type": "shell",
"command": "make dbg=1",
"problemMatcher": ["$nvcc"],
"group": {
"kind": "build",
"isDefault": true
}
}
]
}

View File

@ -0,0 +1,404 @@
################################################################################
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
#
# Makefile project only supported on Mac OS X and Linux Platforms)
#
################################################################################
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
HOST_ARCH := sbsa
TARGET_ARCH := sbsa
endif
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),sbsa)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
NVCCFLAGS += -D_QNX_SOURCE
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
LDFLAGS += -lsocket
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
ifdef TARGET_OVERRIDE
LDFLAGS += -lslog2
endif
ifneq ($(TARGET_FS),)
LDFLAGS += -L$(TARGET_FS)/usr/lib
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
CCFLAGS += -I$(TARGET_FS)/../include
endif
endif
endif
ifdef TARGET_OVERRIDE # cuda toolkit targets override
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)
SAMPLE_ENABLED := 1
# This sample is not supported on ARMv7
ifeq ($(TARGET_ARCH),armv7l)
$(info >>> WARNING - jitlto is not supported on ARMv7 - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
# This sample is not supported on sbsa
ifeq ($(TARGET_ARCH),sbsa)
$(info >>> WARNING - jitlto is not supported on sbsa - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
# This sample is not supported on QNX
ifeq ($(TARGET_OS),qnx)
$(info >>> WARNING - jitlto is not supported on QNX - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I../../../Common
LIBRARIES :=
################################################################################
# libNVRTC specific libraries
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -L$(CUDA_PATH)/lib -F/Library/Frameworks -framework CUDA
endif
ifeq ($(TARGET_OS),darwin)
ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
else
ifeq ($(TARGET_ARCH),x86_64)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
CUDA_SEARCH_PATH += $(CUDA_PATH)/lib/stubs
CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
ifdef TARGET_OVERRIDE
CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
endif
endif
ifeq ($(TARGET_ARCH),ppc64le)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
endif
ifeq ($(HOST_ARCH),ppc64le)
CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
endif
CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
ifeq ("$(CUDALIB)","")
$(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed. Please re-install the driver. <<<)
SAMPLE_ENABLED := 0
else
CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
LIBRARIES += -L$(CUDALIB) -lcuda
endif
endif
ALL_CCFLAGS += --threads 0 --std=c++11
INCLUDES += -I$(CUDA_PATH)/include
LIBRARIES += -lnvrtc -lnvJitLink
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: jitlto
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
jitlto.o:jitlto.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
jitlto: jitlto.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
$(EXEC) ./jitlto
testrun: build
clean:
rm -f jitlto jitlto.o
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/jitlto
clobber: clean

View File

@ -0,0 +1,74 @@
# jitlto - Saxpy with libnvJitLink
## Description
This sample does a simple saxpy multiply and add using nvrtc and nvJitLink with LTO (Link Time Optimization). It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for saxpy.
## Key Concepts
CUDA Runtime API, Runtime Compilation
## Supported SM Architectures
[SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
Linux, Windows
## Supported CPU Architecture
x86_64, ppc64le, aarch64
## CUDA APIs involved
### [CUDA Driver API](http://docs.nvidia.com/cuda/cuda-driver-api/index.html)
cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cuMemcpyHtoD, cuMemcpyDtoH, cuLaunchKernel
## Dependencies needed to build/run
[NVRTC](../../../README.md#nvrtc), [NVJITLINK](../../../README.md#nvjitlink)
## Prerequisites
Download and install the [CUDA Toolkit 12.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run
### Windows
The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
```
*_vs<version>.sln - for Visual Studio <version>
```
Each individual sample has its own set of solution files in its directory:
To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
### Linux
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
```
$ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
* **dbg=1** - build with debug symbols
```
$ make dbg=1
```
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
```
$ make SMS="50 60"
```
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
```
$ make HOST_COMPILER=g++
```
## References (for more details)

View File

@ -0,0 +1,224 @@
#include <cuda.h>
#include <nvJitLink.h>
#include <nvrtc.h>
#include <iostream>
#include <cstring>
#define NUM_THREADS 128
#define NUM_BLOCKS 32
#define NVRTC_SAFE_CALL(x) \
do { \
nvrtcResult result = x; \
if (result != NVRTC_SUCCESS) { \
std::cerr << "\nerror: " #x " failed with error " \
<< nvrtcGetErrorString(result) << '\n'; \
exit(1); \
} \
} while(0)
#define CUDA_SAFE_CALL(x) \
do { \
CUresult result = x; \
if (result != CUDA_SUCCESS) { \
const char *msg; \
cuGetErrorName(result, &msg); \
std::cerr << "\nerror: " #x " failed with error " \
<< msg << '\n'; \
exit(1); \
} \
} while(0)
#define NVJITLINK_SAFE_CALL(h,x) \
do { \
nvJitLinkResult result = x; \
if (result != NVJITLINK_SUCCESS) { \
std::cerr << "\nerror: " #x " failed with error " \
<< result << '\n'; \
size_t lsize; \
result = nvJitLinkGetErrorLogSize(h, &lsize); \
if (result == NVJITLINK_SUCCESS && lsize > 0) { \
char *log = (char*)malloc(lsize); \
result = nvJitLinkGetErrorLog(h, log); \
if (result == NVJITLINK_SUCCESS) { \
std::cerr << "error log: " << log << '\n'; \
free(log); \
} \
} \
exit(1); \
} \
} while(0)
const char *lto_saxpy = " \n\
extern __device__ float compute(float a, float x, float y); \n\
\n\
extern \"C\" __global__ \n\
void saxpy(float a, float *x, float *y, float *out, size_t n) \n\
{ \n\
size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n\
if (tid < n) { \n\
out[tid] = compute(a, x[tid], y[tid]); \n\
} \n\
} \n";
const char *lto_compute = " \n\
__device__ float compute(float a, float x, float y) { \n\
return a * x + y; \n\
} \n";
// compile code into LTOIR, returning the IR and its size
static void getLTOIR (const char *code, const char *name,
char **ltoIR, size_t *ltoIRSize)
{
// Create an instance of nvrtcProgram with the code string.
nvrtcProgram prog;
NVRTC_SAFE_CALL(
nvrtcCreateProgram(&prog, // prog
code, // buffer
name, // name
0, // numHeaders
NULL, // headers
NULL)); // includeNames
// specify that LTO IR should be generated for LTO operation
const char *opts[] = {"-dlto",
"--relocatable-device-code=true"};
nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog
2, // numOptions
opts); // options
// Obtain compilation log from the program.
size_t logSize;
NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
char *log = new char[logSize];
NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
std::cout << log << '\n';
delete[] log;
if (compileResult != NVRTC_SUCCESS) {
exit(1);
}
// Obtain generated LTO IR from the program.
NVRTC_SAFE_CALL(nvrtcGetLTOIRSize(prog, ltoIRSize));
*ltoIR = new char[*ltoIRSize];
NVRTC_SAFE_CALL(nvrtcGetLTOIR(prog, *ltoIR));
// Destroy the program.
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog));
}
int main(int argc, char *argv[])
{
char *ltoIR1;
char *ltoIR2;
size_t ltoIR1Size;
size_t ltoIR2Size;
// getLTOIR uses nvrtc to get the LTOIR.
// We could also use nvcc offline with -dlto -fatbin
// to generate the IR, but using nvrtc keeps the build simpler.
getLTOIR(lto_saxpy, "lto_saxpy.cu", &ltoIR1, &ltoIR1Size);
getLTOIR(lto_compute, "lto_compute.cu", &ltoIR2, &ltoIR2Size);
CUdevice cuDevice;
CUcontext context;
CUmodule module;
CUfunction kernel;
CUDA_SAFE_CALL(cuInit(0));
CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice));
// Dynamically determine the arch to link for
int major = 0;
int minor = 0;
CUDA_SAFE_CALL(cuDeviceGetAttribute(&major,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
CUDA_SAFE_CALL(cuDeviceGetAttribute(&minor,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
int arch = major*10 + minor;
char smbuf[16];
memset(smbuf,0,16);
sprintf(smbuf, "-arch=sm_%d", arch);
// Load the generated LTO IR and link them together
nvJitLinkHandle handle;
const char *lopts[] = {"-lto", smbuf};
NVJITLINK_SAFE_CALL(handle, nvJitLinkCreate(&handle, 2, lopts));
NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR,
(void *)ltoIR1, ltoIR1Size, "lto_saxpy"));
NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR,
(void *)ltoIR2, ltoIR2Size, "lto_compute"));
// The call to nvJitLinkComplete causes linker to link together the two
// LTO IR modules, do optimization on the linked LTO IR,
// and generate cubin from it.
NVJITLINK_SAFE_CALL(handle, nvJitLinkComplete(handle));
// check error log
size_t logSize;
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLogSize(handle, &logSize));
if (logSize > 0) {
char *log = (char*)malloc(logSize+1);
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLog(handle, log));
std::cout << "Error log: " << log << std::endl;
free(log);
}
// get linked cubin
size_t cubinSize;
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubinSize(handle, &cubinSize));
void *cubin = malloc(cubinSize);
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubin(handle, cubin));
NVJITLINK_SAFE_CALL(handle, nvJitLinkDestroy(&handle));
delete[] ltoIR1;
delete[] ltoIR2;
// cubin is linked, so now load it
CUDA_SAFE_CALL(cuModuleLoadData(&module, cubin));
CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "saxpy"));
// Generate input for execution, and create output buffers.
size_t n = NUM_THREADS * NUM_BLOCKS;
size_t bufferSize = n * sizeof(float);
float a = 5.1f;
float *hX = new float[n], *hY = new float[n], *hOut = new float[n];
for (size_t i = 0; i < n; ++i) {
hX[i] = static_cast<float>(i);
hY[i] = static_cast<float>(i * 2);
}
CUdeviceptr dX, dY, dOut;
CUDA_SAFE_CALL(cuMemAlloc(&dX, bufferSize));
CUDA_SAFE_CALL(cuMemAlloc(&dY, bufferSize));
CUDA_SAFE_CALL(cuMemAlloc(&dOut, bufferSize));
CUDA_SAFE_CALL(cuMemcpyHtoD(dX, hX, bufferSize));
CUDA_SAFE_CALL(cuMemcpyHtoD(dY, hY, bufferSize));
// Execute SAXPY.
void *args[] = { &a, &dX, &dY, &dOut, &n };
CUDA_SAFE_CALL(
cuLaunchKernel(kernel,
NUM_BLOCKS, 1, 1, // grid dim
NUM_THREADS, 1, 1, // block dim
0, NULL, // shared mem and stream
args, 0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize());
// Retrieve and print output.
CUDA_SAFE_CALL(cuMemcpyDtoH(hOut, dOut, bufferSize));
for (size_t i = 0; i < n; ++i) {
std::cout << a << " * " << hX[i] << " + " << hY[i]
<< " = " << hOut[i] << '\n';
}
// check last value to verify
if (hOut[n-1] == 29074.5) {
std::cout << "PASSED!\n";
} else {
std::cout << "values not expected?\n";
}
// Release resources.
CUDA_SAFE_CALL(cuMemFree(dX));
CUDA_SAFE_CALL(cuMemFree(dY));
CUDA_SAFE_CALL(cuMemFree(dOut));
CUDA_SAFE_CALL(cuModuleUnload(module));
CUDA_SAFE_CALL(cuCtxDestroy(context));
free(cubin);
delete[] hX;
delete[] hY;
delete[] hOut;
return 0;
}

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2017
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "jitlto", "jitlto_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,112 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>jitlto_vs2017</RootNamespace>
<ProjectName>jitlto</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
<LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
<WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
<TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 12.0.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cuda.lib;nvrtc.lib;nvJitLink.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/jitlto.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration></CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="jitlto.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 12.0.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2019
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "jitlto", "jitlto_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,108 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>jitlto_vs2019</RootNamespace>
<ProjectName>jitlto</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v142</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 12.0.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cuda.lib;nvrtc.lib;nvJitLink.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/jitlto.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration></CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="jitlto.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 12.0.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2022
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "jitlto", "jitlto_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,108 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>jitlto_vs2022</RootNamespace>
<ProjectName>jitlto</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v143</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 12.0.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cuda.lib;nvrtc.lib;nvJitLink.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/jitlto.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration></CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="jitlto.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 12.0.targets" />
</ImportGroup>
</Project>