Add and update samples with CUDA 10.1 support

2025-10-14 17:08:57 +08:00 · 2019-01-23 01:34:43 +05:30 · 2019-01-23 01:34:43 +05:30 · b458dafcd6
commit b458dafcd6
parent 32f0fc6111
201 changed files with 9072 additions and 286 deletions
--- a/Common/helper_cuda.h
+++ b/Common/helper_cuda.h
@ -282,6 +282,42 @@ static const char *_cudaGetErrorEnum(curandStatus_t error) {
 }
 #endif
 #ifdef NVJPEGAPI
 // nvJPEG API errors
 static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
  switch (error) {
    case NVJPEG_STATUS_SUCCESS:
      return "NVJPEG_STATUS_SUCCESS";
    case NVJPEG_STATUS_NOT_INITIALIZED:
      return "NVJPEG_STATUS_NOT_INITIALIZED";
    case NVJPEG_STATUS_INVALID_PARAMETER:
      return "NVJPEG_STATUS_INVALID_PARAMETER";
    case NVJPEG_STATUS_BAD_JPEG:
      return "NVJPEG_STATUS_BAD_JPEG";
    case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
      return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
    case NVJPEG_STATUS_ALLOCATOR_FAILURE:
      return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
    case NVJPEG_STATUS_EXECUTION_FAILED:
      return "NVJPEG_STATUS_EXECUTION_FAILED";
    case NVJPEG_STATUS_ARCH_MISMATCH:
      return "NVJPEG_STATUS_ARCH_MISMATCH";
    case NVJPEG_STATUS_INTERNAL_ERROR:
      return "NVJPEG_STATUS_INTERNAL_ERROR";
  }
  return "<unknown>";
 }
 #endif
 #ifdef NV_NPPIDEFS_H
 // NPP API errors
 static const char *_cudaGetErrorEnum(NppStatus error) {
--- a/Common/helper_multiprocess.cpp
+++ b/Common/helper_multiprocess.cpp
@ -0,0 +1,178 @@
 /* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include "helper_multiprocess.h"
 #include <cstdlib>
 #include <string>
 int sharedMemoryCreate(const char *name, size_t sz, sharedMemoryInfo *info)
 {
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    info->size = sz;
    info->shmHandle = CreateFileMapping(INVALID_HANDLE_VALUE,
                                        NULL,
                                        PAGE_READWRITE,
                                        0,
                                        (DWORD)sz,
                                        name);
    if (info->shmHandle == 0) {
        return GetLastError();
    }
    info->addr = MapViewOfFile(info->shmHandle, FILE_MAP_ALL_ACCESS, 0, 0, sz);
    if (info->addr == NULL) {
        return GetLastError();
    }
    return 0;
 #else
    int status = 0;
    info->size = sz;
    info->shmFd = shm_open(name, O_RDWR | O_CREAT, 0777);
    if (info->shmFd < 0) {
        return errno;
    }
    status = ftruncate(info->shmFd, sz);
    if (status != 0) {
        return status;
    }
    info->addr = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_SHARED, info->shmFd, 0);
    if (info->addr == NULL) {
        return errno;
    }
    return 0;
 #endif
 }
 int sharedMemoryOpen(const char *name, size_t sz, sharedMemoryInfo *info)
 {
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    info->size = sz;
    info->shmHandle = OpenFileMapping(FILE_MAP_ALL_ACCESS, FALSE, name);
    if (info->shmHandle == 0) {
        return GetLastError();
    }
    info->addr = MapViewOfFile(info->shmHandle, FILE_MAP_ALL_ACCESS, 0, 0, sz);
    if (info->addr == NULL) {
        return GetLastError();
    }
    return 0;
 #else
    info->size = sz;
    info->shmFd = shm_open(name, O_RDWR, 0777);
    if (info->shmFd < 0) {
        return errno;
    }
    info->addr = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_SHARED, info->shmFd, 0);
    if (info->addr == NULL) {
        return errno;
    }
    return 0;
 #endif
 }
 void sharedMemoryClose(sharedMemoryInfo *info)
 {
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    if (info->addr) {
        UnmapViewOfFile(info->addr);
    }
    if (info->shmHandle) {
        CloseHandle(info->shmHandle);
    }
 #else
    if (info->addr) {
        munmap(info->addr, info->size);
    }
    if (info->shmFd) {
        close(info->shmFd);
    }
 #endif
 }
 int spawnProcess(Process *process, const char *app, char * const *args)
 {
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    STARTUPINFO si = {0};
    BOOL status;
    size_t arglen = 0;
    size_t argIdx = 0;
    std::string arg_string;
 	memset(process, 0, sizeof(*process));
    while (*args) {
 		arg_string.append(*args).append(1, ' ');
 		args++;
 	}
    status = CreateProcess(app, LPSTR(arg_string.c_str()), NULL, NULL, FALSE, 0, NULL, NULL, &si, process);
    return status ? 0 : GetLastError();
 #else
    *process = fork();
    if (*process == 0) {
        if (0 > execvp(app, args)) {
            return errno;
        }
    }
    else if (*process < 0) {
        return errno;
    }
    return 0;
 #endif
 }
 int waitProcess(Process *process)
 {
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 	DWORD exitCode;
    WaitForSingleObject(process->hProcess, INFINITE);
    GetExitCodeProcess(process->hProcess, &exitCode);
    CloseHandle(process->hProcess);
    CloseHandle(process->hThread);
 	return (int)exitCode;
 #else
    int status = 0;
    do {
        if (0 > waitpid(*process, &status, 0)) {
            return errno;
        }
    } while (!WIFEXITED(status));
    return WEXITSTATUS(status);
 #endif
 }
--- a/Common/helper_multiprocess.h
+++ b/Common/helper_multiprocess.h
@ -0,0 +1,71 @@
 /* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef HELPER_MULTIPROCESS_H
 #define HELPER_MULTIPROCESS_H
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 #ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
 #endif
 #include <windows.h>
 #else
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <unistd.h>
 #include <errno.h>
 #include <sys/wait.h>
 #endif
 typedef struct sharedMemoryInfo_st {
    void *addr;
    size_t size;
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    HANDLE shmHandle;
 #else
    int shmFd;
 #endif
 } sharedMemoryInfo;
 int sharedMemoryCreate(const char *name, size_t sz, sharedMemoryInfo *info);
 int sharedMemoryOpen(const char *name, size_t sz, sharedMemoryInfo *info);
 void sharedMemoryClose(sharedMemoryInfo *info);
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 typedef PROCESS_INFORMATION Process;
 #else
 typedef pid_t Process;
 #endif
 int spawnProcess(Process *process, const char *app, char * const *args);
 int waitProcess(Process *process);
 #endif // HELPER_MULTIPROCESS_H
--- a/Samples/UnifiedMemoryPerf/Makefile
+++ b/Samples/UnifiedMemoryPerf/Makefile
@ -234,6 +234,12 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 SAMPLE_ENABLED := 1
 # This sample is not supported on QNX
 ifeq ($(TARGET_OS),qnx)
  $(info >>> WARNING - UnifiedMemoryPerf is not supported on QNX - waiving sample <<<)
  SAMPLE_ENABLED := 0
 endif
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
@ -246,7 +252,11 @@ LIBRARIES :=
 ################################################################################
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
 SMS ?= 30 35 37 50 52 60 61 70 72 75
 else
 SMS ?= 30 35 37 50 52 60 61 70 75
 endif
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
--- a/Samples/UnifiedMemoryPerf/NsightEclipse.xml
+++ b/Samples/UnifiedMemoryPerf/NsightEclipse.xml
@ -52,6 +52,7 @@
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
  <sm-arch>sm72</sm-arch>
  <sm-arch>sm75</sm-arch>
  <supported_envs>
    <env>
--- a/Samples/UnifiedMemoryPerf/README.md
+++ b/Samples/UnifiedMemoryPerf/README.md
@ -10,7 +10,7 @@ CUDA Systems Integration, Unified Memory, CUDA Streams and Events, Pinned System
 ## Supported SM Architectures
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
@ -30,7 +30,7 @@ cudaMallocManaged, cudaStreamAttachMemAsync, cudaMemcpyAsync, cudaMallocHost, cu
 ## Prerequisites
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## Build and Run
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -105,6 +105,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -105,6 +105,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -105,6 +105,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj
@ -34,7 +34,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -106,6 +106,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/bandwidthTest/Makefile
+++ b/Samples/bandwidthTest/Makefile
@ -0,0 +1,304 @@
 ################################################################################
 # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #  * Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 #  * Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #  * Neither the name of NVIDIA CORPORATION nor the names of its
 #    contributors may be used to endorse or promote products derived
 #    from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
 #
 # Makefile project only supported on Mac OS X and Linux Platforms)
 #
 ################################################################################
 # Location of the CUDA Toolkit
 CUDA_PATH ?= /usr/local/cuda
 ##############################
 # start deprecated interface #
 ##############################
 ifeq ($(x86_64),1)
    $(info WARNING - x86_64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
    TARGET_ARCH ?= x86_64
 endif
 ifeq ($(ARMv7),1)
    $(info WARNING - ARMv7 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=armv7l instead)
    TARGET_ARCH ?= armv7l
 endif
 ifeq ($(aarch64),1)
    $(info WARNING - aarch64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
    TARGET_ARCH ?= aarch64
 endif
 ifeq ($(ppc64le),1)
    $(info WARNING - ppc64le variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
    TARGET_ARCH ?= ppc64le
 endif
 ifneq ($(GCC),)
    $(info WARNING - GCC variable has been deprecated)
    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
    HOST_COMPILER ?= $(GCC)
 endif
 ifneq ($(abi),)
    $(error ERROR - abi variable has been removed)
 endif
 ############################
 # end deprecated interface #
 ############################
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
 ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
            TARGET_SIZE := 64
        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
            TARGET_SIZE := 32
        endif
    else
        TARGET_SIZE := $(shell getconf LONG_BIT)
    endif
 else
    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
    endif
 endif
 # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
 ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
    TARGET_ARCH = armv7l
 endif
 # operating system
 HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 TARGET_OS ?= $(HOST_OS)
 ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
 endif
 # host compiler
 ifeq ($(TARGET_OS),darwin)
    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
        HOST_COMPILER ?= clang++
    endif
 else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
        ifeq ($(TARGET_OS),linux)
            HOST_COMPILER ?= arm-linux-gnueabihf-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
        else ifeq ($(TARGET_OS),android)
            HOST_COMPILER ?= arm-linux-androideabi-g++
        endif
    else ifeq ($(TARGET_ARCH),aarch64)
        ifeq ($(TARGET_OS), linux)
            HOST_COMPILER ?= aarch64-linux-gnu-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
        else ifeq ($(TARGET_OS), android)
            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
    endif
 endif
 HOST_COMPILER ?= g++
 NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
 # internal flags
 NVCCFLAGS   := -m${TARGET_SIZE}
 CCFLAGS     :=
 LDFLAGS     :=
 # build flags
 ifeq ($(TARGET_OS),darwin)
    LDFLAGS += -rpath $(CUDA_PATH)/lib
    CCFLAGS += -arch $(HOST_ARCH)
 else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
 else ifeq ($(TARGET_OS),android)
    LDFLAGS += -pie
    CCFLAGS += -fpie -fpic -fexceptions
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
        endif
    endif
 endif
 ifeq ($(TARGET_OS),qnx)
    CCFLAGS += -DWIN_INTERFACE_CUSTOM
    LDFLAGS += -lsocket
 endif
 # Install directory of different arch
 CUDA_INSTALL_TARGET_DIR :=
 ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
 else ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
 endif
 # Debug build flags
 ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      BUILD_TYPE := debug
 else
      BUILD_TYPE := release
 endif
 ALL_CCFLAGS :=
 ALL_CCFLAGS += $(NVCCFLAGS)
 ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 SAMPLE_ENABLED := 1
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
 # Common includes and paths for CUDA
 INCLUDES  := -I../../Common
 LIBRARIES :=
 ################################################################################
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
 SMS ?= 30 35 37 50 52 60 61 70 72 75
 else
 SMS ?= 30 35 37 50 52 60 61 70 75
 endif
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
 SAMPLE_ENABLED := 0
 endif
 ifeq ($(GENCODE_FLAGS),)
 # Generate SASS code for each SM architecture listed in $(SMS)
 $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
 # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
 HIGHEST_SM := $(lastword $(sort $(SMS)))
 ifneq ($(HIGHEST_SM),)
 GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
 endif
 endif
 ifeq ($(SAMPLE_ENABLED),0)
 EXEC ?= @echo "[@]"
 endif
 ################################################################################
 # Target rules
 all: build
 build: bandwidthTest
 check.deps:
 ifeq ($(SAMPLE_ENABLED),0)
 	@echo "Sample will be waived due to the above missing dependencies"
 else
 	@echo "Sample is ready - all dependencies have been met"
 endif
 bandwidthTest.o:bandwidthTest.cu
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 bandwidthTest: bandwidthTest.o
 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
 	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 run: build
 	$(EXEC) ./bandwidthTest
 clean:
 	rm -f bandwidthTest bandwidthTest.o
 	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/bandwidthTest
 clobber: clean
--- a/Samples/bandwidthTest/NsightEclipse.xml
+++ b/Samples/bandwidthTest/NsightEclipse.xml
@ -0,0 +1,79 @@
 <?xml version="1.0" encoding="UTF-8"?> 
 <!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
 <entry>
  <name>bandwidthTest</name>
  <cuda_api_list>
    <toolkit>cudaSetDevice</toolkit>
    <toolkit>cudaHostAlloc</toolkit>
    <toolkit>cudaFree</toolkit>
    <toolkit>cudaMallocHost</toolkit>
    <toolkit>cudaFreeHost</toolkit>
    <toolkit>cudaMemcpy</toolkit>
    <toolkit>cudaMemcpyAsync</toolkit>
    <toolkit>cudaEventCreate</toolkit>
    <toolkit>cudaEventRecord</toolkit>
    <toolkit>cudaEventDestroy</toolkit>
    <toolkit>cudaDeviceSynchronize</toolkit>
    <toolkit>cudaEventElapsedTime</toolkit>
  </cuda_api_list>
  <description><![CDATA[This is a simple test program to measure the memcopy bandwidth of the GPU and memcpy bandwidth across PCI-e. This test application is capable of measuring device to device copy bandwidth, host to device copy bandwidth for pageable and page-locked memory, and device to host copy bandwidth for pageable and page-locked memory.]]></description>
  <devicecompilation>whole</devicecompilation>
  <includepaths>
    <path>./</path>
    <path>../</path>
    <path>../../common/inc</path>
  </includepaths>
  <keyconcepts>
    <concept level="basic">CUDA Streams and Events</concept>
    <concept level="basic">Performance Strategies</concept>
  </keyconcepts>
  <keywords>
    <keyword>GPGPU</keyword>
    <keyword>bandwidth</keyword>
  </keywords>
  <libraries>
  </libraries>
  <librarypaths>
  </librarypaths>
  <nsight_eclipse>true</nsight_eclipse>
  <primary_file>bandwidthTest.cu</primary_file>
  <scopes>
    <scope>1:CUDA Basic Topics</scope>
    <scope>1:Performance Strategies</scope>
  </scopes>
  <sm-arch>sm30</sm-arch>
  <sm-arch>sm35</sm-arch>
  <sm-arch>sm37</sm-arch>
  <sm-arch>sm50</sm-arch>
  <sm-arch>sm52</sm-arch>
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
  <sm-arch>sm72</sm-arch>
  <sm-arch>sm75</sm-arch>
  <supported_envs>
    <env>
      <arch>x86_64</arch>
      <platform>linux</platform>
    </env>
    <env>
      <platform>windows7</platform>
    </env>
    <env>
      <arch>x86_64</arch>
      <platform>macosx</platform>
    </env>
    <env>
      <arch>arm</arch>
    </env>
    <env>
      <arch>ppc64le</arch>
      <platform>linux</platform>
    </env>
  </supported_envs>
  <supported_sm_architectures>
    <include>all</include>
  </supported_sm_architectures>
  <title>Bandwidth Test</title>
  <type>exe</type>
 </entry>
--- a/Samples/bandwidthTest/README.md
+++ b/Samples/bandwidthTest/README.md
@ -0,0 +1,94 @@
 # bandwidthTest - Bandwidth Test
 ## Description
 This is a simple test program to measure the memcopy bandwidth of the GPU and memcpy bandwidth across PCI-e. This test application is capable of measuring device to device copy bandwidth, host to device copy bandwidth for pageable and page-locked memory, and device to host copy bandwidth for pageable and page-locked memory.
 ## Key Concepts
 CUDA Streams and Events, Performance Strategies
 ## Supported SM Architectures
 [SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
 Linux, Windows, MacOSX
 ## Supported CPU Architecture
 x86_64, ppc64le, armv7l
 ## CUDA APIs involved
 ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
 cudaSetDevice, cudaHostAlloc, cudaFree, cudaMallocHost, cudaFreeHost, cudaMemcpy, cudaMemcpyAsync, cudaEventCreate, cudaEventRecord, cudaEventDestroy, cudaDeviceSynchronize, cudaEventElapsedTime
 ## Prerequisites
 Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 ## Build and Run
 ### Windows
 The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
 ```
 *_vs<version>.sln - for Visual Studio <version>
 ```
 Each individual sample has its own set of solution files in its directory:
 To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
 > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
 ### Linux
 The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
 ```
 $ cd <sample_dir>
 $ make
 ```
 The samples makefiles can take advantage of certain options:
 *  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
 `$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
 *   **dbg=1** - build with debug symbols
    ```
    $ make dbg=1
    ```
 *   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
    ```
    $ make SMS="50 60"
    ```
 *  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
 ```
    $ make HOST_COMPILER=g++
 ```
 ### Mac
 The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
 ```
 $ cd <sample_dir>
 $ make
 ```
 The samples makefiles can take advantage of certain options:
 *  **dbg=1** - build with debug symbols
    ```
    $ make dbg=1
    ```
 *  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
    ```
    $ make SMS="A B ..."
    ```
 *  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
    ```
    $ make HOST_COMPILER=clang
    ```
 ## References (for more details)
--- a/Samples/bandwidthTest/bandwidthTest.cu
+++ b/Samples/bandwidthTest/bandwidthTest.cu
@ -0,0 +1,969 @@
 /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /*
 * This is a simple test program to measure the memcopy bandwidth of the GPU.
 * It can measure device to device copy bandwidth, host to device copy bandwidth
 * for pageable and pinned memory, and device to host copy bandwidth for
 * pageable and pinned memory.
 *
 * Usage:
 * ./bandwidthTest [option]...
 */
 // CUDA runtime
 #include <cuda_runtime.h>
 // includes
 #include <helper_cuda.h>  // helper functions for CUDA error checking and initialization
 #include <helper_functions.h>  // helper for shared functions common to CUDA Samples
 #include <cuda.h>
 #include <cassert>
 #include <iostream>
 #include <memory>
 static const char *sSDKsample = "CUDA Bandwidth Test";
 // defines, project
 #define MEMCOPY_ITERATIONS 100
 #define DEFAULT_SIZE (32 * (1e6))      // 32 M
 #define DEFAULT_INCREMENT (4 * (1e6))  // 4 M
 #define CACHE_CLEAR_SIZE (16 * (1e6))  // 16 M
 // shmoo mode defines
 #define SHMOO_MEMSIZE_MAX (64 * (1e6))       // 64 M
 #define SHMOO_MEMSIZE_START (1e3)            // 1 KB
 #define SHMOO_INCREMENT_1KB (1e3)            // 1 KB
 #define SHMOO_INCREMENT_2KB (2 * 1e3)        // 2 KB
 #define SHMOO_INCREMENT_10KB (10 * (1e3))    // 10KB
 #define SHMOO_INCREMENT_100KB (100 * (1e3))  // 100 KB
 #define SHMOO_INCREMENT_1MB (1e6)            // 1 MB
 #define SHMOO_INCREMENT_2MB (2 * 1e6)        // 2 MB
 #define SHMOO_INCREMENT_4MB (4 * 1e6)        // 4 MB
 #define SHMOO_LIMIT_20KB (20 * (1e3))        // 20 KB
 #define SHMOO_LIMIT_50KB (50 * (1e3))        // 50 KB
 #define SHMOO_LIMIT_100KB (100 * (1e3))      // 100 KB
 #define SHMOO_LIMIT_1MB (1e6)                // 1 MB
 #define SHMOO_LIMIT_16MB (16 * 1e6)          // 16 MB
 #define SHMOO_LIMIT_32MB (32 * 1e6)          // 32 MB
 // CPU cache flush
 #define FLUSH_SIZE (256 * 1024 * 1024)
 char *flush_buf;
 // enums, project
 enum testMode { QUICK_MODE, RANGE_MODE, SHMOO_MODE };
 enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
 enum printMode { USER_READABLE, CSV };
 enum memoryMode { PINNED, PAGEABLE };
 const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device",
                                 "Device to Device", NULL};
 const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL};
 // if true, use CPU based timing for everything
 static bool bDontUseGPUTiming;
 int *pArgc = NULL;
 char **pArgv = NULL;
 ////////////////////////////////////////////////////////////////////////////////
 // declaration, forward
 int runTest(const int argc, const char **argv);
 void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
                   testMode mode, memcpyKind kind, printMode printmode,
                   memoryMode memMode, int startDevice, int endDevice, bool wc);
 void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
                        memoryMode memMode, int startDevice, int endDevice,
                        bool wc);
 void testBandwidthRange(unsigned int start, unsigned int end,
                        unsigned int increment, memcpyKind kind,
                        printMode printmode, memoryMode memMode,
                        int startDevice, int endDevice, bool wc);
 void testBandwidthShmoo(memcpyKind kind, printMode printmode,
                        memoryMode memMode, int startDevice, int endDevice,
                        bool wc);
 float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
                               bool wc);
 float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
                               bool wc);
 float testDeviceToDeviceTransfer(unsigned int memSize);
 void printResultsReadable(unsigned int *memSizes, double *bandwidths,
                          unsigned int count, memcpyKind kind,
                          memoryMode memMode, int iNumDevs, bool wc);
 void printResultsCSV(unsigned int *memSizes, double *bandwidths,
                     unsigned int count, memcpyKind kind, memoryMode memMode,
                     int iNumDevs, bool wc);
 void printHelp(void);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
 int main(int argc, char **argv) {
  pArgc = &argc;
  pArgv = argv;
  flush_buf = (char *)malloc(FLUSH_SIZE);
  // set logfile name and start logs
  printf("[%s] - Starting...\n", sSDKsample);
  int iRetVal = runTest(argc, (const char **)argv);
  if (iRetVal < 0) {
    checkCudaErrors(cudaSetDevice(0));
  }
  // finish
  printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL");
  printf(
      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
      "Results may vary when GPU Boost is enabled.\n");
  free(flush_buf);
  exit((iRetVal == 0) ? EXIT_SUCCESS : EXIT_FAILURE);
 }
 ///////////////////////////////////////////////////////////////////////////////
 // Parse args, run the appropriate tests
 ///////////////////////////////////////////////////////////////////////////////
 int runTest(const int argc, const char **argv) {
  int start = DEFAULT_SIZE;
  int end = DEFAULT_SIZE;
  int startDevice = 0;
  int endDevice = 0;
  int increment = DEFAULT_INCREMENT;
  testMode mode = QUICK_MODE;
  bool htod = false;
  bool dtoh = false;
  bool dtod = false;
  bool wc = false;
  char *modeStr;
  char *device = NULL;
  printMode printmode = USER_READABLE;
  char *memModeStr = NULL;
  memoryMode memMode = PINNED;
  // process command line args
  if (checkCmdLineFlag(argc, argv, "help")) {
    printHelp();
    return 0;
  }
  if (checkCmdLineFlag(argc, argv, "csv")) {
    printmode = CSV;
  }
  if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) {
    if (strcmp(memModeStr, "pageable") == 0) {
      memMode = PAGEABLE;
    } else if (strcmp(memModeStr, "pinned") == 0) {
      memMode = PINNED;
    } else {
      printf("Invalid memory mode - valid modes are pageable or pinned\n");
      printf("See --help for more information\n");
      return -1000;
    }
  } else {
    // default - pinned memory
    memMode = PINNED;
  }
  if (getCmdLineArgumentString(argc, argv, "device", &device)) {
    int deviceCount;
    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
    if (error_id != cudaSuccess) {
      printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id,
             cudaGetErrorString(error_id));
      exit(EXIT_FAILURE);
    }
    if (deviceCount == 0) {
      printf("!!!!!No devices found!!!!!\n");
      return -2000;
    }
    if (strcmp(device, "all") == 0) {
      printf(
          "\n!!!!!Cumulative Bandwidth to be computed from all the devices "
          "!!!!!!\n\n");
      startDevice = 0;
      endDevice = deviceCount - 1;
    } else {
      startDevice = endDevice = atoi(device);
      if (startDevice >= deviceCount || startDevice < 0) {
        printf(
            "\n!!!!!Invalid GPU number %d given hence default gpu %d will be "
            "used !!!!!\n",
            startDevice, 0);
        startDevice = endDevice = 0;
      }
    }
  }
  printf("Running on...\n\n");
  for (int currentDevice = startDevice; currentDevice <= endDevice;
       currentDevice++) {
    cudaDeviceProp deviceProp;
    cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice);
    if (error_id == cudaSuccess) {
      printf(" Device %d: %s\n", currentDevice, deviceProp.name);
      if (deviceProp.computeMode == cudaComputeModeProhibited) {
        fprintf(stderr,
                "Error: device is running in <Compute Mode Prohibited>, no "
                "threads can use ::cudaSetDevice().\n");
        checkCudaErrors(cudaSetDevice(currentDevice));
        exit(EXIT_FAILURE);
      }
    } else {
      printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id,
             cudaGetErrorString(error_id));
      checkCudaErrors(cudaSetDevice(currentDevice));
      exit(EXIT_FAILURE);
    }
  }
  if (getCmdLineArgumentString(argc, argv, "mode", &modeStr)) {
    // figure out the mode
    if (strcmp(modeStr, "quick") == 0) {
      printf(" Quick Mode\n\n");
      mode = QUICK_MODE;
    } else if (strcmp(modeStr, "shmoo") == 0) {
      printf(" Shmoo Mode\n\n");
      mode = SHMOO_MODE;
    } else if (strcmp(modeStr, "range") == 0) {
      printf(" Range Mode\n\n");
      mode = RANGE_MODE;
    } else {
      printf("Invalid mode - valid modes are quick, range, or shmoo\n");
      printf("See --help for more information\n");
      return -3000;
    }
  } else {
    // default mode - quick
    printf(" Quick Mode\n\n");
    mode = QUICK_MODE;
  }
  if (checkCmdLineFlag(argc, argv, "htod")) {
    htod = true;
  }
  if (checkCmdLineFlag(argc, argv, "dtoh")) {
    dtoh = true;
  }
  if (checkCmdLineFlag(argc, argv, "dtod")) {
    dtod = true;
  }
 #if CUDART_VERSION >= 2020
  if (checkCmdLineFlag(argc, argv, "wc")) {
    wc = true;
  }
 #endif
  if (checkCmdLineFlag(argc, argv, "cputiming")) {
    bDontUseGPUTiming = true;
  }
  if (!htod && !dtoh && !dtod) {
    // default:  All
    htod = true;
    dtoh = true;
    dtod = true;
  }
  if (RANGE_MODE == mode) {
    if (checkCmdLineFlag(argc, (const char **)argv, "start")) {
      start = getCmdLineArgumentInt(argc, argv, "start");
      if (start <= 0) {
        printf("Illegal argument - start must be greater than zero\n");
        return -4000;
      }
    } else {
      printf("Must specify a starting size in range mode\n");
      printf("See --help for more information\n");
      return -5000;
    }
    if (checkCmdLineFlag(argc, (const char **)argv, "end")) {
      end = getCmdLineArgumentInt(argc, argv, "end");
      if (end <= 0) {
        printf("Illegal argument - end must be greater than zero\n");
        return -6000;
      }
      if (start > end) {
        printf("Illegal argument - start is greater than end\n");
        return -7000;
      }
    } else {
      printf("Must specify an end size in range mode.\n");
      printf("See --help for more information\n");
      return -8000;
    }
    if (checkCmdLineFlag(argc, argv, "increment")) {
      increment = getCmdLineArgumentInt(argc, argv, "increment");
      if (increment <= 0) {
        printf("Illegal argument - increment must be greater than zero\n");
        return -9000;
      }
    } else {
      printf("Must specify an increment in user mode\n");
      printf("See --help for more information\n");
      return -10000;
    }
  }
  if (htod) {
    testBandwidth((unsigned int)start, (unsigned int)end,
                  (unsigned int)increment, mode, HOST_TO_DEVICE, printmode,
                  memMode, startDevice, endDevice, wc);
  }
  if (dtoh) {
    testBandwidth((unsigned int)start, (unsigned int)end,
                  (unsigned int)increment, mode, DEVICE_TO_HOST, printmode,
                  memMode, startDevice, endDevice, wc);
  }
  if (dtod) {
    testBandwidth((unsigned int)start, (unsigned int)end,
                  (unsigned int)increment, mode, DEVICE_TO_DEVICE, printmode,
                  memMode, startDevice, endDevice, wc);
  }
  // Ensure that we reset all CUDA Devices in question
  for (int nDevice = startDevice; nDevice <= endDevice; nDevice++) {
    cudaSetDevice(nDevice);
  }
  return 0;
 }
 ///////////////////////////////////////////////////////////////////////////////
 //  Run a bandwidth test
 ///////////////////////////////////////////////////////////////////////////////
 void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
                   testMode mode, memcpyKind kind, printMode printmode,
                   memoryMode memMode, int startDevice, int endDevice,
                   bool wc) {
  switch (mode) {
    case QUICK_MODE:
      testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice,
                         endDevice, wc);
      break;
    case RANGE_MODE:
      testBandwidthRange(start, end, increment, kind, printmode, memMode,
                         startDevice, endDevice, wc);
      break;
    case SHMOO_MODE:
      testBandwidthShmoo(kind, printmode, memMode, startDevice, endDevice, wc);
      break;
    default:
      break;
  }
 }
 //////////////////////////////////////////////////////////////////////
 //  Run a quick mode bandwidth test
 //////////////////////////////////////////////////////////////////////
 void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
                        memoryMode memMode, int startDevice, int endDevice,
                        bool wc) {
  testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode,
                     startDevice, endDevice, wc);
 }
 ///////////////////////////////////////////////////////////////////////
 //  Run a range mode bandwidth test
 //////////////////////////////////////////////////////////////////////
 void testBandwidthRange(unsigned int start, unsigned int end,
                        unsigned int increment, memcpyKind kind,
                        printMode printmode, memoryMode memMode,
                        int startDevice, int endDevice, bool wc) {
  // count the number of copies we're going to run
  unsigned int count = 1 + ((end - start) / increment);
  unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
  double *bandwidths = (double *)malloc(count * sizeof(double));
  // Before calculating the cumulative bandwidth, initialize bandwidths array to
  // NULL
  for (unsigned int i = 0; i < count; i++) {
    bandwidths[i] = 0.0;
  }
  // Use the device asked by the user
  for (int currentDevice = startDevice; currentDevice <= endDevice;
       currentDevice++) {
    cudaSetDevice(currentDevice);
    // run each of the copies
    for (unsigned int i = 0; i < count; i++) {
      memSizes[i] = start + i * increment;
      switch (kind) {
        case DEVICE_TO_HOST:
          bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc);
          break;
        case HOST_TO_DEVICE:
          bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc);
          break;
        case DEVICE_TO_DEVICE:
          bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]);
          break;
      }
    }
  }  // Complete the bandwidth computation on all the devices
  // print results
  if (printmode == CSV) {
    printResultsCSV(memSizes, bandwidths, count, kind, memMode,
                    (1 + endDevice - startDevice), wc);
  } else {
    printResultsReadable(memSizes, bandwidths, count, kind, memMode,
                         (1 + endDevice - startDevice), wc);
  }
  // clean up
  free(memSizes);
  free(bandwidths);
 }
 //////////////////////////////////////////////////////////////////////////////
 // Intense shmoo mode - covers a large range of values with varying increments
 //////////////////////////////////////////////////////////////////////////////
 void testBandwidthShmoo(memcpyKind kind, printMode printmode,
                        memoryMode memMode, int startDevice, int endDevice,
                        bool wc) {
  // count the number of copies to make
  unsigned int count =
      1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) +
      ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) +
      ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) +
      ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) +
      ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) +
      ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) +
      ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
  unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
  double *bandwidths = (double *)malloc(count * sizeof(double));
  // Before calculating the cumulative bandwidth, initialize bandwidths array to
  // NULL
  for (unsigned int i = 0; i < count; i++) {
    bandwidths[i] = 0.0;
  }
  // Use the device asked by the user
  for (int currentDevice = startDevice; currentDevice <= endDevice;
       currentDevice++) {
    cudaSetDevice(currentDevice);
    // Run the shmoo
    int iteration = 0;
    unsigned int memSize = 0;
    while (memSize <= SHMOO_MEMSIZE_MAX) {
      if (memSize < SHMOO_LIMIT_20KB) {
        memSize += SHMOO_INCREMENT_1KB;
      } else if (memSize < SHMOO_LIMIT_50KB) {
        memSize += SHMOO_INCREMENT_2KB;
      } else if (memSize < SHMOO_LIMIT_100KB) {
        memSize += SHMOO_INCREMENT_10KB;
      } else if (memSize < SHMOO_LIMIT_1MB) {
        memSize += SHMOO_INCREMENT_100KB;
      } else if (memSize < SHMOO_LIMIT_16MB) {
        memSize += SHMOO_INCREMENT_1MB;
      } else if (memSize < SHMOO_LIMIT_32MB) {
        memSize += SHMOO_INCREMENT_2MB;
      } else {
        memSize += SHMOO_INCREMENT_4MB;
      }
      memSizes[iteration] = memSize;
      switch (kind) {
        case DEVICE_TO_HOST:
          bandwidths[iteration] +=
              testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
          break;
        case HOST_TO_DEVICE:
          bandwidths[iteration] +=
              testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
          break;
        case DEVICE_TO_DEVICE:
          bandwidths[iteration] +=
              testDeviceToDeviceTransfer(memSizes[iteration]);
          break;
      }
      iteration++;
      printf(".");
      fflush(0);
    }
  }  // Complete the bandwidth computation on all the devices
  // print results
  printf("\n");
  if (CSV == printmode) {
    printResultsCSV(memSizes, bandwidths, count, kind, memMode,
                    (1 + endDevice - startDevice), wc);
  } else {
    printResultsReadable(memSizes, bandwidths, count, kind, memMode,
                         (1 + endDevice - startDevice), wc);
  }
  // clean up
  free(memSizes);
  free(bandwidths);
 }
 ///////////////////////////////////////////////////////////////////////////////
 //  test the bandwidth of a device to host memcopy of a specific size
 ///////////////////////////////////////////////////////////////////////////////
 float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
                               bool wc) {
  StopWatchInterface *timer = NULL;
  float elapsedTimeInMs = 0.0f;
  float bandwidthInGBs = 0.0f;
  unsigned char *h_idata = NULL;
  unsigned char *h_odata = NULL;
  cudaEvent_t start, stop;
  sdkCreateTimer(&timer);
  checkCudaErrors(cudaEventCreate(&start));
  checkCudaErrors(cudaEventCreate(&stop));
  // allocate host memory
  if (PINNED == memMode) {
  // pinned memory mode - use special function to get OS-pinned memory
 #if CUDART_VERSION >= 2020
    checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize,
                                  (wc) ? cudaHostAllocWriteCombined : 0));
    checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
                                  (wc) ? cudaHostAllocWriteCombined : 0));
 #else
    checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize));
    checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
 #endif
  } else {
    // pageable memory mode - use malloc
    h_idata = (unsigned char *)malloc(memSize);
    h_odata = (unsigned char *)malloc(memSize);
    if (h_idata == 0 || h_odata == 0) {
      fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
      exit(EXIT_FAILURE);
    }
  }
  // initialize the memory
  for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
    h_idata[i] = (unsigned char)(i & 0xff);
  }
  // allocate device memory
  unsigned char *d_idata;
  checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
  // initialize the device memory
  checkCudaErrors(
      cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
  // copy data from GPU to Host
  if (PINNED == memMode) {
    if (bDontUseGPUTiming) sdkStartTimer(&timer);
    checkCudaErrors(cudaEventRecord(start, 0));
    for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
      checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize,
                                      cudaMemcpyDeviceToHost, 0));
    }
    checkCudaErrors(cudaEventRecord(stop, 0));
    checkCudaErrors(cudaDeviceSynchronize());
    checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
    if (bDontUseGPUTiming) {
      sdkStopTimer(&timer);
      elapsedTimeInMs = sdkGetTimerValue(&timer);
      sdkResetTimer(&timer);
    }
  } else {
    elapsedTimeInMs = 0;
    for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
      sdkStartTimer(&timer);
      checkCudaErrors(
          cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost));
      sdkStopTimer(&timer);
      elapsedTimeInMs += sdkGetTimerValue(&timer);
      sdkResetTimer(&timer);
      memset(flush_buf, i, FLUSH_SIZE);
    }
  }
  // calculate bandwidth in GB/s
  double time_s = elapsedTimeInMs / 1e3;
  bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
  bandwidthInGBs = bandwidthInGBs / time_s;
  // clean up memory
  checkCudaErrors(cudaEventDestroy(stop));
  checkCudaErrors(cudaEventDestroy(start));
  sdkDeleteTimer(&timer);
  if (PINNED == memMode) {
    checkCudaErrors(cudaFreeHost(h_idata));
    checkCudaErrors(cudaFreeHost(h_odata));
  } else {
    free(h_idata);
    free(h_odata);
  }
  checkCudaErrors(cudaFree(d_idata));
  return bandwidthInGBs;
 }
 ///////////////////////////////////////////////////////////////////////////////
 //! test the bandwidth of a host to device memcopy of a specific size
 ///////////////////////////////////////////////////////////////////////////////
 float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
                               bool wc) {
  StopWatchInterface *timer = NULL;
  float elapsedTimeInMs = 0.0f;
  float bandwidthInGBs = 0.0f;
  cudaEvent_t start, stop;
  sdkCreateTimer(&timer);
  checkCudaErrors(cudaEventCreate(&start));
  checkCudaErrors(cudaEventCreate(&stop));
  // allocate host memory
  unsigned char *h_odata = NULL;
  if (PINNED == memMode) {
 #if CUDART_VERSION >= 2020
    // pinned memory mode - use special function to get OS-pinned memory
    checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
                                  (wc) ? cudaHostAllocWriteCombined : 0));
 #else
    // pinned memory mode - use special function to get OS-pinned memory
    checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
 #endif
  } else {
    // pageable memory mode - use malloc
    h_odata = (unsigned char *)malloc(memSize);
    if (h_odata == 0) {
      fprintf(stderr, "Not enough memory available on host to run test!\n");
      exit(EXIT_FAILURE);
    }
  }
  unsigned char *h_cacheClear1 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);
  unsigned char *h_cacheClear2 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);
  if (h_cacheClear1 == 0 || h_cacheClear2 == 0) {
    fprintf(stderr, "Not enough memory available on host to run test!\n");
    exit(EXIT_FAILURE);
  }
  // initialize the memory
  for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
    h_odata[i] = (unsigned char)(i & 0xff);
  }
  for (unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++) {
    h_cacheClear1[i] = (unsigned char)(i & 0xff);
    h_cacheClear2[i] = (unsigned char)(0xff - (i & 0xff));
  }
  // allocate device memory
  unsigned char *d_idata;
  checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
  // copy host memory to device memory
  if (PINNED == memMode) {
    if (bDontUseGPUTiming) sdkStartTimer(&timer);
    checkCudaErrors(cudaEventRecord(start, 0));
    for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
      checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize,
                                      cudaMemcpyHostToDevice, 0));
    }
    checkCudaErrors(cudaEventRecord(stop, 0));
    checkCudaErrors(cudaDeviceSynchronize());
    checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
    if (bDontUseGPUTiming) {
      sdkStopTimer(&timer);
      elapsedTimeInMs = sdkGetTimerValue(&timer);
      sdkResetTimer(&timer);
    }
  } else {
    elapsedTimeInMs = 0;
    for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
      sdkStartTimer(&timer);
      checkCudaErrors(
          cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice));
      sdkStopTimer(&timer);
      elapsedTimeInMs += sdkGetTimerValue(&timer);
      sdkResetTimer(&timer);
      memset(flush_buf, i, FLUSH_SIZE);
    }
  }
  // calculate bandwidth in GB/s
  double time_s = elapsedTimeInMs / 1e3;
  bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
  bandwidthInGBs = bandwidthInGBs / time_s;
  // clean up memory
  checkCudaErrors(cudaEventDestroy(stop));
  checkCudaErrors(cudaEventDestroy(start));
  sdkDeleteTimer(&timer);
  if (PINNED == memMode) {
    checkCudaErrors(cudaFreeHost(h_odata));
  } else {
    free(h_odata);
  }
  free(h_cacheClear1);
  free(h_cacheClear2);
  checkCudaErrors(cudaFree(d_idata));
  return bandwidthInGBs;
 }
 ///////////////////////////////////////////////////////////////////////////////
 //! test the bandwidth of a device to device memcopy of a specific size
 ///////////////////////////////////////////////////////////////////////////////
 float testDeviceToDeviceTransfer(unsigned int memSize) {
  StopWatchInterface *timer = NULL;
  float elapsedTimeInMs = 0.0f;
  float bandwidthInGBs = 0.0f;
  cudaEvent_t start, stop;
  sdkCreateTimer(&timer);
  checkCudaErrors(cudaEventCreate(&start));
  checkCudaErrors(cudaEventCreate(&stop));
  // allocate host memory
  unsigned char *h_idata = (unsigned char *)malloc(memSize);
  if (h_idata == 0) {
    fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
    exit(EXIT_FAILURE);
  }
  // initialize the host memory
  for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
    h_idata[i] = (unsigned char)(i & 0xff);
  }
  // allocate device memory
  unsigned char *d_idata;
  checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
  unsigned char *d_odata;
  checkCudaErrors(cudaMalloc((void **)&d_odata, memSize));
  // initialize memory
  checkCudaErrors(
      cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
  // run the memcopy
  sdkStartTimer(&timer);
  checkCudaErrors(cudaEventRecord(start, 0));
  for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
    checkCudaErrors(
        cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice));
  }
  checkCudaErrors(cudaEventRecord(stop, 0));
  // Since device to device memory copies are non-blocking,
  // cudaDeviceSynchronize() is required in order to get
  // proper timing.
  checkCudaErrors(cudaDeviceSynchronize());
  // get the total elapsed time in ms
  sdkStopTimer(&timer);
  checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
  if (bDontUseGPUTiming) {
    elapsedTimeInMs = sdkGetTimerValue(&timer);
  }
  // calculate bandwidth in GB/s
  double time_s = elapsedTimeInMs / 1e3;
  bandwidthInGBs = (2.0f * memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
  bandwidthInGBs = bandwidthInGBs / time_s;
  // clean up memory
  sdkDeleteTimer(&timer);
  free(h_idata);
  checkCudaErrors(cudaEventDestroy(stop));
  checkCudaErrors(cudaEventDestroy(start));
  checkCudaErrors(cudaFree(d_idata));
  checkCudaErrors(cudaFree(d_odata));
  return bandwidthInGBs;
 }
 /////////////////////////////////////////////////////////
 // print results in an easily read format
 ////////////////////////////////////////////////////////
 void printResultsReadable(unsigned int *memSizes, double *bandwidths,
                          unsigned int count, memcpyKind kind,
                          memoryMode memMode, int iNumDevs, bool wc) {
  printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs);
  printf(" %s Memory Transfers\n", sMemoryMode[memMode]);
  if (wc) {
    printf(" Write-Combined Memory Writes are Enabled");
  }
  printf("   Transfer Size (Bytes)\tBandwidth(GB/s)\n");
  unsigned int i;
  for (i = 0; i < (count - 1); i++) {
    printf("   %u\t\t\t%s%.1f\n", memSizes[i],
           (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
  }
  printf("   %u\t\t\t%s%.1f\n\n", memSizes[i],
         (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
 }
 ///////////////////////////////////////////////////////////////////////////
 // print results in a database format
 ///////////////////////////////////////////////////////////////////////////
 void printResultsCSV(unsigned int *memSizes, double *bandwidths,
                     unsigned int count, memcpyKind kind, memoryMode memMode,
                     int iNumDevs, bool wc) {
  std::string sConfig;
  // log config information
  if (kind == DEVICE_TO_DEVICE) {
    sConfig += "D2D";
  } else {
    if (kind == DEVICE_TO_HOST) {
      sConfig += "D2H";
    } else if (kind == HOST_TO_DEVICE) {
      sConfig += "H2D";
    }
    if (memMode == PAGEABLE) {
      sConfig += "-Paged";
    } else if (memMode == PINNED) {
      sConfig += "-Pinned";
      if (wc) {
        sConfig += "-WriteCombined";
      }
    }
  }
  unsigned int i;
  double dSeconds = 0.0;
  for (i = 0; i < count; i++) {
    dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1 << 20));
    printf(
        "bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u "
        "bytes, NumDevsUsed = %d\n",
        sConfig.c_str(), bandwidths[i], dSeconds, memSizes[i], iNumDevs);
  }
 }
 ///////////////////////////////////////////////////////////////////////////
 // Print help screen
 ///////////////////////////////////////////////////////////////////////////
 void printHelp(void) {
  printf("Usage:  bandwidthTest [OPTION]...\n");
  printf(
      "Test the bandwidth for device to host, host to device, and device to "
      "device transfers\n");
  printf("\n");
  printf(
      "Example:  measure the bandwidth of device to host pinned memory copies "
      "in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");
  printf(
      "./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 "
      "--increment=1024 --dtoh\n");
  printf("\n");
  printf("Options:\n");
  printf("--help\tDisplay this help menu\n");
  printf("--csv\tPrint results as a CSV\n");
  printf("--device=[deviceno]\tSpecify the device device to be used\n");
  printf("  all - compute cumulative bandwidth on all the devices\n");
  printf("  0,1,2,...,n - Specify any particular device to be used\n");
  printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n");
  printf("  pageable - pageable memory\n");
  printf("  pinned   - non-pageable system memory\n");
  printf("--mode=[MODE]\tSpecify the mode to use\n");
  printf("  quick - performs a quick measurement\n");
  printf("  range - measures a user-specified range of values\n");
  printf("  shmoo - performs an intense shmoo of a large range of values\n");
  printf("--htod\tMeasure host to device transfers\n");
  printf("--dtoh\tMeasure device to host transfers\n");
  printf("--dtod\tMeasure device to device transfers\n");
 #if CUDART_VERSION >= 2020
  printf("--wc\tAllocate pinned memory as write-combined\n");
 #endif
  printf("--cputiming\tForce CPU-based timing always\n");
  printf("Range mode options\n");
  printf("--start=[SIZE]\tStarting transfer size in bytes\n");
  printf("--end=[SIZE]\tEnding transfer size in bytes\n");
  printf("--increment=[SIZE]\tIncrement size in bytes\n");
 }
--- a/Samples/bandwidthTest/bandwidthTest_vs2012.sln
+++ b/Samples/bandwidthTest/bandwidthTest_vs2012.sln
@ -0,0 +1,20 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2012
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidthTest", "bandwidthTest_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/bandwidthTest/bandwidthTest_vs2012.vcxproj
+++ b/Samples/bandwidthTest/bandwidthTest_vs2012.vcxproj
@ -0,0 +1,107 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>bandwidthTest_vs2012</RootNamespace>
    <ProjectName>bandwidthTest</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v110</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="bandwidthTest.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/bandwidthTest/bandwidthTest_vs2013.sln
+++ b/Samples/bandwidthTest/bandwidthTest_vs2013.sln
@ -0,0 +1,20 @@
 Microsoft Visual Studio Solution File, Format Version 13.00
 # Visual Studio 2013
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidthTest", "bandwidthTest_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/bandwidthTest/bandwidthTest_vs2013.vcxproj
+++ b/Samples/bandwidthTest/bandwidthTest_vs2013.vcxproj
@ -0,0 +1,107 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>bandwidthTest_vs2013</RootNamespace>
    <ProjectName>bandwidthTest</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v120</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="bandwidthTest.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/bandwidthTest/bandwidthTest_vs2015.sln
+++ b/Samples/bandwidthTest/bandwidthTest_vs2015.sln
@ -0,0 +1,20 @@
 Microsoft Visual Studio Solution File, Format Version 14.00
 # Visual Studio 2015
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidthTest", "bandwidthTest_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/bandwidthTest/bandwidthTest_vs2015.vcxproj
+++ b/Samples/bandwidthTest/bandwidthTest_vs2015.vcxproj
@ -0,0 +1,107 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>bandwidthTest_vs2015</RootNamespace>
    <ProjectName>bandwidthTest</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v140</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="bandwidthTest.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/bandwidthTest/bandwidthTest_vs2017.sln
+++ b/Samples/bandwidthTest/bandwidthTest_vs2017.sln
@ -0,0 +1,20 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2017
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidthTest", "bandwidthTest_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj
+++ b/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj
@ -0,0 +1,108 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>bandwidthTest_vs2017</RootNamespace>
    <ProjectName>bandwidthTest</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v141</PlatformToolset>
 	<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="bandwidthTest.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/conjugateGradientCudaGraphs/Makefile
+++ b/Samples/conjugateGradientCudaGraphs/Makefile
@ -246,7 +246,11 @@ LIBRARIES :=
 ################################################################################
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
 SMS ?= 30 35 37 50 52 60 61 70 72 75
 else
 SMS ?= 30 35 37 50 52 60 61 70 75
 endif
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
@ -264,7 +268,7 @@ GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
 endif
 endif
-LIBRARIES += -lcublas_static -lcusparse_static -lculibos
+LIBRARIES += -lcublas_static -lcublasLt_static -lcusparse_static -lculibos
 ifeq ($(SAMPLE_ENABLED),0)
 EXEC ?= @echo "[@]"
--- a/Samples/conjugateGradientCudaGraphs/NsightEclipse.xml
+++ b/Samples/conjugateGradientCudaGraphs/NsightEclipse.xml
@ -31,6 +31,7 @@
  </keywords>
  <libraries>
    <library>cublas_static</library>
    <library>cublasLt_static</library>
    <library>cusparse_static</library>
    <library>culibos</library>
  </libraries>
@ -55,6 +56,7 @@
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
  <sm-arch>sm72</sm-arch>
  <sm-arch>sm75</sm-arch>
  <supported_envs>
    <env>
--- a/Samples/conjugateGradientCudaGraphs/README.md
+++ b/Samples/conjugateGradientCudaGraphs/README.md
@ -10,7 +10,7 @@ Linear Algebra, CUBLAS Library, CUSPARSE Library
 ## Supported SM Architectures
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
@ -30,7 +30,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch,
 ## Prerequisites
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## Build and Run
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu
@ -323,7 +323,7 @@ int main(int argc, char **argv) {
  checkCudaErrors(cudaStreamCreate(&streamForGraph));
  checkCudaErrors(cublasSetStream(cublasHandle, stream1));
  checkCudaErrors(cusparseSetStream(cusparseHandle, stream1));
-  checkCudaErrors(cudaStreamBeginCapture(stream1));
+  checkCudaErrors(cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal));
  r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_r0, d_b);
  cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE);
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj
@ -34,7 +34,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/conjugateGradientMultiBlockCG/Makefile
+++ b/Samples/conjugateGradientMultiBlockCG/Makefile
@ -264,7 +264,11 @@ LIBRARIES :=
 ################################################################################
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
 SMS ?= 60 61 70 72 75
 else
 SMS ?= 60 61 70 75
 endif
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
--- a/Samples/conjugateGradientMultiBlockCG/NsightEclipse.xml
+++ b/Samples/conjugateGradientMultiBlockCG/NsightEclipse.xml
@ -42,6 +42,7 @@
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
  <sm-arch>sm72</sm-arch>
  <sm-arch>sm75</sm-arch>
  <supported_envs>
    <env>
--- a/Samples/conjugateGradientMultiBlockCG/README.md
+++ b/Samples/conjugateGradientMultiBlockCG/README.md
@ -10,7 +10,7 @@ Unified Memory, Linear Algebra, Cooperative Groups, MultiBlock Cooperative Group
 ## Supported SM Architectures
-[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
@ -27,7 +27,7 @@ x86_64, ppc64le
 ## Prerequisites
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## Build and Run
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2012.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2012.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2013.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2013.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2015.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2015.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj
@ -34,7 +34,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -104,6 +104,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/conjugateGradientMultiDeviceCG/Makefile
+++ b/Samples/conjugateGradientMultiDeviceCG/Makefile
@ -264,7 +264,11 @@ LIBRARIES :=
 ################################################################################
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
 SMS ?= 60 61 70 72 75
 else
 SMS ?= 60 61 70 75
 endif
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
--- a/Samples/conjugateGradientMultiDeviceCG/NsightEclipse.xml
+++ b/Samples/conjugateGradientMultiDeviceCG/NsightEclipse.xml
@ -49,6 +49,7 @@
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
  <sm-arch>sm72</sm-arch>
  <sm-arch>sm75</sm-arch>
  <supported_envs>
    <env>
--- a/Samples/conjugateGradientMultiDeviceCG/README.md
+++ b/Samples/conjugateGradientMultiDeviceCG/README.md
@ -10,7 +10,7 @@ Unified Memory, Linear Algebra, Cooperative Groups, MultiDevice Cooperative Grou
 ## Supported SM Architectures
-[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
@ -30,7 +30,7 @@ cudaMemAdvise, cudaMemPrefetchAsync, cudaLaunchCooperativeKernelMultiDevice, cud
 ## Prerequisites
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## Build and Run
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu
@ -415,7 +415,7 @@ void getIdenticalGPUs(int num_of_gpus, std::set<int> &identicalGPUs) {
      identicalGPUs.erase(it);
    }
    if (!deviceProp.cooperativeMultiDeviceLaunch ||
-        !deviceProp.concurrentManagedAccess) {
+        !deviceProp.managedMemory) {
      identicalGPUs.erase(it);
    }
    it++;
@ -450,8 +450,7 @@ int main(int argc, char **argv) {
  if (identicalGPUs.size() <= 1) {
    printf(
        "No Two or more GPUs with same architecture capable of "
-        "cooperativeMultiDeviceLaunch & concurrentManagedAccess found. "
+        "cooperativeMultiDeviceLaunch & managedMemory found. \nWaiving the sample\n");
        "\nWaiving the sample\n");
    exit(EXIT_WAIVED);
  }
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2012.vcxproj
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2012.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2013.vcxproj
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2013.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.vcxproj
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj
@ -34,7 +34,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -104,6 +104,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/cudaTensorCoreGemm/Makefile
+++ b/Samples/cudaTensorCoreGemm/Makefile
@ -246,12 +246,6 @@ ifeq ($(TARGET_ARCH),armv7l)
  SAMPLE_ENABLED := 0
 endif
 # This sample is not supported on aarch64
 ifeq ($(TARGET_ARCH),aarch64)
  $(info >>> WARNING - cudaTensorCoreGemm is not supported on aarch64 - waiving sample <<<)
  SAMPLE_ENABLED := 0
 endif
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
@ -264,7 +258,11 @@ LIBRARIES :=
 ################################################################################
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
 SMS ?= 70 72 75
 else
 SMS ?= 70 75
 endif
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
--- a/Samples/cudaTensorCoreGemm/NsightEclipse.xml
+++ b/Samples/cudaTensorCoreGemm/NsightEclipse.xml
@ -43,12 +43,16 @@ In addition to that, it demonstrates the use of the new CUDA function attribute
    <scope>1:CUDA Basic Topics</scope>
  </scopes>
  <sm-arch>sm70</sm-arch>
  <sm-arch>sm72</sm-arch>
  <sm-arch>sm75</sm-arch>
  <supported_envs>
    <env>
      <arch>x86_64</arch>
      <platform>linux</platform>
    </env>
    <env>
      <arch>aarch64</arch>
    </env>
    <env>
      <platform>windows7</platform>
    </env>
--- a/Samples/cudaTensorCoreGemm/README.md
+++ b/Samples/cudaTensorCoreGemm/README.md
@ -14,7 +14,7 @@ Matrix Multiply, WMMA, Tensor Cores
 ## Supported SM Architectures
-[SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
@ -22,7 +22,7 @@ Linux, Windows
 ## Supported CPU Architecture
-x86_64, ppc64le
+x86_64, ppc64le, aarch64
 ## CUDA APIs involved
@ -31,7 +31,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate,
 ## Prerequisites
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 ## Build and Run
@ -52,9 +52,9 @@ $ cd <sample_dir>
 $ make
 ```
 The samples makefiles can take advantage of certain options:
-*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le.
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
-`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
 *   **dbg=1** - build with debug symbols
    ```
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
@ -180,16 +180,16 @@
 using namespace nvcuda;
-__host__ void init_host_matrices(float *a, float *b, float *c) {
+__host__ void init_host_matrices(half *a, half *b, float *c) {
  for (int i = 0; i < M_GLOBAL; i++) {
    for (int j = 0; j < K_GLOBAL; j++) {
-      a[i * K_GLOBAL + j] = static_cast<float>(rand() % 3);
+      a[i * K_GLOBAL + j] = (half)(rand() % 3);
    }
  }
  for (int i = 0; i < N_GLOBAL; i++) {
    for (int j = 0; j < K_GLOBAL; j++) {
-      b[i * K_GLOBAL + j] = static_cast<float>(rand() % 3);
+      b[i * K_GLOBAL + j] = (half)(rand() % 3);
    }
  }
@ -198,26 +198,6 @@ __host__ void init_host_matrices(float *a, float *b, float *c) {
  }
 }
 __global__ void init_device_matrices(const float *A_h, const float *B_h,
                                     const float *C_h, half *A, half *B,
                                     float *C, float *D) {
  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M_GLOBAL * K_GLOBAL;
       i += gridDim.x * blockDim.x)
    A[i] = __float2half(A_h[i]);
  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < N_GLOBAL * K_GLOBAL;
       i += gridDim.x * blockDim.x)
    B[i] = __float2half(B_h[i]);
  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M_GLOBAL * N_GLOBAL;
       i += gridDim.x * blockDim.x)
    C[i] = C_h[i];
  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M_GLOBAL * N_GLOBAL;
       i += gridDim.x * blockDim.x)
    D[i] = 0;
 }
 __global__ void compute_gemm(const half *A, const half *B, const float *C,
                             float *D, float alpha, float beta) {
  extern __shared__ half shmem[][CHUNK_K * K + SKEW_HALF];
@ -486,7 +466,7 @@ __global__ void simple_wmma_gemm(half *a, half *b, float *c, float *d, int m_ld,
  }
 }
-__host__ void matMultiplyOnHost(float *A, float *B, float *C, float alpha,
+__host__ void matMultiplyOnHost(half *A, half *B, float *C, float alpha,
                                float beta, int numARows, int numAColumns,
                                int numBRows, int numBColumns, int numCRows,
                                int numCColumns) {
@ -495,7 +475,7 @@ __host__ void matMultiplyOnHost(float *A, float *B, float *C, float alpha,
      float temp = 0.0;
      for (int k = 0; k < numAColumns; k++) {
-        temp += A[i * numAColumns + k] * B[j * numBRows + k];
+        temp += (float)A[i * numAColumns + k] * (float)B[j * numBRows + k];
      }
      C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j];
@ -514,7 +494,7 @@ int main(int argc, char **argv) {
  // Tensor cores require a GPU of Volta (SM7X) architecture or higher.
  if (deviceProp.major < 7) {
    printf(
-        "cudaTensorCoreGemm requires requires SM 7.0 or higher to use Tensor "
+        "cudaTensorCoreGemm requires SM 7.0 or higher to use Tensor "
        "Cores.  Exiting...\n");
    exit(EXIT_WAIVED);
  }
@ -523,25 +503,20 @@ int main(int argc, char **argv) {
  printf("N: %d (%d x %d)\n", N_GLOBAL, N, N_TILES);
  printf("K: %d (%d x %d)\n", K_GLOBAL, K, K_TILES);
-  float *A_h = NULL;
+  half *A_h = NULL;
-  float *B_h = NULL;
+  half *B_h = NULL;
  float *C_h = NULL;
 #if CPU_DEBUG
  float *result_hD = NULL;
  float *result_host = NULL;
 #endif
-  checkCudaErrors(cudaMallocManaged(reinterpret_cast<void **>(&A_h),
+  A_h = (half *)malloc(sizeof(half) * M_GLOBAL * K_GLOBAL);
-                                    sizeof(float) * M_GLOBAL * K_GLOBAL));
+  B_h = (half *)malloc(sizeof(half) * K_GLOBAL * N_GLOBAL);
-  checkCudaErrors(cudaMallocManaged(reinterpret_cast<void **>(&B_h),
+  C_h = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
                                    sizeof(float) * K_GLOBAL * N_GLOBAL));
  checkCudaErrors(cudaMallocManaged(reinterpret_cast<void **>(&C_h),
                                    sizeof(float) * M_GLOBAL * N_GLOBAL));
 #if CPU_DEBUG
-  checkCudaErrors(cudaMallocManaged((void **)&result_hD,
+  result_hD = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
-                                    sizeof(float) * M_GLOBAL * N_GLOBAL));
+  result_host = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
  checkCudaErrors(cudaMallocManaged((void **)&result_host,
                                    sizeof(float) * M_GLOBAL * N_GLOBAL));
 #endif
  half *A = NULL;
@ -567,11 +542,13 @@ int main(int argc, char **argv) {
  printf("Preparing data for GPU...\n");
-  checkKernelErrors(
+  checkCudaErrors(cudaMemcpy(A, A_h, sizeof(half) * M_GLOBAL * K_GLOBAL,
-      (init_device_matrices<<<deviceProp.multiProcessorCount,
+                             cudaMemcpyHostToDevice));
-                              THREADS_PER_BLOCK>>>(A_h, B_h, C_h, A, B, C, D)));
+  checkCudaErrors(cudaMemcpy(B, B_h, sizeof(half) * N_GLOBAL * K_GLOBAL,
-
+                             cudaMemcpyHostToDevice));
-  checkCudaErrors(cudaDeviceSynchronize());
+  checkCudaErrors(cudaMemcpy(C, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL,
                             cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemset(D, 0, sizeof(float) * M_GLOBAL * N_GLOBAL));
  enum {
    // Compute the right amount of shared memory to request.
@ -650,6 +627,8 @@ int main(int argc, char **argv) {
      printf("mismatch i=%d result_hD=%f result_host=%f\n", i, result_hD[i],
             result_host[i]);
  }
  free(result_hD);
  free(result_host);
 #endif
  float milliseconds = 0;
@ -662,9 +641,9 @@ int main(int argc, char **argv) {
                                               (milliseconds / 1000.)) /
                               1e12);
-  checkCudaErrors(cudaFree(reinterpret_cast<void *>(A_h)));
+  free(A_h);
-  checkCudaErrors(cudaFree(reinterpret_cast<void *>(B_h)));
+  free(B_h);
-  checkCudaErrors(cudaFree(reinterpret_cast<void *>(C_h)));
+  free(C_h);
  checkCudaErrors(cudaFree(reinterpret_cast<void *>(A)));
  checkCudaErrors(cudaFree(reinterpret_cast<void *>(B)));
  checkCudaErrors(cudaFree(reinterpret_cast<void *>(C)));
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2012.vcxproj
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2012.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2013.vcxproj
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2013.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2015.vcxproj
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2015.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj
@ -34,7 +34,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/deviceQuery/Makefile
+++ b/Samples/deviceQuery/Makefile
@ -246,7 +246,11 @@ LIBRARIES :=
 ################################################################################
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
 SMS ?= 30 35 37 50 52 60 61 70 72 75
 else
 SMS ?= 30 35 37 50 52 60 61 70 75
 endif
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
--- a/Samples/deviceQuery/NsightEclipse.xml
+++ b/Samples/deviceQuery/NsightEclipse.xml
@ -39,6 +39,7 @@
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
  <sm-arch>sm72</sm-arch>
  <sm-arch>sm75</sm-arch>
  <supported_envs>
    <env>
--- a/Samples/deviceQuery/README.md
+++ b/Samples/deviceQuery/README.md
@ -10,7 +10,7 @@ CUDA Runtime API, Device Query
 ## Supported SM Architectures
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
@ -27,7 +27,7 @@ cudaSetDevice, cudaGetDeviceCount, cudaGetDeviceProperties, cudaDriverGetVersion
 ## Prerequisites
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 ## Build and Run
--- a/Samples/deviceQuery/deviceQuery_vs2012.vcxproj
+++ b/Samples/deviceQuery/deviceQuery_vs2012.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/deviceQuery/deviceQuery_vs2013.vcxproj
+++ b/Samples/deviceQuery/deviceQuery_vs2013.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/deviceQuery/deviceQuery_vs2015.vcxproj
+++ b/Samples/deviceQuery/deviceQuery_vs2015.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/deviceQuery/deviceQuery_vs2017.vcxproj
+++ b/Samples/deviceQuery/deviceQuery_vs2017.vcxproj
@ -34,7 +34,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/immaTensorCoreGemm/Makefile
+++ b/Samples/immaTensorCoreGemm/Makefile
@ -0,0 +1,318 @@
 ################################################################################
 # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #  * Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 #  * Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #  * Neither the name of NVIDIA CORPORATION nor the names of its
 #    contributors may be used to endorse or promote products derived
 #    from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
 #
 # Makefile project only supported on Mac OS X and Linux Platforms)
 #
 ################################################################################
 # Location of the CUDA Toolkit
 CUDA_PATH ?= /usr/local/cuda
 ##############################
 # start deprecated interface #
 ##############################
 ifeq ($(x86_64),1)
    $(info WARNING - x86_64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
    TARGET_ARCH ?= x86_64
 endif
 ifeq ($(ARMv7),1)
    $(info WARNING - ARMv7 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=armv7l instead)
    TARGET_ARCH ?= armv7l
 endif
 ifeq ($(aarch64),1)
    $(info WARNING - aarch64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
    TARGET_ARCH ?= aarch64
 endif
 ifeq ($(ppc64le),1)
    $(info WARNING - ppc64le variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
    TARGET_ARCH ?= ppc64le
 endif
 ifneq ($(GCC),)
    $(info WARNING - GCC variable has been deprecated)
    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
    HOST_COMPILER ?= $(GCC)
 endif
 ifneq ($(abi),)
    $(error ERROR - abi variable has been removed)
 endif
 ############################
 # end deprecated interface #
 ############################
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
 ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
            TARGET_SIZE := 64
        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
            TARGET_SIZE := 32
        endif
    else
        TARGET_SIZE := $(shell getconf LONG_BIT)
    endif
 else
    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
    endif
 endif
 # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
 ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
    TARGET_ARCH = armv7l
 endif
 # operating system
 HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 TARGET_OS ?= $(HOST_OS)
 ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
 endif
 # host compiler
 ifeq ($(TARGET_OS),darwin)
    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
        HOST_COMPILER ?= clang++
    endif
 else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
        ifeq ($(TARGET_OS),linux)
            HOST_COMPILER ?= arm-linux-gnueabihf-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
        else ifeq ($(TARGET_OS),android)
            HOST_COMPILER ?= arm-linux-androideabi-g++
        endif
    else ifeq ($(TARGET_ARCH),aarch64)
        ifeq ($(TARGET_OS), linux)
            HOST_COMPILER ?= aarch64-linux-gnu-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
        else ifeq ($(TARGET_OS), android)
            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
    endif
 endif
 HOST_COMPILER ?= g++
 NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
 # internal flags
 NVCCFLAGS   := -m${TARGET_SIZE}
 CCFLAGS     :=
 LDFLAGS     :=
 # build flags
 ifeq ($(TARGET_OS),darwin)
    LDFLAGS += -rpath $(CUDA_PATH)/lib
    CCFLAGS += -arch $(HOST_ARCH)
 else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
 else ifeq ($(TARGET_OS),android)
    LDFLAGS += -pie
    CCFLAGS += -fpie -fpic -fexceptions
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
        endif
    endif
 endif
 ifeq ($(TARGET_OS),qnx)
    CCFLAGS += -DWIN_INTERFACE_CUSTOM
    LDFLAGS += -lsocket
 endif
 # Install directory of different arch
 CUDA_INSTALL_TARGET_DIR :=
 ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
 else ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
 endif
 # Debug build flags
 ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      BUILD_TYPE := debug
 else
      BUILD_TYPE := release
 endif
 ALL_CCFLAGS :=
 ALL_CCFLAGS += $(NVCCFLAGS)
 ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 SAMPLE_ENABLED := 1
 # This sample is not supported on Mac OSX
 ifeq ($(TARGET_OS),darwin)
  $(info >>> WARNING - immaTensorCoreGemm is not supported on Mac OSX - waiving sample <<<)
  SAMPLE_ENABLED := 0
 endif
 # This sample is not supported on ARMv7
 ifeq ($(TARGET_ARCH),armv7l)
  $(info >>> WARNING - immaTensorCoreGemm is not supported on ARMv7 - waiving sample <<<)
  SAMPLE_ENABLED := 0
 endif
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
 # Common includes and paths for CUDA
 INCLUDES  := -I../../Common
 LIBRARIES :=
 ################################################################################
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
 SMS ?= 72 75
 else
 SMS ?= 75
 endif
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
 SAMPLE_ENABLED := 0
 endif
 ifeq ($(GENCODE_FLAGS),)
 # Generate SASS code for each SM architecture listed in $(SMS)
 $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
 # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
 HIGHEST_SM := $(lastword $(sort $(SMS)))
 ifneq ($(HIGHEST_SM),)
 GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
 endif
 endif
 ALL_CCFLAGS += -maxrregcount=255
 ifeq ($(SAMPLE_ENABLED),0)
 EXEC ?= @echo "[@]"
 endif
 ################################################################################
 # Target rules
 all: build
 build: immaTensorCoreGemm
 check.deps:
 ifeq ($(SAMPLE_ENABLED),0)
 	@echo "Sample will be waived due to the above missing dependencies"
 else
 	@echo "Sample is ready - all dependencies have been met"
 endif
 immaTensorCoreGemm.o:immaTensorCoreGemm.cu
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 immaTensorCoreGemm: immaTensorCoreGemm.o
 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
 	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 run: build
 	$(EXEC) ./immaTensorCoreGemm
 clean:
 	rm -f immaTensorCoreGemm immaTensorCoreGemm.o
 	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/immaTensorCoreGemm
 clobber: clean
--- a/Samples/immaTensorCoreGemm/NsightEclipse.xml
+++ b/Samples/immaTensorCoreGemm/NsightEclipse.xml
@ -0,0 +1,64 @@
 <?xml version="1.0" encoding="UTF-8"?> 
 <!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
 <entry>
  <name>immaTensorCoreGemm</name>
  <cflags>
    <flag>-maxrregcount=255</flag>
  </cflags>
  <cuda_api_list>
    <toolkit>cudaMallocManaged</toolkit>
    <toolkit>cudaDeviceSynchronize</toolkit>
    <toolkit>cudaFuncSetAttribute</toolkit>
    <toolkit>cudaEventCreate</toolkit>
    <toolkit>cudaEventRecord</toolkit>
    <toolkit>cudaEventSynchronize</toolkit>
    <toolkit>cudaEventElapsedTime</toolkit>
    <toolkit>cudaFree</toolkit>
  </cuda_api_list>
  <description><![CDATA[CUDA sample demonstrating a integer GEMM computation using the Warp Matrix Multiply and Accumulate (WMMA) API for integer introduced in CUDA 10. This sample demonstrates the use of the CUDA WMMA API employing the Tensor Cores introduced in the Volta chip family for faster matrix operations. In addition to that, it demonstrates the use of the new CUDA function attribute cudaFuncAttributeMaxDynamicSharedMemorySize that allows the application to reserve an extended amount of shared memory than it is available by default.]]></description>
  <devicecompilation>whole</devicecompilation>
  <includepaths>
    <path>./</path>
    <path>../</path>
    <path>../../common/inc</path>
  </includepaths>
  <keyconcepts>
    <concept level="basic">Matrix Multiply</concept>
    <concept level="advanced">WMMA</concept>
    <concept level="advanced">Tensor Cores</concept>
  </keyconcepts>
  <keywords>
  </keywords>
  <libraries>
  </libraries>
  <librarypaths>
  </librarypaths>
  <nsight_eclipse>true</nsight_eclipse>
  <primary_file>immaTensorCoreGemm.cu</primary_file>
  <scopes>
    <scope>1:CUDA Basic Topics</scope>
  </scopes>
  <sm-arch>sm72</sm-arch>
  <sm-arch>sm75</sm-arch>
  <supported_envs>
    <env>
      <arch>x86_64</arch>
      <platform>linux</platform>
    </env>
    <env>
      <arch>aarch64</arch>
    </env>
    <env>
      <platform>windows7</platform>
    </env>
    <env>
      <arch>ppc64le</arch>
      <platform>linux</platform>
    </env>
  </supported_envs>
  <supported_sm_architectures>
    <from>7.2</from>
  </supported_sm_architectures>
  <title>Tensor Core GEMM Integer MMA</title>
  <type>exe</type>
 </entry>
--- a/Samples/immaTensorCoreGemm/README.md
+++ b/Samples/immaTensorCoreGemm/README.md
@ -0,0 +1,70 @@
 # immaTensorCoreGemm - Tensor Core GEMM Integer MMA
 ## Description
 CUDA sample demonstrating a integer GEMM computation using the Warp Matrix Multiply and Accumulate (WMMA) API for integer introduced in CUDA 10. This sample demonstrates the use of the CUDA WMMA API employing the Tensor Cores introduced in the Volta chip family for faster matrix operations. In addition to that, it demonstrates the use of the new CUDA function attribute cudaFuncAttributeMaxDynamicSharedMemorySize that allows the application to reserve an extended amount of shared memory than it is available by default.
 ## Key Concepts
 Matrix Multiply, WMMA, Tensor Cores
 ## Supported SM Architectures
 [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
 Linux, Windows
 ## Supported CPU Architecture
 x86_64, ppc64le, aarch64
 ## CUDA APIs involved
 ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
 cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate, cudaEventRecord, cudaEventSynchronize, cudaEventElapsedTime, cudaFree
 ## Prerequisites
 Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 ## Build and Run
 ### Windows
 The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
 ```
 *_vs<version>.sln - for Visual Studio <version>
 ```
 Each individual sample has its own set of solution files in its directory:
 To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
 > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
 ### Linux
 The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
 ```
 $ cd <sample_dir>
 $ make
 ```
 The samples makefiles can take advantage of certain options:
 *  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
 `$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
 *   **dbg=1** - build with debug symbols
    ```
    $ make dbg=1
    ```
 *   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
    ```
    $ make SMS="50 60"
    ```
 *  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
 ```
    $ make HOST_COMPILER=g++
 ```
 ## References (for more details)
--- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm.cu
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm.cu
@ -0,0 +1,655 @@
 /* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 // CUDA sample demonstrating a integer GEMM computation using the Warp Matrix
 // Multiply and Accumulate API.
 // In this program, the compute_gemm kernel computes the result of a matrix
 // multiplication and addition: D = alpha * A * B + beta * C. The dimensions of
 // both C and D matrices are M_GLOBAL x N_GLOBAL. The A matrix is M_GLOBAL x
 // K_GLOBAL (row-major), the B matrix is K_GLOBAL x N_GLOBAL (column-major). In
 // that kernel, each CTA computes one 128 x 128 tile of the resulting matrix per
 // iteration. When the tile is computed, the CTA stores it to the global memory
 // and begins a new iteration, selecting a new 128 x 128 tile to compute.
 // Each CTA consists of eight warps. For the 128 x 128 tile, each warp computes
 // eight 16 x 16 subtiles, organized in a 2 x 4 two-dimensional array. Warps
 // compute the 16 x 16 subtiles using nvcuda::wmma::mma_sync operations by
 // moving through the K_GLOBAL dimension of the A and B matrices and
 // accumulating the intermediate result in the local thread state.
 // There are a number of simple optimizations used in the algorithm:
 // - The CTA copies the 128 x 128 tile of the C matrix from the global memory to
 //   shared memory. After that is done, each warp loads the C matrix fragments
 //   from shared memory, thus avoiding a random global memory access.
 // - On each internal iteration, the CTA copies a portion of the A and B
 // matrices from
 //   global memory to shared memory. After that, all warps in the CTA reuse the
 //   A and B data from shared memory, thus reducing the number of data copies
 //   from global memory.
 // - The portions of the A and B matrices are stored in shared memory with an
 // additional
 //   padding (skew) to reduce the number of shared memory access bank conflicts.
 //   (See a detailed explanation near the SKEW_HALF macro definition.)
 // - When the CTA finishes computing the tiles of the resulting matrix, each
 // warp stores
 //   its subtiles to shared memory. The CTA then copies the shared memory
 //   contents to global memory, again avoiding redundant random global memory
 //   accesses.
 // - Note that the CTA tile size is chosen to maximize the GPU register
 // utilization,
 //   but carefully enough to avoid local memory use.
 #include <assert.h>
 #include <cuda.h>
 #include <mma.h>
 #include <stdio.h>
 // helper functions and utilities to work with CUDA
 #include <helper_cuda.h>
 #include <helper_functions.h>
 // Externally configurable parameters.
 #ifndef CPU_DEBUG
 // Set this to 1 to verify the correctness of the GPU-computed matrix.
 #define CPU_DEBUG 0
 #endif
 #ifndef SHARED_MEMORY_LIMIT_64K
 // Set this to 0 to use more than 64 Kb of shared memory to cache data, to
 // improve the performance of the computations on GPU.
 // Note that you need a GPU that can have more than 64 Kb of shared memory
 // per multiprocessor.
 #define SHARED_MEMORY_LIMIT_64K 1
 #endif
 // GPU configuration.
 #define WARP_SIZE 32
 // MMA matrix tile dimensions.
 #define M 16
 #define N 16
 #define K 16
 #define WMMA_M 16
 #define WMMA_N 16
 #define WMMA_K 16
 // GEMM configuration.
 #define M_TILES 256
 #define N_TILES 256
 #define K_TILES 256
 #define M_GLOBAL (M * M_TILES)
 #define N_GLOBAL (N * N_TILES)
 #define K_GLOBAL (K * K_TILES)
 #define C_LAYOUT wmma::mem_row_major
 // Implementation constants.
 #define WARPS_PER_BLOCK 8
 #define THREADS_PER_BLOCK (WARP_SIZE * WARPS_PER_BLOCK)
 #if SHARED_MEMORY_LIMIT_64K
 // With only 64 Kb shared memory available, we can fit two 8-tile chunks of
 // the A and B matrix data, that are 16 * 16 * 8 * 8 * 2 = 32 Kb each
 // (i.e. two 8x8 arrays of tiles of 16x16 uint8_t-typed elements per CTA).
 // But we cannot account the 8 Kb total skew overhead, without which the
 // performance would be severely impacted. So we choose to reduce the chunk size
 // in half, i.e. the amount of A and B matrix data we cache in shared memory.
 // Accordingly, this doubles the number of outer iterations across the global K
 // dimension, which only slightly impacts the performance.
 #define CHUNK_K 8
 #else
 #define CHUNK_K 16
 #endif
 #define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(uint8_t))
 #define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4))
 #define CHUNK_COPY_LINES_PER_WARP (WARP_COPY_BYTES / CHUNK_LINE_BYTES)
 #define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP)
 #define BLOCK_ROW_WARPS 2
 #define BLOCK_COL_WARPS 4
 #define WARP_ROW_TILES 4
 #define WARP_COL_TILES 2
 #define BLOCK_ROW_TILES (WARP_ROW_TILES * BLOCK_ROW_WARPS)
 #define BLOCK_COL_TILES (WARP_COL_TILES * BLOCK_COL_WARPS)
 #define GLOBAL_MEM_STRIDE N_GLOBAL
 #define SHMEM_STRIDE (N * BLOCK_ROW_TILES)
 #define SHMEM_OFFSET (N * WARP_ROW_TILES)
 // The macro below is used to shift rows of the A matrix and columns of the B
 // matrix in shared memory to minimize possible bank conflicts. Before
 // performing the nvcuda::wmma::mma_sync operation, the warp must load the
 // matrix data using the nvcuda::wmma::load_matrix_sync operation. Although the
 // memory access pattern is not specified for that function, each lane in the
 // warp can read one or multiple matrix elements from different matrix rows or
 // columns. For shared memory, such access can result in bank conflicts if
 // different rows / columns of the matrix map to the same bank. By shifting each
 // row and column by a few bytes, we make sure that they map to different banks,
 // thus reducing the number of possible bank conflicts. The number of 16
 // one-byte "uint8_t" elements is chosen as the minimum possible shift because
 // we must keep each row and column 128-bit aligned, as required by
 // nvcuda::wmma::load_matrix_sync.
 #define SKEW_UINT8 16
 #define checkKernelErrors(expr)                             \
  do {                                                      \
    expr;                                                   \
                                                            \
    cudaError_t __err = cudaGetLastError();                 \
    if (__err != cudaSuccess) {                             \
      printf("Line %d: '%s' failed: %s\n", __LINE__, #expr, \
             cudaGetErrorString(__err));                    \
      abort();                                              \
    }                                                       \
  } while (0)
 using namespace nvcuda;
 __host__ void init_host_matrices(uint8_t *a, uint8_t *b, int *c) {
  for (int i = 0; i < M_GLOBAL; i++) {
    for (int j = 0; j < K_GLOBAL; j++) {
      a[i * K_GLOBAL + j] = (uint8_t)(rand() % 3);
    }
  }
  for (int i = 0; i < N_GLOBAL; i++) {
    for (int j = 0; j < K_GLOBAL; j++) {
      b[i * K_GLOBAL + j] = (uint8_t)(rand() % 3);
    }
  }
  for (int t = 0; t < M_GLOBAL * N_GLOBAL; t++) {
    c[t] = (rand() % 3);
  }
 }
 __global__ void compute_gemm_imma(const uint8_t *A, const uint8_t *B,
                                  const int *C, int *D, int alpha, int beta) {
  extern __shared__ uint8_t shmem[][CHUNK_K * K + SKEW_UINT8];
  // Warp and lane identification.
  const unsigned int warpId = threadIdx.x / WARP_SIZE;
  const unsigned int laneId = threadIdx.x % WARP_SIZE;
  // Offset in shared memory from which the B matrix is stored.
  const size_t shmem_idx_b_off = BLOCK_COL_TILES * M;
  // This pointer is used to access the C and D matrix tiles this warp computes.
  int *shmem_warp_tile_ptr = (int *)&shmem[0][0] +
                             (warpId / 2) * SHMEM_STRIDE * K * 2 +
                             (warpId % 2) * SHMEM_OFFSET;
  // This pointer is used to stream the C and D matrices block-wide tile to and
  // from shared memory.
  int *shmem_warp_stream_ptr = (int *)&shmem[0][0] + warpId * SHMEM_STRIDE * K;
  // Adjust the beta scaler, as it'll be multiplied by alpha at the end of
  // each tile computation. Technically this is not generally correct (may
  // result in a loss of precision). Zero still needs to be specially handled
  // though.
  beta /= alpha;
  // Each CTA slides along the 128 x 128 tiles from the top left corner of the
  // matrix to the right and down, and selects the next tile to compute. Once
  // there's no such tile, all warps in this CTA exit.
  for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) {
    const unsigned int block_tile_i =
        ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES);
    const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES;
    // Stop when there are no more D matrix tiles to compute in this CTA.
    if (block_tile_i >= M_TILES) {
      break;
    }
    // This warp's pointer to the C matrix data to copy memory from to shared
    // memory.
    const size_t gmem_idx =
        (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N;
    const int *src_gmem_warp_stream_ptr = &C[gmem_idx];
    // Stream multiple C tiles to shared memory.
 #pragma unroll
    for (int i = 0; i < K; i++) {
      typedef int4 copy_t;
      *((copy_t *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId) =
          *((copy_t *)(src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) +
            laneId);
    }
    __syncthreads();
    // These fragments will accumulate the result of A and B matrix fragment
    // multiplications along the K_GLOBAL dimension.
    wmma::fragment<wmma::accumulator, M, N, K, int> c[WARP_COL_TILES]
                                                     [WARP_ROW_TILES];
    // Load the C matrix tiles into fragments from shared memory.
 #pragma unroll
    for (int i = 0; i < WARP_COL_TILES; i++) {
 #pragma unroll
      for (int j = 0; j < WARP_ROW_TILES; j++) {
        const int *tile_ptr =
            shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N;
        wmma::load_matrix_sync(c[i][j], tile_ptr, SHMEM_STRIDE, C_LAYOUT);
      }
    }
    __syncthreads();
    // Scale the C matrix.
 #pragma unroll
    for (int i = 0; i < WARP_COL_TILES; i++) {
 #pragma unroll
      for (int j = 0; j < WARP_ROW_TILES; j++) {
 #pragma unroll
        for (int t = 0; t < c[i][j].num_elements; t++) {
          c[i][j].x[t] *= beta;
        }
      }
    }
    // Select what warp copies what matrix to shared memory.
    // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix.
    const uint8_t *warp_ptr = (warpId < 4) ? (&A[block_tile_i * M * K_GLOBAL] +
                                              M * K_GLOBAL * (warpId % 4) * 2)
                                           : (&B[block_tile_j * N * K_GLOBAL] +
                                              N * K_GLOBAL * (warpId % 4) * 2);
    // Go through the global K dimension by a fixed step at a time.
 #pragma unroll
    for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) {
      // Copy slices of the A and B matrices to shared memory.
      // The first half of the warps in the CTA copy the A matrix, the rest copy
      // the B matrix.
      size_t shmem_idx =
          warpId < (WARPS_PER_BLOCK / 2)
              ? (M * (warpId % (WARPS_PER_BLOCK / 2)) * 2)
              : (N * (warpId % (WARPS_PER_BLOCK / 2)) * 2 + shmem_idx_b_off);
      // First half of the warp copies the first row / column of the matrix,
      // the second half of the warp copies the next.
      int4 *lane_ptr = (int4 *)(warp_ptr + tile_k * K +
                                (laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL) +
                       (laneId % CHUNK_COPY_LINE_LANES);
      // Shift the second half of the warp to the next row / column in the
      // shared memory.
      shmem_idx += laneId / CHUNK_COPY_LINE_LANES;
 #pragma unroll
      for (int i = 0; i < ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP) * 2;
           i++) {
        // Copy 16 bytes at once in each lane.
        *((int4 *)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) =
            *lane_ptr;
        // Advance the global memory pointer and the shared memory index.
        lane_ptr = (int4 *)((uint8_t *)lane_ptr +
                            K_GLOBAL * CHUNK_COPY_LINES_PER_WARP);
        shmem_idx += CHUNK_COPY_LINES_PER_WARP;
      }
      __syncthreads();
      // Compute a grid of C matrix tiles in each warp.
 #pragma unroll
      for (int k_step = 0; k_step < CHUNK_K; k_step++) {
        wmma::fragment<wmma::matrix_a, M, N, K, uint8_t, wmma::row_major>
            a[WARP_COL_TILES];
        wmma::fragment<wmma::matrix_b, M, N, K, uint8_t, wmma::col_major>
            b[WARP_ROW_TILES];
 #pragma unroll
        for (int i = 0; i < WARP_COL_TILES; i++) {
          size_t shmem_idx_a = (warpId / 2) * M * 2 + (i * M);
          const uint8_t *tile_ptr = &shmem[shmem_idx_a][k_step * K];
          wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_UINT8);
 #pragma unroll
          for (int j = 0; j < WARP_ROW_TILES; j++) {
            if (i == 0) {
              // Load the B matrix fragment once, because it is going to be
              // reused against the other A matrix fragments.
              size_t shmem_idx_b = shmem_idx_b_off +
                                   (WARP_ROW_TILES * N) * (warpId % 2) +
                                   (j * N);
              const uint8_t *tile_ptr = &shmem[shmem_idx_b][k_step * K];
              wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_UINT8);
            }
            wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]);
          }
        }
      }
      __syncthreads();
    }
      // Store the D fragments to shared memory.
 #pragma unroll
    for (int i = 0; i < WARP_COL_TILES; i++) {
 #pragma unroll
      for (int j = 0; j < WARP_ROW_TILES; j++) {
 #pragma unroll
        // Uniform, point-wise transformations of ALL fragment elements by ALL
        // threads in the warp are well-defined even though element indices
        // within fragment storage are not defined.
        for (int t = 0; t < c[i][j].num_elements; t++) c[i][j].x[t] *= alpha;
        int *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N;
        wmma::store_matrix_sync(tile_ptr, c[i][j], SHMEM_STRIDE, C_LAYOUT);
      }
    }
    __syncthreads();
    // Now that shared memory contains all the D tiles, stream them to global
    // memory.
    int *dst_gmem_warp_stream_ptr = &D[gmem_idx];
 #pragma unroll
    for (int i = 0; i < K; i++) {
      *((int4 *)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) =
          *((int4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId);
    }
    __syncthreads();
  }
 }
 // Performs an MxNxK GEMM (C=alpha*A*B + beta*C) assuming:
 //  1) Matrices are packed in memory.
 //  2) M, N and K are multiples of 16.
 //  3) Neither A nor B are transposed.
 // Note: This is a less performant version of the compute_gemm_imma kernel. It
 // is designed for
 //       demonstration purposes only to show the CUDA WMMA API use without
 //       relying on availability of the shared memory.
 __global__ void simple_wmma_gemm_imma(const uint8_t *a, const uint8_t *b,
                                      const int *c, int *d, int m_ld, int n_ld,
                                      int k_ld, int alpha, int beta) {
  // Leading dimensions. Packed with no transpositions.
  int lda = m_ld;
  int ldb = k_ld;
  int ldc = n_ld;
  // Tile using a 2D grid
  int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize;
  int warpN = (blockIdx.y * blockDim.y + threadIdx.y);
  // Declare the fragments
  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, uint8_t,
                 wmma::row_major>
      a_frag;
  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, uint8_t,
                 wmma::col_major>
      b_frag;
  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, int> acc_frag;
  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, int> c_frag;
  wmma::fill_fragment(acc_frag, 0.0f);
  // Loop over k
  for (int i = 0; i < k_ld; i += WMMA_K) {
    int aCol = i;
    int aRow = warpM * WMMA_M;
    int bCol = i;
    int bRow = warpN * WMMA_N;
    // Bounds checking
    if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) {
      // Load the inputs
      wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda);
      wmma::load_matrix_sync(b_frag, b + bCol + bRow * ldb, ldb);
      // Perform the matrix multiplication
      wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
    }
  }
  // Load in the current value of c, scale it by beta, and add this our result
  // scaled by alpha
  int cCol = warpN * WMMA_N;
  int cRow = warpM * WMMA_M;
  if (cRow < m_ld && cCol < n_ld) {
    wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc,
                           wmma::mem_row_major);
    for (int i = 0; i < c_frag.num_elements; i++) {
      c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i];
    }
    // Store the output
    wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc,
                            wmma::mem_row_major);
  }
 }
 __host__ void matMultiplyOnHost(uint8_t *A, uint8_t *B, int *C, int alpha,
                                int beta, int numARows, int numAColumns,
                                int numBRows, int numBColumns, int numCRows,
                                int numCColumns) {
  for (int i = 0; i < numCRows; i++) {
    for (int j = 0; j < numCColumns; j++) {
      int temp = 0;
      for (int k = 0; k < numAColumns; k++) {
        temp += A[i * numAColumns + k] * B[j * numBRows + k];
      }
      C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j];
    }
  }
 }
 int main(int argc, char **argv) {
  printf("Initializing...\n");
  int dev = findCudaDevice(argc, (const char **)argv);
  cudaDeviceProp deviceProp;
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
  // Tensor cores require a GPU of Volta (SM72) architecture or higher.
  if (deviceProp.major < 7 || (deviceProp.major <= 7 && deviceProp.minor < 2)) {
    printf(
        "immaTensorCoreGemm requires SM 7.2 or higher to use Tensor Cores.  "
        "Exiting...\n");
    exit(EXIT_WAIVED);
  }
  printf("M: %d (%d x %d)\n", M_GLOBAL, M, M_TILES);
  printf("N: %d (%d x %d)\n", N_GLOBAL, N, N_TILES);
  printf("K: %d (%d x %d)\n", K_GLOBAL, K, K_TILES);
  uint8_t *A_h = NULL;
  uint8_t *B_h = NULL;
  int *C_h = NULL;
 #if CPU_DEBUG
  int *result_hD = NULL;
  int *result_host = NULL;
 #endif
  A_h = (uint8_t *)malloc(sizeof(uint8_t) * M_GLOBAL * K_GLOBAL);
  B_h = (uint8_t *)malloc(sizeof(uint8_t) * K_GLOBAL * N_GLOBAL);
  C_h = (int *)malloc(sizeof(int) * M_GLOBAL * N_GLOBAL);
 #if CPU_DEBUG
  result_hD = (int *)malloc(sizeof(int) * M_GLOBAL * N_GLOBAL);
  result_host = (int *)malloc(sizeof(int) * M_GLOBAL * N_GLOBAL);
 #endif
  uint8_t *A = NULL;
  uint8_t *B = NULL;
  int *C = NULL;
  int *D = NULL;
  checkCudaErrors(
      cudaMalloc(reinterpret_cast<void **>(&A), sizeof(uint8_t) * M_GLOBAL * K_GLOBAL));
  checkCudaErrors(
      cudaMalloc(reinterpret_cast<void **>(&B), sizeof(uint8_t) * N_GLOBAL * K_GLOBAL));
  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&C), sizeof(int) * M_GLOBAL * N_GLOBAL));
  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&D), sizeof(int) * M_GLOBAL * N_GLOBAL));
  assert(((unsigned long long)A) % 128 == 0);
  assert(((unsigned long long)B) % 128 == 0);
  assert(((unsigned long long)C) % 128 == 0);
  assert(((unsigned long long)D) % 128 == 0);
  init_host_matrices(A_h, B_h, C_h);
  checkCudaErrors(cudaMemcpy(A, A_h, sizeof(uint8_t) * M_GLOBAL * K_GLOBAL,
                             cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy(B, B_h, sizeof(uint8_t) * N_GLOBAL * K_GLOBAL,
                             cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy(C, C_h, sizeof(int) * M_GLOBAL * N_GLOBAL,
                             cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemset(D, 0, sizeof(int) * M_GLOBAL * N_GLOBAL));
  printf("Preparing data for GPU...\n");
  assert(((unsigned long long)A) % 128 == 0);
  assert(((unsigned long long)B) % 128 == 0);
  assert(((unsigned long long)C) % 128 == 0);
  assert(((unsigned long long)D) % 128 == 0);
  enum {
    // Compute the right amount of shared memory to request.
    // We need shared memory to hold per-CTA C and D matrix tiles, and to cache
    // per-CTA chunks
    // of the A and B matrices. Therefore, the right amount to request is the
    // maximum of those
    // two numbers.
    SHMEM_SZ = MAX(sizeof(uint8_t) * (BLOCK_COL_TILES * M) *
                       (CHUNK_K * K + SKEW_UINT8) * 2,
                   M * (BLOCK_ROW_WARPS * WARP_ROW_TILES) * N *
                       (BLOCK_COL_WARPS * WARP_COL_TILES) * sizeof(int))
  };
  printf("Required shared memory size: %lu Kb\n", SHMEM_SZ / 1024UL);
  int alpha = 1;
  int beta = 1;
  cudaEvent_t start, stop;
  checkCudaErrors(cudaEventCreate(&start));
  checkCudaErrors(cudaEventCreate(&stop));
  checkCudaErrors(cudaEventRecord(start));
  // If enough shared memory available on the GPU use high performant kernel
  if (deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) {
    printf("Computing... using high performance kernel compute_gemm_imma \n");
    checkCudaErrors(cudaFuncSetAttribute(
        compute_gemm_imma, cudaFuncAttributeMaxDynamicSharedMemorySize,
        SHMEM_SZ));
    checkKernelErrors(
        (compute_gemm_imma<<<deviceProp.multiProcessorCount, THREADS_PER_BLOCK,
                             SHMEM_SZ>>>(A, B, C, D, alpha, beta)));
 #if CPU_DEBUG
    checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(int) * M_GLOBAL * N_GLOBAL,
                               cudaMemcpyDeviceToHost));
 #endif
  } else {
    dim3 gridDim;
    dim3 blockDim;
    // blockDim.x must be a multiple of warpSize
    // 128x4 means we have 16 warps and a block computes a 64x64 output tile
    blockDim.x = 128;
    blockDim.y = 4;
    gridDim.x = (M_GLOBAL + (WMMA_M * blockDim.x / 32 - 1)) /
                (WMMA_M * blockDim.x / 32);
    gridDim.y = (N_GLOBAL + WMMA_N * blockDim.y - 1) / (WMMA_N * blockDim.y);
    printf("Computing... using simple_wmma_gemm_imma kernel\n");
    simple_wmma_gemm_imma<<<gridDim, blockDim>>>(A, B, C, D, M_GLOBAL, N_GLOBAL,
                                                 K_GLOBAL, alpha, beta);
 #if CPU_DEBUG
    checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(int) * M_GLOBAL * N_GLOBAL,
                               cudaMemcpyDeviceToHost));
 #endif
  }
  checkCudaErrors(cudaEventRecord(stop));
  checkCudaErrors(cudaEventSynchronize(stop));
 #if CPU_DEBUG
  printf("Verifying correctness of the computations...\n");
  memcpy(result_host, C_h, sizeof(int) * M_GLOBAL * N_GLOBAL);
  matMultiplyOnHost(A_h, B_h, result_host, alpha, beta, M_GLOBAL, K_GLOBAL,
                    K_GLOBAL, N_GLOBAL, M_GLOBAL, N_GLOBAL);
  for (int i = 0; i < N_GLOBAL * M_GLOBAL; i++) {
    if (abs(result_hD[i] - result_host[i]) > 0) {
      printf("mismatch i=%d result_hD=%d result_host=%d\n", i, result_hD[i],
             result_host[i]);
    }
  }
  free(result_host);
  free(result_hD);
 #endif
  float milliseconds = 0;
  checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop));
    printf("Time: %f ms\n", milliseconds);
    printf("TOPS: %.2f\n", (((double)M_GLOBAL * N_GLOBAL * K_GLOBAL * 2)/(milliseconds/1000.)) / 1e12);
  free(A_h);
  free(B_h);
  free(C_h);
  checkCudaErrors(cudaFree(reinterpret_cast<void *>(A)));
  checkCudaErrors(cudaFree(reinterpret_cast<void *>(B)));
  checkCudaErrors(cudaFree(reinterpret_cast<void *>(C)));
  checkCudaErrors(cudaFree(reinterpret_cast<void *>(D)));
  return EXIT_SUCCESS;
 }
--- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2012.sln
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2012.sln
@ -0,0 +1,20 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2012
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "immaTensorCoreGemm", "immaTensorCoreGemm_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2012.vcxproj
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2012.vcxproj
@ -0,0 +1,107 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>immaTensorCoreGemm_vs2012</RootNamespace>
    <ProjectName>immaTensorCoreGemm</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v110</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="immaTensorCoreGemm.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2013.sln
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2013.sln
@ -0,0 +1,20 @@
 Microsoft Visual Studio Solution File, Format Version 13.00
 # Visual Studio 2013
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "immaTensorCoreGemm", "immaTensorCoreGemm_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2013.vcxproj
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2013.vcxproj
@ -0,0 +1,107 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>immaTensorCoreGemm_vs2013</RootNamespace>
    <ProjectName>immaTensorCoreGemm</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v120</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="immaTensorCoreGemm.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2015.sln
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2015.sln
@ -0,0 +1,20 @@
 Microsoft Visual Studio Solution File, Format Version 14.00
 # Visual Studio 2015
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "immaTensorCoreGemm", "immaTensorCoreGemm_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2015.vcxproj
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2015.vcxproj
@ -0,0 +1,107 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>immaTensorCoreGemm_vs2015</RootNamespace>
    <ProjectName>immaTensorCoreGemm</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v140</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="immaTensorCoreGemm.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.sln
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.sln
@ -0,0 +1,20 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2017
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "immaTensorCoreGemm", "immaTensorCoreGemm_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj
@ -0,0 +1,108 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>immaTensorCoreGemm_vs2017</RootNamespace>
    <ProjectName>immaTensorCoreGemm</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v141</PlatformToolset>
 	<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="immaTensorCoreGemm.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/matrixMul/Makefile
+++ b/Samples/matrixMul/Makefile
@ -246,7 +246,11 @@ LIBRARIES :=
 ################################################################################
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
 SMS ?= 30 35 37 50 52 60 61 70 72 75
 else
 SMS ?= 30 35 37 50 52 60 61 70 75
 endif
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
--- a/Samples/matrixMul/NsightEclipse.xml
+++ b/Samples/matrixMul/NsightEclipse.xml
@ -46,6 +46,7 @@
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
  <sm-arch>sm72</sm-arch>
  <sm-arch>sm75</sm-arch>
  <supported_envs>
    <env>
--- a/Samples/matrixMul/README.md
+++ b/Samples/matrixMul/README.md
@ -10,7 +10,7 @@ CUDA Runtime API, Linear Algebra
 ## Supported SM Architectures
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
@ -27,7 +27,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla
 ## Prerequisites
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 ## Build and Run
--- a/Samples/matrixMul/matrixMul_vs2012.vcxproj
+++ b/Samples/matrixMul/matrixMul_vs2012.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/matrixMul/matrixMul_vs2013.vcxproj
+++ b/Samples/matrixMul/matrixMul_vs2013.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/matrixMul/matrixMul_vs2015.vcxproj
+++ b/Samples/matrixMul/matrixMul_vs2015.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/matrixMul/matrixMul_vs2017.vcxproj
+++ b/Samples/matrixMul/matrixMul_vs2017.vcxproj
@ -34,7 +34,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/matrixMulDrv/README.md
+++ b/Samples/matrixMulDrv/README.md
@ -10,7 +10,7 @@ CUDA Driver API, Matrix Multiply
 ## Supported SM Architectures
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
@ -27,7 +27,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cu
 ## Prerequisites
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 ## Build and Run
--- a/Samples/matrixMulDrv/matrixMulDrv_vs2012.vcxproj
+++ b/Samples/matrixMulDrv/matrixMulDrv_vs2012.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -106,6 +106,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/matrixMulDrv/matrixMulDrv_vs2013.vcxproj
+++ b/Samples/matrixMulDrv/matrixMulDrv_vs2013.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -106,6 +106,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/matrixMulDrv/matrixMulDrv_vs2015.vcxproj
+++ b/Samples/matrixMulDrv/matrixMulDrv_vs2015.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -106,6 +106,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj
+++ b/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj
@ -34,7 +34,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -107,6 +107,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/nvJPEG/Makefile
+++ b/Samples/nvJPEG/Makefile
@ -0,0 +1,301 @@
 ################################################################################
 # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #  * Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 #  * Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #  * Neither the name of NVIDIA CORPORATION nor the names of its
 #    contributors may be used to endorse or promote products derived
 #    from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
 #
 # Makefile project only supported on Mac OS X and Linux Platforms)
 #
 ################################################################################
 # Location of the CUDA Toolkit
 CUDA_PATH ?= /usr/local/cuda
 ##############################
 # start deprecated interface #
 ##############################
 ifeq ($(x86_64),1)
    $(info WARNING - x86_64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
    TARGET_ARCH ?= x86_64
 endif
 ifeq ($(ARMv7),1)
    $(info WARNING - ARMv7 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=armv7l instead)
    TARGET_ARCH ?= armv7l
 endif
 ifeq ($(aarch64),1)
    $(info WARNING - aarch64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
    TARGET_ARCH ?= aarch64
 endif
 ifeq ($(ppc64le),1)
    $(info WARNING - ppc64le variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
    TARGET_ARCH ?= ppc64le
 endif
 ifneq ($(GCC),)
    $(info WARNING - GCC variable has been deprecated)
    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
    HOST_COMPILER ?= $(GCC)
 endif
 ifneq ($(abi),)
    $(error ERROR - abi variable has been removed)
 endif
 ############################
 # end deprecated interface #
 ############################
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
 ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
            TARGET_SIZE := 64
        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
            TARGET_SIZE := 32
        endif
    else
        TARGET_SIZE := $(shell getconf LONG_BIT)
    endif
 else
    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
    endif
 endif
 # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
 ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
    TARGET_ARCH = armv7l
 endif
 # operating system
 HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 TARGET_OS ?= $(HOST_OS)
 ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
 endif
 # host compiler
 ifeq ($(TARGET_OS),darwin)
    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
        HOST_COMPILER ?= clang++
    endif
 else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
        ifeq ($(TARGET_OS),linux)
            HOST_COMPILER ?= arm-linux-gnueabihf-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
        else ifeq ($(TARGET_OS),android)
            HOST_COMPILER ?= arm-linux-androideabi-g++
        endif
    else ifeq ($(TARGET_ARCH),aarch64)
        ifeq ($(TARGET_OS), linux)
            HOST_COMPILER ?= aarch64-linux-gnu-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
        else ifeq ($(TARGET_OS), android)
            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
    endif
 endif
 HOST_COMPILER ?= g++
 NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
 # internal flags
 NVCCFLAGS   := -m${TARGET_SIZE}
 CCFLAGS     :=
 LDFLAGS     :=
 # build flags
 ifeq ($(TARGET_OS),darwin)
    LDFLAGS += -rpath $(CUDA_PATH)/lib
    CCFLAGS += -arch $(HOST_ARCH)
 else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
 else ifeq ($(TARGET_OS),android)
    LDFLAGS += -pie
    CCFLAGS += -fpie -fpic -fexceptions
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
        endif
    endif
 endif
 ifeq ($(TARGET_OS),qnx)
    CCFLAGS += -DWIN_INTERFACE_CUSTOM
    LDFLAGS += -lsocket
 endif
 # Install directory of different arch
 CUDA_INSTALL_TARGET_DIR :=
 ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
 else ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
 endif
 # Debug build flags
 ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      BUILD_TYPE := debug
 else
      BUILD_TYPE := release
 endif
 ALL_CCFLAGS :=
 ALL_CCFLAGS += $(NVCCFLAGS)
 ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 SAMPLE_ENABLED := 1
 # This sample is not supported on Mac OSX
 ifeq ($(TARGET_OS),darwin)
  $(info >>> WARNING - nvJPEG is not supported on Mac OSX - waiving sample <<<)
  SAMPLE_ENABLED := 0
 endif
 # This sample is not supported on ARMv7
 ifeq ($(TARGET_ARCH),armv7l)
  $(info >>> WARNING - nvJPEG is not supported on ARMv7 - waiving sample <<<)
  SAMPLE_ENABLED := 0
 endif
 # This sample is not supported on aarch64
 ifeq ($(TARGET_ARCH),aarch64)
  $(info >>> WARNING - nvJPEG is not supported on aarch64 - waiving sample <<<)
  SAMPLE_ENABLED := 0
 endif
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
 # Common includes and paths for CUDA
 INCLUDES  := -I../../Common
 LIBRARIES :=
 ################################################################################
 LIBRARIES += -lnvjpeg
 ifeq ($(SAMPLE_ENABLED),0)
 EXEC ?= @echo "[@]"
 endif
 ################################################################################
 # Target rules
 all: build
 build: nvJPEG
 check.deps:
 ifeq ($(SAMPLE_ENABLED),0)
 	@echo "Sample will be waived due to the above missing dependencies"
 else
 	@echo "Sample is ready - all dependencies have been met"
 endif
 nvJPEG.o:nvJPEG.cpp
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 nvJPEG: nvJPEG.o
 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
 	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 run: build
 	$(EXEC) ./nvJPEG
 clean:
 	rm -f nvJPEG nvJPEG.o
 	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/nvJPEG
 clobber: clean
--- a/Samples/nvJPEG/NsightEclipse.xml
+++ b/Samples/nvJPEG/NsightEclipse.xml
@ -0,0 +1,58 @@
 <?xml version="1.0" encoding="UTF-8"?> 
 <!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
 <entry>
  <name>nvJPEG</name>
  <description><![CDATA[A CUDA Sample that demonstrates single and batched decoding of jpeg images using NVJPEG Library.]]></description>
  <devicecompilation>whole</devicecompilation>
  <includepaths>
    <path>./</path>
    <path>../</path>
    <path>../../common/inc</path>
  </includepaths>
  <keyconcepts>
    <concept level="basic">Image Decoding</concept>
    <concept level="basic">NVJPEG Library</concept>
  </keyconcepts>
  <keywords>
    <keyword>NVJPEG</keyword>
    <keyword>JPEG Decoding</keyword>
  </keywords>
  <libraries>
    <library>nvjpeg</library>
  </libraries>
  <librarypaths>
  </librarypaths>
  <nsight_eclipse>true</nsight_eclipse>
  <primary_file>nvJPEG.cpp</primary_file>
  <qatests>
    <qatest>-i ../../../../Samples/nvJPEG/images/</qatest>
  </qatests>
  <required_dependencies>
    <dependency>NVJPEG</dependency>
  </required_dependencies>
  <scopes>
    <scope>1:CUDA Basic Topics</scope>
    <scope>3:JPEG Decoding</scope>
  </scopes>
  <sm-arch>sm30</sm-arch>
  <sm-arch>sm35</sm-arch>
  <sm-arch>sm37</sm-arch>
  <sm-arch>sm50</sm-arch>
  <sm-arch>sm52</sm-arch>
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
  <sm-arch>sm72</sm-arch>
  <sm-arch>sm75</sm-arch>
  <supported_envs>
    <env>
      <arch>x86_64</arch>
      <platform>linux</platform>
    </env>
  </supported_envs>
  <supported_sm_architectures>
    <from>3.0</from>
  </supported_sm_architectures>
  <title>NVJPEG simple</title>
  <type>exe</type>
 </entry>
--- a/Samples/nvJPEG/README.md
+++ b/Samples/nvJPEG/README.md
@ -0,0 +1,61 @@
 # nvJPEG - NVJPEG simple
 ## Description
 A CUDA Sample that demonstrates single and batched decoding of jpeg images using NVJPEG Library.
 ## Key Concepts
 Image Decoding, NVJPEG Library
 ## Supported SM Architectures
 [SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
 Linux
 ## Supported CPU Architecture
 x86_64
 ## CUDA APIs involved
 ## Dependencies needed to build/run
 [NVJPEG](../../README.md#nvjpeg)
 ## Prerequisites
 Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## Build and Run
 ### Linux
 The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
 ```
 $ cd <sample_dir>
 $ make
 ```
 The samples makefiles can take advantage of certain options:
 *  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64.
    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
 `$ make TARGET_ARCH=x86_64` <br/>
    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
 *   **dbg=1** - build with debug symbols
    ```
    $ make dbg=1
    ```
 *   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
    ```
    $ make SMS="50 60"
    ```
 *  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
 ```
    $ make HOST_COMPILER=g++
 ```
 ## References (for more details)
--- a/Samples/nvJPEG/images/img1.jpg
+++ b/Samples/nvJPEG/images/img1.jpg
--- a/Samples/nvJPEG/images/img2.jpg
+++ b/Samples/nvJPEG/images/img2.jpg
--- a/Samples/nvJPEG/images/img3.jpg
+++ b/Samples/nvJPEG/images/img3.jpg
--- a/Samples/nvJPEG/images/img4.jpg
+++ b/Samples/nvJPEG/images/img4.jpg
--- a/Samples/nvJPEG/images/img5.jpg
+++ b/Samples/nvJPEG/images/img5.jpg
--- a/Samples/nvJPEG/images/img6.jpg
+++ b/Samples/nvJPEG/images/img6.jpg
--- a/Samples/nvJPEG/images/img7.jpg
+++ b/Samples/nvJPEG/images/img7.jpg
--- a/Samples/nvJPEG/images/img8.jpg
+++ b/Samples/nvJPEG/images/img8.jpg
--- a/Samples/nvJPEG/nvJPEG.cpp
+++ b/Samples/nvJPEG/nvJPEG.cpp
@ -0,0 +1,559 @@
 /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 // This sample needs at least CUDA 10.0. It demonstrates usages of the nvJPEG
 // library nvJPEG supports single and multiple image(batched) decode. Multiple
 // images can be decoded using the API for batch mode
 #include <cuda_runtime_api.h>
 #include "nvJPEG_helper.hxx"
 int dev_malloc(void **p, size_t s) { return (int)cudaMalloc(p, s); }
 int dev_free(void *p) { return (int)cudaFree(p); }
 typedef std::vector<std::string> FileNames;
 typedef std::vector<std::vector<char> > FileData;
 struct decode_params_t {
  std::string input_dir;
  int batch_size;
  int total_images;
  int dev;
  int warmup;
  nvjpegJpegState_t nvjpeg_state;
  nvjpegHandle_t nvjpeg_handle;
  cudaStream_t stream;
  nvjpegOutputFormat_t fmt;
  bool write_decoded;
  std::string output_dir;
  bool pipelined;
  bool batched;
 };
 int read_next_batch(FileNames &image_names, int batch_size,
                    FileNames::iterator &cur_iter, FileData &raw_data,
                    std::vector<size_t> &raw_len, FileNames &current_names) {
  int counter = 0;
  while (counter < batch_size) {
    if (cur_iter == image_names.end()) {
      std::cerr << "Image list is too short to fill the batch, adding files "
                   "from the beginning of the image list"
                << std::endl;
      cur_iter = image_names.begin();
    }
    if (image_names.size() == 0) {
      std::cerr << "No valid images left in the input list, exit" << std::endl;
      return EXIT_FAILURE;
    }
    // Read an image from disk.
    std::ifstream input(cur_iter->c_str(),
                        std::ios::in | std::ios::binary | std::ios::ate);
    if (!(input.is_open())) {
      std::cerr << "Cannot open image: " << *cur_iter
                << ", removing it from image list" << std::endl;
      image_names.erase(cur_iter);
      continue;
    }
    // Get the size
    std::streamsize file_size = input.tellg();
    input.seekg(0, std::ios::beg);
    // resize if buffer is too small
    if (raw_data[counter].size() < file_size) {
      raw_data[counter].resize(file_size);
    }
    if (!input.read(raw_data[counter].data(), file_size)) {
      std::cerr << "Cannot read from file: " << *cur_iter
                << ", removing it from image list" << std::endl;
      image_names.erase(cur_iter);
      continue;
    }
    raw_len[counter] = file_size;
    current_names[counter] = *cur_iter;
    counter++;
    cur_iter++;
  }
  return EXIT_SUCCESS;
 }
 // prepare buffers for RGBi output format
 int prepare_buffers(FileData &file_data, std::vector<size_t> &file_len,
                    std::vector<int> &img_width, std::vector<int> &img_height,
                    std::vector<nvjpegImage_t> &ibuf,
                    std::vector<nvjpegImage_t> &isz, FileNames &current_names,
                    decode_params_t &params) {
  int widths[NVJPEG_MAX_COMPONENT];
  int heights[NVJPEG_MAX_COMPONENT];
  int channels;
  nvjpegChromaSubsampling_t subsampling;
  for (int i = 0; i < file_data.size(); i++) {
    checkCudaErrors(nvjpegGetImageInfo(
        params.nvjpeg_handle, (unsigned char *)file_data[i].data(), file_len[i],
        &channels, &subsampling, widths, heights));
    img_width[i] = widths[0];
    img_height[i] = heights[0];
    std::cout << "Processing: " << current_names[i] << std::endl;
    std::cout << "Image is " << channels << " channels." << std::endl;
    for (int c = 0; c < channels; c++) {
      std::cout << "Channel #" << c << " size: " << widths[c] << " x "
                << heights[c] << std::endl;
    }
    switch (subsampling) {
      case NVJPEG_CSS_444:
        std::cout << "YUV 4:4:4 chroma subsampling" << std::endl;
        break;
      case NVJPEG_CSS_440:
        std::cout << "YUV 4:4:0 chroma subsampling" << std::endl;
        break;
      case NVJPEG_CSS_422:
        std::cout << "YUV 4:2:2 chroma subsampling" << std::endl;
        break;
      case NVJPEG_CSS_420:
        std::cout << "YUV 4:2:0 chroma subsampling" << std::endl;
        break;
      case NVJPEG_CSS_411:
        std::cout << "YUV 4:1:1 chroma subsampling" << std::endl;
        break;
      case NVJPEG_CSS_410:
        std::cout << "YUV 4:1:0 chroma subsampling" << std::endl;
        break;
      case NVJPEG_CSS_GRAY:
        std::cout << "Grayscale JPEG " << std::endl;
        break;
      case NVJPEG_CSS_UNKNOWN:
        std::cout << "Unknown chroma subsampling" << std::endl;
        return EXIT_FAILURE;
    }
    int mul = 1;
    // in the case of interleaved RGB output, write only to single channel, but
    // 3 samples at once
    if (params.fmt == NVJPEG_OUTPUT_RGBI || params.fmt == NVJPEG_OUTPUT_BGRI) {
      channels = 1;
      mul = 3;
    }
    // in the case of rgb create 3 buffers with sizes of original image
    else if (params.fmt == NVJPEG_OUTPUT_RGB ||
             params.fmt == NVJPEG_OUTPUT_BGR) {
      channels = 3;
      widths[1] = widths[2] = widths[0];
      heights[1] = heights[2] = heights[0];
    }
    // realloc output buffer if required
    for (int c = 0; c < channels; c++) {
      int aw = mul * widths[c];
      int ah = heights[c];
      int sz = aw * ah;
      ibuf[i].pitch[c] = aw;
      if (sz > isz[i].pitch[c]) {
        if (ibuf[i].channel[c]) {
          checkCudaErrors(cudaFree(ibuf[i].channel[c]));
        }
        checkCudaErrors(cudaMalloc(&ibuf[i].channel[c], sz));
        isz[i].pitch[c] = sz;
      }
    }
  }
  return EXIT_SUCCESS;
 }
 void release_buffers(std::vector<nvjpegImage_t> &ibuf) {
  for (int i = 0; i < ibuf.size(); i++) {
    for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++)
      if (ibuf[i].channel[c]) checkCudaErrors(cudaFree(ibuf[i].channel[c]));
  }
 }
 int decode_images(const FileData &img_data, const std::vector<size_t> &img_len,
                  std::vector<nvjpegImage_t> &out, decode_params_t &params,
                  double &time) {
  checkCudaErrors(cudaStreamSynchronize(params.stream));
  nvjpegStatus_t err;
  StopWatchInterface *timer = NULL;
  sdkCreateTimer(&timer);
  if (!params.batched) {
    if (!params.pipelined)  // decode one image at a time
    {
      int thread_idx = 0;
      sdkStartTimer(&timer);
      for (int i = 0; i < params.batch_size; i++) {
        checkCudaErrors(nvjpegDecode(params.nvjpeg_handle, params.nvjpeg_state,
                                     (const unsigned char *)img_data[i].data(),
                                     img_len[i], params.fmt, &out[i],
                                     params.stream));
        checkCudaErrors(cudaStreamSynchronize(params.stream));
      }
    } else {
      int thread_idx = 0;
      sdkStartTimer(&timer);
      for (int i = 0; i < params.batch_size; i++) {
        checkCudaErrors(
            nvjpegDecodePhaseOne(params.nvjpeg_handle, params.nvjpeg_state,
                                 (const unsigned char *)img_data[i].data(),
                                 img_len[i], params.fmt, params.stream));
        checkCudaErrors(cudaStreamSynchronize(params.stream));
        checkCudaErrors(nvjpegDecodePhaseTwo(
            params.nvjpeg_handle, params.nvjpeg_state, params.stream));
        checkCudaErrors(nvjpegDecodePhaseThree(
            params.nvjpeg_handle, params.nvjpeg_state, &out[i], params.stream));
      }
      checkCudaErrors(cudaStreamSynchronize(params.stream));
    }
  } else {
    std::vector<const unsigned char *> raw_inputs;
    for (int i = 0; i < params.batch_size; i++) {
      raw_inputs.push_back((const unsigned char *)img_data[i].data());
    }
    if (!params.pipelined)  // decode multiple images in a single batch
    {
      sdkStartTimer(&timer);
      checkCudaErrors(nvjpegDecodeBatched(
          params.nvjpeg_handle, params.nvjpeg_state, raw_inputs.data(),
          img_len.data(), out.data(), params.stream));
      checkCudaErrors(cudaStreamSynchronize(params.stream));
    } else {
      int thread_idx = 0;
      for (int i = 0; i < params.batch_size; i++) {
        checkCudaErrors(nvjpegDecodeBatchedPhaseOne(
            params.nvjpeg_handle, params.nvjpeg_state, raw_inputs[i],
            img_len[i], i, thread_idx, params.stream));
      }
      checkCudaErrors(nvjpegDecodeBatchedPhaseTwo(
          params.nvjpeg_handle, params.nvjpeg_state, params.stream));
      checkCudaErrors(nvjpegDecodeBatchedPhaseThree(params.nvjpeg_handle,
                                                    params.nvjpeg_state,
                                                    out.data(), params.stream));
      checkCudaErrors(cudaStreamSynchronize(params.stream));
    }
  }
  sdkStopTimer(&timer);
  time = sdkGetAverageTimerValue(&timer)/1000.0f;
  return EXIT_SUCCESS;
 }
 int write_images(std::vector<nvjpegImage_t> &iout, std::vector<int> &widths,
                 std::vector<int> &heights, decode_params_t &params,
                 FileNames &filenames) {
  for (int i = 0; i < params.batch_size; i++) {
    // Get the file name, without extension.
    // This will be used to rename the output file.
    size_t position = filenames[i].rfind("/");
    std::string sFileName =
        (std::string::npos == position)
            ? filenames[i]
            : filenames[i].substr(position + 1, filenames[i].size());
    position = sFileName.rfind(".");
    sFileName = (std::string::npos == position) ? sFileName
                                                : sFileName.substr(0, position);
    std::string fname(params.output_dir + "/" + sFileName + ".bmp");
    int err;
    if (params.fmt == NVJPEG_OUTPUT_RGB || params.fmt == NVJPEG_OUTPUT_BGR) {
      err = writeBMP(fname.c_str(), iout[i].channel[0], iout[i].pitch[0],
                     iout[i].channel[1], iout[i].pitch[1], iout[i].channel[2],
                     iout[i].pitch[2], widths[i], heights[i]);
    } else if (params.fmt == NVJPEG_OUTPUT_RGBI ||
               params.fmt == NVJPEG_OUTPUT_BGRI) {
      // Write BMP from interleaved data
      err = writeBMPi(fname.c_str(), iout[i].channel[0], iout[i].pitch[0],
                      widths[i], heights[i]);
    }
    if (err) {
      std::cout << "Cannot write output file: " << fname << std::endl;
      return EXIT_FAILURE;
    }
    std::cout << "Done writing decoded image to file: " << fname << std::endl;
  }
 }
 double process_images(FileNames &image_names, decode_params_t &params,
                      double &total) {
  // vector for storing raw files and file lengths
  FileData file_data(params.batch_size);
  std::vector<size_t> file_len(params.batch_size);
  FileNames current_names(params.batch_size);
  std::vector<int> widths(params.batch_size);
  std::vector<int> heights(params.batch_size);
  // we wrap over image files to process total_images of files
  FileNames::iterator file_iter = image_names.begin();
  // stream for decoding
  checkCudaErrors(
      cudaStreamCreateWithFlags(&params.stream, cudaStreamNonBlocking));
  int total_processed = 0;
  // output buffers
  std::vector<nvjpegImage_t> iout(params.batch_size);
  // output buffer sizes, for convenience
  std::vector<nvjpegImage_t> isz(params.batch_size);
  for (int i = 0; i < iout.size(); i++) {
    for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++) {
      iout[i].channel[c] = NULL;
      iout[i].pitch[c] = 0;
      isz[i].pitch[c] = 0;
    }
  }
  double test_time = 0;
  int warmup = 0;
  while (total_processed < params.total_images) {
    if (read_next_batch(image_names, params.batch_size, file_iter, file_data,
                        file_len, current_names))
      return EXIT_FAILURE;
    if (prepare_buffers(file_data, file_len, widths, heights, iout, isz,
                        current_names, params))
      return EXIT_FAILURE;
    double time;
    if (decode_images(file_data, file_len, iout, params, time))
      return EXIT_FAILURE;
    if (warmup < params.warmup) {
      warmup++;
    } else {
      total_processed += params.batch_size;
      test_time += time;
    }
    if (params.write_decoded)
      write_images(iout, widths, heights, params, current_names);
  }
  total = test_time;
  release_buffers(iout);
  checkCudaErrors(cudaStreamDestroy(params.stream));
  return EXIT_SUCCESS;
 }
 // parse parameters
 int findParamIndex(const char **argv, int argc, const char *parm) {
  int count = 0;
  int index = -1;
  for (int i = 0; i < argc; i++) {
    if (strncmp(argv[i], parm, 100) == 0) {
      index = i;
      count++;
    }
  }
  if (count == 0 || count == 1) {
    return index;
  } else {
    std::cout << "Error, parameter " << parm
              << " has been specified more than once, exiting\n"
              << std::endl;
    return -1;
  }
  return -1;
 }
 int main(int argc, const char *argv[]) {
  int pidx;
  if ((pidx = findParamIndex(argv, argc, "-h")) != -1 ||
      (pidx = findParamIndex(argv, argc, "--help")) != -1) {
    std::cout << "Usage: " << argv[0]
              << " -i images_dir [-b batch_size] [-t total_images] [-device= "
                 "device_id] [-w warmup_iterations] [-o output_dir] "
                 "[-pipelined] [-batched] [-fmt output_format]\n";
    std::cout << "Parameters: " << std::endl;
    std::cout << "\timages_dir\t:\tPath to single image or directory of images"
              << std::endl;
    std::cout << "\tbatch_size\t:\tDecode images from input by batches of "
                 "specified size"
              << std::endl;
    std::cout << "\ttotal_images\t:\tDecode this much images, if there are "
                 "less images \n"
              << "\t\t\t\t\tin the input than total images, decoder will loop "
                 "over the input"
              << std::endl;
    std::cout << "\tdevice_id\t:\tWhich device to use for decoding"
              << std::endl;
    std::cout << "\twarmup_iterations\t:\tRun this amount of batches first "
                 "without measuring performance"
              << std::endl;
    std::cout
        << "\toutput_dir\t:\tWrite decoded images as BMPs to this directory"
        << std::endl;
    std::cout << "\tpipelined\t:\tUse decoding in phases" << std::endl;
    std::cout << "\tbatched\t\t:\tUse batched interface" << std::endl;
    std::cout << "\toutput_format\t:\tnvJPEG output format for decoding. One "
                 "of [rgb, rgbi, bgr, bgri, yuv, y, unchanged]"
              << std::endl;
    return EXIT_SUCCESS;
  }
  decode_params_t params;
  params.input_dir = "./";
  if ((pidx = findParamIndex(argv, argc, "-i")) != -1) {
    params.input_dir = argv[pidx + 1];
  } else {
    std::cerr << "Please specify input directory with encoded images"
              << std::endl;
    return EXIT_WAIVED;
  }
  params.batch_size = 1;
  if ((pidx = findParamIndex(argv, argc, "-b")) != -1) {
    params.batch_size = std::atoi(argv[pidx + 1]);
  }
  params.total_images = -1;
  if ((pidx = findParamIndex(argv, argc, "-t")) != -1) {
    params.total_images = std::atoi(argv[pidx + 1]);
  }
  params.dev = 0;
  params.dev = findCudaDevice(argc, argv);
  params.warmup = 0;
  if ((pidx = findParamIndex(argv, argc, "-w")) != -1) {
    params.warmup = std::atoi(argv[pidx + 1]);
  }
  params.batched = false;
  if ((pidx = findParamIndex(argv, argc, "-batched")) != -1) {
    params.batched = true;
  }
  params.pipelined = false;
  if ((pidx = findParamIndex(argv, argc, "-pipelined")) != -1) {
    params.pipelined = true;
  }
  params.fmt = NVJPEG_OUTPUT_RGB;
  if ((pidx = findParamIndex(argv, argc, "-fmt")) != -1) {
    std::string sfmt = argv[pidx + 1];
    if (sfmt == "rgb")
      params.fmt = NVJPEG_OUTPUT_RGB;
    else if (sfmt == "bgr")
      params.fmt = NVJPEG_OUTPUT_BGR;
    else if (sfmt == "rgbi")
      params.fmt = NVJPEG_OUTPUT_RGBI;
    else if (sfmt == "bgri")
      params.fmt = NVJPEG_OUTPUT_BGRI;
    else if (sfmt == "yuv")
      params.fmt = NVJPEG_OUTPUT_YUV;
    else if (sfmt == "y")
      params.fmt = NVJPEG_OUTPUT_Y;
    else if (sfmt == "unchanged")
      params.fmt = NVJPEG_OUTPUT_UNCHANGED;
    else {
      std::cout << "Unknown format: " << sfmt << std::endl;
      return EXIT_FAILURE;
    }
  }
  params.write_decoded = false;
  if ((pidx = findParamIndex(argv, argc, "-o")) != -1) {
    params.output_dir = argv[pidx + 1];
    if (params.fmt != NVJPEG_OUTPUT_RGB && params.fmt != NVJPEG_OUTPUT_BGR &&
        params.fmt != NVJPEG_OUTPUT_RGBI && params.fmt != NVJPEG_OUTPUT_BGRI) {
      std::cout << "We can write ony BMPs, which require output format be "
                   "either RGB/BGR or RGBi/BGRi"
                << std::endl;
      return EXIT_FAILURE;
    }
    params.write_decoded = true;
  }
  cudaDeviceProp props;
  checkCudaErrors(cudaGetDeviceProperties(&props, params.dev));
  printf("Using GPU %d (%s, %d SMs, %d th/SM max, CC %d.%d, ECC %s)\n",
         params.dev, props.name, props.multiProcessorCount,
         props.maxThreadsPerMultiProcessor, props.major, props.minor,
         props.ECCEnabled ? "on" : "off");
  nvjpegDevAllocator_t dev_allocator = {&dev_malloc, &dev_free};
  checkCudaErrors(nvjpegCreate(NVJPEG_BACKEND_DEFAULT, &dev_allocator,
                               &params.nvjpeg_handle));
  checkCudaErrors(
      nvjpegJpegStateCreate(params.nvjpeg_handle, &params.nvjpeg_state));
  checkCudaErrors(
      nvjpegDecodeBatchedInitialize(params.nvjpeg_handle, params.nvjpeg_state,
                                    params.batch_size, 1, params.fmt));
  // read source images
  FileNames image_names;
  readInput(params.input_dir, image_names);
  if (params.total_images == -1) {
    params.total_images = image_names.size();
  } else if (params.total_images % params.batch_size) {
    params.total_images =
        ((params.total_images) / params.batch_size) * params.batch_size;
    std::cout << "Changing total_images number to " << params.total_images
              << " to be multiple of batch_size - " << params.batch_size
              << std::endl;
  }
  std::cout << "Decoding images in directory: " << params.input_dir
            << ", total " << params.total_images << ", batchsize "
            << params.batch_size << std::endl;
  double total;
  if (process_images(image_names, params, total)) return EXIT_FAILURE;
  std::cout << "Total decoding time: " << total << std::endl;
  std::cout << "Avg decoding time per image: " << total / params.total_images
            << std::endl;
  std::cout << "Avg images per sec: " << params.total_images / total
            << std::endl;
  std::cout << "Avg decoding time per batch: "
            << total / ((params.total_images + params.batch_size - 1) /
                        params.batch_size)
            << std::endl;
  checkCudaErrors(nvjpegJpegStateDestroy(params.nvjpeg_state));
  checkCudaErrors(nvjpegDestroy(params.nvjpeg_handle));
  return EXIT_SUCCESS;
 }
--- a/Samples/nvJPEG/nvJPEG_helper.hxx
+++ b/Samples/nvJPEG/nvJPEG_helper.hxx
@ -0,0 +1,338 @@
 /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 // This sample needs at least CUDA 10.0.
 // It demonstrates usages of the nvJPEG library
 #ifndef NV_JPEG_EXAMPLE
 #define NV_JPEG_EXAMPLE
 #include "cuda_runtime.h"
 #include "nvjpeg.h"
 #include "helper_cuda.h"
 #include "helper_timer.h"
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
 #include <sstream>
 #include <string>
 #include <vector>
 #include <string.h>    // strcmpi
 #include <sys/time.h>  // timings
 #include <dirent.h>  // linux dir traverse
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
 // write bmp, input - RGB, device
 int writeBMP(const char *filename, const unsigned char *d_chanR, int pitchR,
             const unsigned char *d_chanG, int pitchG,
             const unsigned char *d_chanB, int pitchB, int width, int height) {
  unsigned int headers[13];
  FILE *outfile;
  int extrabytes;
  int paddedsize;
  int x;
  int y;
  int n;
  int red, green, blue;
  std::vector<unsigned char> vchanR(height * width);
  std::vector<unsigned char> vchanG(height * width);
  std::vector<unsigned char> vchanB(height * width);
  unsigned char *chanR = vchanR.data();
  unsigned char *chanG = vchanG.data();
  unsigned char *chanB = vchanB.data();
  checkCudaErrors(cudaMemcpy2D(chanR, (size_t)width, d_chanR, (size_t)pitchR,
                               width, height, cudaMemcpyDeviceToHost));
  checkCudaErrors(cudaMemcpy2D(chanG, (size_t)width, d_chanG, (size_t)pitchR,
                               width, height, cudaMemcpyDeviceToHost));
  checkCudaErrors(cudaMemcpy2D(chanB, (size_t)width, d_chanB, (size_t)pitchR,
                               width, height, cudaMemcpyDeviceToHost));
  extrabytes =
      4 - ((width * 3) % 4);  // How many bytes of padding to add to each
  // horizontal line - the size of which must
  // be a multiple of 4 bytes.
  if (extrabytes == 4) extrabytes = 0;
  paddedsize = ((width * 3) + extrabytes) * height;
  // Headers...
  // Note that the "BM" identifier in bytes 0 and 1 is NOT included in these
  // "headers".
  headers[0] = paddedsize + 54;  // bfSize (whole file size)
  headers[1] = 0;                // bfReserved (both)
  headers[2] = 54;               // bfOffbits
  headers[3] = 40;               // biSize
  headers[4] = width;            // biWidth
  headers[5] = height;           // biHeight
  // Would have biPlanes and biBitCount in position 6, but they're shorts.
  // It's easier to write them out separately (see below) than pretend
  // they're a single int, especially with endian issues...
  headers[7] = 0;           // biCompression
  headers[8] = paddedsize;  // biSizeImage
  headers[9] = 0;           // biXPelsPerMeter
  headers[10] = 0;          // biYPelsPerMeter
  headers[11] = 0;          // biClrUsed
  headers[12] = 0;          // biClrImportant
  if (!(outfile = fopen(filename, "wb"))) {
    std::cerr << "Cannot open file: " << filename << std::endl;
    return 1;
  }
  //
  // Headers begin...
  // When printing ints and shorts, we write out 1 character at a time to avoid
  // endian issues.
  //
  fprintf(outfile, "BM");
  for (n = 0; n <= 5; n++) {
    fprintf(outfile, "%c", headers[n] & 0x000000FF);
    fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
    fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
    fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
  }
  // These next 4 characters are for the biPlanes and biBitCount fields.
  fprintf(outfile, "%c", 1);
  fprintf(outfile, "%c", 0);
  fprintf(outfile, "%c", 24);
  fprintf(outfile, "%c", 0);
  for (n = 7; n <= 12; n++) {
    fprintf(outfile, "%c", headers[n] & 0x000000FF);
    fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
    fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
    fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
  }
  //
  // Headers done, now write the data...
  //
  for (y = height - 1; y >= 0;
       y--)  // BMP image format is written from bottom to top...
  {
    for (x = 0; x <= width - 1; x++) {
      red = chanR[y * width + x];
      green = chanG[y * width + x];
      blue = chanB[y * width + x];
      if (red > 255) red = 255;
      if (red < 0) red = 0;
      if (green > 255) green = 255;
      if (green < 0) green = 0;
      if (blue > 255) blue = 255;
      if (blue < 0) blue = 0;
      // Also, it's written in (b,g,r) format...
      fprintf(outfile, "%c", blue);
      fprintf(outfile, "%c", green);
      fprintf(outfile, "%c", red);
    }
    if (extrabytes)  // See above - BMP lines must be of lengths divisible by 4.
    {
      for (n = 1; n <= extrabytes; n++) {
        fprintf(outfile, "%c", 0);
      }
    }
  }
  fclose(outfile);
  return 0;
 }
 // write bmp, input - RGB, device
 int writeBMPi(const char *filename, const unsigned char *d_RGB, int pitch,
              int width, int height) {
  unsigned int headers[13];
  FILE *outfile;
  int extrabytes;
  int paddedsize;
  int x;
  int y;
  int n;
  int red, green, blue;
  std::vector<unsigned char> vchanRGB(height * width * 3);
  unsigned char *chanRGB = vchanRGB.data();
  checkCudaErrors(cudaMemcpy2D(chanRGB, (size_t)width * 3, d_RGB, (size_t)pitch,
                               width * 3, height, cudaMemcpyDeviceToHost));
  extrabytes =
      4 - ((width * 3) % 4);  // How many bytes of padding to add to each
  // horizontal line - the size of which must
  // be a multiple of 4 bytes.
  if (extrabytes == 4) extrabytes = 0;
  paddedsize = ((width * 3) + extrabytes) * height;
  // Headers...
  // Note that the "BM" identifier in bytes 0 and 1 is NOT included in these
  // "headers".
  headers[0] = paddedsize + 54;  // bfSize (whole file size)
  headers[1] = 0;                // bfReserved (both)
  headers[2] = 54;               // bfOffbits
  headers[3] = 40;               // biSize
  headers[4] = width;            // biWidth
  headers[5] = height;           // biHeight
  // Would have biPlanes and biBitCount in position 6, but they're shorts.
  // It's easier to write them out separately (see below) than pretend
  // they're a single int, especially with endian issues...
  headers[7] = 0;           // biCompression
  headers[8] = paddedsize;  // biSizeImage
  headers[9] = 0;           // biXPelsPerMeter
  headers[10] = 0;          // biYPelsPerMeter
  headers[11] = 0;          // biClrUsed
  headers[12] = 0;          // biClrImportant
  if (!(outfile = fopen(filename, "wb"))) {
    std::cerr << "Cannot open file: " << filename << std::endl;
    return 1;
  }
  //
  // Headers begin...
  // When printing ints and shorts, we write out 1 character at a time to avoid
  // endian issues.
  //
  fprintf(outfile, "BM");
  for (n = 0; n <= 5; n++) {
    fprintf(outfile, "%c", headers[n] & 0x000000FF);
    fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
    fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
    fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
  }
  // These next 4 characters are for the biPlanes and biBitCount fields.
  fprintf(outfile, "%c", 1);
  fprintf(outfile, "%c", 0);
  fprintf(outfile, "%c", 24);
  fprintf(outfile, "%c", 0);
  for (n = 7; n <= 12; n++) {
    fprintf(outfile, "%c", headers[n] & 0x000000FF);
    fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
    fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
    fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
  }
  //
  // Headers done, now write the data...
  //
  for (y = height - 1; y >= 0;
       y--)  // BMP image format is written from bottom to top...
  {
    for (x = 0; x <= width - 1; x++) {
      red = chanRGB[(y * width + x) * 3];
      green = chanRGB[(y * width + x) * 3 + 1];
      blue = chanRGB[(y * width + x) * 3 + 2];
      if (red > 255) red = 255;
      if (red < 0) red = 0;
      if (green > 255) green = 255;
      if (green < 0) green = 0;
      if (blue > 255) blue = 255;
      if (blue < 0) blue = 0;
      // Also, it's written in (b,g,r) format...
      fprintf(outfile, "%c", blue);
      fprintf(outfile, "%c", green);
      fprintf(outfile, "%c", red);
    }
    if (extrabytes)  // See above - BMP lines must be of lengths divisible by 4.
    {
      for (n = 1; n <= extrabytes; n++) {
        fprintf(outfile, "%c", 0);
      }
    }
  }
  fclose(outfile);
  return 0;
 }
 int readInput(const std::string &sInputPath,
              std::vector<std::string> &filelist) {
  int error_code = 1;
  struct stat s;
  if (stat(sInputPath.c_str(), &s) == 0) {
    if (s.st_mode & S_IFREG) {
      filelist.push_back(sInputPath);
    } else if (s.st_mode & S_IFDIR) {
      // processing each file in directory
      DIR *dir_handle;
      struct dirent *dir;
      dir_handle = opendir(sInputPath.c_str());
      std::vector<std::string> filenames;
      if (dir_handle) {
        error_code = 0;
        while ((dir = readdir(dir_handle)) != NULL) {
          if (dir->d_type == DT_REG) {
            std::string sFileName = sInputPath + dir->d_name;
            filelist.push_back(sFileName);
          } else if (dir->d_type == DT_DIR) {
            std::string sname = dir->d_name;
            if (sname != "." && sname != "..") {
              readInput(sInputPath + sname + "/", filelist);
            }
          }
        }
        closedir(dir_handle);
      } else {
        std::cout << "Cannot open input directory: " << sInputPath << std::endl;
        return error_code;
      }
    } else {
      std::cout << "Cannot open input: " << sInputPath << std::endl;
      return error_code;
    }
  } else {
    std::cout << "Cannot find input path " << sInputPath << std::endl;
    return error_code;
  }
  return 0;
 }
 #endif
--- a/Samples/p2pBandwidthLatencyTest/Makefile
+++ b/Samples/p2pBandwidthLatencyTest/Makefile
@ -246,7 +246,11 @@ LIBRARIES :=
 ################################################################################
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
 SMS ?= 30 35 37 50 52 60 61 70 72 75
 else
 SMS ?= 30 35 37 50 52 60 61 70 75
 endif
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
--- a/Samples/p2pBandwidthLatencyTest/NsightEclipse.xml
+++ b/Samples/p2pBandwidthLatencyTest/NsightEclipse.xml
@ -48,6 +48,7 @@
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
  <sm-arch>sm72</sm-arch>
  <sm-arch>sm75</sm-arch>
  <supported_envs>
    <env>
--- a/Samples/p2pBandwidthLatencyTest/README.md
+++ b/Samples/p2pBandwidthLatencyTest/README.md
@ -10,7 +10,7 @@ Performance Strategies, Asynchronous Data Transfers, Unified Virtual Address Spa
 ## Supported SM Architectures
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
@ -27,7 +27,7 @@ cudaDeviceCanAccessPeer, cudaDeviceEnablePeerAccess, cudaDeviceDisablePeerAccess
 ## Prerequisites
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 ## Build and Run
--- a/Show More
+++ b/Show More