diff --git a/Common/FreeImage/lib/linux/sbsa/libfreeimage.a b/Common/FreeImage/lib/linux/sbsa/libfreeimage.a
new file mode 100644
index 00000000..23b3759f
Binary files /dev/null and b/Common/FreeImage/lib/linux/sbsa/libfreeimage.a differ
diff --git a/Common/helper_cuda.h b/Common/helper_cuda.h
index 53f5b2bf..ff803350 100644
--- a/Common/helper_cuda.h
+++ b/Common/helper_cuda.h
@@ -663,6 +663,7 @@ inline int _ConvertSMVer2Cores(int major, int minor) {
       {0x70,  64},
       {0x72,  64},
       {0x75,  64},
+      {0x80,  64},
       {-1, -1}};
 
   int index = 0;
@@ -707,6 +708,7 @@ inline const char* _ConvertSMVer2ArchName(int major, int minor) {
       {0x70, "Volta"},
       {0x72, "Xavier"},
       {0x75, "Turing"},
+      {0x80, "Ampere"},
       {-1, "Graphics Device"}};
 
   int index = 0;
@@ -817,7 +819,19 @@ inline int gpuGetMaxGflopsDeviceId() {
       }
       int multiProcessorCount = 0, clockRate = 0;
       checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device));
-      checkCudaErrors(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device));
+      cudaError_t result = cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device);
+      if (result != cudaSuccess) {
+        // If cudaDevAttrClockRate attribute is not supported we
+        // set clockRate as 1, to consider GPU with most SMs and CUDA Cores.
+        if(result == cudaErrorInvalidValue) {
+          clockRate = 1;
+        }
+        else {
+          fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, __LINE__,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result));
+          exit(EXIT_FAILURE);
+        }
+      }
       uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate;
 
       if (compute_perf > max_compute_perf) {
diff --git a/Common/helper_multiprocess.cpp b/Common/helper_multiprocess.cpp
index a339240a..36e224bd 100644
--- a/Common/helper_multiprocess.cpp
+++ b/Common/helper_multiprocess.cpp
@@ -475,38 +475,19 @@ int ipcSendData(HANDLE mailslot, const void *data, size_t sz) {
 }
 
 int ipcRecvData(ipcHandle *handle, void *data, size_t sz) {
-  DWORD cbMessage, cMessage, cbRead;
-  BOOL fResult;
+  DWORD cbRead = 0;
 
-  cbMessage = cMessage = cbRead = 0;
-  HANDLE mailslot = handle->hMailslot[0];
-
-pollMailSlot:
-  fResult = GetMailslotInfo(mailslot, (LPDWORD)NULL, &cbMessage, &cMessage,
-                            (LPDWORD)NULL);
-  if (!fResult) {
-    printf("IPC failure: GetMailslotInfo failed with %d.\n", GetLastError());
+  if (!ReadFile(handle->hMailslot[0], data, (DWORD)sz, &cbRead, NULL)) {
+    printf("IPC failure: ReadFile failed with %d.\n", GetLastError());
     return -1;
   }
 
-  if (cbMessage == MAILSLOT_NO_MESSAGE) {
-    goto pollMailSlot;
+  if (sz != (size_t)cbRead) {
+    printf(
+        "IPC failure: ReadFile didn't receive the expected number of bytes\n");
+    return -1;
   }
 
-  while (cMessage != 0) {
-    fResult = ReadFile(mailslot, data, (DWORD)sz, &cbRead, NULL);
-    if (!fResult) {
-      printf("IPC failure: ReadFile failed with %d.\n", GetLastError());
-      return -1;
-    }
-
-    fResult = GetMailslotInfo(mailslot, (LPDWORD)NULL, &cbMessage, &cMessage,
-                              (LPDWORD)NULL);
-    if (!fResult) {
-      printf("IPC failure: GetMailslotInfo failed (%d)\n", GetLastError());
-      return -1;
-    }
-  }
   return 0;
 }
 
@@ -530,7 +511,7 @@ int ipcSendShareableHandles(
         printf("IPC failure: DuplicateHandle failed (%d)\n", GetLastError());
         return -1;
       }
-      checkIpcErrors(ipcSendData(handle->hMailslot[i], &hDup, sizeof(HANDLE)));
+      checkIpcErrors(ipcSendData(handle->hMailslot[i], &hDup, sizeof(hDup)));
     }
     CloseHandle(hProcess);
   }
diff --git a/Common/nvrtc_helper.h b/Common/nvrtc_helper.h
index 1ba82eb4..92225d76 100644
--- a/Common/nvrtc_helper.h
+++ b/Common/nvrtc_helper.h
@@ -68,7 +68,37 @@ void compileFileToPTX(char *filename, int argc, char **argv, char **ptxResult,
 
   int numCompileOptions = 0;
 
-  char *compileParams[1];
+  char *compileParams[2];
+
+  int major = 0, minor = 0;
+  char deviceName[256];
+
+  // Picks the best CUDA device available
+  CUdevice cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
+
+  // get compute capabilities and the devicename
+  checkCudaErrors(cuDeviceGetAttribute(
+      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
+      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
+  
+  {
+  // Compile for the GPU arch on which are going to run cuda kernel.
+  std::string compileOptions;
+  compileOptions = "--gpu-architecture=compute_";
+
+  compileParams[numCompileOptions] = reinterpret_cast<char *>(
+                  malloc(sizeof(char) * (compileOptions.length() + 10)));
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+  sprintf_s(compileParams[numCompileOptions], sizeof(char) * (compileOptions.length() + 10),
+            "%s%d%d", compileOptions.c_str(), major, minor);
+#else
+  snprintf(compileParams[numCompileOptions], compileOptions.size() + 10, "%s%d%d",
+           compileOptions.c_str(), major, minor);
+#endif
+  }
+
+  numCompileOptions++;
 
   if (requiresCGheaders) {
     std::string compileOptions;
@@ -92,13 +122,13 @@ void compileFileToPTX(char *filename, int argc, char **argv, char **ptxResult,
           argv[0]);
     }
     compileOptions += path.c_str();
-    compileParams[0] = reinterpret_cast<char *>(
+    compileParams[numCompileOptions] = reinterpret_cast<char *>(
         malloc(sizeof(char) * (compileOptions.length() + 1)));
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-    sprintf_s(compileParams[0], sizeof(char) * (compileOptions.length() + 1),
+    sprintf_s(compileParams[numCompileOptions], sizeof(char) * (compileOptions.length() + 1),
               "%s", compileOptions.c_str());
 #else
-    snprintf(compileParams[0], compileOptions.size(), "%s",
+    snprintf(compileParams[numCompileOptions], compileOptions.size(), "%s",
              compileOptions.c_str());
 #endif
     numCompileOptions++;
@@ -137,7 +167,9 @@ void compileFileToPTX(char *filename, int argc, char **argv, char **ptxResult,
   *ptxResult = ptx;
   *ptxResultSize = ptxSize;
 
-  if (requiresCGheaders) free(compileParams[0]);
+  for (int i = 0; i < numCompileOptions; i++) {
+    free(compileParams[i]);
+  }
 }
 
 CUmodule loadPTX(char *ptx, int argc, char **argv) {
diff --git a/README.md b/README.md
index 0a6a9f42..2650266e 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,25 @@
 # CUDA Samples
 
-Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads).
+Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads).
 
 ## Release Notes
 
 This section describes the release notes for the CUDA Samples on GitHub only.
 
+### CUDA 11.0
+*  Added `dmmaTensorCoreGemm`. Demonstrates double precision GEMM computation using the Double precision Warp Matrix Multiply and Accumulate (WMMA) API introduced with CUDA 11 in Ampere chip family tensor cores.
+*  Added `bf16TensorCoreGemm`. Demonstrates __nv_bfloat16 (e8m7) GEMM computation using the __nv_bfloat16 WMMA API introduced with CUDA 11 in Ampere chip family tensor cores.
+*  Added `tf32TensorCoreGemm`. Demonstrates tf32 (e8m10) GEMM computation using the tf32 WMMA API introduced with CUDA 11 in Ampere chip family tensor cores.
+*  Added `globalToShmemAsyncCopy`. Demonstrates async copy of data from global to shared memory when on compute capability 8.0 or higher. Also demonstrates arrive-wait barrier for synchronization.
+*  Added `simpleAWBarrier`. Demonstrates arrive wait barriers.
+*  Added `simpleAttributes`. Demonstrates the stream attributes that affect L2 locality.
+*  Added warp aggregated atomic multi bucket increments kernel using labeled_partition cooperative groups in `warpAggregatedAtomicsCG` which can be used on compute capability 7.0 and above GPU architectures.
+*  Added `binaryPartitionCG`. Demonstrates  binary partition cooperative groups and reduction within the thread block.
+*  Added two new reduction kernels in `reduction` one which demonstrates reduce_add_sync intrinstic supported on compute capability 8.0 and another which uses cooperative_groups::reduce function which does thread_block_tile level reduction introduced from CUDA 11.0.
+*  Added `simpleVulkanMMAP`. Demonstrates Vulkan CUDA Interop via cuMemMap APIs.
+*  Added `concurrentKernels`. Demonstrates the use of CUDA streams for concurrent execution of several kernels on a GPU.
+*  Dropped Mac OSX support from all samples.
+
 ### CUDA 10.2
 *  Added `simpleD3D11`. Demonstrates CUDA-D3D11 External Resource Interoperability APIs for updating D3D11 buffers from CUDA and synchronization between D3D11 and CUDA with Keyed Mutexes.
 *  Added `simpleDrvRuntime`. Demonstrates CUDA Driver and Runtime APIs working together to load fatbinary of a CUDA kernel.
@@ -69,8 +83,8 @@ This is the first release of CUDA Samples on GitHub:
 
 ### Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
-For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html), and the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html).
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), and the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html).
 
 ### Getting the CUDA Samples
 
@@ -121,68 +135,39 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
     ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## Samples list
 
 ### Samples by OS
 
 #### Linux
-**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | **[EGLStream_CUDA_Interop](./Samples/EGLStream_CUDA_Interop)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** |
+**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** | **[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** |
 ---|---|---|---|
-**[simpleIPC](./Samples/simpleIPC)** | **[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[shfl_scan](./Samples/shfl_scan)** |
-**[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[nvJPEG](./Samples/nvJPEG)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[deviceQuery](./Samples/deviceQuery)** |
-**[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[cudaNvSci](./Samples/cudaNvSci)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** |
-**[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** |
-**[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** |
-**[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[bandwidthTest](./Samples/bandwidthTest)** |
+**[EGLStream_CUDA_Interop](./Samples/EGLStream_CUDA_Interop)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleIPC](./Samples/simpleIPC)** | **[memMapIPCDrv](./Samples/memMapIPCDrv)** |
+**[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[shfl_scan](./Samples/shfl_scan)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** |
+**[nvJPEG](./Samples/nvJPEG)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** |
+**[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[cudaNvSci](./Samples/cudaNvSci)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** |
+**[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** |
+**[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[simpleCUFFT](./Samples/simpleCUFFT)** |
+**[reduction](./Samples/reduction)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** |
+**[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[bandwidthTest](./Samples/bandwidthTest)** | **[concurrentKernels](./Samples/concurrentKernels)** |
 **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** |
 **[simpleVulkan](./Samples/simpleVulkan)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** |
-**[matrixMul](./Samples/matrixMul)** | **[systemWideAtomics](./Samples/systemWideAtomics)** |
+**[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[matrixMul](./Samples/matrixMul)** | **[systemWideAtomics](./Samples/systemWideAtomics)** |
 
 #### Windows
-**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleIPC](./Samples/simpleIPC)** |
+**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** | **[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** |
 ---|---|---|---|
-**[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[shfl_scan](./Samples/shfl_scan)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** |
-**[nvJPEG](./Samples/nvJPEG)** | **[simpleD3D12](./Samples/simpleD3D12)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[deviceQuery](./Samples/deviceQuery)** |
-**[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** |
-**[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** |
-**[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** |
-**[simpleD3D11](./Samples/simpleD3D11)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[bandwidthTest](./Samples/bandwidthTest)** |
+**[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleIPC](./Samples/simpleIPC)** | **[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** |
+**[shfl_scan](./Samples/shfl_scan)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[nvJPEG](./Samples/nvJPEG)** |
+**[simpleD3D12](./Samples/simpleD3D12)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** |
+**[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** |
+**[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** |
+**[matrixMulDrv](./Samples/matrixMulDrv)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** |
+**[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[simpleD3D11](./Samples/simpleD3D11)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** |
+**[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[bandwidthTest](./Samples/bandwidthTest)** | **[concurrentKernels](./Samples/concurrentKernels)** |
 **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** |
 **[simpleVulkan](./Samples/simpleVulkan)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** |
-**[matrixMul](./Samples/matrixMul)** |
-
-#### Mac OSX
-**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | **[shfl_scan](./Samples/shfl_scan)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** |
----|---|---|---|
-**[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** |
-**[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** |
-**[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** |
-**[bandwidthTest](./Samples/bandwidthTest)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** |
-**[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[matrixMul](./Samples/matrixMul)** |
+**[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[matrixMul](./Samples/matrixMul)** |
 
 ## Dependencies
 
diff --git a/Samples/EGLStream_CUDA_Interop/Makefile b/Samples/EGLStream_CUDA_Interop/Makefile
index ea0562c6..0048e566 100644
--- a/Samples/EGLStream_CUDA_Interop/Makefile
+++ b/Samples/EGLStream_CUDA_Interop/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -289,6 +310,10 @@ else
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
   endif
 
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
+  endif
+
   ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
   endif
@@ -303,12 +328,19 @@ else
 
   ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
+    ifdef TARGET_OVERRIDE
+        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
+    endif
   endif
 
   ifeq ($(TARGET_ARCH),ppc64le)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
   endif
 
+  ifeq ($(HOST_ARCH),ppc64le)
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
+  endif
+
   CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
   ifeq ("$(CUDALIB)","")
     $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
diff --git a/Samples/EGLStream_CUDA_Interop/NsightEclipse.xml b/Samples/EGLStream_CUDA_Interop/NsightEclipse.xml
index d2151c75..62b856f3 100644
--- a/Samples/EGLStream_CUDA_Interop/NsightEclipse.xml
+++ b/Samples/EGLStream_CUDA_Interop/NsightEclipse.xml
@@ -49,7 +49,6 @@
     <scope>1:CUDA Basic Topics</scope>
     <scope>2:Graphics Interop</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
   <sm-arch>sm35</sm-arch>
   <sm-arch>sm37</sm-arch>
   <sm-arch>sm50</sm-arch>
@@ -59,6 +58,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/EGLStream_CUDA_Interop/README.md b/Samples/EGLStream_CUDA_Interop/README.md
index ad696626..01fb11ca 100644
--- a/Samples/EGLStream_CUDA_Interop/README.md
+++ b/Samples/EGLStream_CUDA_Interop/README.md
@@ -10,7 +10,7 @@ EGLStreams Interop
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -30,7 +30,7 @@ cuDeviceGet, cuDeviceGetAttribute, cuDeviceComputeCapability, cuDeviceGetCount,
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/EGLStream_CUDA_Interop/cuda_producer.cpp b/Samples/EGLStream_CUDA_Interop/cuda_producer.cpp
index 32d0f7b8..266d2a22 100644
--- a/Samples/EGLStream_CUDA_Interop/cuda_producer.cpp
+++ b/Samples/EGLStream_CUDA_Interop/cuda_producer.cpp
@@ -245,12 +245,30 @@ CUresult cudaProducerTest(test_cuda_producer_s *cudaProducer, char *file) {
   cudaEgl.eglColorFormat = eglColorFormat;
   cudaEgl.cuFormat = CU_AD_FORMAT_UNSIGNED_INT8;
 
+  static int numFramesPresented = 0;
+  // If there is a frame presented before we check if consumer
+  // is done with it using cuEGLStreamProducerReturnFrame.
+  while (numFramesPresented) {
+    CUeglFrame returnedCudaEgl;
+    cuStatus = cuEGLStreamProducerReturnFrame(&cudaProducer->cudaConn,
+                                              &returnedCudaEgl, NULL);
+    if (cuStatus == CUDA_ERROR_LAUNCH_TIMEOUT) {
+      continue;
+    } else if (cuStatus != CUDA_SUCCESS) {
+      printf("cuda Producer return frame FAILED with custatus= %d\n", cuStatus);
+      return cuStatus;
+    } else {
+      numFramesPresented--;
+    }
+  }
+
   cuStatus =
       cuEGLStreamProducerPresentFrame(&cudaProducer->cudaConn, cudaEgl, NULL);
   if (cuStatus != CUDA_SUCCESS) {
     printf("cuda Producer present frame FAILED with custatus= %d\n", cuStatus);
     goto done;
   }
+  numFramesPresented++;
 
 done:
   if (file_p) {
@@ -281,6 +299,13 @@ CUresult cudaDeviceCreateProducer(test_cuda_producer_s *cudaProducer,
       "%d.%d\n\n",
       device, deviceName, major, minor);
 
+  if (major < 6) {
+    printf(
+        "EGLStreams_CUDA_Interop requires SM 6.0 or higher arch GPU.  "
+        "Exiting...\n");
+    exit(2);  // EXIT_WAIVED
+  }
+
   if (CUDA_SUCCESS !=
       (status = cuCtxCreate(&cudaProducer->context, 0, device))) {
     printf("failed to create CUDA context\n");
diff --git a/Samples/EGLStream_CUDA_Interop/findegl.mk b/Samples/EGLStream_CUDA_Interop/findegl.mk
index 5116fc37..242b8592 100644
--- a/Samples/EGLStream_CUDA_Interop/findegl.mk
+++ b/Samples/EGLStream_CUDA_Interop/findegl.mk
@@ -133,10 +133,14 @@ ifeq ("$(TARGET_OS)","linux")
 else
 endif
 
+ifeq ("$(TARGET_OS)","qnx")
+    HOST_CCFLAGS := -V5.4.0,gcc_ntoaarch64le
+endif
+
 # Attempt to compile a minimal EGL application and run to check if EGL_SUPPORT_REUSE_NV is supported in the EGL headers available.
 ifneq ($(SAMPLE_ENABLED), 0)
       $(shell printf "#include <EGL/egl.h>\n#include <EGL/eglext.h>\nint main() {\n#ifdef EGL_SUPPORT_REUSE_NV \n #error \"Compatible EGL header found\" \n  return 0;\n#endif \n return 1;\n}"  > test.c; )
-      EGL_DEFINES := $(shell $(HOST_COMPILER) $(CCFLAGS) $(EXTRA_CCFLAGS) -lEGL test.c -c 2>&1 | grep -ic "Compatible EGL header found";)
+      EGL_DEFINES := $(shell $(HOST_COMPILER) $(HOST_CCFLAGS) $(CCFLAGS) $(EXTRA_CCFLAGS) -lEGL test.c -c 2>&1 | grep -ic "Compatible EGL header found";)
       SHOULD_WAIVE := 0
       ifeq ($(EGL_DEFINES),0)
         SHOULD_WAIVE := 1
diff --git a/Samples/MersenneTwisterGP11213/Makefile b/Samples/MersenneTwisterGP11213/Makefile
index bbfadc91..a9654cbc 100644
--- a/Samples/MersenneTwisterGP11213/Makefile
+++ b/Samples/MersenneTwisterGP11213/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -251,8 +272,8 @@ ifeq ($(GENCODE_FLAGS),)
 $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
 
 ifeq ($(SMS),)
-# Generate PTX code from SM 30
-GENCODE_FLAGS += -gencode arch=compute_30,code=compute_30
+# Generate PTX code from SM 35
+GENCODE_FLAGS += -gencode arch=compute_35,code=compute_35
 endif
 
 # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
diff --git a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2012.vcxproj b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2012.vcxproj
index fb20727b..c7091104 100644
--- a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2012.vcxproj
+++ b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/MersenneTwisterGP11213.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2013.vcxproj b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2013.vcxproj
index e01d2796..ce970bc4 100644
--- a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2013.vcxproj
+++ b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/MersenneTwisterGP11213.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2015.vcxproj b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2015.vcxproj
index 29a7faa7..f12e35e3 100644
--- a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2015.vcxproj
+++ b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/MersenneTwisterGP11213.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2017.vcxproj b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2017.vcxproj
index 808565d6..0bceeb33 100644
--- a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2017.vcxproj
+++ b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/MersenneTwisterGP11213.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2019.vcxproj b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2019.vcxproj
index ef1b3619..c087c18c 100644
--- a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2019.vcxproj
+++ b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/MersenneTwisterGP11213.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/MersenneTwisterGP11213/NsightEclipse.xml b/Samples/MersenneTwisterGP11213/NsightEclipse.xml
index 83e5d736..50ac3a0e 100644
--- a/Samples/MersenneTwisterGP11213/NsightEclipse.xml
+++ b/Samples/MersenneTwisterGP11213/NsightEclipse.xml
@@ -35,7 +35,6 @@
   <scopes>
     <scope>1:CUDA Advanced Topics</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
   <sm-arch>sm35</sm-arch>
   <sm-arch>sm37</sm-arch>
   <sm-arch>sm50</sm-arch>
@@ -45,6 +44,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/MersenneTwisterGP11213/README.md b/Samples/MersenneTwisterGP11213/README.md
index a1f8d15a..efc9a43a 100644
--- a/Samples/MersenneTwisterGP11213/README.md
+++ b/Samples/MersenneTwisterGP11213/README.md
@@ -10,11 +10,11 @@ CURAND Library
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
@@ -67,29 +67,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/NV12toBGRandResize/Makefile b/Samples/NV12toBGRandResize/Makefile
index 543ec801..90a8e474 100644
--- a/Samples/NV12toBGRandResize/Makefile
+++ b/Samples/NV12toBGRandResize/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -253,9 +274,9 @@ LIBRARIES :=
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
diff --git a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2012.vcxproj b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2012.vcxproj
index 80845ddb..3a92cfdc 100644
--- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2012.vcxproj
+++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/NV12toBGRandResize.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2013.vcxproj b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2013.vcxproj
index b4cdb199..6172ba67 100644
--- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2013.vcxproj
+++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/NV12toBGRandResize.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2015.vcxproj b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2015.vcxproj
index da6dd792..c58d8ab8 100644
--- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2015.vcxproj
+++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/NV12toBGRandResize.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.vcxproj b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.vcxproj
index df03ae51..e55bffb2 100644
--- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.vcxproj
+++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/NV12toBGRandResize.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.vcxproj b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.vcxproj
index fd3849e4..db44e72b 100644
--- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.vcxproj
+++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/NV12toBGRandResize.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/NV12toBGRandResize/NsightEclipse.xml b/Samples/NV12toBGRandResize/NsightEclipse.xml
index d8467d10..8d6ec75b 100644
--- a/Samples/NV12toBGRandResize/NsightEclipse.xml
+++ b/Samples/NV12toBGRandResize/NsightEclipse.xml
@@ -32,7 +32,6 @@
     <scope>2:Image Processing</scope>
     <scope>2:Computer Vision</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
   <sm-arch>sm35</sm-arch>
   <sm-arch>sm37</sm-arch>
   <sm-arch>sm50</sm-arch>
@@ -42,6 +41,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/NV12toBGRandResize/README.md b/Samples/NV12toBGRandResize/README.md
index a82c38ec..bcda7ae8 100644
--- a/Samples/NV12toBGRandResize/README.md
+++ b/Samples/NV12toBGRandResize/README.md
@@ -10,11 +10,11 @@ Graphics Interop, Image Processing, Video Processing
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -27,7 +27,7 @@ cudaMemcpy2D, cudaMallocManaged
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
@@ -66,29 +66,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/UnifiedMemoryPerf/Makefile b/Samples/UnifiedMemoryPerf/Makefile
index b6c7f2b4..e1ac47c9 100644
--- a/Samples/UnifiedMemoryPerf/Makefile
+++ b/Samples/UnifiedMemoryPerf/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -253,9 +274,9 @@ LIBRARIES :=
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
diff --git a/Samples/UnifiedMemoryPerf/NsightEclipse.xml b/Samples/UnifiedMemoryPerf/NsightEclipse.xml
index b2567d4b..c2918849 100644
--- a/Samples/UnifiedMemoryPerf/NsightEclipse.xml
+++ b/Samples/UnifiedMemoryPerf/NsightEclipse.xml
@@ -44,16 +44,6 @@
     <scope>1:CUDA Systems Integration</scope>
     <scope>1:Unified Memory</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
-  <sm-arch>sm35</sm-arch>
-  <sm-arch>sm37</sm-arch>
-  <sm-arch>sm50</sm-arch>
-  <sm-arch>sm52</sm-arch>
-  <sm-arch>sm60</sm-arch>
-  <sm-arch>sm61</sm-arch>
-  <sm-arch>sm70</sm-arch>
-  <sm-arch>sm72</sm-arch>
-  <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/UnifiedMemoryPerf/README.md b/Samples/UnifiedMemoryPerf/README.md
index 25fe9798..0ede1f61 100644
--- a/Samples/UnifiedMemoryPerf/README.md
+++ b/Samples/UnifiedMemoryPerf/README.md
@@ -10,11 +10,9 @@ CUDA Systems Integration, Unified Memory, CUDA Streams and Events, Pinned System
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
-
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -30,7 +28,7 @@ cudaMallocManaged, cudaStreamAttachMemAsync, cudaMemcpyAsync, cudaMallocHost, cu
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
@@ -70,29 +68,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.vcxproj b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.vcxproj
index 9eecd26a..de2d7376 100644
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/UnifiedMemoryPerf.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.vcxproj b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.vcxproj
index 8dfff87d..2321663c 100644
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/UnifiedMemoryPerf.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.vcxproj b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.vcxproj
index f87cde90..f028bf77 100644
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/UnifiedMemoryPerf.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj
index bbdc209b..6e5ff90d 100644
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/UnifiedMemoryPerf.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.vcxproj b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.vcxproj
index bbb769c6..d54fbb61 100644
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/UnifiedMemoryPerf.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/bandwidthTest/Makefile b/Samples/bandwidthTest/Makefile
index de12d615..555fe27f 100644
--- a/Samples/bandwidthTest/Makefile
+++ b/Samples/bandwidthTest/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -247,9 +268,9 @@ LIBRARIES :=
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
diff --git a/Samples/bandwidthTest/NsightEclipse.xml b/Samples/bandwidthTest/NsightEclipse.xml
index 5aeaead5..7a584ce8 100644
--- a/Samples/bandwidthTest/NsightEclipse.xml
+++ b/Samples/bandwidthTest/NsightEclipse.xml
@@ -41,7 +41,6 @@
     <scope>1:CUDA Basic Topics</scope>
     <scope>1:Performance Strategies</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
   <sm-arch>sm35</sm-arch>
   <sm-arch>sm37</sm-arch>
   <sm-arch>sm50</sm-arch>
@@ -51,6 +50,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/bandwidthTest/README.md b/Samples/bandwidthTest/README.md
index b54a4ffb..fae5ea5a 100644
--- a/Samples/bandwidthTest/README.md
+++ b/Samples/bandwidthTest/README.md
@@ -10,11 +10,11 @@ CUDA Streams and Events, Performance Strategies
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -27,7 +27,7 @@ cudaSetDevice, cudaHostAlloc, cudaFree, cudaMallocHost, cudaFreeHost, cudaMemcpy
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
@@ -66,29 +66,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/bandwidthTest/bandwidthTest.cu b/Samples/bandwidthTest/bandwidthTest.cu
index dbb8582e..d945add8 100644
--- a/Samples/bandwidthTest/bandwidthTest.cu
+++ b/Samples/bandwidthTest/bandwidthTest.cu
@@ -915,7 +915,7 @@ void printResultsCSV(unsigned int *memSizes, double *bandwidths,
   double dSeconds = 0.0;
 
   for (i = 0; i < count; i++) {
-    dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1 << 20));
+    dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1e9));
     printf(
         "bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u "
         "bytes, NumDevsUsed = %d\n",
diff --git a/Samples/bandwidthTest/bandwidthTest_vs2012.vcxproj b/Samples/bandwidthTest/bandwidthTest_vs2012.vcxproj
index c87bb9b3..321de32e 100644
--- a/Samples/bandwidthTest/bandwidthTest_vs2012.vcxproj
+++ b/Samples/bandwidthTest/bandwidthTest_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/bandwidthTest/bandwidthTest_vs2013.vcxproj b/Samples/bandwidthTest/bandwidthTest_vs2013.vcxproj
index 0d24fbc1..2b5266f5 100644
--- a/Samples/bandwidthTest/bandwidthTest_vs2013.vcxproj
+++ b/Samples/bandwidthTest/bandwidthTest_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/bandwidthTest/bandwidthTest_vs2015.vcxproj b/Samples/bandwidthTest/bandwidthTest_vs2015.vcxproj
index 694f6dae..275151fa 100644
--- a/Samples/bandwidthTest/bandwidthTest_vs2015.vcxproj
+++ b/Samples/bandwidthTest/bandwidthTest_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj b/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj
index ea6bf53e..0fed24b5 100644
--- a/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj
+++ b/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/bandwidthTest/bandwidthTest_vs2019.vcxproj b/Samples/bandwidthTest/bandwidthTest_vs2019.vcxproj
index 7fe91fbd..2ca07898 100644
--- a/Samples/bandwidthTest/bandwidthTest_vs2019.vcxproj
+++ b/Samples/bandwidthTest/bandwidthTest_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/bf16TensorCoreGemm/Makefile b/Samples/bf16TensorCoreGemm/Makefile
new file mode 100644
index 00000000..fdce3af2
--- /dev/null
+++ b/Samples/bf16TensorCoreGemm/Makefile
@@ -0,0 +1,362 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+        endif
+    endif
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on Mac OSX
+ifeq ($(TARGET_OS),darwin)
+  $(info >>> WARNING - bf16TensorCoreGemm is not supported on Mac OSX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on ARMv7
+ifeq ($(TARGET_ARCH),armv7l)
+  $(info >>> WARNING - bf16TensorCoreGemm is not supported on ARMv7 - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+#Detect if installed version of GCC supports required C++11
+ifeq ($(TARGET_OS),linux)
+    empty :=
+    space := $(empty) $(empty)
+    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
+#Create version number without "."
+    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
+# Make sure the version number has at least 3 decimals
+    GCCVERSION += 00
+# Remove spaces from the version number
+    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
+# Crop the version number to 3 decimals.
+    GCCVERSION := $(shell expr `echo $(GCCVERSION)` | cut -b1-3)
+#$(warning $(GCCVERSION))
+
+    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 500)
+
+    ifeq ($(IS_MIN_VERSION), 1)
+        $(info >>> GCC Version is greater or equal to  5.0.0 <<<)
+    else
+        $(info >>> Waiving build. Minimum GCC version required is  5.0.0<<<)
+        SAMPLE_ENABLED := 0
+    endif
+endif
+
+# Gencode arguments
+SMS ?= 80
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ALL_CCFLAGS += --std=c++11
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: bf16TensorCoreGemm
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+bf16TensorCoreGemm.o:bf16TensorCoreGemm.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+bf16TensorCoreGemm: bf16TensorCoreGemm.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./bf16TensorCoreGemm
+
+clean:
+	rm -f bf16TensorCoreGemm bf16TensorCoreGemm.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/bf16TensorCoreGemm
+
+clobber: clean
diff --git a/Samples/bf16TensorCoreGemm/NsightEclipse.xml b/Samples/bf16TensorCoreGemm/NsightEclipse.xml
new file mode 100644
index 00000000..b5f1ba54
--- /dev/null
+++ b/Samples/bf16TensorCoreGemm/NsightEclipse.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>bf16TensorCoreGemm</name>
+  <cflags>
+    <flag>--std=c++11</flag>
+  </cflags>
+  <cuda_api_list>
+    <toolkit>cudaMallocManaged</toolkit>
+    <toolkit>cudaDeviceSynchronize</toolkit>
+    <toolkit>cudaFuncSetAttribute</toolkit>
+    <toolkit>cudaEventCreate</toolkit>
+    <toolkit>cudaEventRecord</toolkit>
+    <toolkit>cudaEventSynchronize</toolkit>
+    <toolkit>cudaEventElapsedTime</toolkit>
+    <toolkit>cudaFree</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[A CUDA sample demonstrating __nv_bfloat16 (e8m7) GEMM computation using the Warp Matrix Multiply and Accumulate (WMMA) API introduced with CUDA 11 in Ampere chip family tensor cores for faster matrix operations. This sample also uses async copy provided by cuda pipeline interface for gmem to shmem async loads which improves kernel performance and reduces register presssure.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">Matrix Multiply</concept>
+    <concept level="advanced">WMMA</concept>
+    <concept level="advanced">Tensor Cores</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>matrix multiply</keyword>
+    <keyword>Async copy</keyword>
+    <keyword>CPP11</keyword>
+    <keyword>GCC 5.0.0</keyword>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>bf16TensorCoreGemm.cu</primary_file>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+  </scopes>
+  <sm-arch>sm80</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <arch>aarch64</arch>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <from>8.0</from>
+  </supported_sm_architectures>
+  <title>bfloat16 Tensor Core GEMM</title>
+  <type>exe</type>
+</entry>
diff --git a/Samples/bf16TensorCoreGemm/README.md b/Samples/bf16TensorCoreGemm/README.md
new file mode 100644
index 00000000..e6dc6c95
--- /dev/null
+++ b/Samples/bf16TensorCoreGemm/README.md
@@ -0,0 +1,70 @@
+# bf16TensorCoreGemm - bfloat16 Tensor Core GEMM
+
+## Description
+
+A CUDA sample demonstrating __nv_bfloat16 (e8m7) GEMM computation using the Warp Matrix Multiply and Accumulate (WMMA) API introduced with CUDA 11 in Ampere chip family tensor cores for faster matrix operations. This sample also uses async copy provided by cuda pipeline interface for gmem to shmem async loads which improves kernel performance and reduces register presssure.
+
+## Key Concepts
+
+Matrix Multiply, WMMA, Tensor Cores
+
+## Supported SM Architectures
+
+[SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, aarch64
+
+## CUDA APIs involved
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate, cudaEventRecord, cudaEventSynchronize, cudaEventElapsedTime, cudaFree
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
diff --git a/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm.cu b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm.cu
new file mode 100644
index 00000000..8e4ae7f4
--- /dev/null
+++ b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm.cu
@@ -0,0 +1,838 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// CUDA sample demonstrating a __nv_bfloat16 (E8M7) GEMM computation using the Warp Matrix Multiply
+// and Accumulate API introduced in CUDA 11.0.
+
+// In this program, the compute_gemm kernel computes the result of a matrix multiplication
+// and addition: D = alpha * A * B + beta * C. The dimensions of both C and D matrices
+// are M_GLOBAL x N_GLOBAL. The A matrix is M_GLOBAL x K_GLOBAL (row-major), the B matrix
+// is K_GLOBAL x N_GLOBAL (column-major).
+// In that kernel, each CTA computes one 128 x 128 tile of the resulting matrix
+// per iteration. When the tile is computed, the CTA stores it to the global memory
+// and begins a new iteration, selecting a new 128 x 128 tile to compute.
+// Each CTA consists of eight warps. For the 128 x 128 tile, each warp computes eight
+// 16 x 16 subtiles, organized in a 2 x 4 two-dimensional array.
+// Warps compute the 16 x 16 subtiles using nvcuda::wmma::mma_sync operations by
+// moving through the K_GLOBAL dimension of the A and B matrices and accumulating
+// the intermediate result in the local thread state.
+
+// There are a number of simple optimizations used in the algorithm:
+// - The CTA copies the 128 x 128 tile of the C matrix from the global memory to
+//   shared memory. After that is done, each warp loads the C matrix fragments from
+//   shared memory, thus avoiding a random global memory access.
+// - On each internal iteration, the CTA copies a portion of the A and B matrices from
+//   global memory to shared memory. After that, all warps in the CTA reuse the A and B
+//   data from shared memory, thus reducing the number of data copies from global memory.
+// - The portions of the A and B matrices are stored in shared memory with an additional
+//   padding (skew) to reduce the number of shared memory access bank conflicts.
+//   (See a detailed explanation near the SKEW_BF16 macro definition.)
+// - When the CTA finishes computing the tiles of the resulting matrix, each warp stores
+//   its subtiles to shared memory. The CTA then copies the shared memory contents to
+//   global memory, again avoiding redundant random global memory accesses.
+// - Note that the CTA tile size is chosen to maximize the GPU register utilization,
+//   but carefully enough to avoid local memory use.
+
+#include <assert.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <cuda_bf16.h>
+#include <mma.h>
+#include <cuda_pipeline.h>
+
+// helper functions and utilities to work with CUDA
+#include <helper_functions.h>
+#include <helper_cuda.h>
+
+// Externally configurable parameters.
+
+// Switch for choosing cpp interface for cuda pipeline 
+// vs primitives interface.
+#define USE_CPP_API 0
+
+#ifndef CPU_DEBUG
+// Set this to 1 to verify the correctness of the GPU-computed matrix.
+#define CPU_DEBUG 0
+#endif
+
+#ifndef SHARED_MEMORY_LIMIT_64K
+// Set this to 0 to use more than 64 Kb of shared memory to cache data, to
+// improve the performance of the computations on GPU.
+// Note that you need a GPU that can have more than 64 Kb of shared memory
+// per multiprocessor.
+#define SHARED_MEMORY_LIMIT_64K 0
+#endif
+
+// GPU configuration.
+
+#define WARP_SIZE 32
+
+// MMA matrix tile dimensions.
+
+#define M 16
+#define N 16
+#define K 16
+
+// GEMM configuration.
+
+#define M_TILES 512
+#define N_TILES 512
+#define K_TILES 512
+
+#define M_GLOBAL (M * M_TILES)
+#define N_GLOBAL (N * N_TILES)
+#define K_GLOBAL (K * K_TILES)
+
+#define C_LAYOUT wmma::mem_row_major
+
+// Implementation constants.
+
+#define WARPS_PER_BLOCK 8
+#define THREADS_PER_BLOCK (WARP_SIZE * WARPS_PER_BLOCK)
+
+#if SHARED_MEMORY_LIMIT_64K
+// With only 64 Kb shared memory available, we can fit two 8-tile chunks of
+// the A and B matrix data, that is (M = 16) * (K = 16) * 8 * (CHUNK_K = 8)
+// * sizeof(__nv_bfloat16) = 32 Kb each.
+// (i.e. two 8x8 arrays of tiles of 16x16 __nv_bfloat16-typed elements per CTA).
+// But we cannot account the 8 Kb total skew overhead, without which the performance
+// would be severely impacted. So we choose to reduce the chunk size in half,
+// i.e. the amount of A and B matrix data we cache in shared memory.
+// Accordingly, this doubles the number of outer iterations across the global K
+// dimension, which only slightly impacts the performance.
+#define CHUNK_K 4
+#else
+#define CHUNK_K 8
+#endif
+
+#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(__nv_bfloat16))
+#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4))
+#define CHUNK_COPY_LINES_PER_WARP (WARP_COPY_BYTES / CHUNK_LINE_BYTES)
+#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP)
+
+#define BLOCK_ROW_WARPS 2
+#define BLOCK_COL_WARPS 4
+
+#define WARP_ROW_TILES 4
+#define WARP_COL_TILES 2
+
+#define BLOCK_ROW_TILES (WARP_ROW_TILES * BLOCK_ROW_WARPS)
+#define BLOCK_COL_TILES (WARP_COL_TILES * BLOCK_COL_WARPS)
+
+#define GLOBAL_MEM_STRIDE N_GLOBAL
+
+#define SHMEM_STRIDE (N * BLOCK_ROW_TILES)
+#define SHMEM_OFFSET (N * WARP_ROW_TILES)
+
+// The macro below is used to shift rows of the A matrix and columns of the B matrix
+// in shared memory to minimize possible bank conflicts.
+// Before performing the nvcuda::wmma::mma_sync operation, the warp must load the matrix
+// data using the nvcuda::wmma::load_matrix_sync operation. Although the memory access pattern
+// is not specified for that function, each lane in the warp can read one or multiple matrix
+// elements from different matrix rows or columns.
+// For shared memory, such access can result in bank conflicts if different rows / columns
+// of the matrix map to the same bank. By shifting each row and column by a few bytes, we
+// make sure that they map to different banks, thus reducing the number of possible bank
+// conflicts.
+// The number of 16 two-byte "__nv_bfloat16" elements is chosen as the minimum possible shift because
+// we must keep each row and column 256-bit aligned, as required by nvcuda::wmma::load_matrix_sync.
+#define SKEW_BF16 16
+
+#define checkKernelErrors(expr) do {                                                        \
+    expr;                                                                                   \
+                                                                                            \
+    cudaError_t __err = cudaGetLastError();                                                 \
+    if (__err != cudaSuccess) {                                                             \
+        printf("Line %d: '%s' failed: %s\n", __LINE__, # expr, cudaGetErrorString(__err));  \
+        abort();                                                                            \
+    }                                                                                       \
+} while(0)
+
+enum kernels
+{
+    bf16mma_shmem_gemm_async_copy  = 0, // __nv_bfloat16 MMA shmem using kernel with async_copy 
+    bf16mma_shmem_gemm             = 1, // __nv_bfloat16 MMA shmem using kernel normal copy (without async_copy).
+    simple_bf16mma_gemm            = 2  // __nv_bfloat16 MMA non-shmem using simple kernel.
+};
+
+const char* kernelNames[] = {"compute_bf16gemm_async_copy", "compute_bf16gemm", 
+                            "simple_wmma_bf16gemm"};
+
+using namespace nvcuda;
+namespace nvcuda_namespace = nvcuda::experimental;
+
+__host__ void init_host_matrices(__nv_bfloat16 *a, __nv_bfloat16 *b, float *c)
+{
+    for (int i = 0; i < M_GLOBAL; i++) {
+        for (int j = 0; j < K_GLOBAL; j++) {
+            a[i*K_GLOBAL+j] = (__nv_bfloat16)(rand() % 3);
+        }
+    }
+
+    for (int i = 0; i < N_GLOBAL; i++) {
+        for (int j = 0; j < K_GLOBAL; j++) {
+            b[i*K_GLOBAL+j] = (__nv_bfloat16)(rand() % 3);
+        }
+    }
+
+    for (int t = 0; t < M_GLOBAL * N_GLOBAL; t++) {
+        c[t] =  (float)(rand() % 3);
+    }
+}
+
+__global__ void compute_bf16gemm(const __nv_bfloat16 *A, const __nv_bfloat16 *B, const float *C, float *D, float alpha, float beta)
+{
+#if __CUDA_ARCH__ >= 800
+    extern __shared__ __nv_bfloat16 shmem[][CHUNK_K * K + SKEW_BF16];
+
+    // Warp and lane identification.
+    const unsigned int warpId = threadIdx.x / WARP_SIZE;
+    const unsigned int laneId = threadIdx.x % WARP_SIZE;
+
+    // Offset in shared memory from which the B matrix is stored.
+    const size_t shmem_idx_b_off = BLOCK_COL_TILES * M;
+
+    // This pointer is used to access the C and D matrix tiles this warp computes.
+    float *shmem_warp_tile_ptr = (float*)&shmem[0][0] + (warpId / BLOCK_ROW_WARPS) * SHMEM_STRIDE * N * BLOCK_ROW_WARPS + (warpId % BLOCK_ROW_WARPS) * SHMEM_OFFSET;
+
+    // This pointer is used to stream the C and D matrices block-wide tile to and from shared memory.
+    float *shmem_warp_stream_ptr = (float*)&shmem[0][0] + warpId * SHMEM_STRIDE * N;
+
+    // Adjust the beta scaler, as it'll be multiplied by alpha at the end of
+    // each tile computation. Technically this is not generally correct (may result
+    // in a loss of precision). Zero still needs to be specially handled though.
+    beta /= alpha;
+
+    // Each CTA slides along the 128 x 128 tiles from the top left corner of the matrix to the
+    // right and down, and selects the next tile to compute. Once there's no such tile,
+    // all warps in this CTA exit.
+    for(unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) {
+        const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES);
+        const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES;
+
+        // Stop when there are no more D matrix tiles to compute in this CTA.
+        if (block_tile_i >= M_TILES) {
+            break;
+        }
+
+        // This warp's pointer to the C matrix data to copy memory from to shared memory.
+        const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N;
+        const float *src_gmem_warp_stream_ptr = &C[gmem_idx];
+
+        // Stream multiple C tiles to shared memory.
+#pragma unroll
+        for (int i = 0; i < N; i++) {
+            *((int4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId) = 
+                *((int4*)(src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId);
+        }
+
+        __syncthreads();
+
+        // These fragments will accumulate the result of A and B matrix fragment multiplications
+        // along the K_GLOBAL dimension.
+        wmma::fragment<wmma::accumulator, M, N, K, float> c[WARP_COL_TILES][WARP_ROW_TILES];
+
+        // Load the C matrix tiles into fragments from shared memory.
+#pragma unroll
+        for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+                const float *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * N + j * N;
+
+                wmma::load_matrix_sync(c[i][j], tile_ptr, SHMEM_STRIDE, C_LAYOUT);
+            }
+        }
+
+        __syncthreads();
+
+        // Scale the C matrix.
+#pragma unroll
+       for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+#pragma unroll
+                for (int t = 0; t < c[i][j].num_elements; t++) {
+                    c[i][j].x[t] *= beta;
+                }
+            }
+        }
+
+        // Select what warp copies what matrix to shared memory.
+        // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix.
+        const __nv_bfloat16 *warp_ptr = (warpId < (WARPS_PER_BLOCK/2)) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2) :
+                                              (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2);
+
+        // Go through the global K dimension by a fixed step at a time.
+#pragma unroll
+        for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) {
+            // Copy slices of the A and B matrices to shared memory.
+            // The first half of the warps in the CTA copy the A matrix, the rest copy the B matrix.
+            size_t shmem_idx = warpId < (WARPS_PER_BLOCK/2) ? (M * (warpId % (WARPS_PER_BLOCK/2)) * 2) : 
+                                                              (N * (warpId % (WARPS_PER_BLOCK/2)) * 2 + shmem_idx_b_off);
+
+            // First half of the warp copies the first row / column of the matrix,
+            // the second half of the warp copies the next.
+            const __nv_bfloat16 *lane_ptr = (warp_ptr + tile_k * K + (laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL);
+
+            // Shift the second half of the warp to the next row / column in the shared memory.
+            shmem_idx += laneId / CHUNK_COPY_LINE_LANES;
+
+#pragma unroll
+            for(int i = 0; i < ((WARP_SIZE/2) / CHUNK_COPY_LINES_PER_WARP) * 2; i++) {
+                // Copy 16 bytes at once in each lane.
+                *((int4*)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) = *((int4*)lane_ptr +  (laneId % CHUNK_COPY_LINE_LANES));
+
+                // Advance the global memory pointer and the shared memory index.
+                lane_ptr = lane_ptr + K_GLOBAL * CHUNK_COPY_LINES_PER_WARP;
+                shmem_idx += CHUNK_COPY_LINES_PER_WARP;
+            }
+
+            __syncthreads();
+
+            // Compute a grid of C matrix tiles in each warp.
+#pragma unroll
+            for (int k_step = 0; k_step < CHUNK_K; k_step++) {
+                wmma::fragment<wmma::matrix_a, M, N, K, __nv_bfloat16, wmma::row_major> a[WARP_COL_TILES];
+                wmma::fragment<wmma::matrix_b, M, N, K, __nv_bfloat16, wmma::col_major> b[WARP_ROW_TILES];
+
+#pragma unroll
+                for (int i = 0; i < WARP_COL_TILES; i++) {
+                    size_t shmem_idx_a = (warpId/BLOCK_ROW_WARPS) * M * BLOCK_ROW_WARPS + (i * M);
+                    const __nv_bfloat16 *tile_ptr = &shmem[shmem_idx_a][k_step * K];
+
+                    wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_BF16);
+
+#pragma unroll
+                    for (int j = 0; j < WARP_ROW_TILES; j++) {
+                        if (i == 0) {
+                            // Load the B matrix fragment once, because it is going to be reused
+                            // against the other A matrix fragments.
+                            size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId%2) + (j * N);
+                            const __nv_bfloat16 *tile_ptr = &shmem[shmem_idx_b][k_step * K];
+
+                            wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_BF16);
+                        }
+
+                        wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]);
+                    }
+                }
+            }
+
+            __syncthreads();
+        }
+
+        // Store the D fragments to shared memory.
+#pragma unroll
+        for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+#pragma unroll
+                // Uniform, point-wise transformations of ALL fragment elements by ALL threads in the
+                // warp are well-defined even though element indices within fragment storage are not defined.
+                for (int t = 0; t < c[i][j].num_elements; t++)
+                    c[i][j].x[t] *= alpha;
+
+                float *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N;
+
+                wmma::store_matrix_sync(tile_ptr, c[i][j], SHMEM_STRIDE, C_LAYOUT);
+            }
+        }
+
+        __syncthreads();
+
+        // Now that shared memory contains all the D tiles, stream them to global memory.
+        float *dst_gmem_warp_stream_ptr = &D[gmem_idx];
+
+#pragma unroll
+        for (int i = 0; i < N; i++) {
+            *((int4*)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) =
+                *((int4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId);
+        }
+
+        __syncthreads();
+    }
+#endif
+}
+
+__global__ void compute_bf16gemm_async_copy(const __nv_bfloat16 *A, const __nv_bfloat16 *B, const float *C, float *D, float alpha, float beta)
+{
+#if __CUDA_ARCH__ >= 800
+    extern __shared__ __nv_bfloat16 shmem[][CHUNK_K * K + SKEW_BF16];
+
+    // Warp and lane identification.
+    const unsigned int warpId = threadIdx.x / WARP_SIZE;
+    const unsigned int laneId = threadIdx.x % WARP_SIZE;
+
+    // Offset in shared memory from which the B matrix is stored.
+    const size_t shmem_idx_b_off = BLOCK_COL_TILES * M;
+
+    // This pointer is used to access the C and D matrix tiles this warp computes.
+    float *shmem_warp_tile_ptr = (float*)&shmem[0][0] + (warpId / BLOCK_ROW_WARPS) * SHMEM_STRIDE * N * BLOCK_ROW_WARPS + (warpId % BLOCK_ROW_WARPS) * SHMEM_OFFSET;
+
+    // This pointer is used to stream the C and D matrices block-wide tile to and from shared memory.
+    float *shmem_warp_stream_ptr = (float*)&shmem[0][0] + warpId * SHMEM_STRIDE * N;
+
+    // Adjust the beta scaler, as it'll be multiplied by alpha at the end of
+    // each tile computation. Technically this is not generally correct (may result
+    // in a loss of precision). Zero still needs to be specially handled though.
+    beta /= alpha;
+
+#if USE_CPP_API
+    nvcuda_namespace::pipeline pipe;
+#endif
+    // Each CTA slides along the 128 x 128 tiles from the top left corner of the matrix to the
+    // right and down, and selects the next tile to compute. Once there's no such tile,
+    // all warps in this CTA exit.
+    for(unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) {
+        const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES);
+        const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES;
+
+        // Stop when there are no more D matrix tiles to compute in this CTA.
+        if (block_tile_i >= M_TILES) {
+            break;
+        }
+
+        // This warp's pointer to the C matrix data to copy memory from to shared memory.
+        const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N;
+        const float *src_gmem_warp_stream_ptr = &C[gmem_idx];
+
+        // Stream multiple C tiles to shared memory.
+#pragma unroll
+        for (int i = 0; i < N; i++) {
+#if USE_CPP_API
+            nvcuda_namespace::memcpy_async(*((int4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId),
+                                            *((int4*)(src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId),
+                                            pipe);
+            pipe.commit();
+#else
+            __pipeline_memcpy_async((reinterpret_cast<int4*>(&shmem_warp_stream_ptr[(SHMEM_STRIDE * i)])) + laneId,
+                                (reinterpret_cast<const int4*>(&src_gmem_warp_stream_ptr[(GLOBAL_MEM_STRIDE * i)])) + laneId,
+                                sizeof(int4));
+            __pipeline_commit();
+#endif
+        }
+
+#if USE_CPP_API
+        pipe.wait_prior<0>();
+#else
+        __pipeline_wait_prior(0);
+#endif
+        __syncthreads();
+
+        // These fragments will accumulate the result of A and B matrix fragment multiplications
+        // along the K_GLOBAL dimension.
+        wmma::fragment<wmma::accumulator, M, N, K, float> c[WARP_COL_TILES][WARP_ROW_TILES];
+
+        // Load the C matrix tiles into fragments from shared memory.
+#pragma unroll
+        for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+                const float *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * N + j * N;
+
+                wmma::load_matrix_sync(c[i][j], tile_ptr, SHMEM_STRIDE, C_LAYOUT);
+            }
+        }
+
+        __syncthreads();
+
+        // Scale the C matrix.
+#pragma unroll
+       for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+#pragma unroll
+                for (int t = 0; t < c[i][j].num_elements; t++) {
+                    c[i][j].x[t] *= beta;
+                }
+            }
+        }
+
+        // Select what warp copies what matrix to shared memory.
+        // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix.
+        const __nv_bfloat16 *warp_ptr = (warpId < (WARPS_PER_BLOCK/2)) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2) :
+                                              (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2);
+
+        // Go through the global K dimension by a fixed step at a time.
+#pragma unroll
+        for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) {
+            // Copy slices of the A and B matrices to shared memory.
+            // The first half of the warps in the CTA copy the A matrix, the rest copy the B matrix.
+            size_t shmem_idx = warpId < (WARPS_PER_BLOCK/2) ? (M * (warpId % (WARPS_PER_BLOCK/2)) * 2) : 
+                                                              (N * (warpId % (WARPS_PER_BLOCK/2)) * 2 + shmem_idx_b_off);
+
+            // First half of the warp copies the first row / column of the matrix,
+            // the second half of the warp copies the next.
+            const __nv_bfloat16 *lane_ptr = (warp_ptr + tile_k * K + (laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL);
+
+            // Shift the second half of the warp to the next row / column in the shared memory.
+            shmem_idx += laneId / CHUNK_COPY_LINE_LANES;
+
+#pragma unroll
+            for(int i = 0; i < ((WARP_SIZE/2) / CHUNK_COPY_LINES_PER_WARP) * 2; i++) {
+                // Copy 16 bytes at once in each lane.
+#if USE_CPP_API
+                nvcuda_namespace::memcpy_async(*((int4*)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)),
+                                                *((int4*)lane_ptr + (laneId % CHUNK_COPY_LINE_LANES)), pipe);
+                pipe.commit();
+#else
+                __pipeline_memcpy_async((int4*)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES),
+                                        (int4*)lane_ptr + (laneId % CHUNK_COPY_LINE_LANES), sizeof(int4));
+                __pipeline_commit();
+#endif
+                // Advance the global memory pointer and the shared memory index.
+                lane_ptr = lane_ptr + K_GLOBAL * CHUNK_COPY_LINES_PER_WARP;
+                shmem_idx += CHUNK_COPY_LINES_PER_WARP;
+            }
+
+#if USE_CPP_API
+            pipe.wait_prior<0>();
+#else
+            __pipeline_wait_prior(0);
+#endif
+            __syncthreads();
+
+            // Compute a grid of C matrix tiles in each warp.
+#pragma unroll
+            for (int k_step = 0; k_step < CHUNK_K; k_step++) {
+                wmma::fragment<wmma::matrix_a, M, N, K, __nv_bfloat16, wmma::row_major> a[WARP_COL_TILES];
+                wmma::fragment<wmma::matrix_b, M, N, K, __nv_bfloat16, wmma::col_major> b[WARP_ROW_TILES];
+
+#pragma unroll
+                for (int i = 0; i < WARP_COL_TILES; i++) {
+                    size_t shmem_idx_a = (warpId / BLOCK_ROW_WARPS) * M * BLOCK_ROW_WARPS + (i * M);
+                    const __nv_bfloat16 *tile_ptr = &shmem[shmem_idx_a][k_step * K];
+
+                    wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_BF16);
+
+#pragma unroll
+                    for (int j = 0; j < WARP_ROW_TILES; j++) {
+                        if (i == 0) {
+                            // Load the B matrix fragment once, because it is going to be reused
+                            // against the other A matrix fragments.
+                            size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId%2) + (j * N);
+                            const __nv_bfloat16 *tile_ptr = &shmem[shmem_idx_b][k_step * K];
+
+                            wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_BF16);
+                        }
+
+                        wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]);
+                    }
+                }
+            }
+
+            __syncthreads();
+        }
+
+        // Store the D fragments to shared memory.
+#pragma unroll
+        for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+#pragma unroll
+                // Uniform, point-wise transformations of ALL fragment elements by ALL threads in the
+                // warp are well-defined even though element indices within fragment storage are not defined.
+                for (int t = 0; t < c[i][j].num_elements; t++)
+                    c[i][j].x[t] *= alpha;
+
+                float *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N;
+
+                wmma::store_matrix_sync(tile_ptr, c[i][j], SHMEM_STRIDE, C_LAYOUT);
+            }
+        }
+
+        __syncthreads();
+
+        // Now that shared memory contains all the D tiles, stream them to global memory.
+        float *dst_gmem_warp_stream_ptr = &D[gmem_idx];
+
+#pragma unroll
+        for (int i = 0; i < N; i++) {
+            *((int4*)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) =
+                *((int4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId);
+        }
+
+        __syncthreads();
+    }
+#endif
+}
+
+// Performs an MxNxK bf16 GEMM (C=alpha*A*B + beta*C) assuming:
+//  1) Matrices are packed in memory.
+//  2) M, N and K are multiples of 16, 16 and 16 respectively. 
+//  3) A is row major, B is column major matrix.
+// Note: This is a less performant version of the compute_bf16gemm kernel. It is designed for
+//       demonstration purposes only to show the CUDA WMMA API use without relying on
+//       availability of the shared memory.
+__global__ void simple_wmma_bf16gemm(__nv_bfloat16 *a, __nv_bfloat16 *b, float *c, float *d, int m_ld, int n_ld, int k_ld, float alpha, float beta)
+{
+#if __CUDA_ARCH__ >= 800
+   // Leading dimensions. Packed with no transpositions.
+    int lda = k_ld;
+    int ldb = k_ld;
+    int ldc = n_ld;
+
+   // Tile using a 2D grid
+   int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize;
+   int warpN = (blockIdx.y * blockDim.y + threadIdx.y);
+ 
+   // Declare the fragments
+   wmma::fragment<wmma::matrix_a, M, N, K, __nv_bfloat16, wmma::row_major> a_frag;
+   wmma::fragment<wmma::matrix_b, M, N, K, __nv_bfloat16, wmma::col_major> b_frag;
+   wmma::fragment<wmma::accumulator, M, N, K, float> acc_frag;
+   wmma::fragment<wmma::accumulator, M, N, K, float> c_frag;
+
+   wmma::fill_fragment(acc_frag, 0.0f);
+
+   // Loop over k
+   for (int i = 0; i < k_ld; i += K) {
+      int aCol = i; 
+      int aRow = warpM * M;
+
+      int bCol = i;
+      int bRow = warpN * N;
+
+      // Bounds checking
+      if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) {
+         // Load the inputs
+         wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda);
+         wmma::load_matrix_sync(b_frag, b + bRow + bCol * ldb, ldb);
+ 
+         // Perform the matrix multiplication
+         wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
+
+      }
+   }
+
+   // Load in the current value of c, scale it by beta, and add this our result scaled by alpha
+   int cCol = warpN * N;
+   int cRow = warpM * M;
+
+   if (cRow < m_ld && cCol < n_ld) {
+      wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, wmma::mem_row_major);
+
+      for(int i=0; i < c_frag.num_elements; i++) {
+         c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i];
+      }
+
+      // Store the output
+      wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc, wmma::mem_row_major);
+   }
+#endif
+}
+
+__host__ void matMultiplyOnHost(__nv_bfloat16 *A, __nv_bfloat16 *B, float *C,
+                                float alpha, float beta,
+                                int numARows, int numAColumns,
+                                int numBRows, int numBColumns,
+                                int numCRows, int numCColumns)
+{
+    for (int i = 0; i < numCRows; i++) {
+        for (int j = 0; j < numCColumns; j++) {
+            float temp = 0.0;
+
+            for (int k = 0; k < numAColumns; k++) {
+                temp += (float)A[i * numAColumns + k] * (float)B[j * numBRows + k];
+            }
+
+            C[i*numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j];
+        }
+    }
+}
+
+int main(int argc, char **argv)
+{
+    printf("Initializing...\n");
+
+    int dev = findCudaDevice(argc, (const char **)argv);
+
+    cudaDeviceProp deviceProp;
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
+
+    // Tensor cores require a GPU of Volta (SM8X) architecture or higher.
+    if (deviceProp.major < 8) {
+        printf("bf16TensorCoreGemm requires requires SM 8.0 or higher to use Tensor Cores.  Exiting...\n");
+        exit(EXIT_WAIVED);
+    }
+
+    printf("M: %d (%d x %d)\n", M_GLOBAL, M, M_TILES);
+    printf("N: %d (%d x %d)\n", N_GLOBAL, N, N_TILES);
+    printf("K: %d (%d x %d)\n", K_GLOBAL, K, K_TILES);
+
+    __nv_bfloat16 *A_h = NULL;
+    __nv_bfloat16 *B_h = NULL;
+    float *C_h = NULL;
+#if CPU_DEBUG
+    float *result_hD = NULL;
+    float *result_host = NULL;
+#endif
+
+    A_h = (__nv_bfloat16*) malloc(sizeof(__nv_bfloat16) * M_GLOBAL * K_GLOBAL);
+    B_h = (__nv_bfloat16*) malloc(sizeof(__nv_bfloat16) * K_GLOBAL * N_GLOBAL);
+    C_h = (float*) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
+#if CPU_DEBUG
+    result_hD   = (float*) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
+    result_host = (float*) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
+#endif
+
+    __nv_bfloat16 *A = NULL;
+    __nv_bfloat16 *B = NULL;
+    float *C = NULL;
+    float *D = NULL;
+
+    checkCudaErrors(cudaMalloc((void**)&A, sizeof(__nv_bfloat16) * M_GLOBAL * K_GLOBAL));
+    checkCudaErrors(cudaMalloc((void**)&B, sizeof(__nv_bfloat16) * N_GLOBAL * K_GLOBAL));
+    checkCudaErrors(cudaMalloc((void**)&C, sizeof(float) * M_GLOBAL * N_GLOBAL));
+    checkCudaErrors(cudaMalloc((void**)&D, sizeof(float) * M_GLOBAL * N_GLOBAL));
+
+    assert(((unsigned long long)A) % 128 == 0);
+    assert(((unsigned long long)B) % 128 == 0);
+    assert(((unsigned long long)C) % 128 == 0);
+    assert(((unsigned long long)D) % 128 == 0);
+
+    init_host_matrices(A_h, B_h, C_h);
+
+    printf("Preparing data for GPU...\n");
+
+    checkCudaErrors(cudaMemcpy(A, A_h, sizeof(__nv_bfloat16) * M_GLOBAL * K_GLOBAL, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(B, B_h, sizeof(__nv_bfloat16) * N_GLOBAL * K_GLOBAL, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(C, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemset(D, 0, sizeof(float) * M_GLOBAL * N_GLOBAL));
+
+    enum {
+        // Compute the right amount of shared memory to request.
+        // We need shared memory to hold per-CTA C and D matrix tiles, and to cache per-CTA chunks
+        // of the A and B matrices. Therefore, the right amount to request is the maximum of those
+        // two numbers.
+        SHMEM_SZ = MAX(sizeof(__nv_bfloat16) * (BLOCK_COL_TILES * M) * (CHUNK_K * K + SKEW_BF16) * 2,
+                       M * (BLOCK_ROW_WARPS * WARP_ROW_TILES) * N * (BLOCK_COL_WARPS * WARP_COL_TILES) * sizeof(float))
+    };
+
+    printf("Required shared memory size: %lu Kb\n", SHMEM_SZ / 1024UL);
+
+    const float alpha = 1.1f;
+    const float beta = 1.2f;
+
+    cudaEvent_t start, stop;
+
+    checkCudaErrors(cudaEventCreate(&start));    
+    checkCudaErrors(cudaEventCreate(&stop));
+    checkCudaErrors(cudaEventRecord(start));
+
+    // kernel to run - default (b16mma_shmem_gemm_async_copy == 0)
+    kernels selected_kernel = bf16mma_shmem_gemm_async_copy;
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) {
+        int kernel_number = getCmdLineArgumentInt(argc, (const char **)argv, "kernel");
+        if (kernel_number < 3) {
+            selected_kernel = (kernels)kernel_number;
+        }
+        else {
+            printf("Error: kernel number should be between 0 to 2, you have entered %d\n", kernel_number);
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    // If enough shared memory available on the GPU use high performant kernel
+    if ((deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) && (selected_kernel != simple_bf16mma_gemm)) {
+        printf("Computing using high performance kernel = %d - %s\n", selected_kernel, kernelNames[selected_kernel]);
+
+        switch (selected_kernel)
+        {
+            case bf16mma_shmem_gemm_async_copy :
+            default:
+                checkCudaErrors(cudaFuncSetAttribute(compute_bf16gemm_async_copy, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ));
+                checkKernelErrors((compute_bf16gemm_async_copy<<<deviceProp.multiProcessorCount*2, THREADS_PER_BLOCK, SHMEM_SZ>>>(A, B, C, D, alpha, beta)));
+                break;
+            case bf16mma_shmem_gemm :
+                checkCudaErrors(cudaFuncSetAttribute(compute_bf16gemm, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ));
+                checkKernelErrors((compute_bf16gemm<<<deviceProp.multiProcessorCount*2, THREADS_PER_BLOCK, SHMEM_SZ>>>(A, B, C, D, alpha, beta)));
+                break;
+        }
+#if CPU_DEBUG
+        checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(float)*M_GLOBAL*N_GLOBAL, cudaMemcpyDeviceToHost));
+#endif
+    }
+    else {
+        dim3 gridDim;
+        dim3 blockDim;
+     
+        // blockDim.x must be a multple of warpSize
+        // 128x4 means we have 16 warps and a block computes a 64x64 output tile
+        blockDim.x = 128;
+        blockDim.y = 4;
+
+        gridDim.x = (M_GLOBAL + (M * blockDim.x / 32 - 1)) / (M * blockDim.x / 32);
+        gridDim.y = (N_GLOBAL + N * blockDim.y - 1) / (N * blockDim.y);
+
+        printf("Computing... using simple_wmma_gemm kernel\n");
+        simple_wmma_bf16gemm<<<gridDim, blockDim>>>(A, B, C, D, M_GLOBAL, N_GLOBAL, K_GLOBAL, alpha, beta);
+#if CPU_DEBUG
+        checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(float) * M_GLOBAL * N_GLOBAL, cudaMemcpyDeviceToHost));
+#endif
+    }
+
+    checkCudaErrors(cudaEventRecord(stop));
+    checkCudaErrors(cudaEventSynchronize(stop));
+
+#if CPU_DEBUG
+    printf("Verifying correctness of the computations...\n");
+
+    memcpy(result_host, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL);
+
+    matMultiplyOnHost(A_h, B_h, result_host,
+                      alpha, beta,
+                      M_GLOBAL, K_GLOBAL,
+                      K_GLOBAL, N_GLOBAL,
+                      M_GLOBAL, N_GLOBAL);
+
+    for (int i = 0; i < N_GLOBAL * M_GLOBAL; i++) {
+        if (fabs(result_hD[i] - result_host[i]) > 0.1f) {
+            printf("mismatch i=%d result_hD=%f result_host=%f\n", i, result_hD[i], result_host[i]);
+        }
+    }
+    free(result_hD);
+    free(result_host);
+#endif
+
+    float milliseconds = 0;
+
+    checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop));
+
+    printf("Time: %f ms\n", milliseconds);
+    printf("TFLOPS: %.2f\n", (((double)M_GLOBAL * N_GLOBAL * K_GLOBAL * 2)/(milliseconds/1000.)) / 1e12);
+
+    free(A_h);
+    free(B_h);
+    free(C_h);
+    checkCudaErrors(cudaFree((void*)A));
+    checkCudaErrors(cudaFree((void*)B));
+    checkCudaErrors(cudaFree((void*)C));
+    checkCudaErrors(cudaFree((void*)D));
+
+    return 0;
+}
diff --git a/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2015.sln b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2015.sln
new file mode 100644
index 00000000..4dc1f7bd
--- /dev/null
+++ b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2015.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 14.00
+# Visual Studio 2015
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bf16TensorCoreGemm", "bf16TensorCoreGemm_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2015.vcxproj b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2015.vcxproj
new file mode 100644
index 00000000..3e4465d0
--- /dev/null
+++ b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2015.vcxproj
@@ -0,0 +1,107 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>bf16TensorCoreGemm_vs2015</RootNamespace>
+    <ProjectName>bf16TensorCoreGemm</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/bf16TensorCoreGemm.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="bf16TensorCoreGemm.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2017.sln b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2017.sln
new file mode 100644
index 00000000..5af238c8
--- /dev/null
+++ b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2017.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bf16TensorCoreGemm", "bf16TensorCoreGemm_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2017.vcxproj b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2017.vcxproj
new file mode 100644
index 00000000..9a539065
--- /dev/null
+++ b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2017.vcxproj
@@ -0,0 +1,112 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>bf16TensorCoreGemm_vs2017</RootNamespace>
+    <ProjectName>bf16TensorCoreGemm</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/bf16TensorCoreGemm.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="bf16TensorCoreGemm.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2019.sln b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2019.sln
new file mode 100644
index 00000000..395d03a5
--- /dev/null
+++ b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2019.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bf16TensorCoreGemm", "bf16TensorCoreGemm_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2019.vcxproj b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2019.vcxproj
new file mode 100644
index 00000000..edbdc041
--- /dev/null
+++ b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2019.vcxproj
@@ -0,0 +1,108 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>bf16TensorCoreGemm_vs2019</RootNamespace>
+    <ProjectName>bf16TensorCoreGemm</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/bf16TensorCoreGemm.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="bf16TensorCoreGemm.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/binaryPartitionCG/Makefile b/Samples/binaryPartitionCG/Makefile
new file mode 100644
index 00000000..4a108e13
--- /dev/null
+++ b/Samples/binaryPartitionCG/Makefile
@@ -0,0 +1,360 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+        endif
+    endif
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on Mac OSX
+ifeq ($(TARGET_OS),darwin)
+  $(info >>> WARNING - binaryPartitionCG is not supported on Mac OSX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+#Detect if installed version of GCC supports required C++11
+ifeq ($(TARGET_OS),linux)
+    empty :=
+    space := $(empty) $(empty)
+    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
+#Create version number without "."
+    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
+# Make sure the version number has at least 3 decimals
+    GCCVERSION += 00
+# Remove spaces from the version number
+    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
+# Crop the version number to 3 decimals.
+    GCCVERSION := $(shell expr `echo $(GCCVERSION)` | cut -b1-3)
+#$(warning $(GCCVERSION))
+
+    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 470)
+
+    ifeq ($(IS_MIN_VERSION), 1)
+        $(info >>> GCC Version is greater or equal to 4.7.0 <<<)
+    else
+        $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
+        SAMPLE_ENABLED := 0
+    endif
+endif
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 35 37 50 52 60 61 70 72 75 80
+else
+SMS ?= 35 37 50 52 60 61 70 75 80
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ALL_CCFLAGS += --std=c++11
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: binaryPartitionCG
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+binaryPartitionCG.o:binaryPartitionCG.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+binaryPartitionCG: binaryPartitionCG.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./binaryPartitionCG
+
+clean:
+	rm -f binaryPartitionCG binaryPartitionCG.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/binaryPartitionCG
+
+clobber: clean
diff --git a/Samples/binaryPartitionCG/NsightEclipse.xml b/Samples/binaryPartitionCG/NsightEclipse.xml
new file mode 100644
index 00000000..fec56867
--- /dev/null
+++ b/Samples/binaryPartitionCG/NsightEclipse.xml
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>binaryPartitionCG</name>
+  <cflags>
+    <flag>--std=c++11</flag>
+  </cflags>
+  <description><![CDATA[This sample is a simple code that illustrates binary partition cooperative groups and reduce within the thread block.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">Cooperative Groups</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>CUDA</keyword>
+    <keyword>Parallel Reduction</keyword>
+    <keyword>Cooperative Groups</keyword>
+    <keyword>CPP11</keyword>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>binaryPartitionCG.cu</primary_file>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+  </scopes>
+  <sm-arch>sm35</sm-arch>
+  <sm-arch>sm37</sm-arch>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>arm</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>Binary Partition Cooperative Groups</title>
+  <type>exe</type>
+</entry>
diff --git a/Samples/binaryPartitionCG/README.md b/Samples/binaryPartitionCG/README.md
new file mode 100644
index 00000000..113c76db
--- /dev/null
+++ b/Samples/binaryPartitionCG/README.md
@@ -0,0 +1,67 @@
+# binaryPartitionCG - Binary Partition Cooperative Groups
+
+## Description
+
+This sample is a simple code that illustrates binary partition cooperative groups and reduce within the thread block.
+
+## Key Concepts
+
+Cooperative Groups
+
+## Supported SM Architectures
+
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, armv7l
+
+## CUDA APIs involved
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
diff --git a/Samples/binaryPartitionCG/binaryPartitionCG.cu b/Samples/binaryPartitionCG/binaryPartitionCG.cu
new file mode 100644
index 00000000..53021c44
--- /dev/null
+++ b/Samples/binaryPartitionCG/binaryPartitionCG.cu
@@ -0,0 +1,155 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This sample illustrates basic usage of binary partition cooperative groups
+ * within the thread block tile when divergent path exists.
+ * 1.) Each thread loads a value from random array.
+ * 2.) then checks if it is odd or even.
+ * 3.) create binary partition group based on the above predicate
+ * 4.) we count the number of odd/even in the group based on size of the binary groups
+ * 5.) write it global counter of odd.
+ * 6.) sum the values loaded by individual threads(using reduce) and write it to global 
+ *     even & odd elements sum.
+ *
+ * **NOTE** : binary_partition results in splitting warp into divergent thread groups
+              this is not good from performance perspective, but in cases where warp 
+              divergence is inevitable one can use binary_partition group.
+*/
+
+#include <stdio.h>
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+#include <helper_cuda.h>
+
+namespace cg = cooperative_groups;
+
+void initOddEvenArr(int *inputArr, unsigned int size)
+{
+    for (int i=0; i < size; i++)
+    {
+        inputArr[i] = rand() % 50;
+    }
+}
+
+
+/**
+ * CUDA kernel device code
+ * 
+ * Creates cooperative groups and performs odd/even counting & summation.
+ */
+__global__ void oddEvenCountAndSumCG(int *inputArr, int *numOfOdds, int *sumOfOddAndEvens, unsigned int size)
+{
+    cg::thread_block cta = cg::this_thread_block();
+    cg::grid_group grid = cg::this_grid();
+    cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
+
+    for (int i = grid.thread_rank(); i < size; i += grid.size())
+    {
+        int elem = inputArr[i];
+        auto subTile = cg::binary_partition(tile32, elem & 1);
+        if (elem & 1) // Odd numbers group
+        {
+            int oddGroupSum = cg::reduce(subTile, elem, cg::plus<int>());
+
+            if (subTile.thread_rank() == 0)
+            {
+                // Add number of odds present in this group of Odds.
+                atomicAdd(numOfOdds, subTile.size());
+
+                // Add local reduction of odds present in this group of Odds.
+                atomicAdd(&sumOfOddAndEvens[0], oddGroupSum);
+
+            }
+        }
+        else // Even numbers group
+        {
+            int evenGroupSum = cg::reduce(subTile, elem, cg::plus<int>());
+
+            if (subTile.thread_rank() == 0)
+            {
+                // Add local reduction of even present in this group of evens.
+                atomicAdd(&sumOfOddAndEvens[1], evenGroupSum);
+            }
+        }
+        // reconverge warp so for next loop iteration we ensure convergence of 
+        // above diverged threads to perform coalesced loads of inputArr.
+        cg::sync(tile32);
+    }
+}
+
+
+/**
+ * Host main routine
+ */
+int main(int argc, const char **argv)
+{
+    int deviceId = findCudaDevice(argc, argv);
+    int *h_inputArr, *d_inputArr;
+    int *h_numOfOdds, *d_numOfOdds;
+    int *h_sumOfOddEvenElems, *d_sumOfOddEvenElems;
+    unsigned int arrSize = 1024 * 100;
+
+    h_inputArr = new int[arrSize];
+    h_numOfOdds = new int[1];
+    h_sumOfOddEvenElems = new int[2];
+    initOddEvenArr(h_inputArr, arrSize);
+   
+    cudaStream_t stream;
+    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    checkCudaErrors(cudaMalloc(&d_inputArr, sizeof(int)*arrSize));
+    checkCudaErrors(cudaMalloc(&d_numOfOdds, sizeof(int)));
+    checkCudaErrors(cudaMalloc(&d_sumOfOddEvenElems, sizeof(int)*2));
+
+    checkCudaErrors(cudaMemcpyAsync(d_inputArr, h_inputArr, sizeof(int)*arrSize, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemsetAsync(d_numOfOdds, 0, sizeof(int), stream));
+    checkCudaErrors(cudaMemsetAsync(d_sumOfOddEvenElems, 0, 2*sizeof(int), stream));
+
+    //Launch the kernel
+    int threadsPerBlock=1024;
+    int blocksPerGrid = arrSize / threadsPerBlock;
+
+    printf("\nLaunching %d blocks with %d threads...\n\n",blocksPerGrid, threadsPerBlock);
+
+    oddEvenCountAndSumCG<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_inputArr, d_numOfOdds, d_sumOfOddEvenElems, arrSize);
+
+    checkCudaErrors(cudaMemcpyAsync(h_numOfOdds, d_numOfOdds, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    checkCudaErrors(cudaMemcpyAsync(h_sumOfOddEvenElems, d_sumOfOddEvenElems, 2*sizeof(int), cudaMemcpyDeviceToHost, stream));
+    
+    printf("Array size = %d Num of Odds = %d Sum of Odds = %d Sum of Evens %d\n", arrSize, h_numOfOdds[0], h_sumOfOddEvenElems[0], h_sumOfOddEvenElems[1]);
+    printf("\n...Done.\n\n");
+
+    delete[] h_inputArr;
+    delete[] h_numOfOdds;
+    delete[] h_sumOfOddEvenElems;
+
+    checkCudaErrors(cudaFree(d_inputArr));
+    checkCudaErrors(cudaFree(d_numOfOdds));
+    checkCudaErrors(cudaFree(d_sumOfOddEvenElems));
+
+    return EXIT_SUCCESS;
+}
diff --git a/Samples/binaryPartitionCG/binaryPartitionCG_vs2015.sln b/Samples/binaryPartitionCG/binaryPartitionCG_vs2015.sln
new file mode 100644
index 00000000..310beb8e
--- /dev/null
+++ b/Samples/binaryPartitionCG/binaryPartitionCG_vs2015.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 14.00
+# Visual Studio 2015
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "binaryPartitionCG", "binaryPartitionCG_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/binaryPartitionCG/binaryPartitionCG_vs2015.vcxproj b/Samples/binaryPartitionCG/binaryPartitionCG_vs2015.vcxproj
new file mode 100644
index 00000000..f87df486
--- /dev/null
+++ b/Samples/binaryPartitionCG/binaryPartitionCG_vs2015.vcxproj
@@ -0,0 +1,107 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>binaryPartitionCG_vs2015</RootNamespace>
+    <ProjectName>binaryPartitionCG</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/binaryPartitionCG.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="binaryPartitionCG.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/binaryPartitionCG/binaryPartitionCG_vs2017.sln b/Samples/binaryPartitionCG/binaryPartitionCG_vs2017.sln
new file mode 100644
index 00000000..54afd0c1
--- /dev/null
+++ b/Samples/binaryPartitionCG/binaryPartitionCG_vs2017.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "binaryPartitionCG", "binaryPartitionCG_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/binaryPartitionCG/binaryPartitionCG_vs2017.vcxproj b/Samples/binaryPartitionCG/binaryPartitionCG_vs2017.vcxproj
new file mode 100644
index 00000000..52bca786
--- /dev/null
+++ b/Samples/binaryPartitionCG/binaryPartitionCG_vs2017.vcxproj
@@ -0,0 +1,112 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>binaryPartitionCG_vs2017</RootNamespace>
+    <ProjectName>binaryPartitionCG</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/binaryPartitionCG.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="binaryPartitionCG.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/binaryPartitionCG/binaryPartitionCG_vs2019.sln b/Samples/binaryPartitionCG/binaryPartitionCG_vs2019.sln
new file mode 100644
index 00000000..0812edd0
--- /dev/null
+++ b/Samples/binaryPartitionCG/binaryPartitionCG_vs2019.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "binaryPartitionCG", "binaryPartitionCG_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/binaryPartitionCG/binaryPartitionCG_vs2019.vcxproj b/Samples/binaryPartitionCG/binaryPartitionCG_vs2019.vcxproj
new file mode 100644
index 00000000..52daed00
--- /dev/null
+++ b/Samples/binaryPartitionCG/binaryPartitionCG_vs2019.vcxproj
@@ -0,0 +1,108 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>binaryPartitionCG_vs2019</RootNamespace>
+    <ProjectName>binaryPartitionCG</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/binaryPartitionCG.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="binaryPartitionCG.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/boxFilterNPP/Makefile b/Samples/boxFilterNPP/Makefile
index 057ed4ca..c0b94aa0 100644
--- a/Samples/boxFilterNPP/Makefile
+++ b/Samples/boxFilterNPP/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -259,8 +280,8 @@ ifeq ($(GENCODE_FLAGS),)
 $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
 
 ifeq ($(SMS),)
-# Generate PTX code from SM 30
-GENCODE_FLAGS += -gencode arch=compute_30,code=compute_30
+# Generate PTX code from SM 35
+GENCODE_FLAGS += -gencode arch=compute_35,code=compute_35
 endif
 
 # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
diff --git a/Samples/boxFilterNPP/NsightEclipse.xml b/Samples/boxFilterNPP/NsightEclipse.xml
new file mode 100644
index 00000000..8b1219f6
--- /dev/null
+++ b/Samples/boxFilterNPP/NsightEclipse.xml
@@ -0,0 +1,87 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>boxFilterNPP</name>
+  <description><![CDATA[A NPP CUDA Sample that demonstrates how to use NPP FilterBox function to perform a Box Filter.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <fallback_min_ptx>true</fallback_min_ptx>
+  <files>
+    <file>./Lena.pgm</file>
+  </files>
+  <includepaths>
+    <path>../../Common/UtilNPP</path>
+    <path>../../Common/FreeImage/include</path>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">Performance Strategies</concept>
+    <concept level="basic">Image Processing</concept>
+    <concept level="basic">NPP Library</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>CUDA</keyword>
+    <keyword>NPP</keyword>
+    <keyword>Image Processing</keyword>
+    <keyword>box filter</keyword>
+  </keywords>
+  <libraries>
+    <library>nppisu_static</library>
+    <library>nppif_static</library>
+    <library>nppc_static</library>
+    <library>culibos</library>
+    <library>freeimage</library>
+  </libraries>
+  <librarypaths>
+    <path>../../Common/FreeImage/lib/$(TARGET_OS)</path>
+    <path>../../Common/FreeImage/lib/$(TARGET_OS)/$(TARGET_ARCH)</path>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>boxFilterNPP.cpp</primary_file>
+  <required_dependencies>
+    <dependency>FreeImage</dependency>
+    <dependency>NPP</dependency>
+  </required_dependencies>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+    <scope>1:Performance Strategies</scope>
+    <scope>2:Image Processing</scope>
+    <scope>2:Computer Vision</scope>
+  </scopes>
+  <sm-arch>sm35</sm-arch>
+  <sm-arch>sm37</sm-arch>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>x86_64</arch>
+      <platform>macosx</platform>
+    </env>
+    <env>
+      <arch>arm</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>Box Filter with NPP</title>
+  <type>exe</type>
+</entry>
diff --git a/Samples/boxFilterNPP/README.md b/Samples/boxFilterNPP/README.md
index 7aa98387..d403359c 100644
--- a/Samples/boxFilterNPP/README.md
+++ b/Samples/boxFilterNPP/README.md
@@ -10,11 +10,11 @@ Performance Strategies, Image Processing, NPP Library
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
@@ -67,29 +67,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/boxFilterNPP/boxFilterNPP_vs2012.vcxproj b/Samples/boxFilterNPP/boxFilterNPP_vs2012.vcxproj
index a584b2f9..38f8880f 100644
--- a/Samples/boxFilterNPP/boxFilterNPP_vs2012.vcxproj
+++ b/Samples/boxFilterNPP/boxFilterNPP_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/boxFilterNPP.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/boxFilterNPP/boxFilterNPP_vs2013.vcxproj b/Samples/boxFilterNPP/boxFilterNPP_vs2013.vcxproj
index 09f9b1bc..79fa154f 100644
--- a/Samples/boxFilterNPP/boxFilterNPP_vs2013.vcxproj
+++ b/Samples/boxFilterNPP/boxFilterNPP_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/boxFilterNPP.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/boxFilterNPP/boxFilterNPP_vs2015.vcxproj b/Samples/boxFilterNPP/boxFilterNPP_vs2015.vcxproj
index f31e36b4..4a373318 100644
--- a/Samples/boxFilterNPP/boxFilterNPP_vs2015.vcxproj
+++ b/Samples/boxFilterNPP/boxFilterNPP_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/boxFilterNPP.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/boxFilterNPP/boxFilterNPP_vs2017.vcxproj b/Samples/boxFilterNPP/boxFilterNPP_vs2017.vcxproj
index 18ed2356..3c910879 100644
--- a/Samples/boxFilterNPP/boxFilterNPP_vs2017.vcxproj
+++ b/Samples/boxFilterNPP/boxFilterNPP_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/boxFilterNPP.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/boxFilterNPP/boxFilterNPP_vs2019.vcxproj b/Samples/boxFilterNPP/boxFilterNPP_vs2019.vcxproj
index eaa59028..4140360b 100644
--- a/Samples/boxFilterNPP/boxFilterNPP_vs2019.vcxproj
+++ b/Samples/boxFilterNPP/boxFilterNPP_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/boxFilterNPP.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/cannyEdgeDetectorNPP/Makefile b/Samples/cannyEdgeDetectorNPP/Makefile
index 688a4539..b5d26abe 100644
--- a/Samples/cannyEdgeDetectorNPP/Makefile
+++ b/Samples/cannyEdgeDetectorNPP/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -259,8 +280,8 @@ ifeq ($(GENCODE_FLAGS),)
 $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
 
 ifeq ($(SMS),)
-# Generate PTX code from SM 30
-GENCODE_FLAGS += -gencode arch=compute_30,code=compute_30
+# Generate PTX code from SM 35
+GENCODE_FLAGS += -gencode arch=compute_35,code=compute_35
 endif
 
 # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
diff --git a/Samples/cannyEdgeDetectorNPP/NsightEclipse.xml b/Samples/cannyEdgeDetectorNPP/NsightEclipse.xml
new file mode 100644
index 00000000..d8bc0479
--- /dev/null
+++ b/Samples/cannyEdgeDetectorNPP/NsightEclipse.xml
@@ -0,0 +1,83 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>cannyEdgeDetectorNPP</name>
+  <description><![CDATA[An NPP CUDA Sample that demonstrates the recommended parameters to use with the nppiFilterCannyBorder_8u_C1R Canny Edge Detection image filter function. This function expects a single channel 8-bit grayscale input image. You can generate a grayscale image from a color image by first calling nppiColorToGray() or nppiRGBToGray(). The Canny Edge Detection function combines and improves on the techniques required to produce an edge detection image using multiple steps.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <fallback_min_ptx>true</fallback_min_ptx>
+  <includepaths>
+    <path>../../Common/UtilNPP</path>
+    <path>../../Common/FreeImage/include</path>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">Performance Strategies</concept>
+    <concept level="basic">Image Processing</concept>
+    <concept level="basic">NPP Library</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>CUDA</keyword>
+    <keyword>NPP</keyword>
+    <keyword>Image Processing</keyword>
+  </keywords>
+  <libraries>
+    <library>nppisu_static</library>
+    <library>nppif_static</library>
+    <library>nppc_static</library>
+    <library>culibos</library>
+    <library>freeimage</library>
+  </libraries>
+  <librarypaths>
+    <path>../../Common/FreeImage/lib/$(TARGET_OS)</path>
+    <path>../../Common/FreeImage/lib/$(TARGET_OS)/$(TARGET_ARCH)</path>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>cannyEdgeDetectorNPP.cpp</primary_file>
+  <required_dependencies>
+    <dependency>FreeImage</dependency>
+    <dependency>NPP</dependency>
+  </required_dependencies>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+    <scope>1:Performance Strategies</scope>
+    <scope>2:Image Processing</scope>
+    <scope>2:Computer Vision</scope>
+  </scopes>
+  <sm-arch>sm35</sm-arch>
+  <sm-arch>sm37</sm-arch>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>x86_64</arch>
+      <platform>macosx</platform>
+    </env>
+    <env>
+      <arch>arm</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>Canny Edge Detector NPP</title>
+  <type>exe</type>
+</entry>
diff --git a/Samples/cannyEdgeDetectorNPP/README.md b/Samples/cannyEdgeDetectorNPP/README.md
index 0a96063c..d9a6f025 100644
--- a/Samples/cannyEdgeDetectorNPP/README.md
+++ b/Samples/cannyEdgeDetectorNPP/README.md
@@ -10,11 +10,11 @@ Performance Strategies, Image Processing, NPP Library
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
@@ -67,29 +67,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP.cpp b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP.cpp
index 4a321666..496d53b3 100644
--- a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP.cpp
+++ b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP.cpp
@@ -47,6 +47,27 @@
 #include <helper_cuda.h>
 #include <helper_string.h>
 
+inline int cudaDeviceInit(int argc, const char **argv) {
+  int deviceCount;
+  checkCudaErrors(cudaGetDeviceCount(&deviceCount));
+
+  if (deviceCount == 0) {
+    std::cerr << "CUDA error: no devices supporting CUDA." << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  int dev = findCudaDevice(argc, argv);
+
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, dev);
+  std::cerr << "cudaSetDevice GPU" << dev << " = " << deviceProp.name
+            << std::endl;
+
+  checkCudaErrors(cudaSetDevice(dev));
+
+  return dev;
+}
+
 bool printfNPPinfo(int argc, char *argv[]) {
   const NppLibraryVersion *libVer = nppGetLibVersion();
 
@@ -74,7 +95,7 @@ int main(int argc, char *argv[]) {
     std::string sFilename;
     char *filePath;
 
-    int dev = findCudaDevice(argc, (const char **)argv);
+    cudaDeviceInit(argc, (const char **)argv);
 
     if (printfNPPinfo(argc, argv) == false) {
       exit(EXIT_SUCCESS);
diff --git a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2012.vcxproj b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2012.vcxproj
index d8eedb82..bcb5353d 100644
--- a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2012.vcxproj
+++ b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/cannyEdgeDetectorNPP.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2013.vcxproj b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2013.vcxproj
index f2919adc..ab17f333 100644
--- a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2013.vcxproj
+++ b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/cannyEdgeDetectorNPP.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2015.vcxproj b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2015.vcxproj
index fe20e581..80f14019 100644
--- a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2015.vcxproj
+++ b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/cannyEdgeDetectorNPP.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2017.vcxproj b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2017.vcxproj
index 65e57a3d..eb51df0b 100644
--- a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2017.vcxproj
+++ b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/cannyEdgeDetectorNPP.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2019.vcxproj b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2019.vcxproj
index ddc3eb7a..5e4f6045 100644
--- a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2019.vcxproj
+++ b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/cannyEdgeDetectorNPP.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/concurrentKernels/Makefile b/Samples/concurrentKernels/Makefile
new file mode 100644
index 00000000..847f090b
--- /dev/null
+++ b/Samples/concurrentKernels/Makefile
@@ -0,0 +1,325 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+        endif
+    endif
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 35 37 50 52 60 61 70 72 75 80
+else
+SMS ?= 35 37 50 52 60 61 70 75 80
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: concurrentKernels
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+concurrentKernels.o:concurrentKernels.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+concurrentKernels: concurrentKernels.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./concurrentKernels
+
+clean:
+	rm -f concurrentKernels concurrentKernels.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/concurrentKernels
+
+clobber: clean
diff --git a/Samples/concurrentKernels/NsightEclipse.xml b/Samples/concurrentKernels/NsightEclipse.xml
new file mode 100644
index 00000000..959c0d36
--- /dev/null
+++ b/Samples/concurrentKernels/NsightEclipse.xml
@@ -0,0 +1,63 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>concurrentKernels</name>
+  <description><![CDATA[This sample demonstrates the use of CUDA streams for concurrent execution of several kernels on GPU device. It also illustrates how to introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="advanced">Performance Strategies</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>CUDA</keyword>
+    <keyword>Concurrent Kernels</keyword>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>concurrentKernels.cu</primary_file>
+  <scopes>
+    <scope>1:CUDA Advanced Topics</scope>
+    <scope>1:Performance Strategies</scope>
+  </scopes>
+  <sm-arch>sm35</sm-arch>
+  <sm-arch>sm37</sm-arch>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>x86_64</arch>
+      <platform>macosx</platform>
+    </env>
+    <env>
+      <arch>arm</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>Concurrent Kernels</title>
+</entry>
diff --git a/Samples/concurrentKernels/README.md b/Samples/concurrentKernels/README.md
new file mode 100644
index 00000000..34151489
--- /dev/null
+++ b/Samples/concurrentKernels/README.md
@@ -0,0 +1,67 @@
+# concurrentKernels - Concurrent Kernels
+
+## Description
+
+This sample demonstrates the use of CUDA streams for concurrent execution of several kernels on GPU device. It also illustrates how to introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function.
+
+## Key Concepts
+
+Performance Strategies
+
+## Supported SM Architectures
+
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, armv7l
+
+## CUDA APIs involved
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
diff --git a/Samples/concurrentKernels/concurrentKernels.cu b/Samples/concurrentKernels/concurrentKernels.cu
new file mode 100644
index 00000000..b54d1c89
--- /dev/null
+++ b/Samples/concurrentKernels/concurrentKernels.cu
@@ -0,0 +1,228 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// This sample demonstrates the use of streams for concurrent execution. It also
+// illustrates how to introduce dependencies between CUDA streams with the
+// cudaStreamWaitEvent function.
+//
+
+// Devices of compute capability 2.0 or higher can overlap the kernels
+//
+#include <cooperative_groups.h>
+#include <stdio.h>
+
+namespace cg = cooperative_groups;
+#include <helper_cuda.h>
+#include <helper_functions.h>
+
+// This is a kernel that does no real work but runs at least for a specified
+// number of clocks
+__global__ void clock_block(clock_t *d_o, clock_t clock_count) {
+  unsigned int start_clock = (unsigned int)clock();
+
+  clock_t clock_offset = 0;
+
+  while (clock_offset < clock_count) {
+    unsigned int end_clock = (unsigned int)clock();
+
+    // The code below should work like
+    // this (thanks to modular arithmetics):
+    //
+    // clock_offset = (clock_t) (end_clock > start_clock ?
+    //                           end_clock - start_clock :
+    //                           end_clock + (0xffffffffu - start_clock));
+    //
+    // Indeed, let m = 2^32 then
+    // end - start = end + m - start (mod m).
+
+    clock_offset = (clock_t)(end_clock - start_clock);
+  }
+
+  d_o[0] = clock_offset;
+}
+
+// Single warp reduction kernel
+__global__ void sum(clock_t *d_clocks, int N) {
+  // Handle to thread block group
+  cg::thread_block cta = cg::this_thread_block();
+  __shared__ clock_t s_clocks[32];
+
+  clock_t my_sum = 0;
+
+  for (int i = threadIdx.x; i < N; i += blockDim.x) {
+    my_sum += d_clocks[i];
+  }
+
+  s_clocks[threadIdx.x] = my_sum;
+  cg::sync(cta);
+
+  for (int i = 16; i > 0; i /= 2) {
+    if (threadIdx.x < i) {
+      s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
+    }
+
+    cg::sync(cta);
+  }
+
+  d_clocks[0] = s_clocks[0];
+}
+
+int main(int argc, char **argv) {
+  int nkernels = 8;             // number of concurrent kernels
+  int nstreams = nkernels + 1;  // use one more stream than concurrent kernel
+  int nbytes = nkernels * sizeof(clock_t);  // number of data bytes
+  float kernel_time = 10;                   // time the kernel should run in ms
+  float elapsed_time;                       // timing variables
+  int cuda_device = 0;
+
+  printf("[%s] - Starting...\n", argv[0]);
+
+  // get number of kernels if overridden on the command line
+  if (checkCmdLineFlag(argc, (const char **)argv, "nkernels")) {
+    nkernels = getCmdLineArgumentInt(argc, (const char **)argv, "nkernels");
+    nstreams = nkernels + 1;
+  }
+
+  // use command-line specified CUDA device, otherwise use device with highest
+  // Gflops/s
+  cuda_device = findCudaDevice(argc, (const char **)argv);
+
+  cudaDeviceProp deviceProp;
+  checkCudaErrors(cudaGetDevice(&cuda_device));
+
+  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
+
+  if ((deviceProp.concurrentKernels == 0)) {
+    printf("> GPU does not support concurrent kernel execution\n");
+    printf("  CUDA kernel runs will be serialized\n");
+  }
+
+  printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
+         deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
+
+  // allocate host memory
+  clock_t *a = 0;  // pointer to the array data in host memory
+  checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
+
+  // allocate device memory
+  clock_t *d_a = 0;  // pointers to data and init value in the device memory
+  checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
+
+  // allocate and initialize an array of stream handles
+  cudaStream_t *streams =
+      (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
+
+  for (int i = 0; i < nstreams; i++) {
+    checkCudaErrors(cudaStreamCreate(&(streams[i])));
+  }
+
+  // create CUDA event handles
+  cudaEvent_t start_event, stop_event;
+  checkCudaErrors(cudaEventCreate(&start_event));
+  checkCudaErrors(cudaEventCreate(&stop_event));
+
+  // the events are used for synchronization only and hence do not need to
+  // record timings this also makes events not introduce global sync points when
+  // recorded which is critical to get overlap
+  cudaEvent_t *kernelEvent;
+  kernelEvent = (cudaEvent_t *)malloc(nkernels * sizeof(cudaEvent_t));
+
+  for (int i = 0; i < nkernels; i++) {
+    checkCudaErrors(
+        cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming));
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // time execution with nkernels streams
+  clock_t total_clocks = 0;
+#if defined(__arm__) || defined(__aarch64__)
+  // the kernel takes more time than the channel reset time on arm archs, so to
+  // prevent hangs reduce time_clocks.
+  clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 1000));
+#else
+  clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
+#endif
+
+  cudaEventRecord(start_event, 0);
+
+  // queue nkernels in separate streams and record when they are done
+  for (int i = 0; i < nkernels; ++i) {
+    clock_block<<<1, 1, 0, streams[i]>>>(&d_a[i], time_clocks);
+    total_clocks += time_clocks;
+    checkCudaErrors(cudaEventRecord(kernelEvent[i], streams[i]));
+
+    // make the last stream wait for the kernel event to be recorded
+    checkCudaErrors(
+        cudaStreamWaitEvent(streams[nstreams - 1], kernelEvent[i], 0));
+  }
+
+  // queue a sum kernel and a copy back to host in the last stream.
+  // the commands in this stream get dispatched as soon as all the kernel events
+  // have been recorded
+  sum<<<1, 32, 0, streams[nstreams - 1]>>>(d_a, nkernels);
+  checkCudaErrors(cudaMemcpyAsync(
+      a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost, streams[nstreams - 1]));
+
+  // at this point the CPU has dispatched all work for the GPU and can continue
+  // processing other tasks in parallel
+
+  // in this sample we just wait until the GPU is done
+  checkCudaErrors(cudaEventRecord(stop_event, 0));
+  checkCudaErrors(cudaEventSynchronize(stop_event));
+  checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
+
+  printf("Expected time for serial execution of %d kernels = %.3fs\n", nkernels,
+         nkernels * kernel_time / 1000.0f);
+  printf("Expected time for concurrent execution of %d kernels = %.3fs\n",
+         nkernels, kernel_time / 1000.0f);
+  printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f);
+
+  bool bTestResult = (a[0] > total_clocks);
+
+  // release resources
+  for (int i = 0; i < nkernels; i++) {
+    cudaStreamDestroy(streams[i]);
+    cudaEventDestroy(kernelEvent[i]);
+  }
+
+  free(streams);
+  free(kernelEvent);
+
+  cudaEventDestroy(start_event);
+  cudaEventDestroy(stop_event);
+  cudaFreeHost(a);
+  cudaFree(d_a);
+
+  if (!bTestResult) {
+    printf("Test failed!\n");
+    exit(EXIT_FAILURE);
+  }
+
+  printf("Test passed\n");
+  exit(EXIT_SUCCESS);
+}
diff --git a/Samples/concurrentKernels/concurrentKernels_vs2012.sln b/Samples/concurrentKernels/concurrentKernels_vs2012.sln
new file mode 100644
index 00000000..9cb8a2df
--- /dev/null
+++ b/Samples/concurrentKernels/concurrentKernels_vs2012.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/concurrentKernels/concurrentKernels_vs2012.vcxproj b/Samples/concurrentKernels/concurrentKernels_vs2012.vcxproj
new file mode 100644
index 00000000..9c8802ea
--- /dev/null
+++ b/Samples/concurrentKernels/concurrentKernels_vs2012.vcxproj
@@ -0,0 +1,107 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>concurrentKernels_vs2012</RootNamespace>
+    <ProjectName>concurrentKernels</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/concurrentKernels.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="concurrentKernels.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/concurrentKernels/concurrentKernels_vs2013.sln b/Samples/concurrentKernels/concurrentKernels_vs2013.sln
new file mode 100644
index 00000000..d90f8153
--- /dev/null
+++ b/Samples/concurrentKernels/concurrentKernels_vs2013.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 13.00
+# Visual Studio 2013
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/concurrentKernels/concurrentKernels_vs2013.vcxproj b/Samples/concurrentKernels/concurrentKernels_vs2013.vcxproj
new file mode 100644
index 00000000..845be7b5
--- /dev/null
+++ b/Samples/concurrentKernels/concurrentKernels_vs2013.vcxproj
@@ -0,0 +1,107 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>concurrentKernels_vs2013</RootNamespace>
+    <ProjectName>concurrentKernels</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/concurrentKernels.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="concurrentKernels.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/concurrentKernels/concurrentKernels_vs2015.sln b/Samples/concurrentKernels/concurrentKernels_vs2015.sln
new file mode 100644
index 00000000..4a23c0fe
--- /dev/null
+++ b/Samples/concurrentKernels/concurrentKernels_vs2015.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 14.00
+# Visual Studio 2015
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/concurrentKernels/concurrentKernels_vs2015.vcxproj b/Samples/concurrentKernels/concurrentKernels_vs2015.vcxproj
new file mode 100644
index 00000000..a50c3010
--- /dev/null
+++ b/Samples/concurrentKernels/concurrentKernels_vs2015.vcxproj
@@ -0,0 +1,107 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>concurrentKernels_vs2015</RootNamespace>
+    <ProjectName>concurrentKernels</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/concurrentKernels.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="concurrentKernels.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/concurrentKernels/concurrentKernels_vs2017.sln b/Samples/concurrentKernels/concurrentKernels_vs2017.sln
new file mode 100644
index 00000000..64f75783
--- /dev/null
+++ b/Samples/concurrentKernels/concurrentKernels_vs2017.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/concurrentKernels/concurrentKernels_vs2017.vcxproj b/Samples/concurrentKernels/concurrentKernels_vs2017.vcxproj
new file mode 100644
index 00000000..accf8eee
--- /dev/null
+++ b/Samples/concurrentKernels/concurrentKernels_vs2017.vcxproj
@@ -0,0 +1,112 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>concurrentKernels_vs2017</RootNamespace>
+    <ProjectName>concurrentKernels</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/concurrentKernels.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="concurrentKernels.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/concurrentKernels/concurrentKernels_vs2019.sln b/Samples/concurrentKernels/concurrentKernels_vs2019.sln
new file mode 100644
index 00000000..e391d056
--- /dev/null
+++ b/Samples/concurrentKernels/concurrentKernels_vs2019.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/concurrentKernels/concurrentKernels_vs2019.vcxproj b/Samples/concurrentKernels/concurrentKernels_vs2019.vcxproj
new file mode 100644
index 00000000..30501438
--- /dev/null
+++ b/Samples/concurrentKernels/concurrentKernels_vs2019.vcxproj
@@ -0,0 +1,108 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>concurrentKernels_vs2019</RootNamespace>
+    <ProjectName>concurrentKernels</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/concurrentKernels.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="concurrentKernels.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/conjugateGradientCudaGraphs/Makefile b/Samples/conjugateGradientCudaGraphs/Makefile
index 35f79976..cd0c71f6 100644
--- a/Samples/conjugateGradientCudaGraphs/Makefile
+++ b/Samples/conjugateGradientCudaGraphs/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -247,9 +268,9 @@ LIBRARIES :=
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
diff --git a/Samples/conjugateGradientCudaGraphs/NsightEclipse.xml b/Samples/conjugateGradientCudaGraphs/NsightEclipse.xml
index 8362bbd9..e07b7518 100644
--- a/Samples/conjugateGradientCudaGraphs/NsightEclipse.xml
+++ b/Samples/conjugateGradientCudaGraphs/NsightEclipse.xml
@@ -48,7 +48,6 @@
     <scope>3:Linear Algebra</scope>
     <scope>1:CUDA Graphs</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
   <sm-arch>sm35</sm-arch>
   <sm-arch>sm37</sm-arch>
   <sm-arch>sm50</sm-arch>
@@ -58,6 +57,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/conjugateGradientCudaGraphs/README.md b/Samples/conjugateGradientCudaGraphs/README.md
index ef16ddf8..01fc7cd5 100644
--- a/Samples/conjugateGradientCudaGraphs/README.md
+++ b/Samples/conjugateGradientCudaGraphs/README.md
@@ -10,11 +10,11 @@ Linear Algebra, CUBLAS Library, CUSPARSE Library
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -30,7 +30,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch,
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
@@ -70,29 +70,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu
index 2096ea55..b1528438 100644
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu
@@ -42,13 +42,10 @@
 #include <cuda_runtime.h>
 #include <cusparse.h>
 
-#include <cooperative_groups.h>
-
 // Utilities and system includes
 #include <helper_cuda.h>  // helper function CUDA error checking and initialization
 #include <helper_functions.h>  // helper for shared functions common to CUDA Samples
 
-namespace cg = cooperative_groups;
 
 const char *sSDKname = "conjugateGradientCudaGraphs";
 
@@ -193,6 +190,26 @@ int main(int argc, char **argv) {
   checkCudaErrors(cudaMalloc((void **)&d_na, sizeof(float)));
   checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(float)));
 
+  /* Wrap raw data into cuSPARSE generic API objects */
+  cusparseSpMatDescr_t matA = NULL;
+  checkCudaErrors(cusparseCreateCsr(
+      &matA, N, N, nz, d_row, d_col, d_val, CUSPARSE_INDEX_32I,
+      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+  cusparseDnVecDescr_t vecx = NULL;
+  checkCudaErrors(cusparseCreateDnVec(&vecx, N, d_x, CUDA_R_32F));
+  cusparseDnVecDescr_t vecp = NULL;
+  checkCudaErrors(cusparseCreateDnVec(&vecp, N, d_p, CUDA_R_32F));
+  cusparseDnVecDescr_t vecAx = NULL;
+  checkCudaErrors(cusparseCreateDnVec(&vecAx, N, d_Ax, CUDA_R_32F));
+
+  /* Allocate workspace for cuSPARSE */
+  size_t bufferSize = 0;
+  checkCudaErrors(cusparseSpMV_bufferSize(
+      cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecx,
+      &beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize));
+  void *buffer = NULL;
+  checkCudaErrors(cudaMalloc(&buffer, bufferSize));
+
   cusparseMatDescr_t descr = 0;
   checkCudaErrors(cusparseCreateMatDescr(&descr));
 
@@ -217,9 +234,9 @@ int main(int argc, char **argv) {
   beta = 0.0;
 
   checkCudaErrors(cusparseSetStream(cusparseHandle, stream1));
-  checkCudaErrors(
-      cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz,
-                     &alpha, descr, d_val, d_row, d_col, d_x, &beta, d_Ax));
+  checkCudaErrors(cusparseSpMV(
+    cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecx,
+    &beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer));
 
   checkCudaErrors(cublasSetStream(cublasHandle, stream1));
   checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpham1, d_Ax, 1, d_r, 1));
@@ -231,9 +248,9 @@ int main(int argc, char **argv) {
   k = 1;
   // First Iteration when k=1 starts
   checkCudaErrors(cublasScopy(cublasHandle, N, d_r, 1, d_p, 1));
-  checkCudaErrors(
-      cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz,
-                     &alpha, descr, d_val, d_row, d_col, d_p, &beta, d_Ax));
+  checkCudaErrors(cusparseSpMV(
+    cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp,
+    &beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer));
 
   checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot));
 
@@ -273,9 +290,9 @@ int main(int argc, char **argv) {
 
   checkCudaErrors(
       cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST));
-  checkCudaErrors(
-      cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz,
-                     &alpha, descr, d_val, d_row, d_col, d_p, &beta, d_Ax));
+  checkCudaErrors(cusparseSpMV(
+    cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp,
+    &beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer));
 
   checkCudaErrors(cudaMemsetAsync(d_dot, 0, sizeof(float), stream1));
   checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot));
@@ -317,9 +334,9 @@ int main(int argc, char **argv) {
     cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_HOST);
     checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1));
 
-    checkCudaErrors(cusparseScsrmv(
-        cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha,
-        descr, d_val, d_row, d_col, d_p, &beta, d_Ax));
+    checkCudaErrors(cusparseSpMV(
+      cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp,
+      &beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer));
 
     cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE);
     checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot));
@@ -378,6 +395,11 @@ int main(int argc, char **argv) {
   cusparseDestroy(cusparseHandle);
   cublasDestroy(cublasHandle);
 
+  if (matA       ) { checkCudaErrors(cusparseDestroySpMat(matA)); }
+  if (vecx       ) { checkCudaErrors(cusparseDestroyDnVec(vecx)); }
+  if (vecAx      ) { checkCudaErrors(cusparseDestroyDnVec(vecAx)); }
+  if (vecp       ) { checkCudaErrors(cusparseDestroyDnVec(vecp)); }
+
   free(I);
   free(J);
   free(val);
diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.vcxproj b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.vcxproj
index 6650c372..51174a20 100644
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/conjugateGradientCudaGraphs.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.vcxproj b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.vcxproj
index 1982b5cb..fcbe61f3 100644
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/conjugateGradientCudaGraphs.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.vcxproj b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.vcxproj
index e3636933..3b613351 100644
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/conjugateGradientCudaGraphs.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj
index f9336466..25ed37df 100644
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/conjugateGradientCudaGraphs.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.vcxproj b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.vcxproj
index 9f466da5..4bedee4c 100644
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/conjugateGradientCudaGraphs.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/conjugateGradientMultiBlockCG/Makefile b/Samples/conjugateGradientMultiBlockCG/Makefile
index c9a9a7a5..8d98ab7a 100644
--- a/Samples/conjugateGradientMultiBlockCG/Makefile
+++ b/Samples/conjugateGradientMultiBlockCG/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -252,6 +273,12 @@ ifeq ($(TARGET_ARCH),aarch64)
   SAMPLE_ENABLED := 0
 endif
 
+# This sample is not supported on sbsa
+ifeq ($(TARGET_ARCH),sbsa)
+  $(info >>> WARNING - conjugateGradientMultiBlockCG is not supported on sbsa - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
@@ -265,9 +292,9 @@ LIBRARIES :=
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 60 61 70 72 75
+SMS ?= 60 61 70 72 75 80
 else
-SMS ?= 60 61 70 75
+SMS ?= 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
diff --git a/Samples/conjugateGradientMultiBlockCG/NsightEclipse.xml b/Samples/conjugateGradientMultiBlockCG/NsightEclipse.xml
index a7b88e1e..d3dfbc29 100644
--- a/Samples/conjugateGradientMultiBlockCG/NsightEclipse.xml
+++ b/Samples/conjugateGradientMultiBlockCG/NsightEclipse.xml
@@ -44,6 +44,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/conjugateGradientMultiBlockCG/README.md b/Samples/conjugateGradientMultiBlockCG/README.md
index fea968c0..b11f8bc7 100644
--- a/Samples/conjugateGradientMultiBlockCG/README.md
+++ b/Samples/conjugateGradientMultiBlockCG/README.md
@@ -10,7 +10,7 @@ Unified Memory, Linear Algebra, Cooperative Groups, MultiBlock Cooperative Group
 
 ## Supported SM Architectures
 
-[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -27,7 +27,7 @@ x86_64, ppc64le
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG.cu b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG.cu
index 951afbd9..ba8461c3 100644
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG.cu
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG.cu
@@ -304,6 +304,7 @@ extern "C" __global__ void gpuConjugateGradient(int *I, int *J, float *val,
 
     r0 = r1;
 
+    cg::sync(grid);
     if (threadIdx.x == 0 && blockIdx.x == 0) *dot_result = 0.0;
 
     cg::sync(grid);
diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2012.vcxproj b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2012.vcxproj
index 5f17f544..b7496b3f 100644
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2012.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/conjugateGradientMultiBlockCG.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2013.vcxproj b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2013.vcxproj
index 4cdc7f63..e115cdcc 100644
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2013.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/conjugateGradientMultiBlockCG.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2015.vcxproj b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2015.vcxproj
index 970aa8ce..bb4ddd63 100644
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2015.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/conjugateGradientMultiBlockCG.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj
index 5daf578f..582a8405 100644
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/conjugateGradientMultiBlockCG.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.vcxproj b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.vcxproj
index 26ff0ce0..534f82cf 100644
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/conjugateGradientMultiBlockCG.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/conjugateGradientMultiDeviceCG/Makefile b/Samples/conjugateGradientMultiDeviceCG/Makefile
index 94ad7385..9f125347 100644
--- a/Samples/conjugateGradientMultiDeviceCG/Makefile
+++ b/Samples/conjugateGradientMultiDeviceCG/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -252,6 +273,12 @@ ifeq ($(TARGET_ARCH),aarch64)
   SAMPLE_ENABLED := 0
 endif
 
+# This sample is not supported on sbsa
+ifeq ($(TARGET_ARCH),sbsa)
+  $(info >>> WARNING - conjugateGradientMultiDeviceCG is not supported on sbsa - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
@@ -263,11 +290,38 @@ LIBRARIES :=
 
 ################################################################################
 
+#Detect if installed version of GCC supports required C++11
+ifeq ($(TARGET_OS),linux)
+    empty :=
+    space := $(empty) $(empty)
+    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
+#Create version number without "."
+    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
+# Make sure the version number has at least 3 decimals
+    GCCVERSION += 00
+# Remove spaces from the version number
+    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
+# Crop the version number to 3 decimals.
+    GCCVERSION := $(shell expr `echo $(GCCVERSION)` | cut -b1-3)
+#$(warning $(GCCVERSION))
+
+    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 470)
+
+    ifeq ($(IS_MIN_VERSION), 1)
+        $(info >>> GCC Version is greater or equal to 4.7.0 <<<)
+    else
+        $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
+        SAMPLE_ENABLED := 0
+    endif
+endif
+
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 60 61 70 72 75
+SMS ?= 60 61 70 72 75 80
 else
-SMS ?= 60 61 70 75
+SMS ?= 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
@@ -286,7 +340,7 @@ GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
 endif
 endif
 
-ALL_CCFLAGS += -dc -maxrregcount=64
+ALL_CCFLAGS += -dc -maxrregcount=64 --std=c++11
 
 LIBRARIES += -lcudadevrt
 
diff --git a/Samples/conjugateGradientMultiDeviceCG/NsightEclipse.xml b/Samples/conjugateGradientMultiDeviceCG/NsightEclipse.xml
index a0c93fbd..905be0ae 100644
--- a/Samples/conjugateGradientMultiDeviceCG/NsightEclipse.xml
+++ b/Samples/conjugateGradientMultiDeviceCG/NsightEclipse.xml
@@ -5,6 +5,7 @@
   <cflags>
     <flag>-dc</flag>
     <flag>-maxrregcount=64</flag>
+    <flag>--std=c++11</flag>
   </cflags>
   <cuda_api_list>
     <toolkit>cudaMemAdvise</toolkit>
@@ -31,6 +32,7 @@
     <keyword>Sparse Matrix</keyword>
     <keyword>Unified Memory</keyword>
     <keyword>Multi-GPU</keyword>
+    <keyword>CPP11</keyword>
   </keywords>
   <libraries>
     <library>cudadevrt</library>
@@ -52,6 +54,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/conjugateGradientMultiDeviceCG/README.md b/Samples/conjugateGradientMultiDeviceCG/README.md
index 8db65502..7aae376c 100644
--- a/Samples/conjugateGradientMultiDeviceCG/README.md
+++ b/Samples/conjugateGradientMultiDeviceCG/README.md
@@ -10,7 +10,7 @@ Unified Memory, Linear Algebra, Cooperative Groups, MultiDevice Cooperative Grou
 
 ## Supported SM Architectures
 
-[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -30,7 +30,7 @@ cudaMemAdvise, cudaMemPrefetchAsync, cudaLaunchCooperativeKernelMultiDevice, cud
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu
index 695a8034..f7fad12d 100644
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu
@@ -36,8 +36,10 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <map>
 #include <iostream>
 #include <set>
+#include <utility>
 
 #include <cuda_runtime.h>
 
@@ -46,6 +48,7 @@
 #include <helper_functions.h>  // helper for shared functions common to CUDA Samples
 
 #include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
 
 namespace cg = cooperative_groups;
 
@@ -201,37 +204,31 @@ __device__ void gpuSaxpy(float *x, float *y, float a, int size,
 __device__ void gpuDotProduct(float *vecA, float *vecB, int size,
                               const cg::thread_block &cta,
                               const cg::multi_grid_group &multi_grid) {
-  __shared__ double tmp[THREADS_PER_BLOCK];
+  extern __shared__ double tmp[];
 
   double temp_sum = 0.0;
+
   for (int i = multi_grid.thread_rank(); i < size; i += multi_grid.size()) {
     temp_sum += static_cast<double>(vecA[i] * vecB[i]);
   }
-  tmp[cta.thread_rank()] = temp_sum;
-
-  cg::sync(cta);
 
   cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
 
-  double beta = temp_sum;
-  double temp;
+  temp_sum = cg::reduce(tile32, temp_sum, cg::plus<double>());
 
-  for (int i = tile32.size() / 2; i > 0; i >>= 1) {
-    if (tile32.thread_rank() < i) {
-      temp = tmp[cta.thread_rank() + i];
-      beta += temp;
-      tmp[cta.thread_rank()] = beta;
-    }
-    cg::sync(tile32);
+  if (tile32.thread_rank() == 0) {
+    tmp[tile32.meta_group_rank()] = temp_sum;
   }
+
   cg::sync(cta);
 
-  if (cta.thread_rank() == 0) {
-    beta = 0.0;
-    for (int i = 0; i < cta.size(); i += tile32.size()) {
-      beta += tmp[i];
+  if (tile32.meta_group_rank() == 0) {
+     temp_sum = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0;
+     temp_sum = cg::reduce(tile32, temp_sum, cg::plus<double>());
+
+    if (tile32.thread_rank() == 0) {
+      atomicAdd(&grid_dot_result, temp_sum);
     }
-    atomicAdd(&grid_dot_result, beta);
   }
 }
 
@@ -242,24 +239,13 @@ __device__ void gpuCopyVector(float *srcA, float *destB, int size,
   }
 }
 
-__device__ void gpuScaleVector(float *vec, float alpha, int size,
-                               const cg::multi_grid_group &multi_grid) {
+__device__ void gpuScaleVectorAndSaxpy(float *x, float *y, float a, float scale, int size,
+                         const cg::multi_grid_group &multi_grid) {
   for (int i = multi_grid.thread_rank(); i < size; i += multi_grid.size()) {
-    vec[i] = alpha * vec[i];
+    y[i] = a * x[i] + scale * y[i];
   }
 }
 
-__device__ void setDotResultToZero(double *dot_result) {
-  unsigned long long int *address_as_ull = (unsigned long long int *)dot_result;
-  unsigned long long int old = *address_as_ull, assumed;
-
-  do {
-    assumed = old;
-    old = atomicCAS_system(address_as_ull, assumed, 0);
-
-  } while (assumed != old);
-}
-
 extern "C" __global__ void multiGpuConjugateGradient(
     int *I, int *J, float *val, float *x, float *Ax, float *p, float *r,
     double *dot_result, int nnz, int N, float tol) {
@@ -304,10 +290,7 @@ extern "C" __global__ void multiGpuConjugateGradient(
   while (r1 > tol * tol && k <= max_iter) {
     if (k > 1) {
       b = r1 / r0;
-
-      gpuScaleVector(p, b, N, multi_grid);
-      cg::sync(grid);
-      gpuSaxpy(r, p, alpha, N, multi_grid);
+      gpuScaleVectorAndSaxpy(r, p, alpha, b, N, multi_grid);
     } else {
       gpuCopyVector(r, p, N, multi_grid);
     }
@@ -317,7 +300,7 @@ extern "C" __global__ void multiGpuConjugateGradient(
     gpuSpMV(I, J, val, nnz, N, alpha, p, Ax, cta, multi_grid);
 
     if (multi_grid.thread_rank() == 0) {
-      setDotResultToZero(dot_result);
+      *dot_result = 0.0;
     }
     cg::sync(multi_grid);
 
@@ -343,7 +326,7 @@ extern "C" __global__ void multiGpuConjugateGradient(
 
     cg::sync(multi_grid);
     if (multi_grid.thread_rank() == 0) {
-      setDotResultToZero(dot_result);
+      *dot_result = 0.0;
     }
 
     cg::sync(multi_grid);
@@ -363,68 +346,31 @@ extern "C" __global__ void multiGpuConjugateGradient(
   }
 }
 
-void getIdenticalGPUs(int num_of_gpus, std::set<int> &identicalGPUs) {
-  int *major_minor =
-      reinterpret_cast<int *>(malloc(sizeof(int) * num_of_gpus * 2));
-  int foundIdenticalGPUs = 0;
+// Map of device version to device number
+std::multimap<std::pair<int, int>, int> getIdenticalGPUs() {
+  int numGpus = 0;
+  checkCudaErrors(cudaGetDeviceCount(&numGpus));
 
-  for (int i = 0; i < num_of_gpus; i++) {
+  std::multimap<std::pair<int, int>, int> identicalGpus;
+
+  for (int i = 0; i < numGpus; i++) {
     cudaDeviceProp deviceProp;
     checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i));
-    major_minor[i * 2] = deviceProp.major;
-    major_minor[i * 2 + 1] = deviceProp.minor;
+
+    // Filter unsupported devices
+    if (deviceProp.cooperativeMultiDeviceLaunch &&
+        deviceProp.concurrentManagedAccess) {
+      identicalGpus.emplace(std::make_pair(deviceProp.major, deviceProp.minor), i);
+    }
     printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", i,
-           deviceProp.name, deviceProp.major, deviceProp.minor);
+          deviceProp.name, deviceProp.major, deviceProp.minor);
   }
 
-  int maxMajorMinor[2] = {0, 0};
-
-  for (int i = 0; i < num_of_gpus; i++) {
-    for (int j = i + 1; j < num_of_gpus; j++) {
-      if ((major_minor[i * 2] == major_minor[j * 2]) &&
-          (major_minor[i * 2 + 1] == major_minor[j * 2 + 1])) {
-        identicalGPUs.insert(i);
-        identicalGPUs.insert(j);
-        foundIdenticalGPUs = 1;
-        if (maxMajorMinor[0] < major_minor[i * 2] &&
-            maxMajorMinor[1] < major_minor[i * 2 + 1]) {
-          maxMajorMinor[0] = major_minor[i * 2];
-          maxMajorMinor[1] = major_minor[i * 2 + 1];
-        }
-      }
-    }
-  }
-
-  free(major_minor);
-  if (!foundIdenticalGPUs) {
-    printf(
-        "No Two or more GPUs with same architecture found\nWaiving the "
-        "sample\n");
-    exit(EXIT_WAIVED);
-  }
-
-  std::set<int>::iterator it = identicalGPUs.begin();
-
-  // Iterate over all the identical GPUs found
-  while (it != identicalGPUs.end()) {
-    cudaDeviceProp deviceProp;
-    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, *it));
-    // Remove all the GPUs which are less than the best arch available
-    if (deviceProp.major != maxMajorMinor[0] &&
-        deviceProp.minor != maxMajorMinor[1]) {
-      identicalGPUs.erase(it);
-    }
-    if (!deviceProp.cooperativeMultiDeviceLaunch ||
-        !deviceProp.concurrentManagedAccess) {
-      identicalGPUs.erase(it);
-    }
-    it++;
-  }
-
-  return;
+  return identicalGpus;
 }
 
 int main(int argc, char **argv) {
+  constexpr size_t kNumGpusRequired = 2;
   int N = 0, nz = 0, *I = NULL, *J = NULL;
   float *val = NULL;
   const float tol = 1e-5f;
@@ -434,33 +380,101 @@ int main(int argc, char **argv) {
   float *r, *p, *Ax;
 
   printf("Starting [%s]...\n", sSDKname);
+  auto gpusByArch = getIdenticalGPUs();
 
-  int num_of_gpus = 0;
-  checkCudaErrors(cudaGetDeviceCount(&num_of_gpus));
+  auto it = gpusByArch.begin();
+  auto end = gpusByArch.end();
 
-  if (num_of_gpus <= 1) {
-    printf("No. of GPU on node %d\n", num_of_gpus);
-    printf("Minimum Two or more GPUs are required to run this sample code\n");
-    exit(EXIT_WAIVED);
+  auto bestFit = std::make_pair(it, it);
+  // use std::distance to find the largest number of GPUs amongst architectures
+  auto distance = [](decltype(bestFit) p){return std::distance(p.first, p.second);};
+
+  // Read each unique key/pair element in order
+  for (; it != end; it = gpusByArch.upper_bound(it->first)) {
+    // first and second are iterators bounded within the architecture group
+    auto testFit = gpusByArch.equal_range(it->first);
+    // Always use devices with highest architecture version or whichever has the most devices available
+    if (distance(bestFit) <= distance(testFit))
+        bestFit = testFit;
   }
 
-  std::set<int> identicalGPUs;
-  getIdenticalGPUs(num_of_gpus, identicalGPUs);
-
-  if (identicalGPUs.size() <= 1) {
+  if (distance(bestFit) < kNumGpusRequired) {
     printf(
         "No Two or more GPUs with same architecture capable of "
-        "cooperativeMultiDeviceLaunch & concurrentManagedAccess found. \nWaiving the sample\n");
+        "cooperativeMultiDeviceLaunch & concurrentManagedAccess found. "
+        "\nWaiving the sample\n");
     exit(EXIT_WAIVED);
   }
 
-  std::set<int>::iterator deviceId = identicalGPUs.begin();
+  std::set<int> bestFitDeviceIds;
 
-  // We use only 2 GPUs as for input size of N = 10485760*2 two GPUs are enough.
-  while (identicalGPUs.size() > 2) {
-    identicalGPUs.erase(deviceId);
-    deviceId++;
+  // check & select peer-to-peer access capable GPU devices as enabling p2p access between participating
+  // GPUs gives better performance for multi_grid sync.
+  for (auto itr = bestFit.first; itr != bestFit.second; itr++) {
+    int deviceId = itr->second;
+    checkCudaErrors(cudaSetDevice(deviceId));
+
+    std::for_each(itr, bestFit.second, [&deviceId, &bestFitDeviceIds](decltype(*itr) mapPair) {
+      if (deviceId != mapPair.second)
+      {
+        int access = 0;
+        checkCudaErrors(cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second));
+        printf("Device=%d %s Access Peer Device=%d\n", deviceId, access ? "CAN" : "CANNOT", mapPair.second);
+        if (access && bestFitDeviceIds.size() < kNumGpusRequired) {
+          bestFitDeviceIds.emplace(deviceId);
+          bestFitDeviceIds.emplace(mapPair.second);
+        }
+        else {
+          printf("Ignoring device %i (max devices exceeded)\n", mapPair.second);
+        }
+      }
+    });
+
+    if (bestFitDeviceIds.size() >= kNumGpusRequired)
+    {
+      printf("Selected p2p capable devices - ");
+      for (auto devicesItr = bestFitDeviceIds.begin(); devicesItr != bestFitDeviceIds.end(); devicesItr++)
+      {
+        printf("deviceId = %d  ", *devicesItr);
+      }
+      printf("\n");
+      break;
+    }
   }
+
+  // if bestFitDeviceIds.size() == 0 it means the GPUs in system are not p2p capable,
+  // hence we add it without p2p capability check.
+  if (!bestFitDeviceIds.size())
+  {
+    printf("Devices involved are not p2p capable.. selecting %zu of them\n", kNumGpusRequired);
+    std::for_each(bestFit.first, bestFit.second, [&bestFitDeviceIds](decltype(*bestFit.first) mapPair) {
+      if (bestFitDeviceIds.size() < kNumGpusRequired) {
+        bestFitDeviceIds.emplace(mapPair.second);
+      }
+      else {
+        printf("Ignoring device %i (max devices exceeded)\n", mapPair.second);
+      }
+      // Insert the sequence into the deviceIds set
+    });
+  }
+  else
+  {
+    // perform cudaDeviceEnablePeerAccess in both directions for all participating devices
+    // of a cudaLaunchCooperativeKernelMultiDevice call this gives better performance for multi_grid sync.
+    for (auto p1_itr = bestFitDeviceIds.begin(); p1_itr != bestFitDeviceIds.end(); p1_itr++)
+    {
+      checkCudaErrors(cudaSetDevice(*p1_itr));
+      for (auto p2_itr = bestFitDeviceIds.begin(); p2_itr != bestFitDeviceIds.end(); p2_itr++)
+      {
+        if (*p1_itr != *p2_itr)
+        {
+          checkCudaErrors(cudaDeviceEnablePeerAccess(*p2_itr, 0 ));
+          checkCudaErrors(cudaSetDevice(*p1_itr));
+        }
+      }
+    }
+  }
+
   /* Generate a random tridiagonal symmetric matrix in CSR format */
   N = 10485760 * 2;
   nz = (N - 2) * 3 + 4;
@@ -501,87 +515,94 @@ int main(int argc, char **argv) {
   checkCudaErrors(
       cudaMallocManaged(reinterpret_cast<void **>(&Ax), N * sizeof(float)));
 
-  std::cout << "\nRunning on GPUs = " << identicalGPUs.size() << std::endl;
-  cudaStream_t *nStreams = reinterpret_cast<cudaStream_t *>(
-      malloc(sizeof(cudaStream_t) * identicalGPUs.size()));
+  std::cout << "\nRunning on GPUs = " << kNumGpusRequired << std::endl;
+  cudaStream_t nStreams[kNumGpusRequired];
 
-  void *kernelArgs[] = {
-      (void *)&I,  (void *)&J, (void *)&val, (void *)&x,
-      (void *)&Ax, (void *)&p, (void *)&r,   (void *)&dot_result,
-      (void *)&nz, (void *)&N, (void *)&tol,
-  };
-
-  int sMemSize = sizeof(double) * THREADS_PER_BLOCK;
-  int numBlocksPerSm = 0;
+  int sMemSize = sizeof(double) * ((THREADS_PER_BLOCK/32) + 1);
+  int numBlocksPerSm = INT_MAX;
   int numThreads = THREADS_PER_BLOCK;
+  int numSms = INT_MAX;
+  auto deviceId = bestFitDeviceIds.begin();
 
-  deviceId = identicalGPUs.begin();
-  cudaDeviceProp deviceProp;
-  checkCudaErrors(cudaSetDevice(*deviceId));
-  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, *deviceId));
-
-  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-      &numBlocksPerSm, multiGpuConjugateGradient, numThreads, sMemSize));
-
-  int numSms = deviceProp.multiProcessorCount;
-  dim3 dimGrid(numSms * numBlocksPerSm, 1, 1),
-      dimBlock(THREADS_PER_BLOCK, 1, 1);
-
-  int device_count = 0;
-
-  int totalThreadsPerGPU = numSms * numBlocksPerSm * THREADS_PER_BLOCK;
-
-  while (deviceId != identicalGPUs.end()) {
+  // set numSms & numBlocksPerSm to be lowest of 2 devices
+  while (deviceId != bestFitDeviceIds.end()) {
     cudaDeviceProp deviceProp;
     checkCudaErrors(cudaSetDevice(*deviceId));
     checkCudaErrors(cudaGetDeviceProperties(&deviceProp, *deviceId));
+
+    int numBlocksPerSm_current=0;
+    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &numBlocksPerSm_current, multiGpuConjugateGradient, numThreads, sMemSize));
+
+    if (numBlocksPerSm > numBlocksPerSm_current)
+    {
+        numBlocksPerSm = numBlocksPerSm_current;
+    }
+    if (numSms > deviceProp.multiProcessorCount)
+    {
+        numSms = deviceProp.multiProcessorCount;
+    }
+    deviceId++;
+  }
+
+  if (!numBlocksPerSm) {
+    printf(
+        "Max active blocks per SM is returned as 0.\n Hence, Waiving the "
+        "sample\n");
+    exit(EXIT_WAIVED);
+  }
+
+  int device_count = 0;
+  int totalThreadsPerGPU = numSms * numBlocksPerSm * THREADS_PER_BLOCK;
+  deviceId =  bestFitDeviceIds.begin();;
+  while (deviceId != bestFitDeviceIds.end()) {
+    checkCudaErrors(cudaSetDevice(*deviceId));
     checkCudaErrors(cudaStreamCreate(&nStreams[device_count]));
 
-    if (deviceProp.concurrentManagedAccess) {
-      int perGPUIter = N / (totalThreadsPerGPU * identicalGPUs.size());
-      int offset_Ax = device_count * totalThreadsPerGPU;
-      int offset_r = device_count * totalThreadsPerGPU;
-      int offset_p = device_count * totalThreadsPerGPU;
-      int offset_x = device_count * totalThreadsPerGPU;
+    int perGPUIter = N / (totalThreadsPerGPU * kNumGpusRequired);
+    int offset_Ax = device_count * totalThreadsPerGPU;
+    int offset_r = device_count * totalThreadsPerGPU;
+    int offset_p = device_count * totalThreadsPerGPU;
+    int offset_x = device_count * totalThreadsPerGPU;
 
-      checkCudaErrors(cudaMemPrefetchAsync(I, sizeof(int) * N, *deviceId,
-                                           nStreams[device_count]));
-      checkCudaErrors(cudaMemPrefetchAsync(val, sizeof(float) * nz, *deviceId,
-                                           nStreams[device_count]));
-      checkCudaErrors(cudaMemPrefetchAsync(J, sizeof(float) * nz, *deviceId,
-                                           nStreams[device_count]));
+    checkCudaErrors(cudaMemPrefetchAsync(I, sizeof(int) * N, *deviceId,
+                                         nStreams[device_count]));
+    checkCudaErrors(cudaMemPrefetchAsync(val, sizeof(float) * nz, *deviceId,
+                                         nStreams[device_count]));
+    checkCudaErrors(cudaMemPrefetchAsync(J, sizeof(float) * nz, *deviceId,
+                                         nStreams[device_count]));
 
-      if (offset_Ax <= N) {
-        for (int i = 0; i < perGPUIter; i++) {
-          cudaMemAdvise(Ax + offset_Ax, sizeof(float) * totalThreadsPerGPU,
-                        cudaMemAdviseSetPreferredLocation, *deviceId);
-          cudaMemAdvise(r + offset_r, sizeof(float) * totalThreadsPerGPU,
-                        cudaMemAdviseSetPreferredLocation, *deviceId);
-          cudaMemAdvise(x + offset_x, sizeof(float) * totalThreadsPerGPU,
-                        cudaMemAdviseSetPreferredLocation, *deviceId);
-          cudaMemAdvise(p + offset_p, sizeof(float) * totalThreadsPerGPU,
-                        cudaMemAdviseSetPreferredLocation, *deviceId);
+    if (offset_Ax <= N) {
+      for (int i = 0; i < perGPUIter; i++) {
+        cudaMemAdvise(Ax + offset_Ax, sizeof(float) * totalThreadsPerGPU,
+                      cudaMemAdviseSetPreferredLocation, *deviceId);
+        cudaMemAdvise(r + offset_r, sizeof(float) * totalThreadsPerGPU,
+                      cudaMemAdviseSetPreferredLocation, *deviceId);
+        cudaMemAdvise(x + offset_x, sizeof(float) * totalThreadsPerGPU,
+                      cudaMemAdviseSetPreferredLocation, *deviceId);
+        cudaMemAdvise(p + offset_p, sizeof(float) * totalThreadsPerGPU,
+                      cudaMemAdviseSetPreferredLocation, *deviceId);
 
-          cudaMemAdvise(Ax + offset_Ax, sizeof(float) * totalThreadsPerGPU,
-                        cudaMemAdviseSetAccessedBy, *deviceId);
-          cudaMemAdvise(r + offset_r, sizeof(float) * totalThreadsPerGPU,
-                        cudaMemAdviseSetAccessedBy, *deviceId);
-          cudaMemAdvise(p + offset_p, sizeof(float) * totalThreadsPerGPU,
-                        cudaMemAdviseSetAccessedBy, *deviceId);
-          cudaMemAdvise(x + offset_x, sizeof(float) * totalThreadsPerGPU,
-                        cudaMemAdviseSetAccessedBy, *deviceId);
+        cudaMemAdvise(Ax + offset_Ax, sizeof(float) * totalThreadsPerGPU,
+                      cudaMemAdviseSetAccessedBy, *deviceId);
+        cudaMemAdvise(r + offset_r, sizeof(float) * totalThreadsPerGPU,
+                      cudaMemAdviseSetAccessedBy, *deviceId);
+        cudaMemAdvise(p + offset_p, sizeof(float) * totalThreadsPerGPU,
+                      cudaMemAdviseSetAccessedBy, *deviceId);
+        cudaMemAdvise(x + offset_x, sizeof(float) * totalThreadsPerGPU,
+                      cudaMemAdviseSetAccessedBy, *deviceId);
 
-          offset_Ax += totalThreadsPerGPU * identicalGPUs.size();
-          offset_r += totalThreadsPerGPU * identicalGPUs.size();
-          offset_p += totalThreadsPerGPU * identicalGPUs.size();
-          offset_x += totalThreadsPerGPU * identicalGPUs.size();
+        offset_Ax += totalThreadsPerGPU * kNumGpusRequired;
+        offset_r += totalThreadsPerGPU * kNumGpusRequired;
+        offset_p += totalThreadsPerGPU * kNumGpusRequired;
+        offset_x += totalThreadsPerGPU * kNumGpusRequired;
 
-          if (offset_Ax >= N) {
-            break;
-          }
+        if (offset_Ax >= N) {
+          break;
         }
       }
     }
+
     device_count++;
     deviceId++;
   }
@@ -600,11 +621,16 @@ int main(int argc, char **argv) {
 
   printf("Total threads per GPU = %d numBlocksPerSm  = %d\n",
          numSms * numBlocksPerSm * THREADS_PER_BLOCK, numBlocksPerSm);
-  cudaLaunchParams *launchParamsList = reinterpret_cast<cudaLaunchParams *>(
-      malloc(sizeof(cudaLaunchParams) * identicalGPUs.size()));
-  for (int i = 0; i < identicalGPUs.size(); i++) {
-    launchParamsList[i].func =
-        reinterpret_cast<void *>(multiGpuConjugateGradient);
+  dim3 dimGrid(numSms * numBlocksPerSm, 1, 1), dimBlock(THREADS_PER_BLOCK, 1, 1);
+  void *kernelArgs[] = {
+      (void *)&I,  (void *)&J, (void *)&val, (void *)&x,
+      (void *)&Ax, (void *)&p, (void *)&r,   (void *)&dot_result,
+      (void *)&nz, (void *)&N, (void *)&tol,
+  };
+  cudaLaunchParams *launchParamsList = (cudaLaunchParams *)malloc(
+      sizeof(cudaLaunchParams) * kNumGpusRequired);
+  for (int i = 0; i < kNumGpusRequired; i++) {
+    launchParamsList[i].func = (void *)multiGpuConjugateGradient;
     launchParamsList[i].gridDim = dimGrid;
     launchParamsList[i].blockDim = dimBlock;
     launchParamsList[i].sharedMem = sMemSize;
@@ -613,21 +639,20 @@ int main(int argc, char **argv) {
   }
 
   printf("Launching kernel\n");
+
   checkCudaErrors(cudaLaunchCooperativeKernelMultiDevice(
-      launchParamsList, identicalGPUs.size(),
+      launchParamsList, kNumGpusRequired,
       cudaCooperativeLaunchMultiDeviceNoPreSync |
           cudaCooperativeLaunchMultiDeviceNoPostSync));
 
-  if (deviceProp.concurrentManagedAccess) {
-    checkCudaErrors(
-        cudaMemPrefetchAsync(x, sizeof(float) * N, cudaCpuDeviceId));
-    checkCudaErrors(
+  checkCudaErrors(
+      cudaMemPrefetchAsync(x, sizeof(float) * N, cudaCpuDeviceId));
+  checkCudaErrors(
         cudaMemPrefetchAsync(dot_result, sizeof(double), cudaCpuDeviceId));
-  }
 
-  deviceId = identicalGPUs.begin();
+  deviceId =  bestFitDeviceIds.begin();;
   device_count = 0;
-  while (deviceId != identicalGPUs.end()) {
+  while (deviceId != bestFitDeviceIds.end()) {
     checkCudaErrors(cudaSetDevice(*deviceId));
     checkCudaErrors(cudaStreamSynchronize(nStreams[device_count++]));
     deviceId++;
diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.vcxproj b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.vcxproj
index 2fd0abae..e078f002 100644
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.vcxproj
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/conjugateGradientMultiDeviceCG.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj
index 4360b2dd..fb8c0978 100644
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/conjugateGradientMultiDeviceCG.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.vcxproj b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.vcxproj
index 0ccff425..22e66415 100644
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.vcxproj
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/conjugateGradientMultiDeviceCG.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/cuSolverDn_LinearSolver/Makefile b/Samples/cuSolverDn_LinearSolver/Makefile
index 516368ec..f76a3497 100644
--- a/Samples/cuSolverDn_LinearSolver/Makefile
+++ b/Samples/cuSolverDn_LinearSolver/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
diff --git a/Samples/cuSolverDn_LinearSolver/NsightEclipse.xml b/Samples/cuSolverDn_LinearSolver/NsightEclipse.xml
index 237fbcc6..9cf0af78 100644
--- a/Samples/cuSolverDn_LinearSolver/NsightEclipse.xml
+++ b/Samples/cuSolverDn_LinearSolver/NsightEclipse.xml
@@ -44,7 +44,6 @@
     <scope>1:CUDA Basic Topics</scope>
     <scope>3:Linear Algebra</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
   <sm-arch>sm35</sm-arch>
   <sm-arch>sm37</sm-arch>
   <sm-arch>sm50</sm-arch>
@@ -54,6 +53,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/cuSolverDn_LinearSolver/README.md b/Samples/cuSolverDn_LinearSolver/README.md
index 096de241..41ab8320 100644
--- a/Samples/cuSolverDn_LinearSolver/README.md
+++ b/Samples/cuSolverDn_LinearSolver/README.md
@@ -10,11 +10,11 @@ Linear Algebra, CUSOLVER Library
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -27,7 +27,7 @@ x86_64, ppc64le, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
@@ -67,29 +67,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/cuSolverSp_LinearSolver/Makefile b/Samples/cuSolverSp_LinearSolver/Makefile
index 137fedc0..7ef0549f 100644
--- a/Samples/cuSolverSp_LinearSolver/Makefile
+++ b/Samples/cuSolverSp_LinearSolver/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -251,9 +272,9 @@ LIBRARIES :=
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
diff --git a/Samples/cuSolverSp_LinearSolver/NsightEclipse.xml b/Samples/cuSolverSp_LinearSolver/NsightEclipse.xml
index 495dd16d..a76bb7af 100644
--- a/Samples/cuSolverSp_LinearSolver/NsightEclipse.xml
+++ b/Samples/cuSolverSp_LinearSolver/NsightEclipse.xml
@@ -45,7 +45,6 @@
     <scope>1:CUDA Basic Topics</scope>
     <scope>3:Linear Algebra</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
   <sm-arch>sm35</sm-arch>
   <sm-arch>sm37</sm-arch>
   <sm-arch>sm50</sm-arch>
@@ -55,6 +54,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/cuSolverSp_LinearSolver/README.md b/Samples/cuSolverSp_LinearSolver/README.md
index 7fb1d12b..af7831fc 100644
--- a/Samples/cuSolverSp_LinearSolver/README.md
+++ b/Samples/cuSolverSp_LinearSolver/README.md
@@ -10,11 +10,11 @@ Linear Algebra, CUSOLVER Library
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
@@ -67,29 +67,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp
index 26e83831..fabb33fb 100644
--- a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp
+++ b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp
@@ -24,53 +24,63 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+
 /*
  *  Test three linear solvers, including Cholesky, LU and QR.
- *  The user has to prepare a sparse matrix of "matrix market format" (with extension .mtx).
- *  For example, the user can download matrices in Florida Sparse Matrix Collection.
+ *  The user has to prepare a sparse matrix of "matrix market format" (with
+ extension .mtx).
+ *  For example, the user can download matrices in Florida Sparse Matrix
+ Collection.
  *  (http://www.cise.ufl.edu/research/sparse/matrices/)
  *
  *  The user needs to choose a solver by the switch -R<solver> and
  *  to provide the path of the matrix by the switch -F<file>, then
  *  the program solves
- *          A*x = b  
+ *          A*x = b
  *  and reports relative error
  *          |b-A*x|/(|A|*|x|+|b|)
  *
  *  How does it work?
  *     The example solves A*x = b by the following steps
- *  step 1: B = A(Q,Q) 
+ *  step 1: B = A(Q,Q)
  *     Q is the ordering to minimize zero fill-in.
  *     The user can choose symrcm or symamd.
  *  step 2: solve B*z = Q*b
  *  step 3: x = inv(Q)*z
- * 
+ *
  *  Above three steps can be combined by the formula
  *        (Q*A*Q')*(Q*x) = (Q*b)
  *
- *  The elapsed time is also reported so the user can compare efficiency of different solvers.
+ *  The elapsed time is also reported so the user can compare efficiency of
+ different solvers.
  *
  *  How to use
-        /cuSolverSp_LinearSolver            // Default: Cholesky, symrcm & file=lap2D_5pt_n100.mtx
- *     ./cuSolverSp_LinearSolver -R=chol  -file=<file>   // cholesky factorization
- *     ./cuSolverSp_LinearSolver -R=lu -P=symrcm -file=<file>     // symrcm + LU with partial pivoting
- *     ./cuSolverSp_LinearSolver -R=qr -P=symamd -file=<file>     // symamd + QR factorization
+        /cuSolverSp_LinearSolver            // Default: Cholesky, symrcm &
+ file=lap2D_5pt_n100.mtx
+ *     ./cuSolverSp_LinearSolver -R=chol  -file=<file>   // cholesky
+ factorization
+ *     ./cuSolverSp_LinearSolver -R=lu -P=symrcm -file=<file>     // symrcm + LU
+ with partial pivoting
+ *     ./cuSolverSp_LinearSolver -R=qr -P=symamd -file=<file>     // symamd + QR
+ factorization
  *
  *
- *  Remark: the absolute error on solution x is meaningless without knowing condition number of A.
- *     The relative error on residual should be close to machine zero, i.e. 1.e-15.
+ *  Remark: the absolute error on solution x is meaningless without knowing
+ condition number of A.
+ *     The relative error on residual should be close to machine zero,
+ i.e. 1.e-15.
  */
 
+#include <assert.h>
+#include <ctype.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <ctype.h>
-#include <assert.h>
 
 #include <cuda_runtime.h>
 
-#include "cusparse.h"
 #include "cusolverSp.h"
+#include "cusparse.h"
 
 #include "helper_cuda.h"
 #include "helper_cusolver.h"
@@ -461,10 +471,38 @@ int main(int argc, char *argv[]) {
                                   cudaMemcpyDeviceToDevice, stream));
   checkCudaErrors(cudaMemcpyAsync(d_x, h_x, sizeof(double) * colsA,
                                   cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(cusparseDcsrmv(cusparseHandle,
-                                 CUSPARSE_OPERATION_NON_TRANSPOSE, rowsA, colsA,
-                                 nnzA, &minus_one, descrA, d_csrValA,
-                                 d_csrRowPtrA, d_csrColIndA, d_x, &one, d_r));
+
+  /* Wrap raw data into cuSPARSE generic API objects */
+  cusparseSpMatDescr_t matA = NULL;
+  if (baseA) {
+    checkCudaErrors(cusparseCreateCsr(&matA, rowsA, colsA, nnzA, d_csrRowPtrA,
+                                      d_csrColIndA, d_csrValA,
+                                      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+                                      CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F));
+  } else {
+    checkCudaErrors(cusparseCreateCsr(&matA, rowsA, colsA, nnzA, d_csrRowPtrA,
+                                      d_csrColIndA, d_csrValA,
+                                      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+                                      CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
+  }
+
+  cusparseDnVecDescr_t vecx = NULL;
+  checkCudaErrors(cusparseCreateDnVec(&vecx, colsA, d_x, CUDA_R_64F));
+  cusparseDnVecDescr_t vecAx = NULL;
+  checkCudaErrors(cusparseCreateDnVec(&vecAx, rowsA, d_r, CUDA_R_64F));
+
+  /* Allocate workspace for cuSPARSE */
+  size_t bufferSize = 0;
+  checkCudaErrors(cusparseSpMV_bufferSize(
+      cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx,
+      &one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize));
+  void *buffer = NULL;
+  checkCudaErrors(cudaMalloc(&buffer, bufferSize));
+
+  checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                               &minus_one, matA, vecx, &one, vecAx, CUDA_R_64F,
+                               CUSPARSE_MV_ALG_DEFAULT, &buffer));
+
   checkCudaErrors(cudaMemcpyAsync(h_r, d_r, sizeof(double) * rowsA,
                                   cudaMemcpyDeviceToHost, stream));
   /* wait until h_r is ready */
@@ -518,10 +556,11 @@ int main(int argc, char *argv[]) {
   printf("step 8: evaluate residual r = b - A*x (result on GPU)\n");
   checkCudaErrors(cudaMemcpyAsync(d_r, d_b, sizeof(double) * rowsA,
                                   cudaMemcpyDeviceToDevice, stream));
-  checkCudaErrors(cusparseDcsrmv(cusparseHandle,
-                                 CUSPARSE_OPERATION_NON_TRANSPOSE, rowsA, colsA,
-                                 nnzA, &minus_one, descrA, d_csrValA,
-                                 d_csrRowPtrA, d_csrColIndA, d_x, &one, d_r));
+
+  checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                               &minus_one, matA, vecx, &one, vecAx, CUDA_R_64F,
+                               CUSPARSE_MV_ALG_DEFAULT, &buffer));
+
   checkCudaErrors(cudaMemcpyAsync(h_x, d_x, sizeof(double) * colsA,
                                   cudaMemcpyDeviceToHost, stream));
   checkCudaErrors(cudaMemcpyAsync(h_r, d_r, sizeof(double) * rowsA,
@@ -566,6 +605,15 @@ int main(int argc, char *argv[]) {
   if (descrA) {
     checkCudaErrors(cusparseDestroyMatDescr(descrA));
   }
+  if (matA) {
+    checkCudaErrors(cusparseDestroySpMat(matA));
+  }
+  if (vecx) {
+    checkCudaErrors(cusparseDestroyDnVec(vecx));
+  }
+  if (vecAx) {
+    checkCudaErrors(cusparseDestroyDnVec(vecAx));
+  }
 
   if (h_csrValA) {
     free(h_csrValA);
diff --git a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2012.vcxproj b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2012.vcxproj
index f37b41de..b1969d11 100644
--- a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2012.vcxproj
+++ b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/cuSolverSp_LinearSolver.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2013.vcxproj b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2013.vcxproj
index db6f8896..4bd70769 100644
--- a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2013.vcxproj
+++ b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/cuSolverSp_LinearSolver.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2015.vcxproj b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2015.vcxproj
index b567d157..df2f48f8 100644
--- a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2015.vcxproj
+++ b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/cuSolverSp_LinearSolver.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2017.vcxproj b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2017.vcxproj
index 2c7c7c78..8d4e4ae2 100644
--- a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2017.vcxproj
+++ b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/cuSolverSp_LinearSolver.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2019.vcxproj b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2019.vcxproj
index c6227ce3..6fb20b8d 100644
--- a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2019.vcxproj
+++ b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/cuSolverSp_LinearSolver.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/cudaNvSci/Makefile b/Samples/cudaNvSci/Makefile
index 5033d0de..ca8e28a2 100644
--- a/Samples/cudaNvSci/Makefile
+++ b/Samples/cudaNvSci/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -248,12 +269,6 @@ ifeq ($(TARGET_ARCH),armv7l)
   SAMPLE_ENABLED := 0
 endif
 
-# This sample is not supported on QNX
-ifeq ($(TARGET_OS),qnx)
-  $(info >>> WARNING - cudaNvSci is not supported on QNX - waiving sample <<<)
-  SAMPLE_ENABLED := 0
-endif
-
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
@@ -265,7 +280,10 @@ LIBRARIES :=
 
 ################################################################################
 
-#Detect if installed version of GCC supports C++11
+# Makefile include to help find NVSCI Libraries
+include ./findnvsci.mk
+
+#Detect if installed version of GCC supports required C++11
 ifeq ($(TARGET_OS),linux)
     empty :=
     space := $(empty) $(empty)
@@ -287,16 +305,16 @@ ifeq ($(TARGET_OS),linux)
     ifeq ($(IS_MIN_VERSION), 1)
         $(info >>> GCC Version is greater or equal to 4.7.0 <<<)
     else
-        $(info >>> Waiving build. Minimum GCC version required for C++11 is 4.7.0 <<<)
+        $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
         SAMPLE_ENABLED := 0
     endif
 endif
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
@@ -331,6 +349,10 @@ else
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
   endif
 
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
+  endif
+
   ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
   endif
@@ -345,12 +367,19 @@ else
 
   ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
+    ifdef TARGET_OVERRIDE
+        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
+    endif
   endif
 
   ifeq ($(TARGET_ARCH),ppc64le)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
   endif
 
+  ifeq ($(HOST_ARCH),ppc64le)
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
+  endif
+
   CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
   ifeq ("$(CUDALIB)","")
     $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
diff --git a/Samples/cudaNvSci/NsightEclipse.xml b/Samples/cudaNvSci/NsightEclipse.xml
index d10a85d4..f75a0beb 100644
--- a/Samples/cudaNvSci/NsightEclipse.xml
+++ b/Samples/cudaNvSci/NsightEclipse.xml
@@ -53,6 +53,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <platform>aarch64</platform>
diff --git a/Samples/cudaNvSci/README.md b/Samples/cudaNvSci/README.md
index 0ace4afe..dbde9f48 100644
--- a/Samples/cudaNvSci/README.md
+++ b/Samples/cudaNvSci/README.md
@@ -10,7 +10,7 @@ CUDA NvSci Interop, Data Parallel Algorithms, Image Processing
 
 ## Supported SM Architectures
 
-[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedBuffer, cudaExternalMemoryG
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/cudaNvSci/cudaNvSci.cpp b/Samples/cudaNvSci/cudaNvSci.cpp
index be876970..01b443f3 100644
--- a/Samples/cudaNvSci/cudaNvSci.cpp
+++ b/Samples/cudaNvSci/cudaNvSci.cpp
@@ -88,7 +88,7 @@ class cudaNvSciSignal {
   }
 
   ~cudaNvSciSignal() {
-    checkCudaErrors(cudaFreeArray(d_mipLevelArray));
+    checkCudaErrors(cudaSetDevice(m_cudaDeviceId));
     checkCudaErrors(cudaFreeMipmappedArray(d_mipmapArray));
     checkCudaErrors(cudaFree(d_outputBuf));
     checkCudaErrors(cudaDestroyExternalSemaphore(signalSem));
@@ -189,6 +189,8 @@ class cudaNvSciSignal {
   NvSciBufAttrList getNvSciImageBufAttrList() { return m_imageBufAttrList; }
 
   void runRotateImageAndSignal(unsigned char *imageData) {
+    int numOfGPUs = 0;
+    checkCudaErrors(cudaGetDeviceCount(&numOfGPUs));  // For cuda init purpose
     checkCudaErrors(cudaSetDevice(m_cudaDeviceId));
 
     copyDataToImageArray(imageData);
@@ -431,6 +433,8 @@ class cudaNvSciWait {
 
   void runImageGrayscale(std::string image_filename, size_t imageWidth,
                          size_t imageHeight) {
+    int numOfGPUs = 0;
+    checkCudaErrors(cudaGetDeviceCount(&numOfGPUs));  // For cuda init purpose
     checkCudaErrors(cudaSetDevice(m_cudaDeviceId));
 
     waitExternalSemaphore();
diff --git a/Samples/cudaNvSci/findnvsci.mk b/Samples/cudaNvSci/findnvsci.mk
new file mode 100644
index 00000000..8f857bc3
--- /dev/null
+++ b/Samples/cudaNvSci/findnvsci.mk
@@ -0,0 +1,138 @@
+################################################################################
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################
+#  findnvsci.mk is used to find the NVSCI Libraries and headers
+#
+################################################################################
+
+# Determine OS platform and unix distribution
+ifeq ("$(TARGET_OS)","linux")
+   # first search lsb_release
+   DISTRO  = $(shell lsb_release -i -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+   ifeq ("$(DISTRO)","")
+     # second search and parse /etc/issue
+     DISTRO = $(shell more /etc/issue | awk '{print $$1}' | sed '1!d' | sed -e "/^$$/d" 2>/dev/null | tr "[:upper:]" "[:lower:]")
+     # ensure data from /etc/issue is valid
+     ifeq (,$(filter $(DISTRO),ubuntu fedora red rhel centos suse))
+       DISTRO = 
+     endif
+     ifeq ("$(DISTRO)","")
+       # third, we can search in /etc/os-release or /etc/{distro}-release
+       DISTRO = $(shell awk '/ID/' /etc/*-release | sed 's/ID=//' | grep -v "VERSION" | grep -v "ID" | grep -v "DISTRIB")
+     endif
+   endif
+endif
+
+ifeq ("$(TARGET_OS)","linux")
+    # $(info) >> findegl.mk -> LINUX path <<<)
+    # Each set of Linux Distros have different paths for where to find their OpenGL libraries reside
+    UBUNTU = $(shell echo $(DISTRO) | grep -i ubuntu      >/dev/null 2>&1; echo $$?)
+    FEDORA = $(shell echo $(DISTRO) | grep -i fedora      >/dev/null 2>&1; echo $$?)
+    RHEL   = $(shell echo $(DISTRO) | grep -i 'red\|rhel' >/dev/null 2>&1; echo $$?)
+    CENTOS = $(shell echo $(DISTRO) | grep -i centos      >/dev/null 2>&1; echo $$?)
+    SUSE   = $(shell echo $(DISTRO) | grep -i 'suse\|sles' >/dev/null 2>&1; echo $$?)
+    ifeq ("$(UBUNTU)","0")
+      ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        GLPATH := /usr/arm-linux-gnueabihf/lib
+        GLLINK := -L/usr/arm-linux-gnueabihf/lib
+        ifneq ($(TARGET_FS),) 
+          GLPATH += $(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+          GLLINK += -L$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+      else ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-aarch64)
+        GLPATH := /usr/aarch64-linux-gnu/lib
+        GLLINK := -L/usr/aarch64-linux-gnu/lib
+        ifneq ($(TARGET_FS),)
+          GLPATH += $(TARGET_FS)/usr/lib
+          GLPATH += $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+          GLLINK += -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+        endif 
+      else
+        UBUNTU_PKG_NAME = $(shell which dpkg >/dev/null 2>&1 && dpkg -l 'nvidia-*' | grep '^ii' | awk '{print $$2}' | head -1)
+        ifneq ("$(UBUNTU_PKG_NAME)","")
+          GLPATH    ?= /usr/lib/$(UBUNTU_PKG_NAME)
+          GLLINK    ?= -L/usr/lib/$(UBUNTU_PKG_NAME)
+        endif
+        DFLT_PATH ?= /usr/lib
+      endif
+    endif
+    ifeq ("$(SUSE)","0")
+      GLPATH    ?= /usr/X11R6/lib64
+      GLLINK    ?= -L/usr/X11R6/lib64
+      DFLT_PATH ?= /usr/lib64
+    endif
+    ifeq ("$(FEDORA)","0")
+      GLPATH    ?= /usr/lib64/nvidia
+      GLLINK    ?= -L/usr/lib64/nvidia
+      DFLT_PATH ?= /usr/lib64
+    endif
+    ifeq ("$(RHEL)","0")
+      GLPATH    ?= /usr/lib64/nvidia
+      GLLINK    ?= -L/usr/lib64/nvidia
+      DFLT_PATH ?= /usr/lib64
+    endif
+    ifeq ("$(CENTOS)","0")
+      GLPATH    ?= /usr/lib64/nvidia
+      GLLINK    ?= -L/usr/lib64/nvidia
+      DFLT_PATH ?= /usr/lib64
+    endif
+
+  NVSCIBUFLIB  := $(shell find -L $(GLPATH) $(DFLT_PATH) -name libnvscibuf.so    -print 2>/dev/null)
+  NVSCISYNCLIB  := $(shell find -L $(GLPATH) $(DFLT_PATH) -name libnvscisync.so    -print 2>/dev/null)
+
+  ifeq ("$(NVSCIBUFLIB)","")
+      $(info >>> WARNING - libnvscibuf.so not found, please install libnvscibuf.so <<<)
+      SAMPLE_ENABLED := 0
+  endif
+
+  ifeq ("$(NVSCISYNCLIB)","")
+      $(info >>> WARNING - libnvscisync.so not found, please install libnvscisync.so <<<)
+      SAMPLE_ENABLED := 0
+  endif
+
+  HEADER_SEARCH_PATH ?= $(TARGET_FS)/usr/include
+  ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+      HEADER_SEARCH_PATH += /usr/arm-linux-gnueabihf/include
+  else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-aarch64-linux)
+      HEADER_SEARCH_PATH += /usr/aarch64-linux-gnu/include
+  endif
+
+  NVSCIBUFHEADER  := $(shell find -L $(HEADER_SEARCH_PATH) -name nvscibuf.h -print 2>/dev/null)
+  NVSCISYNCHEADER  := $(shell find -L $(HEADER_SEARCH_PATH) -name nvscisync.h -print 2>/dev/null)
+
+  ifeq ("$(NVSCIBUFHEADER)","")
+      $(info >>> WARNING - nvscibuf.h not found, please install nvscibuf.h <<<)
+      SAMPLE_ENABLED := 0
+  endif
+  ifeq ("$(NVSCISYNCHEADER)","")
+      $(info >>> WARNING - nvscisync.h not found, please install nvscisync.h <<<)
+      SAMPLE_ENABLED := 0
+  endif
+else
+endif
+
diff --git a/Samples/cudaTensorCoreGemm/Makefile b/Samples/cudaTensorCoreGemm/Makefile
index 5e0bc9e6..45a028b3 100644
--- a/Samples/cudaTensorCoreGemm/Makefile
+++ b/Samples/cudaTensorCoreGemm/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -259,9 +280,9 @@ LIBRARIES :=
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 70 72 75
+SMS ?= 70 72 75 80
 else
-SMS ?= 70 75
+SMS ?= 70 75 80
 endif
 
 ifeq ($(SMS),)
diff --git a/Samples/cudaTensorCoreGemm/NsightEclipse.xml b/Samples/cudaTensorCoreGemm/NsightEclipse.xml
index b8b24e8c..3e33a22a 100644
--- a/Samples/cudaTensorCoreGemm/NsightEclipse.xml
+++ b/Samples/cudaTensorCoreGemm/NsightEclipse.xml
@@ -45,6 +45,7 @@ In addition to that, it demonstrates the use of the new CUDA function attribute
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/cudaTensorCoreGemm/README.md b/Samples/cudaTensorCoreGemm/README.md
index 6aa42ef7..d9de3e8c 100644
--- a/Samples/cudaTensorCoreGemm/README.md
+++ b/Samples/cudaTensorCoreGemm/README.md
@@ -14,7 +14,7 @@ Matrix Multiply, WMMA, Tensor Cores
 
 ## Supported SM Architectures
 
-[SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -31,7 +31,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate,
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
index 3677212d..2c17417f 100644
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
@@ -151,20 +151,19 @@
 #define SHMEM_STRIDE (N * BLOCK_ROW_TILES)
 #define SHMEM_OFFSET (N * WARP_ROW_TILES)
 
-// The macro below is used to shift rows of the A matrix and columns of the B
-// matrix in shared memory to minimize possible bank conflicts. Before
-// performing the nvcuda::wmma::mma_sync operation, the warp must load the
-// matrix data using the nvcuda::wmma::load_matrix_sync operation. Although the
-// memory access pattern is not specified for that function, each lane in the
-// warp can read one or multiple matrix elements from different matrix rows or
-// columns. For shared memory, such access can result in bank conflicts if
-// different rows / columns of the matrix map to the same bank. By shifting each
-// row and column by a few bytes, we make sure that they map to different banks,
-// thus reducing the number of possible bank conflicts. The number of 8 two-byte
-// "half" elements is chosen as the minimum possible shift because we must keep
-// each row and column 128-bit aligned, as required by
-// nvcuda::wmma::load_matrix_sync.
-#define SKEW_HALF 8
+// The macro below is used to shift rows of the A matrix and columns of the B matrix
+// in shared memory to minimize possible bank conflicts.
+// Before performing the nvcuda::wmma::mma_sync operation, the warp must load the matrix
+// data using the nvcuda::wmma::load_matrix_sync operation. Although the memory access pattern
+// is not specified for that function, each lane in the warp can read one or multiple matrix
+// elements from different matrix rows or columns.
+// For shared memory, such access can result in bank conflicts if different rows / columns
+// of the matrix map to the same bank. By shifting each row and column by a few bytes, we
+// make sure that they map to different banks, thus reducing the number of possible bank
+// conflicts.
+// The number of 16 two-byte "half" elements is chosen as the minimum possible shift because
+// we must keep each row and column 256-bit aligned, as required by nvcuda::wmma::load_matrix_sync.
+#define SKEW_HALF 16
 
 #define checkKernelErrors(expr)                             \
   do {                                                      \
diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2012.vcxproj b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2012.vcxproj
index c12e80dd..58dd5958 100644
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2012.vcxproj
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/cudaTensorCoreGemm.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2013.vcxproj b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2013.vcxproj
index b1ba9b83..7f1df1dd 100644
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2013.vcxproj
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/cudaTensorCoreGemm.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2015.vcxproj b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2015.vcxproj
index 86008f5f..dbfa6f7d 100644
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2015.vcxproj
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/cudaTensorCoreGemm.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj
index 336a565a..cf4dd606 100644
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/cudaTensorCoreGemm.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2019.vcxproj b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2019.vcxproj
index c276873a..8634efd8 100644
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2019.vcxproj
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/cudaTensorCoreGemm.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/deviceQuery/Makefile b/Samples/deviceQuery/Makefile
index 45265754..c5e045f8 100644
--- a/Samples/deviceQuery/Makefile
+++ b/Samples/deviceQuery/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -247,9 +268,9 @@ LIBRARIES :=
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
diff --git a/Samples/deviceQuery/NsightEclipse.xml b/Samples/deviceQuery/NsightEclipse.xml
index 04bfe94a..4b4d81ea 100644
--- a/Samples/deviceQuery/NsightEclipse.xml
+++ b/Samples/deviceQuery/NsightEclipse.xml
@@ -31,7 +31,6 @@
   <scopes>
     <scope>1:CUDA Basic Topics</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
   <sm-arch>sm35</sm-arch>
   <sm-arch>sm37</sm-arch>
   <sm-arch>sm50</sm-arch>
@@ -41,6 +40,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/deviceQuery/README.md b/Samples/deviceQuery/README.md
index 7c143ee6..5cb523f1 100644
--- a/Samples/deviceQuery/README.md
+++ b/Samples/deviceQuery/README.md
@@ -10,11 +10,11 @@ CUDA Runtime API, Device Query
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -27,7 +27,7 @@ cudaSetDevice, cudaGetDeviceCount, cudaGetDeviceProperties, cudaDriverGetVersion
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
@@ -66,29 +66,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/deviceQuery/deviceQuery.cpp b/Samples/deviceQuery/deviceQuery.cpp
index 0f9f2866..c002cc5c 100644
--- a/Samples/deviceQuery/deviceQuery.cpp
+++ b/Samples/deviceQuery/deviceQuery.cpp
@@ -189,6 +189,8 @@ int main(int argc, char **argv) {
            deviceProp.totalConstMem);
     printf("  Total amount of shared memory per block:       %zu bytes\n",
            deviceProp.sharedMemPerBlock);
+    printf("  Total shared memory per multiprocessor:        %zu bytes\n",
+           deviceProp.sharedMemPerMultiprocessor);
     printf("  Total number of registers available per block: %d\n",
            deviceProp.regsPerBlock);
     printf("  Warp size:                                     %d\n",
@@ -228,6 +230,8 @@ int main(int argc, char **argv) {
 #endif
     printf("  Device supports Unified Addressing (UVA):      %s\n",
            deviceProp.unifiedAddressing ? "Yes" : "No");
+    printf("  Device supports Managed Memory:                %s\n",
+           deviceProp.managedMemory ? "Yes" : "No");
     printf("  Device supports Compute Preemption:            %s\n",
            deviceProp.computePreemptionSupported ? "Yes" : "No");
     printf("  Supports Cooperative Kernel Launch:            %s\n",
diff --git a/Samples/deviceQuery/deviceQuery_vs2012.vcxproj b/Samples/deviceQuery/deviceQuery_vs2012.vcxproj
index 508d9feb..0c4e53bd 100644
--- a/Samples/deviceQuery/deviceQuery_vs2012.vcxproj
+++ b/Samples/deviceQuery/deviceQuery_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/deviceQuery.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/deviceQuery/deviceQuery_vs2013.vcxproj b/Samples/deviceQuery/deviceQuery_vs2013.vcxproj
index 044dbdc0..22213dd4 100644
--- a/Samples/deviceQuery/deviceQuery_vs2013.vcxproj
+++ b/Samples/deviceQuery/deviceQuery_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/deviceQuery.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/deviceQuery/deviceQuery_vs2015.vcxproj b/Samples/deviceQuery/deviceQuery_vs2015.vcxproj
index fa7eadd9..8675906e 100644
--- a/Samples/deviceQuery/deviceQuery_vs2015.vcxproj
+++ b/Samples/deviceQuery/deviceQuery_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/deviceQuery.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/deviceQuery/deviceQuery_vs2017.vcxproj b/Samples/deviceQuery/deviceQuery_vs2017.vcxproj
index a38f849b..f5e0ad15 100644
--- a/Samples/deviceQuery/deviceQuery_vs2017.vcxproj
+++ b/Samples/deviceQuery/deviceQuery_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/deviceQuery.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/deviceQuery/deviceQuery_vs2019.vcxproj b/Samples/deviceQuery/deviceQuery_vs2019.vcxproj
index 5a4756b3..19a1a483 100644
--- a/Samples/deviceQuery/deviceQuery_vs2019.vcxproj
+++ b/Samples/deviceQuery/deviceQuery_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/deviceQuery.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/dmmaTensorCoreGemm/Makefile b/Samples/dmmaTensorCoreGemm/Makefile
new file mode 100644
index 00000000..f40af361
--- /dev/null
+++ b/Samples/dmmaTensorCoreGemm/Makefile
@@ -0,0 +1,362 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+        endif
+    endif
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on Mac OSX
+ifeq ($(TARGET_OS),darwin)
+  $(info >>> WARNING - dmmaTensorCoreGemm is not supported on Mac OSX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on ARMv7
+ifeq ($(TARGET_ARCH),armv7l)
+  $(info >>> WARNING - dmmaTensorCoreGemm is not supported on ARMv7 - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+#Detect if installed version of GCC supports required C++11
+ifeq ($(TARGET_OS),linux)
+    empty :=
+    space := $(empty) $(empty)
+    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
+#Create version number without "."
+    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
+# Make sure the version number has at least 3 decimals
+    GCCVERSION += 00
+# Remove spaces from the version number
+    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
+# Crop the version number to 3 decimals.
+    GCCVERSION := $(shell expr `echo $(GCCVERSION)` | cut -b1-3)
+#$(warning $(GCCVERSION))
+
+    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 500)
+
+    ifeq ($(IS_MIN_VERSION), 1)
+        $(info >>> GCC Version is greater or equal to  5.0.0 <<<)
+    else
+        $(info >>> Waiving build. Minimum GCC version required is  5.0.0<<<)
+        SAMPLE_ENABLED := 0
+    endif
+endif
+
+# Gencode arguments
+SMS ?= 80
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ALL_CCFLAGS += --std=c++11
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: dmmaTensorCoreGemm
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+dmmaTensorCoreGemm.o:dmmaTensorCoreGemm.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+dmmaTensorCoreGemm: dmmaTensorCoreGemm.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./dmmaTensorCoreGemm
+
+clean:
+	rm -f dmmaTensorCoreGemm dmmaTensorCoreGemm.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/dmmaTensorCoreGemm
+
+clobber: clean
diff --git a/Samples/dmmaTensorCoreGemm/NsightEclipse.xml b/Samples/dmmaTensorCoreGemm/NsightEclipse.xml
new file mode 100644
index 00000000..9d7ccc56
--- /dev/null
+++ b/Samples/dmmaTensorCoreGemm/NsightEclipse.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>dmmaTensorCoreGemm</name>
+  <cflags>
+    <flag>--std=c++11</flag>
+  </cflags>
+  <cuda_api_list>
+    <toolkit>cudaMallocManaged</toolkit>
+    <toolkit>cudaDeviceSynchronize</toolkit>
+    <toolkit>cudaFuncSetAttribute</toolkit>
+    <toolkit>cudaEventCreate</toolkit>
+    <toolkit>cudaEventRecord</toolkit>
+    <toolkit>cudaEventSynchronize</toolkit>
+    <toolkit>cudaEventElapsedTime</toolkit>
+    <toolkit>cudaFree</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[CUDA sample demonstrates double precision GEMM computation using the Double precision Warp Matrix Multiply and Accumulate (WMMA) API introduced with CUDA 11 in Ampere chip family tensor cores for faster matrix operations. This sample also uses async copy provided by cuda pipeline interface for gmem to shmem async loads which improves kernel performance and reduces register presssure. Further, this sample also demonstrates how to use cooperative groups async copy interface over a group for performing gmem to shmem async loads.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">Matrix Multiply</concept>
+    <concept level="advanced">WMMA</concept>
+    <concept level="advanced">Tensor Cores</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>matrix multiply</keyword>
+    <keyword>Async copy</keyword>
+    <keyword>CPP11</keyword>
+    <keyword>GCC 5.0.0</keyword>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>dmmaTensorCoreGemm.cu</primary_file>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+  </scopes>
+  <sm-arch>sm80</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <arch>aarch64</arch>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <from>8.0</from>
+  </supported_sm_architectures>
+  <title>Double Precision Tensor Core GEMM</title>
+  <type>exe</type>
+</entry>
diff --git a/Samples/dmmaTensorCoreGemm/README.md b/Samples/dmmaTensorCoreGemm/README.md
new file mode 100644
index 00000000..6358918e
--- /dev/null
+++ b/Samples/dmmaTensorCoreGemm/README.md
@@ -0,0 +1,70 @@
+# dmmaTensorCoreGemm - Double Precision Tensor Core GEMM
+
+## Description
+
+CUDA sample demonstrates double precision GEMM computation using the Double precision Warp Matrix Multiply and Accumulate (WMMA) API introduced with CUDA 11 in Ampere chip family tensor cores for faster matrix operations. This sample also uses async copy provided by cuda pipeline interface for gmem to shmem async loads which improves kernel performance and reduces register presssure. Further, this sample also demonstrates how to use cooperative groups async copy interface over a group for performing gmem to shmem async loads.
+
+## Key Concepts
+
+Matrix Multiply, WMMA, Tensor Cores
+
+## Supported SM Architectures
+
+[SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, aarch64
+
+## CUDA APIs involved
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate, cudaEventRecord, cudaEventSynchronize, cudaEventElapsedTime, cudaFree
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
diff --git a/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm.cu b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm.cu
new file mode 100644
index 00000000..65f91c02
--- /dev/null
+++ b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm.cu
@@ -0,0 +1,1055 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+// CUDA sample demonstrating a Double precision GEMM computation using the Warp
+//  Matrix Multiply and Accumulate API introduced in CUDA 11.0.
+
+// In this program, the compute_dgemm kernel computes the result of a matrix multiplication
+// and addition: D = alpha * A * B + beta * C. The dimensions of both C and D matrices
+// are M_GLOBAL x N_GLOBAL. The A matrix is M_GLOBAL x K_GLOBAL (row-major), the B matrix
+// is K_GLOBAL x N_GLOBAL (column-major).
+// In that kernel, each CTA computes one 64 x 64 tile of the resulting matrix
+// per iteration. When the tile is computed, the CTA stores it to the global memory
+// and begins a new iteration, selecting a new 64 x 64 tile to compute.
+// Each CTA consists of eight warps. For the 64 x 64 tile, each warp computes eight
+// 8 x 8 subtiles, organized in a 2 x 4 two-dimensional array.
+// Warps compute the 8 x 8 subtiles using nvcuda::wmma::mma_sync operations by
+// moving through the K_GLOBAL dimension of the A and B matrices and accumulating
+// the intermediate result in the local thread state.
+
+// There are a number of simple optimizations used in the algorithm:
+// - The CTA copies the 64 x 64 tile of the C matrix from the global memory to
+//   shared memory. After that is done, each warp loads the C matrix fragments from
+//   shared memory, thus avoiding a random global memory access.
+// - On each internal iteration, the CTA copies a portion of the A and B matrices from
+//   global memory to shared memory. After that, all warps in the CTA reuse the A and B
+//   data from shared memory, thus reducing the number of data copies from global memory.
+// - The portions of the A and B matrices are stored in shared memory with an additional
+//   padding (skew) to reduce the number of shared memory access bank conflicts.
+//   (See a detailed explanation near the SKEW_DOUBLE macro definition.)
+// - When the CTA finishes computing the tiles of the resulting matrix, each warp stores
+//   its subtiles to shared memory. The CTA then copies the shared memory contents to
+//   global memory, again avoiding redundant random global memory accesses.
+// - Note that the CTA tile size is chosen to maximize the GPU register utilization,
+//   but carefully enough to avoid local memory use.
+
+#include <assert.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <mma.h>
+#include <cuda_pipeline.h>
+#include <cooperative_groups.h>
+#include <cooperative_groups/memcpy_async.h>
+
+// Switch for choosing cpp interface for cuda pipeline 
+// vs primitives interface.
+#define USE_CPP_API 0
+
+// helper functions and utilities to work with CUDA
+#include <helper_functions.h>
+#include <helper_cuda.h>
+
+// Externally configurable parameters.
+
+#ifndef CPU_DEBUG
+// Set this to 1 to verify the correctness of the GPU-computed matrix.
+#define CPU_DEBUG 0
+#endif
+
+#ifndef SHARED_MEMORY_LIMIT_64K
+// Set this to 0 to use more than 64 Kb of shared memory to cache data, to
+// improve the performance of the computations on GPU.
+// Note that you need a GPU that can have more than 64 Kb of shared memory
+// per multiprocessor.
+#define SHARED_MEMORY_LIMIT_64K 0
+#endif
+
+// GPU configuration.
+
+#define WARP_SIZE 32
+
+// MMA matrix tile dimensions.
+
+#define M 8
+#define N 8
+#define K 4
+
+// GEMM configuration.
+
+#define M_TILES 1024
+#define N_TILES 1024
+#define K_TILES 1024
+
+#define M_GLOBAL (M * M_TILES)
+#define N_GLOBAL (N * N_TILES)
+#define K_GLOBAL (K * K_TILES)
+
+#define C_LAYOUT wmma::mem_row_major
+
+// Implementation constants.
+
+#define WARPS_PER_BLOCK 8
+#define THREADS_PER_BLOCK (WARP_SIZE * WARPS_PER_BLOCK)
+
+#if SHARED_MEMORY_LIMIT_64K
+// With only 64 Kb shared memory available, we can fit 8x16-tile chunks of each
+// the A and B matrix data, that are (M = 8) * (K = 4) * 8 * (CHUNK_K = 16) * sizeof(double) = 32 Kb each
+// But we cannot account the 4 Kb total skew overhead, without which the performance
+// would be severely impacted. So we choose to reduce the chunk size in half,
+// i.e. the amount of A and B matrix data we cache in shared memory.
+// Accordingly, this doubles the number of outer iterations across the global K
+// dimension, which only slightly impacts the performance.
+#define CHUNK_K 8
+#else
+#define CHUNK_K 16
+#endif
+
+#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(double))
+#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4))
+#define CHUNK_COPY_LINES_PER_WARP (WARP_COPY_BYTES / CHUNK_LINE_BYTES)
+#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP)
+
+#define BLOCK_ROW_WARPS 2
+#define BLOCK_COL_WARPS 4
+
+#define WARP_ROW_TILES 4
+#define WARP_COL_TILES 2
+
+#define BLOCK_ROW_TILES (WARP_ROW_TILES * BLOCK_ROW_WARPS)
+#define BLOCK_COL_TILES (WARP_COL_TILES * BLOCK_COL_WARPS)
+
+#define GLOBAL_MEM_STRIDE N_GLOBAL
+
+#define SHMEM_STRIDE (N * BLOCK_ROW_TILES)
+#define SHMEM_OFFSET (N * WARP_ROW_TILES)
+
+// The macro below is used to shift rows of the A matrix and columns of the B matrix
+// in shared memory to minimize possible bank conflicts.
+// Before performing the nvcuda::wmma::mma_sync operation, the warp must load the matrix
+// data using the nvcuda::wmma::load_matrix_sync operation. Although the memory access pattern
+// is not specified for that function, each lane in the warp can read one or multiple matrix
+// elements from different matrix rows or columns.
+// For shared memory, such access can result in bank conflicts if different rows / columns
+// of the matrix map to the same bank. By shifting each row and column by a few bytes, we
+// make sure that they map to different banks, thus reducing the number of possible bank
+// conflicts.
+// The number of 4 eight-byte "double" elements is chosen as the minimum possible shift because
+// we must keep each row and column 256-bit aligned, as required by nvcuda::wmma::load_matrix_sync.
+#define SKEW_DOUBLE 4
+
+#define checkKernelErrors(expr) do {                                                        \
+    expr;                                                                                   \
+                                                                                            \
+    cudaError_t __err = cudaGetLastError();                                                 \
+    if (__err != cudaSuccess) {                                                             \
+        printf("Line %d: '%s' failed: %s\n", __LINE__, # expr, cudaGetErrorString(__err));  \
+        abort();                                                                            \
+    }                                                                                       \
+} while(0)
+
+enum kernels
+{
+    dmma_shmem_gemm_async_copy      = 0, // DMMA shmem using kernel with async_copy
+    dmma_shmem_gemm_cg_async_copy   = 1, // DMMA shmem using kernel with cooperative groups async_copy
+    dmma_shmem_gemm                 = 2, // DMMA shmem using kernel normal copy (without async_copy).
+    simple_dmma_gemm                = 3  // DMMA non-shmem using simple kernel.
+};
+
+const char* kernelNames[] = {"compute_dgemm_async_copy", "compute_dgemm_cg_async_copy",
+                            "compute_dgemm", "simple_wmma_gemm"};
+
+using namespace nvcuda;
+namespace nvcuda_namespace = nvcuda::experimental;
+namespace cg = cooperative_groups;
+
+__host__ void init_host_matrices(double *a, double *b, double *c)
+{
+    for (int i = 0; i < M_GLOBAL; i++) {
+        for (int j = 0; j < K_GLOBAL; j++) {
+            a[i*K_GLOBAL+j] = (double) (rand() % 3);
+        }
+    }
+
+    for (int i = 0; i < N_GLOBAL; i++) {
+        for (int j = 0; j < K_GLOBAL; j++) {
+            b[i*K_GLOBAL+j] = (double) (rand() % 3);
+        }
+    }
+
+    for (int t = 0; t < M_GLOBAL * N_GLOBAL; t++) {
+        c[t] =  (double) (rand() % 3);
+    }
+}
+
+__global__ void compute_dgemm(const double *A, const double *B, const double *C, double *D, double alpha, double beta)
+{
+#if __CUDA_ARCH__ >= 800
+    extern __shared__ double shmem[][CHUNK_K * K + SKEW_DOUBLE];
+
+    // Warp and lane identification.
+    const unsigned int warpId = threadIdx.x / WARP_SIZE;
+    const unsigned int laneId = threadIdx.x % WARP_SIZE;
+
+    // Offset in shared memory from which the B matrix is stored.
+    const size_t shmem_idx_b_off = BLOCK_COL_TILES * M;
+
+
+    // This pointer is used to access the C and D matrix tiles this warp computes.
+    double *shmem_warp_tile_ptr = (double*)&shmem[0][0] + (warpId / BLOCK_ROW_WARPS) * SHMEM_STRIDE * N * BLOCK_ROW_WARPS + (warpId % BLOCK_ROW_WARPS) * SHMEM_OFFSET;
+
+    // This pointer is used to stream the C and D matrices block-wide tile to and from shared memory.
+    double *shmem_warp_stream_ptr = (double*)&shmem[0][0] + warpId * SHMEM_STRIDE * N;
+
+    // Adjust the beta scaler, as it'll be multiplied by alpha at the end of
+    // each tile computation. Technically this is not generally correct (may result
+    // in a loss of precision). Zero still needs to be specially handled though.
+    beta /= alpha;
+
+    // Each CTA slides along the 64 x 64 tiles from the top left corner of the matrix to the
+    // right and down, and selects the next tile to compute. Once there's no such tile,
+    // all warps in this CTA exit.
+    for(unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) {
+        const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES);
+        const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES;
+
+        // Stop when there are no more D matrix tiles to compute in this CTA.
+        if (block_tile_i >= M_TILES) {
+            break;
+        }
+
+        // This warp's pointer to the C matrix data to copy memory from to shared memory.
+        const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N;
+        const double *src_gmem_warp_stream_ptr = &C[gmem_idx];
+
+        // Stream multiple C tiles to shared memory.
+#pragma unroll
+        for (int i = 0; i < N; i++) {
+            *((int4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId) = 
+                *((int4 *)(src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId);
+        }
+
+        __syncthreads();
+
+        // These fragments will accumulate the result of A and B matrix fragment multiplications
+        // along the K_GLOBAL dimension.
+        wmma::fragment<wmma::accumulator, M, N, K, double> c[WARP_COL_TILES][WARP_ROW_TILES];
+
+        // Load the C matrix tiles into fragments from shared memory.
+#pragma unroll
+        for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+                const double *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * N + j * N;
+
+                wmma::load_matrix_sync(c[i][j], tile_ptr, SHMEM_STRIDE, C_LAYOUT);
+            }
+        }
+
+        __syncthreads();
+
+        // Scale the C matrix.
+#pragma unroll
+       for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+#pragma unroll
+                for (int t = 0; t < c[i][j].num_elements; t++) {
+                    c[i][j].x[t] *= beta;
+                }
+            }
+        }
+
+        // Select what warp copies what matrix to shared memory.
+        // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix.
+        const double *warp_ptr = (warpId < (WARPS_PER_BLOCK/2)) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2) :
+                                              (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2);
+
+        // Go through the global K dimension by a fixed step at a time.
+#pragma unroll
+        for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) {
+            // Copy slices of the A and B matrices to shared memory.
+            // The first half of the warps in the CTA copy the A matrix, the rest copy the B matrix.
+            size_t shmem_idx = warpId < (WARPS_PER_BLOCK/2) ? (M * (warpId % (WARPS_PER_BLOCK/2)) * 2) : 
+                                                              (N * (warpId % (WARPS_PER_BLOCK/2)) * 2 + shmem_idx_b_off);
+
+            // First half of the warp copies the first row / column of the matrix,
+            // the second half of the warp copies the next.
+            const double *lane_ptr = warp_ptr + tile_k * K + (laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL;
+
+            // Shift the second half of the warp to the next row / column in the shared memory.
+            shmem_idx += laneId / CHUNK_COPY_LINE_LANES;
+
+
+#pragma unroll
+            for(int i = 0; i < ((WARP_SIZE/2) / CHUNK_COPY_LINES_PER_WARP); i++) {
+                 // Copy 16 bytes at once in each lane.
+                *((int4*)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) = *((int4*)lane_ptr +  (laneId % CHUNK_COPY_LINE_LANES));
+
+                // Advance the global memory pointer and the shared memory index.
+                lane_ptr = lane_ptr + K_GLOBAL * CHUNK_COPY_LINES_PER_WARP;
+                shmem_idx += CHUNK_COPY_LINES_PER_WARP;
+            }
+
+            __syncthreads();
+
+            // Compute a grid of C matrix tiles in each warp.
+#pragma unroll
+            for (int k_step = 0; k_step < CHUNK_K; k_step++) {
+                wmma::fragment<wmma::matrix_a, M, N, K, double, wmma::row_major> a[WARP_COL_TILES];
+                wmma::fragment<wmma::matrix_b, M, N, K, double, wmma::col_major> b[WARP_ROW_TILES];
+
+#pragma unroll
+                for (int i = 0; i < WARP_COL_TILES; i++) {
+                    size_t shmem_idx_a = (warpId/2) * M * 2 + (i * M);
+                    const double *tile_ptr = &shmem[shmem_idx_a][k_step * K];
+
+                    wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_DOUBLE);
+
+#pragma unroll
+                    for (int j = 0; j < WARP_ROW_TILES; j++) {
+                        if (i == 0) {
+                            // Load the B matrix fragment once, because it is going to be reused
+                            // against the other A matrix fragments.
+                            size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId%2) + (j * N);
+                            const double *tile_ptr = &shmem[shmem_idx_b][k_step * K];
+
+                            wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_DOUBLE);
+
+                        }
+
+                        wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]);
+
+                    }
+                }
+            }
+
+            __syncthreads();
+        }
+
+        // Store the D fragments to shared memory.
+#pragma unroll
+        for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+                // Uniform, point-wise transformations of ALL fragment elements by ALL threads in the
+                // warp are well-defined even though element indices within fragment storage are not defined.
+#pragma unroll
+                for (int t = 0; t < c[i][j].num_elements; t++)
+                    c[i][j].x[t] *= alpha;
+
+                double *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * N + j * N;
+
+                wmma::store_matrix_sync(tile_ptr, c[i][j], SHMEM_STRIDE, C_LAYOUT);
+            }
+        }
+
+        __syncthreads();
+
+        // Now that shared memory contains all the D tiles, stream them to global memory.
+        double *dst_gmem_warp_stream_ptr = &D[gmem_idx];
+
+#pragma unroll
+        for (int i = 0; i < N; i++) {
+            *((int4*)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) =
+                *((int4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId);
+        }
+
+        __syncthreads();
+    }
+#endif
+}
+
+__global__ void compute_dgemm_async_copy(const double *A, const double *B, const double *C, double *D, double alpha, double beta)
+{
+#if __CUDA_ARCH__ >= 800
+    extern __shared__ double shmem[][CHUNK_K * K + SKEW_DOUBLE];
+
+    // Warp and lane identification.
+    const unsigned int warpId = threadIdx.x / WARP_SIZE;
+    const unsigned int laneId = threadIdx.x % WARP_SIZE;
+
+    // Offset in shared memory from which the B matrix is stored.
+    const size_t shmem_idx_b_off = BLOCK_COL_TILES * M;
+
+
+    // This pointer is used to access the C and D matrix tiles this warp computes.
+    double *shmem_warp_tile_ptr = (double*)&shmem[0][0] + (warpId/BLOCK_ROW_WARPS) * SHMEM_STRIDE * N * BLOCK_ROW_WARPS + (warpId % BLOCK_ROW_WARPS) * SHMEM_OFFSET;
+
+    // This pointer is used to stream the C and D matrices block-wide tile to and from shared memory.
+    double *shmem_warp_stream_ptr = (double*)&shmem[0][0] + warpId * SHMEM_STRIDE * N;
+
+    // Adjust the beta scaler, as it'll be multiplied by alpha at the end of
+    // each tile computation. Technically this is not generally correct (may result
+    // in a loss of precision). Zero still needs to be specially handled though.
+    beta /= alpha;
+
+    // Each CTA slides along the 64 x 64 tiles from the top left corner of the matrix to the
+    // right and down, and selects the next tile to compute. Once there's no such tile,
+    // all warps in this CTA exit.
+
+#if USE_CPP_API
+    nvcuda_namespace::pipeline pipe;
+#endif
+    for(unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) {
+        const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES);
+        const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES;
+
+        // Stop when there are no more D matrix tiles to compute in this CTA.
+        if (block_tile_i >= M_TILES) {
+            break;
+        }
+
+        // This warp's pointer to the C matrix data to copy memory from to shared memory.
+        const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N;
+        const double *src_gmem_warp_stream_ptr = &C[gmem_idx];
+
+        // Stream multiple C tiles to shared memory.
+#pragma unroll
+        for (int i = 0; i < N; i++) {
+#if USE_CPP_API
+            nvcuda_namespace::memcpy_async(*((int4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId),
+                                            *((int4*)(src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId),
+                                            pipe);
+            pipe.commit();
+#else
+            __pipeline_memcpy_async((reinterpret_cast<int4*>(&shmem_warp_stream_ptr[(SHMEM_STRIDE * i)])) + laneId,
+                                    (reinterpret_cast<const int4*>(&src_gmem_warp_stream_ptr[(GLOBAL_MEM_STRIDE * i)])) + laneId,
+                                    sizeof(int4));
+            __pipeline_commit();
+#endif
+        }
+        // Now wait for all the above issued 8 batches to complete.
+#if USE_CPP_API
+        pipe.wait_prior<0>();
+#else
+        __pipeline_wait_prior(0);
+#endif
+        __syncthreads();
+
+        // These fragments will accumulate the result of A and B matrix fragment multiplications
+        // along the K_GLOBAL dimension.
+        wmma::fragment<wmma::accumulator, M, N, K, double> c[WARP_COL_TILES][WARP_ROW_TILES];
+
+        // Load the C matrix tiles into fragments from shared memory.
+#pragma unroll
+        for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+                const double *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * N + j * N;
+
+                wmma::load_matrix_sync(c[i][j], tile_ptr, SHMEM_STRIDE, C_LAYOUT);
+            }
+        }
+
+        __syncthreads();
+
+        // Scale the C matrix.
+#pragma unroll
+       for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+#pragma unroll
+                for (int t = 0; t < c[i][j].num_elements; t++) {
+                    c[i][j].x[t] *= beta;
+                }
+            }
+        }
+
+        // Select what warp copies what matrix to shared memory.
+        // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix.
+        const double *warp_ptr = (warpId < (WARPS_PER_BLOCK/2)) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2) :
+                                              (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2);
+
+        // Go through the global K dimension by a fixed step at a time.
+#pragma unroll
+        for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) {
+            // Copy slices of the A and B matrices to shared memory.
+            // The first half of the warps in the CTA copy the A matrix, the rest copy the B matrix.
+            size_t shmem_idx = warpId < (WARPS_PER_BLOCK/2) ? (M * (warpId % (WARPS_PER_BLOCK/2)) * 2) : 
+                                                              (N * (warpId % (WARPS_PER_BLOCK/2)) * 2 + shmem_idx_b_off);
+
+            // First half of the warp copies the first row / column of the matrix,
+            // the second half of the warp copies the next.
+            const double *lane_ptr = warp_ptr + tile_k * K + (laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL;
+
+            // Shift the second half of the warp to the next row / column in the shared memory.
+            shmem_idx += laneId / CHUNK_COPY_LINE_LANES;
+
+#pragma unroll
+            for(int i = 0; i < ((WARP_SIZE/2) / CHUNK_COPY_LINES_PER_WARP); i++) {
+                 // Copy 16 bytes at once in each lane.
+#if USE_CPP_API
+                nvcuda_namespace::memcpy_async(*((int4*)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)),
+                                                *((int4*)lane_ptr + (laneId % CHUNK_COPY_LINE_LANES)), pipe);
+                pipe.commit();
+#else
+                __pipeline_memcpy_async((int4*)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES),
+                                        (int4*)lane_ptr + (laneId % CHUNK_COPY_LINE_LANES), sizeof(int4));
+                __pipeline_commit();
+#endif
+                // Advance the global memory pointer and the shared memory index.
+                lane_ptr = lane_ptr + K_GLOBAL * CHUNK_COPY_LINES_PER_WARP;
+                shmem_idx += CHUNK_COPY_LINES_PER_WARP;
+            }
+
+#if USE_CPP_API
+            pipe.wait_prior<0>();
+#else
+            __pipeline_wait_prior(0);
+#endif
+            __syncthreads();
+
+            // Compute a grid of C matrix tiles in each warp.
+#pragma unroll
+            for (int k_step = 0; k_step < CHUNK_K; k_step++) {
+                wmma::fragment<wmma::matrix_a, M, N, K, double, wmma::row_major> a[WARP_COL_TILES];
+                wmma::fragment<wmma::matrix_b, M, N, K, double, wmma::col_major> b[WARP_ROW_TILES];
+
+#pragma unroll
+                for (int i = 0; i < WARP_COL_TILES; i++) {
+                    size_t shmem_idx_a = (warpId/2) * M * 2 + (i * M);
+                    const double *tile_ptr = &shmem[shmem_idx_a][k_step * K];
+
+                    wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_DOUBLE);
+
+#pragma unroll
+                    for (int j = 0; j < WARP_ROW_TILES; j++) {
+                        if (i == 0) {
+                            // Load the B matrix fragment once, because it is going to be reused
+                            // against the other A matrix fragments.
+                            size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId%2) + (j * N);
+                            const double *tile_ptr = &shmem[shmem_idx_b][k_step * K];
+
+                            wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_DOUBLE);
+
+                        }
+
+                        wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]);
+
+                    }
+                }
+            }
+
+            __syncthreads();
+        }
+
+        // Store the D fragments to shared memory.
+#pragma unroll
+        for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+                // Uniform, point-wise transformations of ALL fragment elements by ALL threads in the
+                // warp are well-defined even though element indices within fragment storage are not defined.
+#pragma unroll
+                for (int t = 0; t < c[i][j].num_elements; t++)
+                    c[i][j].x[t] *= alpha;
+
+                double *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * N + j * N;
+
+                wmma::store_matrix_sync(tile_ptr, c[i][j], SHMEM_STRIDE, C_LAYOUT);
+            }
+        }
+
+        __syncthreads();
+
+        // Now that shared memory contains all the D tiles, stream them to global memory.
+        double *dst_gmem_warp_stream_ptr = &D[gmem_idx];
+
+#pragma unroll
+        for (int i = 0; i < N; i++) {
+            *((int4*)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) =
+                *((int4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId);
+        }
+
+        __syncthreads();
+    }
+#endif
+}
+
+ __global__ void compute_dgemm_cg_async_copy(const double *A, const double *B, const double *C, double *D, double alpha, double beta)
+{
+#if __CUDA_ARCH__ >= 800
+    extern __shared__ double shmem[][CHUNK_K * K + SKEW_DOUBLE];
+    auto cta = cg::this_thread_block();
+    auto tile32 = cg::tiled_partition<32>(cta);
+
+    constexpr int tileChunkCopySize = WARP_SIZE / CHUNK_COPY_LINES_PER_WARP;
+    auto tileChunkCopy = cg::tiled_partition<tileChunkCopySize>(cta);
+
+    // Warp and lane identification.
+    const unsigned int warpId = threadIdx.x / WARP_SIZE;
+    const unsigned int laneId = threadIdx.x % WARP_SIZE;
+
+    // Offset in shared memory from which the B matrix is stored.
+    const size_t shmem_idx_b_off = BLOCK_COL_TILES * M;
+
+    // This pointer is used to access the C and D matrix tiles this warp computes.
+    double *shmem_warp_tile_ptr = (double*)&shmem[0][0] + (warpId/2) * SHMEM_STRIDE * N * 2 + (warpId%2) * SHMEM_OFFSET;
+
+    // This pointer is used to stream the C and D matrices block-wide tile to and from shared memory.
+    double *shmem_warp_stream_ptr = (double*)&shmem[0][0] + warpId * SHMEM_STRIDE * N;
+
+    // Adjust the beta scaler, as it'll be multiplied by alpha at the end of
+    // each tile computation. Technically this is not generally correct (may result
+    // in a loss of precision). Zero still needs to be specially handled though.
+    beta /= alpha;
+
+    // Each CTA slides along the 64 x 64 tiles from the top left corner of the matrix to the
+    // right and down, and selects the next tile to compute. Once there's no such tile,
+    // all warps in this CTA exit.
+
+    nvcuda_namespace::pipeline pipe;
+
+    for(unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) {
+        const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES);
+        const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES;
+
+        // Stop when there are no more D matrix tiles to compute in this CTA.
+        if (block_tile_i >= M_TILES) {
+            break;
+        }
+
+        // This warp's pointer to the C matrix data to copy memory from to shared memory.
+        const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N;
+        const double *src_gmem_warp_stream_ptr = &C[gmem_idx];
+
+
+        // Stream multiple C tiles to shared memory.
+        #pragma unroll
+        for (int i = 0; i < N; i++)
+        {
+            int4 *shMemCopy_t = reinterpret_cast<int4*>(&shmem_warp_stream_ptr[(SHMEM_STRIDE * i)]);
+            const int4 *gMemCopy_t = reinterpret_cast<const int4*>(&src_gmem_warp_stream_ptr[(GLOBAL_MEM_STRIDE * i)]);
+            cg::memcpy_async(tile32, shMemCopy_t, tile32.size(), gMemCopy_t, tile32.size(), pipe);
+        }
+        cg::wait_prior<0>(tile32, pipe);
+        cg::sync(cta);
+
+        // These fragments will accumulate the result of A and B matrix fragment multiplications
+        // along the K_GLOBAL dimension.
+        wmma::fragment<wmma::accumulator, M, N, K, double> c[WARP_COL_TILES][WARP_ROW_TILES];
+
+        // Load the C matrix tiles into fragments from shared memory.
+#pragma unroll
+        for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+                const double *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * N + j * N;
+
+                wmma::load_matrix_sync(c[i][j], tile_ptr, SHMEM_STRIDE, C_LAYOUT);
+            }
+        }
+
+        __syncthreads();
+
+        // Scale the C matrix.
+#pragma unroll
+       for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+#pragma unroll
+                for (int t = 0; t < c[i][j].num_elements; t++) {
+                    c[i][j].x[t] *= beta;
+                }
+            }
+        }
+
+        // Select what warp copies what matrix to shared memory.
+        // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix.
+        const double *warp_ptr = (warpId < 4) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2) :
+                                              (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2);
+
+         // Go through the global K dimension by a fixed step at a time.
+#pragma unroll
+        for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) {
+            // Copy slices of the A and B matrices to shared memory.
+            // The first half of the warps in the CTA copy the A matrix, the rest copy the B matrix.
+            size_t shmem_idx = warpId < (WARPS_PER_BLOCK/2) ? (M * (warpId % (WARPS_PER_BLOCK/2)) * 2) :
+                                                              (N * (warpId % (WARPS_PER_BLOCK/2)) * 2 + shmem_idx_b_off);
+
+            // First half of the warp copies the first row / column of the matrix,
+            // the second half of the warp copies the next.
+            int4 *lane_ptr = (int4*)(warp_ptr + tile_k * K + (laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL);
+
+            // Shift the second half of the warp to the next row / column in the shared memory.
+            shmem_idx += laneId / CHUNK_COPY_LINE_LANES;
+
+#pragma unroll
+            for(int i = 0; i < ((WARP_SIZE/2) / CHUNK_COPY_LINES_PER_WARP); i++) {
+                 // Copy 16 bytes at once in each lane.
+                int4 *shMemCopy_t = ((int4*)&shmem[shmem_idx][0]);
+                const int4 *gMemCopy_t = lane_ptr;
+                cg::memcpy_async(tileChunkCopy, shMemCopy_t, tileChunkCopy.size(), gMemCopy_t, tileChunkCopy.size(), pipe);
+
+                // Advance the global memory pointer and the shared memory index.
+                lane_ptr = (int4*)((double*)lane_ptr + K_GLOBAL * CHUNK_COPY_LINES_PER_WARP);
+                shmem_idx += CHUNK_COPY_LINES_PER_WARP;
+            }
+            cg::wait_prior<0>(tileChunkCopy, pipe);
+            cg::sync(cta);
+
+            // Compute a grid of C matrix tiles in each warp.
+#pragma unroll
+            for (int k_step = 0; k_step < CHUNK_K; k_step++) {
+                wmma::fragment<wmma::matrix_a, M, N, K, double, wmma::row_major> a[WARP_COL_TILES];
+                wmma::fragment<wmma::matrix_b, M, N, K, double, wmma::col_major> b[WARP_ROW_TILES];
+
+#pragma unroll
+                for (int i = 0; i < WARP_COL_TILES; i++) {
+                    size_t shmem_idx_a = (warpId/2) * M * 2 + (i * M);
+                    const double *tile_ptr = &shmem[shmem_idx_a][k_step * K];
+
+                    wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_DOUBLE);
+
+#pragma unroll
+                    for (int j = 0; j < WARP_ROW_TILES; j++) {
+                        if (i == 0) {
+                            // Load the B matrix fragment once, because it is going to be reused
+                            // against the other A matrix fragments.
+                            size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId%2) + (j * N);
+                            const double *tile_ptr = &shmem[shmem_idx_b][k_step * K];
+
+                            wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_DOUBLE);
+
+                        }
+
+                        wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]);
+
+                    }
+                }
+            }
+
+            __syncthreads();
+        }
+
+        // Store the D fragments to shared memory.
+#pragma unroll
+        for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+                // Uniform, point-wise transformations of ALL fragment elements by ALL threads in the
+                // warp are well-defined even though element indices within fragment storage are not defined.
+#pragma unroll
+                for (int t = 0; t < c[i][j].num_elements; t++)
+                    c[i][j].x[t] *= alpha;
+
+                double *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * N + j * N;
+
+                wmma::store_matrix_sync(tile_ptr, c[i][j], SHMEM_STRIDE, C_LAYOUT);
+            }
+        }
+
+        __syncthreads();
+
+        // Now that shared memory contains all the D tiles, stream them to global memory.
+        double *dst_gmem_warp_stream_ptr = &D[gmem_idx];
+
+#pragma unroll
+        for (int i = 0; i < N; i++) {
+            *((int4*)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) =
+                *((int4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId);
+        }
+
+        __syncthreads();
+    }
+#endif
+}
+
+
+// Performs an MxNxK DGEMM (C=alpha*A*B + beta*C) assuming:
+//  1) Matrices are packed in memory.
+//  2) M, N and K are multiples of 8, 8 and 4 respectively. 
+//  3) A is row major, B is column major matrix.
+// Note: This is a less performant version of the compute_dgemm kernel. It is designed for
+//       demonstration purposes only to show the CUDA WMMA API use without relying on
+//       availability of the shared memory.
+__global__ void simple_wmma_gemm(double *a, double *b, double *c, double *d, int m_ld, int n_ld, int k_ld, double alpha, double beta)
+{
+#if __CUDA_ARCH__ >= 800
+    // Leading dimensions. Packed with no transpositions.
+    int lda = k_ld;
+    int ldb = k_ld;
+    int ldc = n_ld;
+
+    // Tile using a 2D grid
+    int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize;
+    int warpN = (blockIdx.y * blockDim.y + threadIdx.y);
+
+    // Declare the fragments
+    wmma::fragment<wmma::matrix_a, M, N, K, double, wmma::row_major> a_frag;
+    wmma::fragment<wmma::matrix_b, M, N, K, double, wmma::col_major> b_frag;
+    wmma::fragment<wmma::accumulator, M, N, K, double> acc_frag;
+    wmma::fragment<wmma::accumulator, M, N, K, double> c_frag;
+
+    wmma::fill_fragment(acc_frag, 0.0f);
+
+    // Loop over k
+    for (int i = 0; i < k_ld; i += K) {
+        int aCol = i; 
+        int aRow = warpM * M;
+
+        int bCol = warpN * N;
+        int bRow = i;
+
+        // Bounds checking
+        if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) {
+            // Load the inputs
+            wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda);
+            wmma::load_matrix_sync(b_frag, b + bRow + bCol * ldb, ldb);
+
+            // Perform the matrix multiplication
+            wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
+        }
+    }
+
+    // Load in the current value of c, scale it by beta, and add this our result scaled by alpha
+    int cCol = warpN * N;
+    int cRow = warpM * M;
+
+    if (cRow < m_ld && cCol < n_ld) {
+        wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, wmma::mem_row_major);
+
+        for(int i=0; i < c_frag.num_elements; i++) {
+            c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i];
+        }
+
+        // Store the output
+        wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc, wmma::mem_row_major);
+    }
+#endif
+}
+
+__host__ void matMultiplyOnHost(double *A, double *B, double *C,
+                                float alpha, float beta,
+                                int numARows, int numAColumns,
+                                int numBRows, int numBColumns,
+                                int numCRows, int numCColumns)
+{
+    for (int i = 0; i < numCRows; i++) {
+        for (int j = 0; j < numCColumns; j++) {
+            double temp = 0.0;
+
+            for (int k = 0; k < numAColumns; k++) {
+                // B matrix is column major. A matrix is row major.
+                temp += A[i * numAColumns + k] * B[j * numBRows + k];
+            }
+
+            C[i*numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j];
+        }
+    }
+}
+
+int main(int argc, char **argv)
+{
+    printf("Initializing...\n");
+
+    int dev = findCudaDevice(argc, (const char **)argv);
+
+    cudaDeviceProp deviceProp;
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
+
+    // Double precision Tensor cores require a GPU of Ampere (SM8X) architecture or higher.
+    if (deviceProp.major < 8) {
+        printf("dmmaTensorCoreGemm requires SM 8.0 or higher.  Exiting...\n");
+        exit(EXIT_WAIVED);
+    }
+
+    printf("M: %d (%d x %d)\n", M_GLOBAL, M, M_TILES);
+    printf("N: %d (%d x %d)\n", N_GLOBAL, N, N_TILES);
+    printf("K: %d (%d x %d)\n", K_GLOBAL, K, K_TILES);
+
+    double *A_h = NULL;
+    double *B_h = NULL;
+    double *C_h = NULL;
+#if CPU_DEBUG
+    double *result_hD = NULL;
+    double *result_host = NULL;
+#endif
+
+    A_h = (double*) malloc(sizeof(double) * M_GLOBAL * K_GLOBAL);
+    B_h = (double*) malloc(sizeof(double) * K_GLOBAL * N_GLOBAL);
+    C_h = (double*) malloc(sizeof(double) * M_GLOBAL * N_GLOBAL);
+#if CPU_DEBUG
+    result_hD   = (double*) malloc(sizeof(double) * M_GLOBAL * N_GLOBAL);
+    result_host = (double*) malloc(sizeof(double) * M_GLOBAL * N_GLOBAL);
+#endif
+
+    double *A = NULL;
+    double *B = NULL;
+    double *C = NULL;
+    double *D = NULL;
+
+    checkCudaErrors(cudaMalloc((void**)&A, sizeof(double) * M_GLOBAL * K_GLOBAL));
+    checkCudaErrors(cudaMalloc((void**)&B, sizeof(double) * N_GLOBAL * K_GLOBAL));
+    checkCudaErrors(cudaMalloc((void**)&C, sizeof(double) * M_GLOBAL * N_GLOBAL));
+    checkCudaErrors(cudaMalloc((void**)&D, sizeof(double) * M_GLOBAL * N_GLOBAL));
+
+    assert(((unsigned long long)A) % 128 == 0);
+    assert(((unsigned long long)B) % 128 == 0);
+    assert(((unsigned long long)C) % 128 == 0);
+    assert(((unsigned long long)D) % 128 == 0);
+
+    init_host_matrices(A_h, B_h, C_h);
+
+    printf("Preparing data for GPU...\n");
+
+    checkCudaErrors(cudaMemcpy(A, A_h, sizeof(double) * M_GLOBAL * K_GLOBAL, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(B, B_h, sizeof(double) * N_GLOBAL * K_GLOBAL, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(C, C_h, sizeof(double) * M_GLOBAL * N_GLOBAL, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemset(D, 0, sizeof(double) * M_GLOBAL * N_GLOBAL));
+
+    enum {
+        // Compute the right amount of shared memory to request.
+        // We need shared memory to hold per-CTA C and D matrix tiles, and to cache per-CTA chunks
+        // of the A and B matrices. Therefore, the right amount to request is the maximum of those
+        // two numbers.
+        SHMEM_SZ = MAX(sizeof(double) * (BLOCK_COL_TILES * M) * (CHUNK_K * K + SKEW_DOUBLE) * 2,
+                       M * (BLOCK_ROW_WARPS * WARP_ROW_TILES) * N * (BLOCK_COL_WARPS * WARP_COL_TILES) * sizeof(double))
+    };
+
+    printf("Required shared memory size: %lu Kb\n", SHMEM_SZ / 1024UL);
+
+    const double alpha = 1.1f;
+    const double beta = 1.2f;
+
+    cudaEvent_t start, stop;
+
+    checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&stop));
+    checkCudaErrors(cudaEventRecord(start));
+
+    kernels selected_kernel = dmma_shmem_gemm_async_copy;
+
+    // kernel to run - default (dmma_shmem_gemm_async_copy == 0)
+    if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) {
+        int kernel_number = getCmdLineArgumentInt(argc, (const char **)argv, "kernel");
+        if (kernel_number < 4)
+        {
+            selected_kernel = (kernels)kernel_number;
+        }
+        else
+        {
+            printf("Error: kernel number should be between 0 to 3, you have entered %d\n", kernel_number);
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    // If enough shared memory available on the GPU use high performant kernel
+    if ((deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) && (selected_kernel != simple_dmma_gemm))
+    {
+        printf("Computing using high performance kernel = %d - %s\n", selected_kernel, kernelNames[selected_kernel]);
+
+        switch (selected_kernel)
+        {
+            case dmma_shmem_gemm_async_copy :
+            default:
+                checkCudaErrors(cudaFuncSetAttribute(compute_dgemm_async_copy, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ));
+                checkKernelErrors((compute_dgemm_async_copy<<<deviceProp.multiProcessorCount*2, THREADS_PER_BLOCK, SHMEM_SZ>>>(A, B, C, D, alpha, beta)));
+                break;
+            case dmma_shmem_gemm_cg_async_copy :
+                checkCudaErrors(cudaFuncSetAttribute(compute_dgemm_cg_async_copy, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ));
+                checkKernelErrors((compute_dgemm_cg_async_copy<<<deviceProp.multiProcessorCount*2, THREADS_PER_BLOCK, SHMEM_SZ>>>(A, B, C, D, alpha, beta)));
+                break;
+            case dmma_shmem_gemm :
+                checkCudaErrors(cudaFuncSetAttribute(compute_dgemm, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ));
+                checkKernelErrors((compute_dgemm<<<deviceProp.multiProcessorCount*2, THREADS_PER_BLOCK, SHMEM_SZ>>>(A, B, C, D, alpha, beta)));
+                break;
+        }
+
+#if CPU_DEBUG
+        checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(double)*M_GLOBAL*N_GLOBAL, cudaMemcpyDeviceToHost));
+#endif
+    }
+    else
+    {
+        dim3 gridDim;
+        dim3 blockDim;
+     
+        // blockDim.x must be a multple of warpSize
+        // 128x4 means we have 16 warps and a block computes a 64x64 output tile
+        blockDim.x = 128;
+        blockDim.y = 4;
+
+        gridDim.x = (M_GLOBAL + (M * blockDim.x / 32 - 1)) / (M * blockDim.x / 32);
+        gridDim.y = (N_GLOBAL + N * blockDim.y - 1) / (N * blockDim.y);
+
+        printf("Computing... using simple_wmma_gemm kernel\n");
+        simple_wmma_gemm<<<gridDim, blockDim>>>(A, B, C, D, M_GLOBAL, N_GLOBAL, K_GLOBAL, alpha, beta);
+#if CPU_DEBUG
+        checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(double) * M_GLOBAL * N_GLOBAL, cudaMemcpyDeviceToHost));
+#endif
+    }
+
+    checkCudaErrors(cudaEventRecord(stop));
+    checkCudaErrors(cudaEventSynchronize(stop));
+
+#if CPU_DEBUG
+    printf("Verifying correctness of the computations...\n");
+
+    memcpy(result_host, C_h, sizeof(double) * M_GLOBAL * N_GLOBAL);
+
+    matMultiplyOnHost(A_h, B_h, result_host,
+                      alpha, beta,
+                      M_GLOBAL, K_GLOBAL,
+                      K_GLOBAL, N_GLOBAL,
+                      M_GLOBAL, N_GLOBAL);
+
+    size_t number_of_matches = 0;
+    for (int i = 0; i < N_GLOBAL*M_GLOBAL; i++) {
+        if  (fabs(result_hD[i] - result_host[i]) > 0.1f)
+        {
+            printf("mismatch i=%d result_hD=%f result_host=%f\n", i, result_hD[i], result_host[i]);
+            break;
+        }
+        else
+        {
+            number_of_matches++;
+        }
+    }
+    printf("number_of_matches = %zu out of = %d \n", number_of_matches, N_GLOBAL*M_GLOBAL);
+    free(result_hD);
+    free(result_host);
+#endif
+
+    float milliseconds = 0;
+
+    checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop));
+
+    printf("Time: %f ms\n", milliseconds);
+    printf("FP64 TFLOPS: %.2f\n", (((double)M_GLOBAL * N_GLOBAL * K_GLOBAL * 2)/(milliseconds/1000.)) / 1e12);
+
+    free(A_h);
+    free(B_h);
+    free(C_h);
+    checkCudaErrors(cudaFree((void*)A));
+    checkCudaErrors(cudaFree((void*)B));
+    checkCudaErrors(cudaFree((void*)C));
+    checkCudaErrors(cudaFree((void*)D));
+
+    return 0;
+}
diff --git a/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2015.sln b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2015.sln
new file mode 100644
index 00000000..ba841a1f
--- /dev/null
+++ b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2015.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 14.00
+# Visual Studio 2015
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "dmmaTensorCoreGemm", "dmmaTensorCoreGemm_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2015.vcxproj b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2015.vcxproj
new file mode 100644
index 00000000..8d8af215
--- /dev/null
+++ b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2015.vcxproj
@@ -0,0 +1,107 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>dmmaTensorCoreGemm_vs2015</RootNamespace>
+    <ProjectName>dmmaTensorCoreGemm</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/dmmaTensorCoreGemm.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="dmmaTensorCoreGemm.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2017.sln b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2017.sln
new file mode 100644
index 00000000..b1542da1
--- /dev/null
+++ b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2017.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "dmmaTensorCoreGemm", "dmmaTensorCoreGemm_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2017.vcxproj b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2017.vcxproj
new file mode 100644
index 00000000..f93e5d46
--- /dev/null
+++ b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2017.vcxproj
@@ -0,0 +1,112 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>dmmaTensorCoreGemm_vs2017</RootNamespace>
+    <ProjectName>dmmaTensorCoreGemm</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/dmmaTensorCoreGemm.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="dmmaTensorCoreGemm.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2019.sln b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2019.sln
new file mode 100644
index 00000000..6ba525c9
--- /dev/null
+++ b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2019.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "dmmaTensorCoreGemm", "dmmaTensorCoreGemm_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2019.vcxproj b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2019.vcxproj
new file mode 100644
index 00000000..a01153fc
--- /dev/null
+++ b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2019.vcxproj
@@ -0,0 +1,108 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>dmmaTensorCoreGemm_vs2019</RootNamespace>
+    <ProjectName>dmmaTensorCoreGemm</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/dmmaTensorCoreGemm.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="dmmaTensorCoreGemm.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/globalToShmemAsyncCopy/Makefile b/Samples/globalToShmemAsyncCopy/Makefile
new file mode 100644
index 00000000..155b0336
--- /dev/null
+++ b/Samples/globalToShmemAsyncCopy/Makefile
@@ -0,0 +1,360 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+        endif
+    endif
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - globalToShmemAsyncCopy is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+#Detect if installed version of GCC supports required C++11
+ifeq ($(TARGET_OS),linux)
+    empty :=
+    space := $(empty) $(empty)
+    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
+#Create version number without "."
+    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
+# Make sure the version number has at least 3 decimals
+    GCCVERSION += 00
+# Remove spaces from the version number
+    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
+# Crop the version number to 3 decimals.
+    GCCVERSION := $(shell expr `echo $(GCCVERSION)` | cut -b1-3)
+#$(warning $(GCCVERSION))
+
+    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 500)
+
+    ifeq ($(IS_MIN_VERSION), 1)
+        $(info >>> GCC Version is greater or equal to  5.0.0 <<<)
+    else
+        $(info >>> Waiving build. Minimum GCC version required is  5.0.0<<<)
+        SAMPLE_ENABLED := 0
+    endif
+endif
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 35 37 50 52 60 61 70 72 75 80
+else
+SMS ?= 35 37 50 52 60 61 70 75 80
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ALL_CCFLAGS += --std=c++11
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: globalToShmemAsyncCopy
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+globalToShmemAsyncCopy.o:globalToShmemAsyncCopy.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+globalToShmemAsyncCopy: globalToShmemAsyncCopy.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./globalToShmemAsyncCopy
+
+clean:
+	rm -f globalToShmemAsyncCopy globalToShmemAsyncCopy.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/globalToShmemAsyncCopy
+
+clobber: clean
diff --git a/Samples/globalToShmemAsyncCopy/NsightEclipse.xml b/Samples/globalToShmemAsyncCopy/NsightEclipse.xml
new file mode 100644
index 00000000..aac3235c
--- /dev/null
+++ b/Samples/globalToShmemAsyncCopy/NsightEclipse.xml
@@ -0,0 +1,93 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>globalToShmemAsyncCopy</name>
+  <cflags>
+    <flag>--std=c++11</flag>
+  </cflags>
+  <cuda_api_list>
+    <toolkit>cudaEventCreate</toolkit>
+    <toolkit>cudaEventRecord</toolkit>
+    <toolkit>cudaEventQuery</toolkit>
+    <toolkit>cudaEventDestroy</toolkit>
+    <toolkit>cudaEventElapsedTime</toolkit>
+    <toolkit>cudaEventSynchronize</toolkit>
+    <toolkit>cudaMalloc</toolkit>
+    <toolkit>cudaFree</toolkit>
+    <toolkit>cudaMemcpy</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[This sample implements matrix multiplication which uses asynchronous copy of data from global to shared memory when on compute capability 8.0 or higher. Also demonstrates arrive-wait barrier for synchronization.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">CUDA Runtime API</concept>
+    <concept level="basic">Linear Algebra</concept>
+    <concept level="basic">CPP11 CUDA</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>CUDA</keyword>
+    <keyword>matrix multiply</keyword>
+    <keyword>Async copy</keyword>
+    <keyword>CPP11</keyword>
+    <keyword>GCC 5.0.0</keyword>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>globalToShmemAsyncCopy.cu</primary_file>
+  <required_dependencies>
+    <dependency>CPP11</dependency>
+  </required_dependencies>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+    <scope>3:Linear Algebra</scope>
+  </scopes>
+  <sm-arch>sm35</sm-arch>
+  <sm-arch>sm37</sm-arch>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <arch>x86_64</arch>
+      <platform>macosx</platform>
+    </env>
+    <env>
+      <arch>arm</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <arch>aarch64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <arch>aarch64</arch>
+      <platform>qnx</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>Global Memory to Shared Memory Async Copy</title>
+</entry>
diff --git a/Samples/globalToShmemAsyncCopy/README.md b/Samples/globalToShmemAsyncCopy/README.md
new file mode 100644
index 00000000..e501458c
--- /dev/null
+++ b/Samples/globalToShmemAsyncCopy/README.md
@@ -0,0 +1,74 @@
+# globalToShmemAsyncCopy - Global Memory to Shared Memory Async Copy
+
+## Description
+
+This sample implements matrix multiplication which uses asynchronous copy of data from global to shared memory when on compute capability 8.0 or higher. Also demonstrates arrive-wait barrier for synchronization.
+
+## Key Concepts
+
+CUDA Runtime API, Linear Algebra, CPP11 CUDA
+
+## Supported SM Architectures
+
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, armv7l, aarch64
+
+## CUDA APIs involved
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventElapsedTime, cudaEventSynchronize, cudaMalloc, cudaFree, cudaMemcpy
+
+## Dependencies needed to build/run
+[CPP11](../../README.md#cpp11)
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l, aarch64.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/> `$ make TARGET_ARCH=aarch64` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
diff --git a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu
new file mode 100644
index 00000000..4b0de1bc
--- /dev/null
+++ b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu
@@ -0,0 +1,951 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * Matrix multiplication: C = A * B.
+ *
+ * This sample demonstrates implements matrix multiplication which makes use of shared memory
+ * to ensure data reuse, the matrix multiplication is done using tiling approach.
+ * With compute capability 8.0 or higher the CUDA kernels involved uses asynchronously copy data
+ * from global to shared memory; a.k.a., async-copy.
+ * This sample has been written for clarity of exposition to illustrate various CUDA programming
+ * principles, not with the goal of providing the most performant generic kernel for matrix multiplication.
+ */
+
+// System includes
+#include <stdio.h>
+#include <assert.h>
+
+// CUDA runtime
+#include <cuda_runtime.h>
+#include <cuda_pipeline.h>
+#if __CUDA_ARCH__ >= 700
+#include <cuda_awbarrier.h>
+#endif
+
+// Helper functions and utilities to work with CUDA
+#include <helper_functions.h>
+#include <helper_cuda.h>
+
+namespace nvcuda_namespace = nvcuda::experimental;
+
+enum kernels
+{
+    AsyncCopyMultiStageLargeChunk = 0,
+    AsyncCopyLargeChunk           = 1,
+    AsyncCopyLargeChunkAWBarrier  = 2,
+    AsyncCopyMultiStage           = 3,
+    AsyncCopySingleStage          = 4,
+    Naive                         = 5,
+    NaiveLargeChunk               = 6
+};
+
+const char* kernelNames[] = {"AsyncCopyMultiStageLargeChunk", "AsyncCopyLargeChunk", 
+                            "AsyncCopyLargeChunkAWBarrier", "AsyncCopyMultiStage", 
+                            "AsyncCopySingleStage", "Naive", "NaiveLargeChunk"};
+
+#define USE_CPP_API 0
+
+constexpr int blockSize = 16;
+
+// Multi Stage memcpy_async pipeline with large chunk copy
+template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyMultiStageLargeChunk(float* __restrict__ C, 
+                                                        const float* __restrict__ A,
+                                                        const float* __restrict__ B, int wA,
+                                                        int wB) {
+    // Requires BLOCK_SIZE % 4 == 0 
+
+    // Multi-stage pipeline version
+    constexpr size_t maxPipelineStages = 4;
+
+    // Declaration of the shared memory array As used to
+    // store the sub-matrix of A for each stage
+    __shared__ float As[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE];
+
+    // Declaration of the shared memory array Bs used to
+    // store the sub-matrix of B for each stage
+    __shared__ float Bs[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE];
+
+    float Csub = 0.0;
+
+    // Index of the first sub-matrix of A processed by the block
+    const int aBegin = wA * (BLOCK_SIZE) * blockIdx.y;
+
+    // Index of the last sub-matrix of A processed by the block
+    const int aEnd   = aBegin + wA - 1;
+
+    // Step size used to iterate through the sub-matrices of A
+    int aStep  = BLOCK_SIZE;
+
+    // Index of the first sub-matrix of B processed by the block
+    const int bBegin = BLOCK_SIZE * blockIdx.x;
+
+    // Step size used to iterate through the sub-matrices of B
+    int bStep  = BLOCK_SIZE * wB;
+
+    const int t4x = threadIdx.x * 4 ;
+
+#if USE_CPP_API
+    nvcuda_namespace::pipeline pipe;
+#endif
+    // Loop over all the sub-matrices of A and B
+    // required to compute the block sub-matrix
+    for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, iStage = 0; a <= aEnd; a += aStep, b += bStep, ++i ) {
+        // Load the matrices from device memory to shared memory; each thread loads
+        // one element of each matrix
+
+        for ( ; aStage <= a + aStep * maxPipelineStages ; aStage += aStep, bStage += bStep, ++iStage )
+        {
+            if ( aStage <= aEnd && t4x < BLOCK_SIZE )
+            {
+                // Rotating buffer
+                const int j = iStage % maxPipelineStages;
+                float4 * const A4s = reinterpret_cast<float4*>(& As[j][threadIdx.y][t4x]);
+                float4 * const B4s = reinterpret_cast<float4*>(& Bs[j][threadIdx.y][t4x]);
+                const float4 * const A4  = reinterpret_cast<const float4*>(& A[aStage + wA * threadIdx.y + t4x]);
+                const float4 * const B4  = reinterpret_cast<const float4*>(& B[aStage + wA * threadIdx.y + t4x]);
+
+#if USE_CPP_API
+                nvcuda_namespace::memcpy_async(*A4s,*A4, pipe);
+                nvcuda_namespace::memcpy_async(*B4s,*B4, pipe);
+#else
+                __pipeline_memcpy_async(A4s, A4, sizeof(float4));
+                __pipeline_memcpy_async(B4s, B4, sizeof(float4));
+#endif
+            }
+
+#if USE_CPP_API
+            pipe.commit();
+#else
+            __pipeline_commit();
+#endif
+        }
+#if USE_CPP_API
+        pipe.wait_prior<maxPipelineStages-1>();
+#else
+        __pipeline_wait_prior(maxPipelineStages-1);
+#endif
+        // Synchronize to make sure the matrices are loaded
+        __syncthreads();
+
+        // Rotating buffer
+        const int j = i % maxPipelineStages;
+
+        // Multiply the two matrices together;
+        // each thread computes one element
+        // of the block sub-matrix
+        for (int k = 0; k < BLOCK_SIZE; ++k) {
+            Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x];
+        }
+
+        // Don't have to synchronize because 
+        // next iteration is loading to a different buffer
+    }
+
+    // Write the block sub-matrix to device memory;
+    // each thread writes four element
+    int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
+    C[c + wB * threadIdx.y + threadIdx.x] = Csub;
+}
+
+// Single Stage memcpy_async pipeline with Large copy chunk (float4)
+template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyLargeChunk(float* __restrict__ C, 
+                                                        const float* __restrict__ A,
+                                                        const float* __restrict__ B, int wA,
+                                                        int wB) {
+    // Requires BLOCK_SIZE % 4 == 0 
+
+    // Declaration of the shared memory array As used to
+    // store the sub-matrix of A
+    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
+
+    // Declaration of the shared memory array Bs used to
+    // store the sub-matrix of B
+    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+
+    // Index of the first sub-matrix of A processed by the block
+    int aBegin = wA * BLOCK_SIZE * blockIdx.y;
+
+    // Index of the last sub-matrix of A processed by the block
+    int aEnd   = aBegin + wA - 1;
+
+    // Step size used to iterate through the sub-matrices of A
+    int aStep  = BLOCK_SIZE;
+
+    // Index of the first sub-matrix of B processed by the block
+    int bBegin = BLOCK_SIZE * blockIdx.x;
+
+    // Step size used to iterate through the sub-matrices of B
+    int bStep  = BLOCK_SIZE * wB;
+
+    // Single-stage pipeline version
+    float Csub = 0.0;
+
+    const int t4x = threadIdx.x * 4;
+#if USE_CPP_API
+    nvcuda_namespace::pipeline pipe;
+#endif
+    // Loop over all the sub-matrices of A and B
+    // required to compute the block sub-matrix
+    for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
+        // Load the matrices from device memory to shared memory; 
+        // a subset of threads loads a contiguous chunk of elements.
+
+        // Previously, per-thread:
+        // As[ty][tx] = A[a + wA * ty + tx];
+        // Bs[ty][tx] = B[b + wB * ty + tx];
+
+        // Now, one fourth of the threads load four elements of each matrix
+        if ( t4x < BLOCK_SIZE ) {
+            float4 * const A4s = reinterpret_cast<float4*>(& As[threadIdx.y][t4x]);
+            float4 * const B4s = reinterpret_cast<float4*>(& Bs[threadIdx.y][t4x]);
+            const float4 * const A4  = reinterpret_cast<const float4*>(& A[a + wA * threadIdx.y + t4x]);
+            const float4 * const B4  = reinterpret_cast<const float4*>(& B[a + wA * threadIdx.y + t4x]);
+
+#if USE_CPP_API
+            nvcuda_namespace::memcpy_async(*A4s,*A4,pipe);
+            nvcuda_namespace::memcpy_async(*B4s,*B4,pipe);
+
+            pipe.commit_and_wait();
+#else
+            __pipeline_memcpy_async(A4s, A4, sizeof(float4));
+            __pipeline_memcpy_async(B4s, B4, sizeof(float4));
+
+            __pipeline_commit();
+            __pipeline_wait_prior(0);
+#endif
+        }
+
+        // Synchronize to make sure the matrices are loaded
+        __syncthreads();
+
+        // Multiply the two matrices together;
+        // each thread computes one element
+        // of the block sub-matrix
+#pragma unroll
+        for (int k = 0; k < BLOCK_SIZE; ++k) {
+            Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x];
+        }
+
+        // Synchronize to make sure that the preceding
+        // computation is done before loading two new
+        // sub-matrices of A and B in the next iteration
+        __syncthreads();
+    }
+
+    // Write the block sub-matrix to device memory;
+    // each thread writes four element
+    int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
+    C[c + wB * threadIdx.y + threadIdx.x] = Csub;
+}
+
+// Single Stage memcpy_async pipeline with Large copy chunk (float4) using arrive-wait barrier
+template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyLargeChunkAWBarrier(float* __restrict__ C, 
+                                                        const float* __restrict__ A,
+                                                        const float* __restrict__ B, int wA,
+                                                        int wB) {
+#if __CUDA_ARCH__ >= 700
+    // Requires BLOCK_SIZE % 4 == 0 
+
+    // Declaration of the shared memory array As used to
+    // store the sub-matrix of A
+    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
+
+    // Declaration of the shared memory array Bs used to
+    // store the sub-matrix of B
+    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+
+    nvcuda_namespace::pipeline pipe;
+    __shared__ nvcuda_namespace::awbarrier barrier;
+
+    if (threadIdx.x == 0) {
+        nvcuda_namespace::init(&barrier, blockDim.x*blockDim.y);
+    }
+    __syncthreads();
+
+    // Index of the first sub-matrix of A processed by the block
+    int aBegin = wA * BLOCK_SIZE * blockIdx.y;
+
+    // Index of the last sub-matrix of A processed by the block
+    int aEnd   = aBegin + wA - 1;
+
+    // Step size used to iterate through the sub-matrices of A
+    int aStep  = BLOCK_SIZE;
+
+    // Index of the first sub-matrix of B processed by the block
+    int bBegin = BLOCK_SIZE * blockIdx.x;
+
+    // Step size used to iterate through the sub-matrices of B
+    int bStep  = BLOCK_SIZE * wB;
+
+    float Csub = 0.0;
+
+    const int t4x = threadIdx.x * 4;
+
+    // Loop over all the sub-matrices of A and B
+    // required to compute the block sub-matrix
+    for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
+        // Load the matrices from device memory to shared memory; 
+        // a subset of threads loads a contiguous chunk of elements.
+
+        // Now, one fourth of the threads load four elements of each matrix
+        if ( t4x < BLOCK_SIZE ) {
+            float4 * const A4s = reinterpret_cast<float4*>(& As[threadIdx.y][t4x]);
+            float4 * const B4s = reinterpret_cast<float4*>(& Bs[threadIdx.y][t4x]);
+            const float4 * const A4  = reinterpret_cast<const float4*>(& A[a + wA * threadIdx.y + t4x]);
+            const float4 * const B4  = reinterpret_cast<const float4*>(& B[a + wA * threadIdx.y + t4x]);
+
+            nvcuda_namespace::memcpy_async(*A4s,*A4,pipe);
+            nvcuda_namespace::memcpy_async(*B4s,*B4,pipe);
+
+            pipe.arrive_on(barrier);
+        }
+
+        // Synchronize to make sure the matrices are loaded
+        barrier.arrive_and_wait();
+
+        // Multiply the two matrices together;
+        // each thread computes one element
+        // of the block sub-matrix
+#pragma unroll
+        for (int k = 0; k < BLOCK_SIZE; ++k) {
+            Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x];
+        }
+
+        // Synchronize to make sure that the preceding
+        // computation is done before loading two new
+        // sub-matrices of A and B in the next iteration
+        __syncthreads();
+    }
+
+    // Write the block sub-matrix to device memory;
+    // each thread writes four element
+    int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
+    C[c + wB * threadIdx.y + threadIdx.x] = Csub;
+#endif
+}
+
+// Single Stage memcpy_async pipeline with float copy
+template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopySingleStage(float *C, const float *A,
+                                                        const float *B, int wA,
+                                                        int wB) {
+
+    // Declaration of the shared memory array As used to
+    // store the sub-matrix of A
+    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
+
+    // Declaration of the shared memory array Bs used to
+    // store the sub-matrix of B
+    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+
+    // Index of the first sub-matrix of A processed by the block
+    int aBegin = wA * BLOCK_SIZE * blockIdx.y;
+
+    // Index of the last sub-matrix of A processed by the block
+    int aEnd   = aBegin + wA - 1;
+
+    // Step size used to iterate through the sub-matrices of A
+    int aStep  = BLOCK_SIZE;
+
+    // Index of the first sub-matrix of B processed by the block
+    int bBegin = BLOCK_SIZE * blockIdx.x;
+
+    // Step size used to iterate through the sub-matrices of B
+    int bStep  = BLOCK_SIZE * wB;
+
+    // Single-stage pipeline version
+    float Csub = 0.0;
+
+#if USE_CPP_API
+    nvcuda_namespace::pipeline pipe;
+#endif
+    // Loop over all the sub-matrices of A and B
+    // required to compute the block sub-matrix
+    for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
+        // Load the matrices from device memory to shared memory; each thread loads
+        // one element of each matrix
+        {
+            const float *A_float = reinterpret_cast<const float*>(A + a + wA * threadIdx.y + threadIdx.x);
+            const float *B_float = reinterpret_cast<const float*>(B + b + wB * threadIdx.y + threadIdx.x);
+
+#if USE_CPP_API
+
+            nvcuda_namespace::memcpy_async(As[threadIdx.y][threadIdx.x], *A_float, pipe);
+            nvcuda_namespace::memcpy_async(Bs[threadIdx.y][threadIdx.x], *B_float, pipe);
+
+            pipe.commit_and_wait();
+#else
+            __pipeline_memcpy_async(&As[threadIdx.y][threadIdx.x], A_float, sizeof(float));
+            __pipeline_memcpy_async(&Bs[threadIdx.y][threadIdx.x], B_float, sizeof(float));
+
+            __pipeline_commit();
+            __pipeline_wait_prior(0);
+#endif
+        }
+
+        // Synchronize to make sure the matrices are loaded
+        __syncthreads();
+
+        // Multiply the two matrices together;
+        // each thread computes one element
+        // of the block sub-matrix
+#pragma unroll
+        for (int k = 0; k < BLOCK_SIZE; ++k) {
+            Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x];
+        }
+
+        // Synchronize to make sure that the preceding
+        // computation is done before loading two new
+        // sub-matrices of A and B in the next iteration
+        __syncthreads();
+    }
+
+    // Write the block sub-matrix to device memory;
+    // each thread writes four element
+    int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
+    C[c + wB * threadIdx.y + threadIdx.x] = Csub;
+}
+
+// Multi Stage memcpy_async pipeline with int copy
+template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyMultiStage(float* __restrict__ C, 
+                                                        const float* __restrict__ A,
+                                                        const float* __restrict__ B, int wA,
+                                                        int wB) {
+    // Multi-stage pipeline version
+    constexpr size_t maxPipelineStages = 4;
+
+    // Declaration of the shared memory array As used to
+    // store the sub-matrix of A for each stage
+    __shared__ float As[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE];
+
+    // Declaration of the shared memory array Bs used to
+    // store the sub-matrix of B for each stage
+    __shared__ float Bs[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE];
+
+    float Csub = 0.0;
+
+    // Index of the first sub-matrix of A processed by the block
+    const int aBegin = wA * BLOCK_SIZE * blockIdx.y;
+
+    // Index of the last sub-matrix of A processed by the block
+    const int aEnd   = aBegin + wA - 1;
+
+    // Step size used to iterate through the sub-matrices of A
+    int aStep  = BLOCK_SIZE;
+
+    // Index of the first sub-matrix of B processed by the block
+    const int bBegin = BLOCK_SIZE * blockIdx.x;
+
+    // Step size used to iterate through the sub-matrices of B
+    int bStep  = BLOCK_SIZE * wB;
+
+#if USE_CPP_API
+    nvcuda_namespace::pipeline pipe;
+#endif
+    // Loop over all the sub-matrices of A and B
+    // required to compute the block sub-matrix
+    for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, iStage = 0; a <= aEnd; a += aStep, b += bStep, ++i ) {
+        // Load the matrices from device memory to shared memory; each thread loads
+        // one element of each matrix
+
+        for ( ; aStage <= a + aStep * maxPipelineStages ; aStage += aStep, bStage += bStep, ++iStage )
+        {
+            if ( aStage <= aEnd )
+            {
+                const float *A_float = reinterpret_cast<const float*>(A + aStage + wA * threadIdx.y + threadIdx.x);
+                const float *B_float = reinterpret_cast<const float*>(B + bStage + wB * threadIdx.y + threadIdx.x);
+
+                // Rotating buffer
+                const int j = iStage % maxPipelineStages;
+#if USE_CPP_API
+                nvcuda_namespace::memcpy_async(As[j][threadIdx.y][threadIdx.x], *A_float, pipe);
+                nvcuda_namespace::memcpy_async(Bs[j][threadIdx.y][threadIdx.x], *B_float, pipe);
+#else
+                __pipeline_memcpy_async(&As[j][threadIdx.y][threadIdx.x], A_float, sizeof(float));
+                __pipeline_memcpy_async(&Bs[j][threadIdx.y][threadIdx.x], B_float, sizeof(float));
+#endif
+            }
+#if USE_CPP_API
+            pipe.commit();
+#else
+            __pipeline_commit();
+#endif
+        }
+#if USE_CPP_API
+        pipe.wait_prior<maxPipelineStages-1>();
+#else
+        __pipeline_wait_prior(maxPipelineStages-1);
+#endif
+        // Synchronize to make sure the matrices are loaded
+        __syncthreads();
+
+        const int j = i % maxPipelineStages;
+
+        // Multiply the two matrices together;
+        // each thread computes one element
+        // of the block sub-matrix
+        for (int k = 0; k < BLOCK_SIZE; ++k) {
+            Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x];
+        }
+
+        // Don't have to synchronize because 
+        // next iteration is loading to a different buffer
+    }
+
+    // Write the block sub-matrix to device memory;
+    // each thread writes four element
+    int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
+    C[c + wB * threadIdx.y + threadIdx.x] = Csub;
+}
+
+/**
+ * Matrix multiplication (CUDA Kernel) on the device: C = A * B
+ * wA is A's width and wB is B's width
+ */
+template <int BLOCK_SIZE> __global__ void MatrixMulNaive(float *C, float *A,
+                                                        float *B, int wA,
+                                                        int wB) {
+    // Declaration of the shared memory array As used to
+    // store the sub-matrix of A
+    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
+
+    // Declaration of the shared memory array Bs used to
+    // store the sub-matrix of B
+    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+
+    // Index of the first sub-matrix of A processed by the block
+    int aBegin = wA * BLOCK_SIZE * blockIdx.y;
+
+    // Index of the last sub-matrix of A processed by the block
+    int aEnd   = aBegin + wA - 1;
+
+    // Step size used to iterate through the sub-matrices of A
+    int aStep  = BLOCK_SIZE;
+
+    // Index of the first sub-matrix of B processed by the block
+    int bBegin = BLOCK_SIZE * blockIdx.x;
+
+    // Step size used to iterate through the sub-matrices of B
+    int bStep  = BLOCK_SIZE * wB;
+
+    // Csub is used to store the element of the block sub-matrix
+    // that is computed by the thread
+    float Csub = 0;
+
+    // Loop over all the sub-matrices of A and B
+    // required to compute the block sub-matrix
+    for (int a = aBegin, b = bBegin;
+            a <= aEnd;
+            a += aStep, b += bStep) {
+
+        // Load the matrices from device memory
+        // to shared memory; each thread loads
+        // one element of each matrix
+        As[threadIdx.y][threadIdx.x] = A[a + wA * threadIdx.y + threadIdx.x];
+        Bs[threadIdx.y][threadIdx.x] = B[b + wB * threadIdx.y + threadIdx.x];
+
+        // Synchronize to make sure the matrices are loaded
+        __syncthreads();
+
+        // Multiply the two matrices together;
+        // each thread computes one element
+        // of the block sub-matrix
+#pragma unroll
+        for (int k = 0; k < BLOCK_SIZE; ++k) {
+            Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x];
+        }
+
+        // Synchronize to make sure that the preceding
+        // computation is done before loading two new
+        // sub-matrices of A and B in the next iteration
+        __syncthreads();
+    }
+
+    // Write the block sub-matrix to device memory;
+    // each thread writes one element
+    int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
+    C[c + wB * threadIdx.y + threadIdx.x] = Csub;
+}
+
+template <int BLOCK_SIZE> __global__ void MatrixMulNaiveLargeChunk(float *C, float *A,
+                                                        float *B, int wA,
+                                                        int wB) {
+    // Declaration of the shared memory array As used to
+    // store the sub-matrix of A
+    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
+
+    // Declaration of the shared memory array Bs used to
+    // store the sub-matrix of B
+    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+
+    int t4x = threadIdx.x * 4 ;
+
+    // Index of the first sub-matrix of A processed by the block
+    int aBegin = wA * BLOCK_SIZE * blockIdx.y;
+
+    // Index of the last sub-matrix of A processed by the block
+    int aEnd   = aBegin + wA - 1;
+
+    // Step size used to iterate through the sub-matrices of A
+    int aStep  = BLOCK_SIZE;
+
+    // Index of the first sub-matrix of B processed by the block
+    int bBegin = BLOCK_SIZE * blockIdx.x;
+
+    // Step size used to iterate through the sub-matrices of B
+    int bStep  = BLOCK_SIZE * wB;
+
+    // Csub is used to store the element of the block sub-matrix
+    // that is computed by the thread
+    float Csub = 0;
+
+    // Loop over all the sub-matrices of A and B
+    // required to compute the block sub-matrix
+    for (int a = aBegin, b = bBegin;
+            a <= aEnd;
+            a += aStep, b += bStep) {
+
+        // Load the matrices from device memory
+        // to shared memory; 
+
+        // One fourth of the threads load four elements of each matrix
+        if ( t4x < BLOCK_SIZE ) {
+            float4 * const A4s = reinterpret_cast<float4*>(& As[threadIdx.y][t4x]);
+            float4 * const B4s = reinterpret_cast<float4*>(& Bs[threadIdx.y][t4x]);
+            const float4 * const A4 = reinterpret_cast<float4*>(& A[a + wA * threadIdx.y + t4x]);
+            const float4 * const B4 = reinterpret_cast<float4*>(& B[a + wA * threadIdx.y + t4x]);
+            *A4s = *A4 ;
+            *B4s = *B4 ;
+        }
+
+        // Synchronize to make sure the matrices are loaded
+        __syncthreads();
+
+        // Multiply the two matrices together;
+        // each thread computes one element
+        // of the block sub-matrix
+#pragma unroll
+        for (int k = 0; k < BLOCK_SIZE; ++k) {
+            Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x];
+        }
+
+        // Synchronize to make sure that the preceding
+        // computation is done before loading two new
+        // sub-matrices of A and B in the next iteration
+        __syncthreads();
+    }
+
+    // Write the block sub-matrix to device memory;
+    // each thread writes one element
+    int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
+    C[c + wB * threadIdx.y + threadIdx.x] = Csub;
+}
+
+
+void ConstantInit(float *data, int size, float val) {
+    for (int i = 0; i < size; ++i) {
+        data[i] = val;
+    }
+}
+
+/**
+ * Run matrix multiplication using CUDA
+ */
+int MatrixMultiply(int argc, char **argv,
+                   const dim3 &dimsA,
+                   const dim3 &dimsB,
+                   kernels kernel_number) {
+    // Allocate host memory for matrices A and B
+    unsigned int size_A = dimsA.x * dimsA.y;
+    unsigned int mem_size_A = sizeof(float) * size_A;
+    float *h_A = reinterpret_cast<float *>(malloc(mem_size_A));
+    unsigned int size_B = dimsB.x * dimsB.y;
+    unsigned int mem_size_B = sizeof(float) * size_B;
+    float *h_B = reinterpret_cast<float *>(malloc(mem_size_B));
+    cudaStream_t stream;
+
+    // Initialize host memory
+    const float valB = 2.10f;
+    ConstantInit(h_A, size_A, 1.0f);
+    ConstantInit(h_B, size_B, valB);
+
+    // Allocate device memory
+    float *d_A, *d_B, *d_C;
+
+    // Allocate host matrix C
+    dim3 dimsC(dimsB.x, dimsA.y, 1);
+    unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
+    float *h_C = reinterpret_cast<float *>(malloc(mem_size_C));
+
+    if (h_C == NULL) {
+        fprintf(stderr, "Failed to allocate host matrix C!\n");
+        exit(EXIT_FAILURE);
+    }
+
+    checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
+    checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
+    checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
+    // Allocate CUDA events that we'll use for timing
+    cudaEvent_t start, stop;
+    checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&stop));
+
+    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+    // copy host memory to device
+    checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemsetAsync(d_C, 0, mem_size_C, stream));
+
+    // Setup execution parameters
+    dim3 threads(blockSize, blockSize);
+    dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
+
+
+    printf("Running kernel = %d - %s\n", kernel_number, kernelNames[kernel_number]);
+    // Create and start timer
+    printf("Computing result using CUDA Kernel...\n");
+
+    // Performs warmup operation using matrixMul CUDA kernel
+    switch (kernel_number)
+    {
+        case AsyncCopyMultiStageLargeChunk :
+        default:
+            MatrixMulAsyncCopyMultiStageLargeChunk<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+            break;
+        case AsyncCopyLargeChunk :
+            MatrixMulAsyncCopyLargeChunk<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+            break;
+        case AsyncCopyLargeChunkAWBarrier :
+            MatrixMulAsyncCopyLargeChunkAWBarrier<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+            break;
+        case AsyncCopyMultiStage :
+            MatrixMulAsyncCopyMultiStage<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+            break;
+        case AsyncCopySingleStage :
+            MatrixMulAsyncCopySingleStage<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+            break;
+        case Naive :
+            MatrixMulNaive<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+            break;
+        case NaiveLargeChunk:
+            MatrixMulNaiveLargeChunk<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+            break;
+    }
+
+    printf("done\n");
+    checkCudaErrors(cudaStreamSynchronize(stream));
+
+
+    // Execute the kernel
+    int nIter = 100;
+
+    // Record the start event
+    checkCudaErrors(cudaEventRecord(start, stream));
+
+    for (int j = 0; j < nIter; j++) {
+        switch (kernel_number)
+        {
+            case AsyncCopyMultiStageLargeChunk :
+            default:
+                MatrixMulAsyncCopyMultiStageLargeChunk<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+                break;
+            case AsyncCopyLargeChunk :
+                MatrixMulAsyncCopyLargeChunk<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+                break;
+            case AsyncCopyLargeChunkAWBarrier :
+                MatrixMulAsyncCopyLargeChunkAWBarrier<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+                break;
+            case AsyncCopyMultiStage :
+                MatrixMulAsyncCopyMultiStage<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+                break;
+            case AsyncCopySingleStage :
+                MatrixMulAsyncCopySingleStage<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+                break;
+            case Naive :
+                MatrixMulNaive<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+                break;
+            case NaiveLargeChunk:
+                MatrixMulNaiveLargeChunk<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+                break;
+        }
+    }
+
+    // Record the stop event
+    checkCudaErrors(cudaEventRecord(stop, stream));
+
+    // Wait for the stop event to complete
+    checkCudaErrors(cudaEventSynchronize(stop));
+
+    float msecTotal = 0.0f;
+    checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
+
+    // Compute and print the performance
+    float msecPerMatrixMul = msecTotal / nIter;
+    double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
+                               static_cast<double>(dimsA.y) *
+                               static_cast<double>(dimsB.x);
+    double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) /
+                       (msecPerMatrixMul / 1000.0f);
+    printf(
+        "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops," \
+        " WorkgroupSize= %u threads/block\n",
+        gigaFlops,
+        msecPerMatrixMul,
+        flopsPerMatrixMul,
+        threads.x * threads.y);
+
+    // Copy result from device to host
+    checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
+    checkCudaErrors(cudaStreamSynchronize(stream));
+
+    printf("Checking computed result for correctness: ");
+    bool correct = true;
+
+    // test relative error by the formula
+    // |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
+    double eps = 1.e-6;  // machine zero
+
+    for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
+        double abs_err = fabs(h_C[i] - (dimsA.x * valB));
+        double dot_length = dimsA.x;
+        double abs_val = fabs(h_C[i]);
+        double rel_err = abs_err / abs_val / dot_length;
+
+        if (rel_err > eps) {
+            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n",
+                   i, h_C[i], dimsA.x * valB, eps);
+            correct = false;
+        }
+    }
+
+    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
+
+    // Clean up memory
+    free(h_A);
+    free(h_B);
+    free(h_C);
+    checkCudaErrors(cudaFree(d_A));
+    checkCudaErrors(cudaFree(d_B));
+    checkCudaErrors(cudaFree(d_C));
+    checkCudaErrors(cudaEventDestroy(start));
+    checkCudaErrors(cudaEventDestroy(stop));
+    printf("\nNOTE: The CUDA Samples are not meant for performance"\
+           "measurements. Results may vary when GPU Boost is enabled.\n");
+
+    if (correct) {
+        return EXIT_SUCCESS;
+    } else {
+        return EXIT_FAILURE;
+    }
+}
+
+
+int main(int argc, char **argv) {
+    printf("[Matrix Multiply Using CUDA] - Starting...\n");
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
+            checkCmdLineFlag(argc, (const char **)argv, "?")) {
+        printf("Usage -device=n (n >= 0 for deviceID)\n");
+        printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
+        printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
+        printf("      -kernel=kernel_number (0 - AsyncCopyMultiStageLargeChunk; 1 - AsyncCopyLargeChunk)\n");
+        printf("                            (2 - AsyncCopyLargeChunkAWBarrier; 3 - AsyncCopyMultiStage)\n");
+        printf("                            (4 - AsyncCopySingleStage; 5 - Naive without memcpy_async)\n");
+        printf("                            (6 - NaiveLargeChunk without memcpy_async)\n");
+        printf("  Note: Outer matrix dimensions of A & B matrices must be equal.\n");
+
+        exit(EXIT_SUCCESS);
+    }
+
+    // This will pick the best possible CUDA capable device, otherwise
+    // override the device ID based on input provided at the command line
+    int dev = findCudaDevice(argc, (const char **)argv);
+
+    int matrixBlock = 32;
+    dim3 dimsA(10 * 2 * matrixBlock, 10 * 2 * matrixBlock, 1);
+    dim3 dimsB(10 * 2 * matrixBlock, 10 * 2 * matrixBlock, 1);
+
+    // width of Matrix A
+    if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
+        dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
+    }
+
+    // height of Matrix A
+    if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
+        dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
+    }
+
+    // width of Matrix B
+    if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
+        dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
+    }
+
+    // height of Matrix B
+    if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
+        dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
+    }
+
+    if (dimsA.x != dimsB.y) {
+        printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
+               dimsA.x, dimsB.y);
+        exit(EXIT_FAILURE);
+    }
+
+    kernels selected_kernel = AsyncCopyMultiStageLargeChunk;
+
+    // kernel to run - default (AsyncCopyMultiStageLargeChunk == 0)
+    if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) {
+        int kernel_number = getCmdLineArgumentInt(argc, (const char **)argv, "kernel");
+        if (kernel_number < 7)
+        {
+            selected_kernel = (kernels)kernel_number;
+        }
+        else
+        {
+            printf("Error: kernel number should be between 0 to 6, you have entered %d\n", kernel_number);
+            exit(EXIT_FAILURE);
+        }
+
+        int major = 0;
+        checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
+        if ((kernel_number == AsyncCopyLargeChunkAWBarrier) && major < 7)
+        {
+            printf("AsyncCopyLargeChunkAWBarrier kernel requires requires SM 7.0 or higher.  Exiting...\n");
+            exit(EXIT_WAIVED);
+        }
+    }
+
+
+    printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y,
+                                               dimsB.x, dimsB.y);
+
+    int matrix_result = MatrixMultiply(argc, argv, dimsA, dimsB, selected_kernel);
+
+    exit(matrix_result);
+}
+
diff --git a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2015.sln b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2015.sln
new file mode 100644
index 00000000..aa01fd15
--- /dev/null
+++ b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2015.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 14.00
+# Visual Studio 2015
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "globalToShmemAsyncCopy", "globalToShmemAsyncCopy_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2015.vcxproj b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2015.vcxproj
new file mode 100644
index 00000000..d8cd0a9c
--- /dev/null
+++ b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2015.vcxproj
@@ -0,0 +1,107 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>globalToShmemAsyncCopy_vs2015</RootNamespace>
+    <ProjectName>globalToShmemAsyncCopy</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/globalToShmemAsyncCopy.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="globalToShmemAsyncCopy.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2017.sln b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2017.sln
new file mode 100644
index 00000000..848a7fc9
--- /dev/null
+++ b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2017.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "globalToShmemAsyncCopy", "globalToShmemAsyncCopy_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2017.vcxproj b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2017.vcxproj
new file mode 100644
index 00000000..809e05a3
--- /dev/null
+++ b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2017.vcxproj
@@ -0,0 +1,112 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>globalToShmemAsyncCopy_vs2017</RootNamespace>
+    <ProjectName>globalToShmemAsyncCopy</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/globalToShmemAsyncCopy.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="globalToShmemAsyncCopy.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2019.sln b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2019.sln
new file mode 100644
index 00000000..7e24c8a6
--- /dev/null
+++ b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2019.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "globalToShmemAsyncCopy", "globalToShmemAsyncCopy_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2019.vcxproj b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2019.vcxproj
new file mode 100644
index 00000000..7ee2da9b
--- /dev/null
+++ b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2019.vcxproj
@@ -0,0 +1,108 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>globalToShmemAsyncCopy_vs2019</RootNamespace>
+    <ProjectName>globalToShmemAsyncCopy</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/globalToShmemAsyncCopy.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="globalToShmemAsyncCopy.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/immaTensorCoreGemm/Makefile b/Samples/immaTensorCoreGemm/Makefile
index 9e08de11..b70ce8f0 100644
--- a/Samples/immaTensorCoreGemm/Makefile
+++ b/Samples/immaTensorCoreGemm/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -259,9 +280,9 @@ LIBRARIES :=
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 72 75
+SMS ?= 72 75 80
 else
-SMS ?= 75
+SMS ?= 75 80
 endif
 
 ifeq ($(SMS),)
diff --git a/Samples/immaTensorCoreGemm/NsightEclipse.xml b/Samples/immaTensorCoreGemm/NsightEclipse.xml
index d87551ca..1e030bb5 100644
--- a/Samples/immaTensorCoreGemm/NsightEclipse.xml
+++ b/Samples/immaTensorCoreGemm/NsightEclipse.xml
@@ -40,6 +40,7 @@
   </scopes>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/immaTensorCoreGemm/README.md b/Samples/immaTensorCoreGemm/README.md
index 8f44aeec..072a9efb 100644
--- a/Samples/immaTensorCoreGemm/README.md
+++ b/Samples/immaTensorCoreGemm/README.md
@@ -10,7 +10,7 @@ Matrix Multiply, WMMA, Tensor Cores
 
 ## Supported SM Architectures
 
-[SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -27,7 +27,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate,
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm.cu b/Samples/immaTensorCoreGemm/immaTensorCoreGemm.cu
index 2d075447..6245eb95 100644
--- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm.cu
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm.cu
@@ -160,11 +160,11 @@
 // columns. For shared memory, such access can result in bank conflicts if
 // different rows / columns of the matrix map to the same bank. By shifting each
 // row and column by a few bytes, we make sure that they map to different banks,
-// thus reducing the number of possible bank conflicts. The number of 16
+// thus reducing the number of possible bank conflicts. The number of 32
 // one-byte "uint8_t" elements is chosen as the minimum possible shift because
-// we must keep each row and column 128-bit aligned, as required by
+// we must keep each row and column 256-bit aligned, as required by
 // nvcuda::wmma::load_matrix_sync.
-#define SKEW_UINT8 16
+#define SKEW_UINT8 32
 
 #define checkKernelErrors(expr)                             \
   do {                                                      \
diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2012.vcxproj b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2012.vcxproj
index 1170b23c..ea13b074 100644
--- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2012.vcxproj
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2013.vcxproj b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2013.vcxproj
index fd7e1efd..a0ffbad8 100644
--- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2013.vcxproj
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2015.vcxproj b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2015.vcxproj
index 06e8e895..c0513b0c 100644
--- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2015.vcxproj
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj
index 8e4d582d..6bdf2fb8 100644
--- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2019.vcxproj b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2019.vcxproj
index d7590007..f8b5076c 100644
--- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2019.vcxproj
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/jacobiCudaGraphs/Makefile b/Samples/jacobiCudaGraphs/Makefile
index b1e53f6e..4c0c967e 100644
--- a/Samples/jacobiCudaGraphs/Makefile
+++ b/Samples/jacobiCudaGraphs/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -247,9 +268,9 @@ LIBRARIES :=
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
diff --git a/Samples/jacobiCudaGraphs/NsightEclipse.xml b/Samples/jacobiCudaGraphs/NsightEclipse.xml
index aa7ed599..eb1140ba 100644
--- a/Samples/jacobiCudaGraphs/NsightEclipse.xml
+++ b/Samples/jacobiCudaGraphs/NsightEclipse.xml
@@ -42,16 +42,6 @@
   <scopes>
     <scope>1:CUDA</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
-  <sm-arch>sm35</sm-arch>
-  <sm-arch>sm37</sm-arch>
-  <sm-arch>sm50</sm-arch>
-  <sm-arch>sm52</sm-arch>
-  <sm-arch>sm60</sm-arch>
-  <sm-arch>sm61</sm-arch>
-  <sm-arch>sm70</sm-arch>
-  <sm-arch>sm72</sm-arch>
-  <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/jacobiCudaGraphs/README.md b/Samples/jacobiCudaGraphs/README.md
index deff3787..a5211bd2 100644
--- a/Samples/jacobiCudaGraphs/README.md
+++ b/Samples/jacobiCudaGraphs/README.md
@@ -10,11 +10,9 @@ CUDA Graphs, Stream Capture, Instantiated CUDA Graph Update, Cooperative Groups
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
-
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -27,7 +25,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch,
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
@@ -66,29 +64,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2012.vcxproj b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2012.vcxproj
index ef259877..7ffef47a 100644
--- a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2012.vcxproj
+++ b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/jacobiCudaGraphs.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2013.vcxproj b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2013.vcxproj
index 8409ac1d..bb5da1db 100644
--- a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2013.vcxproj
+++ b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/jacobiCudaGraphs.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2015.vcxproj b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2015.vcxproj
index c41d0353..3bd2f880 100644
--- a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2015.vcxproj
+++ b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/jacobiCudaGraphs.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2017.vcxproj b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2017.vcxproj
index e8bb6ad3..bbf2b2a4 100644
--- a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2017.vcxproj
+++ b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/jacobiCudaGraphs.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2019.vcxproj b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2019.vcxproj
index f18a58fa..1f77230e 100644
--- a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2019.vcxproj
+++ b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/jacobiCudaGraphs.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/matrixMul/Makefile b/Samples/matrixMul/Makefile
index df89c55f..b99c1750 100644
--- a/Samples/matrixMul/Makefile
+++ b/Samples/matrixMul/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -247,9 +268,9 @@ LIBRARIES :=
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
diff --git a/Samples/matrixMul/NsightEclipse.xml b/Samples/matrixMul/NsightEclipse.xml
index 364b84c1..6583eb57 100644
--- a/Samples/matrixMul/NsightEclipse.xml
+++ b/Samples/matrixMul/NsightEclipse.xml
@@ -38,7 +38,6 @@
     <scope>1:CUDA Basic Topics</scope>
     <scope>3:Linear Algebra</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
   <sm-arch>sm35</sm-arch>
   <sm-arch>sm37</sm-arch>
   <sm-arch>sm50</sm-arch>
@@ -48,6 +47,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/matrixMul/README.md b/Samples/matrixMul/README.md
index bf4f7cda..c43508ce 100644
--- a/Samples/matrixMul/README.md
+++ b/Samples/matrixMul/README.md
@@ -10,11 +10,11 @@ CUDA Runtime API, Linear Algebra
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -27,7 +27,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
@@ -66,29 +66,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/matrixMul/matrixMul_vs2012.vcxproj b/Samples/matrixMul/matrixMul_vs2012.vcxproj
index d68e275d..a8c2802f 100644
--- a/Samples/matrixMul/matrixMul_vs2012.vcxproj
+++ b/Samples/matrixMul/matrixMul_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/matrixMul.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/matrixMul/matrixMul_vs2013.vcxproj b/Samples/matrixMul/matrixMul_vs2013.vcxproj
index a01c6c5f..7d062bbe 100644
--- a/Samples/matrixMul/matrixMul_vs2013.vcxproj
+++ b/Samples/matrixMul/matrixMul_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/matrixMul.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/matrixMul/matrixMul_vs2015.vcxproj b/Samples/matrixMul/matrixMul_vs2015.vcxproj
index 66a1e419..4db7b8b6 100644
--- a/Samples/matrixMul/matrixMul_vs2015.vcxproj
+++ b/Samples/matrixMul/matrixMul_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/matrixMul.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/matrixMul/matrixMul_vs2017.vcxproj b/Samples/matrixMul/matrixMul_vs2017.vcxproj
index ff75d2b1..11049e43 100644
--- a/Samples/matrixMul/matrixMul_vs2017.vcxproj
+++ b/Samples/matrixMul/matrixMul_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/matrixMul.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/matrixMul/matrixMul_vs2019.vcxproj b/Samples/matrixMul/matrixMul_vs2019.vcxproj
index a7387155..44fa1294 100644
--- a/Samples/matrixMul/matrixMul_vs2019.vcxproj
+++ b/Samples/matrixMul/matrixMul_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/matrixMul.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/matrixMulDrv/Makefile b/Samples/matrixMulDrv/Makefile
index 8eb720e8..26a52302 100644
--- a/Samples/matrixMulDrv/Makefile
+++ b/Samples/matrixMulDrv/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -257,8 +278,8 @@ ifeq ($(GENCODE_FLAGS),)
 $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
 
 ifeq ($(SMS),)
-# Generate PTX code from SM 30
-GENCODE_FLAGS += -gencode arch=compute_30,code=compute_30
+# Generate PTX code from SM 35
+GENCODE_FLAGS += -gencode arch=compute_35,code=compute_35
 endif
 
 # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
@@ -284,6 +305,10 @@ else
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
   endif
 
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
+  endif
+
   ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
   endif
@@ -298,12 +323,19 @@ else
 
   ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
+    ifdef TARGET_OVERRIDE
+        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
+    endif
   endif
 
   ifeq ($(TARGET_ARCH),ppc64le)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
   endif
 
+  ifeq ($(HOST_ARCH),ppc64le)
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
+  endif
+
   CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
   ifeq ("$(CUDALIB)","")
     $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
diff --git a/Samples/matrixMulDrv/README.md b/Samples/matrixMulDrv/README.md
index 08ccac03..27c82ed8 100644
--- a/Samples/matrixMulDrv/README.md
+++ b/Samples/matrixMulDrv/README.md
@@ -10,11 +10,11 @@ CUDA Driver API, Matrix Multiply
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -27,7 +27,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cu
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
@@ -66,29 +66,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2012.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2012.vcxproj
index df1dc3b6..a3f2db38 100644
--- a/Samples/matrixMulDrv/matrixMulDrv_vs2012.vcxproj
+++ b/Samples/matrixMulDrv/matrixMulDrv_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/matrixMulDrv.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2013.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2013.vcxproj
index 5070411e..f746ae9e 100644
--- a/Samples/matrixMulDrv/matrixMulDrv_vs2013.vcxproj
+++ b/Samples/matrixMulDrv/matrixMulDrv_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/matrixMulDrv.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2015.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2015.vcxproj
index 4e3de53f..9e2ddd4c 100644
--- a/Samples/matrixMulDrv/matrixMulDrv_vs2015.vcxproj
+++ b/Samples/matrixMulDrv/matrixMulDrv_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/matrixMulDrv.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj
index 7b697003..7fa6c84b 100644
--- a/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj
+++ b/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/matrixMulDrv.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2019.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2019.vcxproj
index 5181d238..4387b288 100644
--- a/Samples/matrixMulDrv/matrixMulDrv_vs2019.vcxproj
+++ b/Samples/matrixMulDrv/matrixMulDrv_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/matrixMulDrv.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/memMapIPCDrv/Makefile b/Samples/memMapIPCDrv/Makefile
index d983f210..38c48d5e 100644
--- a/Samples/memMapIPCDrv/Makefile
+++ b/Samples/memMapIPCDrv/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -254,6 +275,12 @@ ifeq ($(TARGET_ARCH),aarch64)
   SAMPLE_ENABLED := 0
 endif
 
+# This sample is not supported on sbsa
+ifeq ($(TARGET_ARCH),sbsa)
+  $(info >>> WARNING - memMapIPCDrv is not supported on sbsa - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
@@ -275,8 +302,8 @@ ifeq ($(GENCODE_FLAGS),)
 $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
 
 ifeq ($(SMS),)
-# Generate PTX code from SM 30
-GENCODE_FLAGS += -gencode arch=compute_30,code=compute_30
+# Generate PTX code from SM 35
+GENCODE_FLAGS += -gencode arch=compute_35,code=compute_35
 endif
 
 # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
@@ -302,6 +329,10 @@ else
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
   endif
 
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
+  endif
+
   ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
   endif
@@ -316,12 +347,19 @@ else
 
   ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
+    ifdef TARGET_OVERRIDE
+        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
+    endif
   endif
 
   ifeq ($(TARGET_ARCH),ppc64le)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
   endif
 
+  ifeq ($(HOST_ARCH),ppc64le)
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
+  endif
+
   CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
   ifeq ("$(CUDALIB)","")
     $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
diff --git a/Samples/memMapIPCDrv/README.md b/Samples/memMapIPCDrv/README.md
index e5879f22..b4d6ba68 100644
--- a/Samples/memMapIPCDrv/README.md
+++ b/Samples/memMapIPCDrv/README.md
@@ -10,7 +10,7 @@ CUDA Driver API, cuMemMap IPC, MMAP
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -30,7 +30,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuLaunchKernel, cuMemcpyD
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/memMapIPCDrv/memMapIPCDrv_vs2012.vcxproj b/Samples/memMapIPCDrv/memMapIPCDrv_vs2012.vcxproj
index 63b48c6e..40a38b56 100644
--- a/Samples/memMapIPCDrv/memMapIPCDrv_vs2012.vcxproj
+++ b/Samples/memMapIPCDrv/memMapIPCDrv_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/memMapIPCDrv.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/memMapIPCDrv/memMapIPCDrv_vs2013.vcxproj b/Samples/memMapIPCDrv/memMapIPCDrv_vs2013.vcxproj
index 1d8979f9..a44e3108 100644
--- a/Samples/memMapIPCDrv/memMapIPCDrv_vs2013.vcxproj
+++ b/Samples/memMapIPCDrv/memMapIPCDrv_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/memMapIPCDrv.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/memMapIPCDrv/memMapIPCDrv_vs2015.vcxproj b/Samples/memMapIPCDrv/memMapIPCDrv_vs2015.vcxproj
index ebfba25c..b1586a4e 100644
--- a/Samples/memMapIPCDrv/memMapIPCDrv_vs2015.vcxproj
+++ b/Samples/memMapIPCDrv/memMapIPCDrv_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/memMapIPCDrv.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/memMapIPCDrv/memMapIPCDrv_vs2017.vcxproj b/Samples/memMapIPCDrv/memMapIPCDrv_vs2017.vcxproj
index 6b1494a5..53391214 100644
--- a/Samples/memMapIPCDrv/memMapIPCDrv_vs2017.vcxproj
+++ b/Samples/memMapIPCDrv/memMapIPCDrv_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/memMapIPCDrv.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/memMapIPCDrv/memMapIPCDrv_vs2019.vcxproj b/Samples/memMapIPCDrv/memMapIPCDrv_vs2019.vcxproj
index c6a2769b..121fe730 100644
--- a/Samples/memMapIPCDrv/memMapIPCDrv_vs2019.vcxproj
+++ b/Samples/memMapIPCDrv/memMapIPCDrv_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/memMapIPCDrv.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/memMapIPCDrv/memMapIpc.cpp b/Samples/memMapIPCDrv/memMapIpc.cpp
index 7c6c3845..729cd231 100644
--- a/Samples/memMapIPCDrv/memMapIpc.cpp
+++ b/Samples/memMapIPCDrv/memMapIpc.cpp
@@ -302,7 +302,7 @@ static void memMapGetDeviceFunction(char **argv) {
       cuModuleGetFunction(&_memMapIpc_kernel, cuModule, "memMapIpc_kernel"));
 }
 
-static void childProcess(int id, char **argv) {
+static void childProcess(int devId, int id, char **argv) {
   volatile shmStruct *shm = NULL;
   sharedMemoryInfo info;
   ipcHandle *ipcChildHandle = NULL;
@@ -329,7 +329,7 @@ static void childProcess(int id, char **argv) {
   CUstream stream;
   int multiProcessorCount;
 
-  checkCudaErrors(cuDeviceGet(&device, id));
+  checkCudaErrors(cuDeviceGet(&device, devId));
   checkCudaErrors(cuCtxCreate(&ctx, 0, device));
   checkCudaErrors(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
 
@@ -350,7 +350,7 @@ static void childProcess(int id, char **argv) {
 
   // Import the memory allocations shared by the parent with us and map them in
   // our address space.
-  memMapImportAndMapMemory(d_ptr, DATA_BUF_SIZE, shHandle, id);
+  memMapImportAndMapMemory(d_ptr, DATA_BUF_SIZE, shHandle, devId);
 
   // Since we have imported allocations shared by the parent with us, we can
   // close all the ShareableHandles.
@@ -424,7 +424,6 @@ static void parentProcess(char *app) {
 
   checkCudaErrors(cuDeviceGetCount(&devCount));
   std::vector<CUdevice> devices(devCount);
-  std::vector<CUcontext> ctxs(devCount);
 
   if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) {
     printf("Failed to create shared memory slab\n");
@@ -436,9 +435,11 @@ static void parentProcess(char *app) {
 
   for (i = 0; i < devCount; i++) {
     checkCudaErrors(cuDeviceGet(&devices[i], i));
-    checkCudaErrors(cuCtxCreate(&ctxs[i], 0, devices[i]));
   }
 
+  std::vector<CUcontext> ctxs;
+  std::vector<unsigned char> selectedDevices;
+
   // Pick all the devices that can access each other's memory for this test
   // Keep in mind that CUDA has minimal support for fork() without a
   // corresponding exec() in the child process, but in this case our
@@ -500,6 +501,10 @@ static void parentProcess(char *app) {
       }
     }
     if (allPeers) {
+      CUcontext ctx;
+      checkCudaErrors(cuCtxCreate(&ctx, 0, devices[i]));
+      ctxs.push_back(ctx);
+
       // Enable peers here.  This isn't necessary for IPC, but it will
       // setup the peers for the device.  For systems that only allow 8
       // peers per GPU at a time, this acts to remove devices from CanAccessPeer
@@ -509,6 +514,7 @@ static void parentProcess(char *app) {
         checkCudaErrors(cuCtxSetCurrent(ctxs[j]));
         checkCudaErrors(cuCtxEnablePeerAccess(ctxs[i], 0));
       }
+      selectedDevices.push_back(i);
       nprocesses++;
       if (nprocesses >= MAX_DEVICES) {
         break;
@@ -521,13 +527,17 @@ static void parentProcess(char *app) {
     }
   }
 
+  for (int i = 0; i < ctxs.size(); ++i) {
+    checkCudaErrors(cuCtxDestroy(ctxs[i]));
+  };
+
   if (nprocesses == 0) {
     printf("No CUDA devices support IPC\n");
     exit(EXIT_WAIVED);
   }
   shm->nprocesses = nprocesses;
 
-  unsigned char firstSelectedDevice = 0;
+  unsigned char firstSelectedDevice = selectedDevices[0];
 
   std::vector<ShareableHandle> shHandles(nprocesses);
   std::vector<CUmemGenericAllocationHandle> allocationHandles(nprocesses);
@@ -540,10 +550,12 @@ static void parentProcess(char *app) {
   // Launch the child processes!
   for (i = 0; i < nprocesses; i++) {
     char devIdx[10];
-    char *const args[] = {app, devIdx, NULL};
+    char procIdx[10];
+    char *const args[] = {app, devIdx, procIdx, NULL};
     Process process;
 
-    SPRINTF(devIdx, "%d", i);
+    SPRINTF(devIdx, "%d", selectedDevices[i]);
+    SPRINTF(procIdx, "%d", i);
 
     if (spawnProcess(&process, app, args)) {
       printf("Failed to create process\n");
@@ -593,7 +605,7 @@ int main(int argc, char **argv) {
   if (argc == 1) {
     parentProcess(argv[0]);
   } else {
-    childProcess(atoi(argv[1]), argv);
+    childProcess(atoi(argv[1]), atoi(argv[2]), argv);
   }
   return EXIT_SUCCESS;
 #endif
diff --git a/Samples/nvJPEG/Makefile b/Samples/nvJPEG/Makefile
index 57322b71..1d3c8c56 100644
--- a/Samples/nvJPEG/Makefile
+++ b/Samples/nvJPEG/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -246,12 +267,6 @@ ifeq ($(TARGET_ARCH),armv7l)
   SAMPLE_ENABLED := 0
 endif
 
-# This sample is not supported on QNX
-ifeq ($(TARGET_OS),qnx)
-  $(info >>> WARNING - nvJPEG is not supported on QNX - waiving sample <<<)
-  SAMPLE_ENABLED := 0
-endif
-
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
diff --git a/Samples/nvJPEG/NsightEclipse.xml b/Samples/nvJPEG/NsightEclipse.xml
index 6d7cc56c..2c30ca3c 100644
--- a/Samples/nvJPEG/NsightEclipse.xml
+++ b/Samples/nvJPEG/NsightEclipse.xml
@@ -31,16 +31,6 @@
     <scope>1:CUDA Basic Topics</scope>
     <scope>3:JPEG Decoding</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
-  <sm-arch>sm35</sm-arch>
-  <sm-arch>sm37</sm-arch>
-  <sm-arch>sm50</sm-arch>
-  <sm-arch>sm52</sm-arch>
-  <sm-arch>sm60</sm-arch>
-  <sm-arch>sm61</sm-arch>
-  <sm-arch>sm70</sm-arch>
-  <sm-arch>sm72</sm-arch>
-  <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/nvJPEG/README.md b/Samples/nvJPEG/README.md
index 45b5a019..b556bddf 100644
--- a/Samples/nvJPEG/README.md
+++ b/Samples/nvJPEG/README.md
@@ -10,8 +10,6 @@ Image Decoding, NVJPEG Library
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
-
 ## Supported OSes
 
 Linux, Windows
@@ -27,7 +25,7 @@ x86_64, ppc64le, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/nvJPEG/nvJPEG.cpp b/Samples/nvJPEG/nvJPEG.cpp
index 16dcbd19..1822949e 100644
--- a/Samples/nvJPEG/nvJPEG.cpp
+++ b/Samples/nvJPEG/nvJPEG.cpp
@@ -36,6 +36,10 @@ int dev_malloc(void **p, size_t s) { return (int)cudaMalloc(p, s); }
 
 int dev_free(void *p) { return (int)cudaFree(p); }
 
+int host_malloc(void** p, size_t s, unsigned int f) { return (int)cudaHostAlloc(p, s, f); }
+
+int host_free(void* p) { return (int)cudaFreeHost(p); }
+
 typedef std::vector<std::string> FileNames;
 typedef std::vector<std::vector<char> > FileData;
 
@@ -50,6 +54,14 @@ struct decode_params_t {
   nvjpegHandle_t nvjpeg_handle;
   cudaStream_t stream;
 
+  // used with decoupled API
+  nvjpegJpegState_t nvjpeg_decoupled_state;
+  nvjpegBufferPinned_t pinned_buffers[2]; // 2 buffers for pipelining
+  nvjpegBufferDevice_t device_buffer;
+  nvjpegJpegStream_t  jpeg_streams[2]; //  2 streams for pipelining
+  nvjpegDecodeParams_t nvjpeg_decode_params;
+  nvjpegJpegDecoder_t nvjpeg_decoder;
+
   nvjpegOutputFormat_t fmt;
   bool write_decoded;
   std::string output_dir;
@@ -195,6 +207,33 @@ int prepare_buffers(FileData &file_data, std::vector<size_t> &file_len,
   return EXIT_SUCCESS;
 }
 
+void create_decoupled_api_handles(decode_params_t& params){
+
+  checkCudaErrors(nvjpegDecoderCreate(params.nvjpeg_handle, NVJPEG_BACKEND_DEFAULT, &params.nvjpeg_decoder));
+  checkCudaErrors(nvjpegDecoderStateCreate(params.nvjpeg_handle, params.nvjpeg_decoder, &params.nvjpeg_decoupled_state));   
+  
+  checkCudaErrors(nvjpegBufferPinnedCreate(params.nvjpeg_handle, NULL, &params.pinned_buffers[0]));
+  checkCudaErrors(nvjpegBufferPinnedCreate(params.nvjpeg_handle, NULL, &params.pinned_buffers[1]));
+  checkCudaErrors(nvjpegBufferDeviceCreate(params.nvjpeg_handle, NULL, &params.device_buffer));
+  
+  checkCudaErrors(nvjpegJpegStreamCreate(params.nvjpeg_handle, &params.jpeg_streams[0]));
+  checkCudaErrors(nvjpegJpegStreamCreate(params.nvjpeg_handle, &params.jpeg_streams[1]));
+
+  checkCudaErrors(nvjpegDecodeParamsCreate(params.nvjpeg_handle, &params.nvjpeg_decode_params));
+}
+
+void destroy_decoupled_api_handles(decode_params_t& params){  
+
+  checkCudaErrors(nvjpegDecodeParamsDestroy(params.nvjpeg_decode_params));
+  checkCudaErrors(nvjpegJpegStreamDestroy(params.jpeg_streams[0]));
+  checkCudaErrors(nvjpegJpegStreamDestroy(params.jpeg_streams[1]));
+  checkCudaErrors(nvjpegBufferPinnedDestroy(params.pinned_buffers[0]));
+  checkCudaErrors(nvjpegBufferPinnedDestroy(params.pinned_buffers[1]));
+  checkCudaErrors(nvjpegBufferDeviceDestroy(params.device_buffer));
+  checkCudaErrors(nvjpegJpegStateDestroy(params.nvjpeg_decoupled_state));  
+  checkCudaErrors(nvjpegDecoderDestroy(params.nvjpeg_decoder));
+}
+
 void release_buffers(std::vector<nvjpegImage_t> &ibuf) {
   for (int i = 0; i < ibuf.size(); i++) {
     for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++)
@@ -206,37 +245,52 @@ int decode_images(const FileData &img_data, const std::vector<size_t> &img_len,
                   std::vector<nvjpegImage_t> &out, decode_params_t &params,
                   double &time) {
   checkCudaErrors(cudaStreamSynchronize(params.stream));
-  nvjpegStatus_t err;
-  StopWatchInterface *timer = NULL;
-  sdkCreateTimer(&timer);
+  cudaEvent_t startEvent = NULL, stopEvent = NULL;
+  float loopTime = 0; 
+  
+  checkCudaErrors(cudaEventCreate(&startEvent, cudaEventBlockingSync));
+  checkCudaErrors(cudaEventCreate(&stopEvent, cudaEventBlockingSync));
 
   if (!params.batched) {
     if (!params.pipelined)  // decode one image at a time
     {
-      int thread_idx = 0;
-      sdkStartTimer(&timer);
+      checkCudaErrors(cudaEventRecord(startEvent, params.stream));
       for (int i = 0; i < params.batch_size; i++) {
         checkCudaErrors(nvjpegDecode(params.nvjpeg_handle, params.nvjpeg_state,
                                      (const unsigned char *)img_data[i].data(),
                                      img_len[i], params.fmt, &out[i],
                                      params.stream));
-        checkCudaErrors(cudaStreamSynchronize(params.stream));
       }
+      checkCudaErrors(cudaEventRecord(stopEvent, params.stream));
     } else {
-      int thread_idx = 0;
-      sdkStartTimer(&timer);
+      // use de-coupled API in pipelined mode
+      checkCudaErrors(cudaEventRecord(startEvent, params.stream));
+      checkCudaErrors(nvjpegStateAttachDeviceBuffer(params.nvjpeg_decoupled_state, params.device_buffer));
+      int buffer_index = 0;
+      checkCudaErrors(nvjpegDecodeParamsSetOutputFormat(params.nvjpeg_decode_params, params.fmt));
       for (int i = 0; i < params.batch_size; i++) {
-        checkCudaErrors(
-            nvjpegDecodePhaseOne(params.nvjpeg_handle, params.nvjpeg_state,
-                                 (const unsigned char *)img_data[i].data(),
-                                 img_len[i], params.fmt, params.stream));
-        checkCudaErrors(cudaStreamSynchronize(params.stream));
-        checkCudaErrors(nvjpegDecodePhaseTwo(
-            params.nvjpeg_handle, params.nvjpeg_state, params.stream));
-        checkCudaErrors(nvjpegDecodePhaseThree(
-            params.nvjpeg_handle, params.nvjpeg_state, &out[i], params.stream));
-      }
+      checkCudaErrors(
+          nvjpegJpegStreamParse(params.nvjpeg_handle, (const unsigned char *)img_data[i].data(), img_len[i], 
+          0, 0, params.jpeg_streams[buffer_index]));
+                                
+      checkCudaErrors(nvjpegStateAttachPinnedBuffer(params.nvjpeg_decoupled_state,
+          params.pinned_buffers[buffer_index]));
+      
+      checkCudaErrors(nvjpegDecodeJpegHost(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state, 
+          params.nvjpeg_decode_params, params.jpeg_streams[buffer_index]));
+
       checkCudaErrors(cudaStreamSynchronize(params.stream));
+
+      checkCudaErrors(nvjpegDecodeJpegTransferToDevice(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
+          params.jpeg_streams[buffer_index], params.stream));
+
+      buffer_index = 1 - buffer_index; // switch pinned buffer in pipeline mode to avoid an extra sync
+
+      checkCudaErrors(nvjpegDecodeJpegDevice(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
+          &out[i], params.stream));
+
+      }
+      checkCudaErrors(cudaEventRecord(stopEvent, params.stream));
     }
   } else {
     std::vector<const unsigned char *> raw_inputs;
@@ -244,30 +298,16 @@ int decode_images(const FileData &img_data, const std::vector<size_t> &img_len,
       raw_inputs.push_back((const unsigned char *)img_data[i].data());
     }
 
-    if (!params.pipelined)  // decode multiple images in a single batch
-    {
-      sdkStartTimer(&timer);
-      checkCudaErrors(nvjpegDecodeBatched(
-          params.nvjpeg_handle, params.nvjpeg_state, raw_inputs.data(),
-          img_len.data(), out.data(), params.stream));
-      checkCudaErrors(cudaStreamSynchronize(params.stream));
-    } else {
-      int thread_idx = 0;
-      for (int i = 0; i < params.batch_size; i++) {
-        checkCudaErrors(nvjpegDecodeBatchedPhaseOne(
-            params.nvjpeg_handle, params.nvjpeg_state, raw_inputs[i],
-            img_len[i], i, thread_idx, params.stream));
-      }
-      checkCudaErrors(nvjpegDecodeBatchedPhaseTwo(
-          params.nvjpeg_handle, params.nvjpeg_state, params.stream));
-      checkCudaErrors(nvjpegDecodeBatchedPhaseThree(params.nvjpeg_handle,
-                                                    params.nvjpeg_state,
-                                                    out.data(), params.stream));
-      checkCudaErrors(cudaStreamSynchronize(params.stream));
-    }
+    checkCudaErrors(cudaEventRecord(startEvent, params.stream));
+    checkCudaErrors(nvjpegDecodeBatched(
+        params.nvjpeg_handle, params.nvjpeg_state, raw_inputs.data(),
+        img_len.data(), out.data(), params.stream));
+    checkCudaErrors(cudaEventRecord(stopEvent, params.stream));
+  
   }
-  sdkStopTimer(&timer);
-  time = sdkGetAverageTimerValue(&timer)/1000.0f;
+  checkCudaErrors(cudaEventSynchronize(stopEvent));
+  checkCudaErrors(cudaEventElapsedTime(&loopTime, startEvent, stopEvent));
+  time = static_cast<double>(loopTime);
 
   return EXIT_SUCCESS;
 }
@@ -518,14 +558,20 @@ int main(int argc, const char *argv[]) {
          props.ECCEnabled ? "on" : "off");
 
   nvjpegDevAllocator_t dev_allocator = {&dev_malloc, &dev_free};
-  checkCudaErrors(nvjpegCreate(NVJPEG_BACKEND_DEFAULT, &dev_allocator,
-                               &params.nvjpeg_handle));
+  nvjpegPinnedAllocator_t pinned_allocator ={&host_malloc, &host_free};
+  int flags = 0;
+  checkCudaErrors(nvjpegCreateEx(NVJPEG_BACKEND_DEFAULT, &dev_allocator,
+                                &pinned_allocator,flags,  &params.nvjpeg_handle));
+
   checkCudaErrors(
       nvjpegJpegStateCreate(params.nvjpeg_handle, &params.nvjpeg_state));
   checkCudaErrors(
       nvjpegDecodeBatchedInitialize(params.nvjpeg_handle, params.nvjpeg_state,
                                     params.batch_size, 1, params.fmt));
 
+  if(params.pipelined ){
+    create_decoupled_api_handles(params);
+  }
   // read source images
   FileNames image_names;
   readInput(params.input_dir, image_names);
@@ -556,6 +602,10 @@ int main(int argc, const char *argv[]) {
                         params.batch_size)
             << std::endl;
 
+  if(params.pipelined ){ 
+    destroy_decoupled_api_handles(params);
+  }
+
   checkCudaErrors(nvjpegJpegStateDestroy(params.nvjpeg_state));
   checkCudaErrors(nvjpegDestroy(params.nvjpeg_handle));
 
diff --git a/Samples/nvJPEG_encoder/Makefile b/Samples/nvJPEG_encoder/Makefile
index 0f50b0c8..1d31e70a 100644
--- a/Samples/nvJPEG_encoder/Makefile
+++ b/Samples/nvJPEG_encoder/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -246,12 +267,6 @@ ifeq ($(TARGET_ARCH),armv7l)
   SAMPLE_ENABLED := 0
 endif
 
-# This sample is not supported on QNX
-ifeq ($(TARGET_OS),qnx)
-  $(info >>> WARNING - nvJPEG_encoder is not supported on QNX - waiving sample <<<)
-  SAMPLE_ENABLED := 0
-endif
-
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
diff --git a/Samples/nvJPEG_encoder/NsightEclipse.xml b/Samples/nvJPEG_encoder/NsightEclipse.xml
index 0f9debba..c1df7f98 100644
--- a/Samples/nvJPEG_encoder/NsightEclipse.xml
+++ b/Samples/nvJPEG_encoder/NsightEclipse.xml
@@ -31,16 +31,6 @@
     <scope>1:CUDA Basic Topics</scope>
     <scope>4:JPEG Encoding</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
-  <sm-arch>sm35</sm-arch>
-  <sm-arch>sm37</sm-arch>
-  <sm-arch>sm50</sm-arch>
-  <sm-arch>sm52</sm-arch>
-  <sm-arch>sm60</sm-arch>
-  <sm-arch>sm61</sm-arch>
-  <sm-arch>sm70</sm-arch>
-  <sm-arch>sm72</sm-arch>
-  <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/nvJPEG_encoder/README.md b/Samples/nvJPEG_encoder/README.md
index 89358216..97321d97 100644
--- a/Samples/nvJPEG_encoder/README.md
+++ b/Samples/nvJPEG_encoder/README.md
@@ -10,8 +10,6 @@ Image Encoding, NVJPEG Library
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
-
 ## Supported OSes
 
 Linux, Windows
@@ -27,7 +25,7 @@ x86_64, ppc64le, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/nvJPEG_encoder/nvJPEG_encoder.cpp b/Samples/nvJPEG_encoder/nvJPEG_encoder.cpp
index 7390731b..ae37cb9e 100644
--- a/Samples/nvJPEG_encoder/nvJPEG_encoder.cpp
+++ b/Samples/nvJPEG_encoder/nvJPEG_encoder.cpp
@@ -35,8 +35,6 @@
 int dev_malloc(void **p, size_t s) { return (int)cudaMalloc(p, s); }
 int dev_free(void *p) { return (int)cudaFree(p); }
 
-StopWatchInterface *timer = NULL;
-
 bool is_interleaved(nvjpegOutputFormat_t format)
 {
     if (format == NVJPEG_OUTPUT_RGBI || format == NVJPEG_OUTPUT_BGRI)
@@ -51,6 +49,7 @@ struct encode_params_t {
   std::string format;
   std::string subsampling;
   int quality;
+  int huf;
   int dev;
 };
 
@@ -62,6 +61,11 @@ nvjpegEncoderState_t encoder_state;
 int decodeEncodeOneImage(std::string sImagePath, std::string sOutputPath, double &time, nvjpegOutputFormat_t output_format, nvjpegInputFormat_t input_format)
 {
     time = 0.;
+    cudaEvent_t startEvent = NULL, stopEvent = NULL;
+    float loopTime = 0;
+    checkCudaErrors(cudaEventCreate(&startEvent, cudaEventBlockingSync));
+    checkCudaErrors(cudaEventCreate(&stopEvent, cudaEventBlockingSync));
+
     // Get the file name, without extension.
     // This will be used to rename the output file.    
     size_t position = sImagePath.rfind("/");
@@ -87,7 +91,7 @@ int decodeEncodeOneImage(std::string sImagePath, std::string sOutputPath, double
 
     // Image buffers. 
     unsigned char * pBuffer = NULL; 
-    double decode_time = 0.;
+    double encoder_time = 0.;
     
     std::vector<char> vBuffer(nSize);
     
@@ -169,9 +173,7 @@ int decodeEncodeOneImage(std::string sImagePath, std::string sOutputPath, double
             int nReturnCode = 0;
 
             cudaDeviceSynchronize();
-            // Create the CUTIL timer
-            sdkCreateTimer(&timer);
-            sdkStartTimer(&timer);
+
             nReturnCode = nvjpegDecode(nvjpeg_handle, jpeg_state, dpImage, nSize, output_format, &imgdesc, NULL);
 
             // alternatively decode by stages
@@ -179,14 +181,14 @@ int decodeEncodeOneImage(std::string sImagePath, std::string sOutputPath, double
             nReturnCode = nvjpegDecodeMixed(nvjpeg_handle, NULL);
             nReturnCode = nvjpegDecodeGPU(nvjpeg_handle, NULL);*/
             cudaDeviceSynchronize();
-            sdkStopTimer(&timer);
-            decode_time =sdkGetTimerValue(&timer);
+
             if(nReturnCode != 0)
             {
                 std::cerr << "Error in nvjpegDecode." << std::endl;
                 return 1;
             }
 
+            checkCudaErrors(cudaEventRecord(startEvent, NULL));
             /////////////////////// encode ////////////////////
             if (NVJPEG_OUTPUT_YUV == output_format)
             {
@@ -226,6 +228,12 @@ int decodeEncodeOneImage(std::string sImagePath, std::string sOutputPath, double
                 obuffer.data(),
                 &length,
                 NULL));
+
+            checkCudaErrors(cudaEventRecord(stopEvent, NULL));
+            checkCudaErrors(cudaEventSynchronize(stopEvent));
+            checkCudaErrors(cudaEventElapsedTime(&loopTime, startEvent, stopEvent));
+            encoder_time = static_cast<double>(loopTime);
+
             std::string output_filename = sOutputPath + "/" + sFileName + ".jpg";
             char directory[120];
             char mkdir_cmd[256];
@@ -250,7 +258,7 @@ int decodeEncodeOneImage(std::string sImagePath, std::string sOutputPath, double
         }
     }
 
-    time = decode_time;
+    time = encoder_time;
 
     return 0;
 }
@@ -352,14 +360,14 @@ int processArgs(encode_params_t param)
         return error_code;
     }
     
-    double total_time = 0., decode_time = 0.;
+    double total_time = 0., encoder_time = 0.;
     int total_images = 0;
 
     for (unsigned int i = 0; i < inputFiles.size(); i++)
     {
         std::string &sFileName = inputFiles[i];
         std::cout << "Processing file: " << sFileName << std::endl;
-        int image_error_code = decodeEncodeOneImage(sFileName, sOutputPath, decode_time, oformat, iformat);
+        int image_error_code = decodeEncodeOneImage(sFileName, sOutputPath, encoder_time, oformat, iformat);
         if (image_error_code)
         {
             std::cerr << "Error processing file: " << sFileName << std::endl;
@@ -368,11 +376,11 @@ int processArgs(encode_params_t param)
         else
         {
             total_images++;
-            total_time += decode_time;
+            total_time += encoder_time;
         }                      
     }
     std::cout << "Total images processed: " << total_images << std::endl;
-    std::cout << "Total time spent on decoding: " << total_time << std::endl;
+    std::cout << "Total time spent on encoding: " << total_time << std::endl;
     std::cout << "Avg time/image: " << total_time/total_images << std::endl;
 
     return 0;
@@ -411,7 +419,7 @@ int main(int argc, const char *argv[])
       (pidx = findParamIndex(argv, argc, "--help")) != -1) {
     std::cout << "Usage: " << argv[0]
               << " -i images_dir  [-o output_dir] [-device=device_id]"                 
-                 "[-q quality][-s 420/444] [-fmt output_format]\n";
+                 "[-q quality][-s 420/444] [-fmt output_format] [-huf 0]\n";
     std::cout << "Parameters: " << std::endl;
     std::cout << "\timages_dir\t:\tPath to single image or directory of images" << std::endl;
     std::cout << "\toutput_dir\t:\tWrite encoded images as jpeg to this directory" << std::endl;
@@ -421,6 +429,7 @@ int main(int argc, const char *argv[])
     std::cout << "\toutput_format\t:\tnvJPEG output format for encoding. One "
                  "of [rgb, rgbi, bgr, bgri, yuv, y, unchanged]"
               << std::endl;
+    std::cout << "\tHuffman Optimization\t:\tUse Huffman optimization [default 0]" << std::endl;
     return EXIT_SUCCESS;
   }
 
@@ -465,6 +474,11 @@ int main(int argc, const char *argv[])
     params.format = "yuv";
   }
 
+  params.huf = 0;
+  if ((pidx = findParamIndex(argv, argc, "-huf")) != -1) {
+    params.huf = std::atoi(argv[pidx + 1]);
+  }
+
     cudaDeviceProp props;
     checkCudaErrors(cudaGetDeviceProperties(&props, params.dev));
 
@@ -481,7 +495,7 @@ int main(int argc, const char *argv[])
     
     // sample input parameters
     checkCudaErrors(nvjpegEncoderParamsSetQuality(encode_params, params.quality, NULL));
-    checkCudaErrors(nvjpegEncoderParamsSetOptimizedHuffman(encode_params, 1, NULL));
+    checkCudaErrors(nvjpegEncoderParamsSetOptimizedHuffman(encode_params, params.huf, NULL));
 
     pidx = processArgs(params);
 
diff --git a/Samples/p2pBandwidthLatencyTest/Makefile b/Samples/p2pBandwidthLatencyTest/Makefile
index 64941813..0f830b77 100644
--- a/Samples/p2pBandwidthLatencyTest/Makefile
+++ b/Samples/p2pBandwidthLatencyTest/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -247,9 +268,9 @@ LIBRARIES :=
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
diff --git a/Samples/p2pBandwidthLatencyTest/NsightEclipse.xml b/Samples/p2pBandwidthLatencyTest/NsightEclipse.xml
index 7ce8df08..65b1a254 100644
--- a/Samples/p2pBandwidthLatencyTest/NsightEclipse.xml
+++ b/Samples/p2pBandwidthLatencyTest/NsightEclipse.xml
@@ -40,7 +40,6 @@
     <scope>1:CUDA Basic Topics</scope>
     <scope>1:Performance Strategies</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
   <sm-arch>sm35</sm-arch>
   <sm-arch>sm37</sm-arch>
   <sm-arch>sm50</sm-arch>
@@ -50,6 +49,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/p2pBandwidthLatencyTest/README.md b/Samples/p2pBandwidthLatencyTest/README.md
index 1bb08468..130d0879 100644
--- a/Samples/p2pBandwidthLatencyTest/README.md
+++ b/Samples/p2pBandwidthLatencyTest/README.md
@@ -10,11 +10,11 @@ Performance Strategies, Asynchronous Data Transfers, Unified Virtual Address Spa
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -27,7 +27,7 @@ cudaDeviceCanAccessPeer, cudaDeviceEnablePeerAccess, cudaDeviceDisablePeerAccess
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
@@ -66,29 +66,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
index 51f7cc09..4eb75bee 100644
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
@@ -163,8 +163,12 @@ void outputBandwidthMatrix(int numGPUs, bool p2p, P2PDataTransfer p2p_method) {
     cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking);
     cudaMalloc(&buffers[d], numElems * sizeof(int));
     cudaCheckError();
+    cudaMemset(buffers[d], 0, numElems * sizeof(int));
+    cudaCheckError();
     cudaMalloc(&buffersD2D[d], numElems * sizeof(int));
     cudaCheckError();
+    cudaMemset(buffersD2D[d], 0, numElems * sizeof(int));
+    cudaCheckError();
     cudaEventCreate(&start[d]);
     cudaCheckError();
     cudaEventCreate(&stop[d]);
@@ -300,7 +304,9 @@ void outputBidirectionalBandwidthMatrix(int numGPUs, bool p2p) {
   for (int d = 0; d < numGPUs; d++) {
     cudaSetDevice(d);
     cudaMalloc(&buffers[d], numElems * sizeof(int));
+    cudaMemset(buffers[d], 0, numElems * sizeof(int));
     cudaMalloc(&buffersD2D[d], numElems * sizeof(int));
+    cudaMemset(buffersD2D[d], 0, numElems * sizeof(int));
     cudaCheckError();
     cudaEventCreate(&start[d]);
     cudaCheckError();
@@ -463,7 +469,9 @@ void outputLatencyMatrix(int numGPUs, bool p2p, P2PDataTransfer p2p_method) {
     cudaSetDevice(d);
     cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking);
     cudaMalloc(&buffers[d], sizeof(int) * numElems);
+    cudaMemset(buffers[d], 0, sizeof(int) * numElems);
     cudaMalloc(&buffersD2D[d], sizeof(int) * numElems);
+    cudaMemset(buffersD2D[d], 0, sizeof(int) * numElems);
     cudaCheckError();
     cudaEventCreate(&start[d]);
     cudaCheckError();
diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.vcxproj b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.vcxproj
index ddbccb80..5f06f965 100644
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.vcxproj
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/p2pBandwidthLatencyTest.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.vcxproj b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.vcxproj
index 2a1c23d8..90a4785b 100644
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.vcxproj
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/p2pBandwidthLatencyTest.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.vcxproj b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.vcxproj
index 249330ee..2b7a26cb 100644
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.vcxproj
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/p2pBandwidthLatencyTest.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj
index bb6125b7..6408e78d 100644
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/p2pBandwidthLatencyTest.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2019.vcxproj b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2019.vcxproj
index 93559f84..b4b26eff 100644
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2019.vcxproj
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/p2pBandwidthLatencyTest.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/reduction/Makefile b/Samples/reduction/Makefile
index 17aa7ef6..ee4a3b23 100644
--- a/Samples/reduction/Makefile
+++ b/Samples/reduction/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -234,6 +255,12 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 
 SAMPLE_ENABLED := 1
 
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - reduction is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
@@ -245,11 +272,38 @@ LIBRARIES :=
 
 ################################################################################
 
+#Detect if installed version of GCC supports required C++11
+ifeq ($(TARGET_OS),linux)
+    empty :=
+    space := $(empty) $(empty)
+    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
+#Create version number without "."
+    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
+# Make sure the version number has at least 3 decimals
+    GCCVERSION += 00
+# Remove spaces from the version number
+    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
+# Crop the version number to 3 decimals.
+    GCCVERSION := $(shell expr `echo $(GCCVERSION)` | cut -b1-3)
+#$(warning $(GCCVERSION))
+
+    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 470)
+
+    ifeq ($(IS_MIN_VERSION), 1)
+        $(info >>> GCC Version is greater or equal to 4.7.0 <<<)
+    else
+        $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
+        SAMPLE_ENABLED := 0
+    endif
+endif
+
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
@@ -268,6 +322,8 @@ GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
 endif
 endif
 
+ALL_CCFLAGS += --std=c++11
+
 ifeq ($(SAMPLE_ENABLED),0)
 EXEC ?= @echo "[@]"
 endif
diff --git a/Samples/reduction/NsightEclipse.xml b/Samples/reduction/NsightEclipse.xml
index 48c36bb7..c0696c61 100644
--- a/Samples/reduction/NsightEclipse.xml
+++ b/Samples/reduction/NsightEclipse.xml
@@ -2,7 +2,10 @@
 <!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
 <entry>
   <name>reduction</name>
-  <description><![CDATA[A parallel sum reduction that computes the sum of a large arrays of values.  This sample demonstrates several important optimization strategies for Data-Parallel Algorithms like reduction.]]></description>
+  <cflags>
+    <flag>--std=c++11</flag>
+  </cflags>
+  <description><![CDATA[A parallel sum reduction that computes the sum of a large arrays of values. This sample demonstrates several important optimization strategies for Data-Parallel Algorithms like reduction using shared memory, __shfl_down_sync, __reduce_add_sync and cooperative_groups reduce.]]></description>
   <devicecompilation>whole</devicecompilation>
   <includepaths>
     <path>./</path>
@@ -17,6 +20,7 @@
     <keyword>CUDA</keyword>
     <keyword>GPGPU</keyword>
     <keyword>Parallel Reduction</keyword>
+    <keyword>CPP11</keyword>
   </keywords>
   <libraries>
   </libraries>
@@ -24,12 +28,23 @@
   </librarypaths>
   <nsight_eclipse>true</nsight_eclipse>
   <primary_file>reduction.cpp</primary_file>
+  <qatests>
+    <qatest>-kernel 0</qatest>
+    <qatest>-kernel 1</qatest>
+    <qatest>-kernel 2</qatest>
+    <qatest>-kernel 3</qatest>
+    <qatest>-kernel 4</qatest>
+    <qatest>-kernel 5</qatest>
+    <qatest>-kernel 6</qatest>
+  </qatests>
+  <required_dependencies>
+    <dependency>CPP11</dependency>
+  </required_dependencies>
   <scopes>
     <scope>1:CUDA Advanced Topics</scope>
     <scope>1:Data-Parallel Algorithms</scope>
     <scope>1:Performance Strategies</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
   <sm-arch>sm35</sm-arch>
   <sm-arch>sm37</sm-arch>
   <sm-arch>sm50</sm-arch>
@@ -39,6 +54,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/reduction/README.md b/Samples/reduction/README.md
index 6e465b94..b2a9b5fd 100644
--- a/Samples/reduction/README.md
+++ b/Samples/reduction/README.md
@@ -2,7 +2,7 @@
 
 ## Description
 
-A parallel sum reduction that computes the sum of a large arrays of values.  This sample demonstrates several important optimization strategies for Data-Parallel Algorithms like reduction.
+A parallel sum reduction that computes the sum of a large arrays of values. This sample demonstrates several important optimization strategies for Data-Parallel Algorithms like reduction using shared memory, __shfl_down_sync, __reduce_add_sync and cooperative_groups reduce.
 
 ## Key Concepts
 
@@ -10,11 +10,11 @@ Data-Parallel Algorithms, Performance Strategies
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -22,9 +22,13 @@ x86_64, ppc64le, armv7l
 
 ## CUDA APIs involved
 
+## Dependencies needed to build/run
+[CPP11](../../README.md#cpp11)
+
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
 
@@ -63,29 +67,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/reduction/reduction.cpp b/Samples/reduction/reduction.cpp
index 972ae361..e6e5eb82 100644
--- a/Samples/reduction/reduction.cpp
+++ b/Samples/reduction/reduction.cpp
@@ -229,7 +229,7 @@ void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks,
     threads *= 2;
   }
 
-  if (whichKernel == 6) {
+  if (whichKernel >= 6) {
     blocks = MIN(maxBlocks, blocks);
   }
 }
@@ -361,7 +361,7 @@ void shmoo(int minN, int maxN, int maxThreads, int maxBlocks,
                              cudaMemcpyHostToDevice));
 
   // warm-up
-  for (int kernel = 0; kernel < 7; kernel++) {
+  for (int kernel = 0; kernel < 8; kernel++) {
     reduce<T>(maxN, maxThreads, maxNumBlocks, kernel, d_idata, d_odata);
   }
 
@@ -380,7 +380,7 @@ void shmoo(int minN, int maxN, int maxThreads, int maxBlocks,
     printf(", %d", i);
   }
 
-  for (int kernel = 0; kernel < 7; kernel++) {
+  for (int kernel = 0; kernel < 8; kernel++) {
     printf("\n%d", kernel);
 
     for (int i = minN; i <= maxN; i *= 2) {
@@ -421,7 +421,7 @@ template <class T>
 bool runTest(int argc, char **argv, ReduceType datatype) {
   int size = 1 << 24;    // number of elements to reduce
   int maxThreads = 256;  // number of threads per block
-  int whichKernel = 6;
+  int whichKernel = 7;
   int maxBlocks = 64;
   bool cpuFinalReduction = false;
   int cpuFinalThreshold = 1;
diff --git a/Samples/reduction/reduction_kernel.cu b/Samples/reduction/reduction_kernel.cu
index 0aaee929..67c536a2 100644
--- a/Samples/reduction/reduction_kernel.cu
+++ b/Samples/reduction/reduction_kernel.cu
@@ -33,6 +33,7 @@
 #define _REDUCE_KERNEL_H_
 
 #include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
 #include <stdio.h>
 
 namespace cg = cooperative_groups;
@@ -67,6 +68,25 @@ struct SharedMemory<double> {
   }
 };
 
+template <class T>
+__device__ __forceinline__ T warpReduceSum(unsigned int mask, T mySum) {
+  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+    mySum += __shfl_down_sync(mask, mySum, offset);
+  }
+  return mySum;
+}
+
+#if __CUDA_ARCH__ >= 800
+// Specialize warpReduceFunc for int inputs to use __reduce_add_sync intrinsic
+// when on SM 8.0 or higher
+template <>
+__device__ __forceinline__ int warpReduceSum<int>(unsigned int mask,
+                                                  int mySum) {
+  mySum = __reduce_add_sync(mask, mySum);
+  return mySum;
+}
+#endif
+
 /*
     Parallel sum reduction using shared memory
     - takes log(n) steps for n input elements
@@ -341,22 +361,32 @@ __global__ void reduce6(T *g_idata, T *g_odata, unsigned int n) {
   // perform first level of reduction,
   // reading from global memory, writing to shared memory
   unsigned int tid = threadIdx.x;
-  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
-  unsigned int gridSize = blockSize * 2 * gridDim.x;
+  unsigned int gridSize = blockSize * gridDim.x;
 
   T mySum = 0;
 
   // we reduce multiple elements per thread.  The number is determined by the
   // number of active thread blocks (via gridDim).  More blocks will result
   // in a larger gridSize and therefore fewer elements per thread
-  while (i < n) {
-    mySum += g_idata[i];
+  if (nIsPow2) {
+    unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
+    gridSize = gridSize << 1;
 
-    // ensure we don't read out of bounds -- this is optimized away for powerOf2
-    // sized arrays
-    if (nIsPow2 || i + blockSize < n) mySum += g_idata[i + blockSize];
-
-    i += gridSize;
+    while (i < n) {
+      mySum += g_idata[i];
+      // ensure we don't read out of bounds -- this is optimized away for
+      // powerOf2 sized arrays
+      if ((i + blockSize) < n) {
+        mySum += g_idata[i + blockSize];
+      }
+      i += gridSize;
+    }
+  } else {
+    unsigned int i = blockIdx.x * blockSize + threadIdx.x;
+    while (i < n) {
+      mySum += g_idata[i];
+      i += gridSize;
+    }
   }
 
   // each thread puts its local sum into shared memory
@@ -397,6 +427,129 @@ __global__ void reduce6(T *g_idata, T *g_odata, unsigned int n) {
   if (cta.thread_rank() == 0) g_odata[blockIdx.x] = mySum;
 }
 
+template <typename T, unsigned int blockSize, bool nIsPow2>
+__global__ void reduce7(const T *__restrict__ g_idata, T *__restrict__ g_odata,
+                        unsigned int n) {
+  T *sdata = SharedMemory<T>();
+
+  // perform first level of reduction,
+  // reading from global memory, writing to shared memory
+  unsigned int tid = threadIdx.x;
+  unsigned int gridSize = blockSize * gridDim.x;
+  unsigned int maskLength = (blockSize & 31);  // 31 = warpSize-1
+  maskLength = (maskLength > 0) ? (32 - maskLength) : maskLength;
+  const unsigned int mask = (0xffffffff) >> maskLength;
+
+  T mySum = 0;
+
+  // we reduce multiple elements per thread.  The number is determined by the
+  // number of active thread blocks (via gridDim).  More blocks will result
+  // in a larger gridSize and therefore fewer elements per thread
+  if (nIsPow2) {
+    unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
+    gridSize = gridSize << 1;
+
+    while (i < n) {
+      mySum += g_idata[i];
+      // ensure we don't read out of bounds -- this is optimized away for
+      // powerOf2 sized arrays
+      if ((i + blockSize) < n) {
+        mySum += g_idata[i + blockSize];
+      }
+      i += gridSize;
+    }
+  } else {
+    unsigned int i = blockIdx.x * blockSize + threadIdx.x;
+    while (i < n) {
+      mySum += g_idata[i];
+      i += gridSize;
+    }
+  }
+
+  // Reduce within warp using shuffle or reduce_add if T==int & CUDA_ARCH ==
+  // SM 8.0
+  mySum = warpReduceSum<T>(mask, mySum);
+
+  // each thread puts its local sum into shared memory
+  if ((tid % warpSize) == 0) {
+    sdata[tid / warpSize] = mySum;
+  }
+
+  __syncthreads();
+
+  const unsigned int shmem_extent =
+      (blockSize / warpSize) > 0 ? (blockSize / warpSize) : 1;
+  const unsigned int ballot_result = __ballot_sync(mask, tid < shmem_extent);
+  if (tid < shmem_extent) {
+    mySum = sdata[tid];
+    // Reduce final warp using shuffle or reduce_add if T==int & CUDA_ARCH ==
+    // SM 8.0
+    mySum = warpReduceSum<T>(ballot_result, mySum);
+  }
+
+  // write result for this block to global mem
+  if (tid == 0) {
+    g_odata[blockIdx.x] = mySum;
+  }
+}
+
+// Performs a reduction step and updates numTotal with how many are remaining
+template <typename T, typename Group>
+__device__ T cg_reduce_n(T in, Group &threads) {
+  return cg::reduce(threads, in, cg::plus<T>());
+}
+
+template <class T>
+__global__ void cg_reduce(T *g_idata, T *g_odata, unsigned int n) {
+  // Shared memory for intermediate steps
+  T *sdata = SharedMemory<T>();
+  // Handle to thread block group
+  cg::thread_block cta = cg::this_thread_block();
+  // Handle to tile in thread block
+  cg::thread_block_tile<32> tile = cg::tiled_partition<32>(cta);
+
+  unsigned int ctaSize = cta.size();
+  unsigned int numCtas = gridDim.x;
+  unsigned int threadRank = cta.thread_rank();
+  unsigned int threadIndex = (blockIdx.x * ctaSize) + threadRank;
+
+  T threadVal = 0;
+  {
+    unsigned int i = threadIndex;
+    unsigned int indexStride = (numCtas * ctaSize);
+    while (i < n) {
+      threadVal += g_idata[i];
+      i += indexStride;
+    }
+    sdata[threadRank] = threadVal;
+  }
+
+  // Wait for all tiles to finish and reduce within CTA
+  {
+    unsigned int ctaSteps = tile.meta_group_size();
+    unsigned int ctaIndex = ctaSize >> 1;
+    while (ctaIndex >= 32) {
+      cta.sync();
+      if (threadRank < ctaIndex) {
+        threadVal += sdata[threadRank + ctaIndex];
+        sdata[threadRank] = threadVal;
+      }
+      ctaSteps >>= 1;
+      ctaIndex >>= 1;
+    }
+  }
+
+  // Shuffle redux instead of smem redux
+  {
+    cta.sync();
+    if (tile.meta_group_rank() == 0) {
+      threadVal = cg_reduce_n(threadVal, tile);
+    }
+  }
+
+  if (threadRank == 0) g_odata[blockIdx.x] = threadVal;
+}
+
 extern "C" bool isPow2(unsigned int x);
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -542,7 +695,6 @@ void reduce(int size, int threads, int blocks, int whichKernel, T *d_idata,
       break;
 
     case 6:
-    default:
       if (isPow2(size)) {
         switch (threads) {
           case 512:
@@ -650,6 +802,122 @@ void reduce(int size, int threads, int blocks, int whichKernel, T *d_idata,
       }
 
       break;
+
+    case 7:
+      // For reduce7 kernel we require only blockSize/warpSize
+      // number of elements in shared memory
+      smemSize = ((threads / 32) + 1) * sizeof(T);
+      if (isPow2(size)) {
+        switch (threads) {
+          case 512:
+            reduce7<T, 512, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 256:
+            reduce7<T, 256, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 128:
+            reduce7<T, 128, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 64:
+            reduce7<T, 64, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 32:
+            reduce7<T, 32, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 16:
+            reduce7<T, 16, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 8:
+            reduce7<T, 8, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 4:
+            reduce7<T, 4, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 2:
+            reduce7<T, 2, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 1:
+            reduce7<T, 1, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+        }
+      } else {
+        switch (threads) {
+          case 512:
+            reduce7<T, 512, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 256:
+            reduce7<T, 256, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 128:
+            reduce7<T, 128, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 64:
+            reduce7<T, 64, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 32:
+            reduce7<T, 32, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 16:
+            reduce7<T, 16, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 8:
+            reduce7<T, 8, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 4:
+            reduce7<T, 4, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 2:
+            reduce7<T, 2, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 1:
+            reduce7<T, 1, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+        }
+      }
+
+      break;
+    case 8:
+    default:
+      cg_reduce<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+      break;
   }
 }
 
diff --git a/Samples/reduction/reduction_vs2015.vcxproj b/Samples/reduction/reduction_vs2015.vcxproj
index 39152dad..3b8769ef 100644
--- a/Samples/reduction/reduction_vs2015.vcxproj
+++ b/Samples/reduction/reduction_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/reduction.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/reduction/reduction_vs2017.vcxproj b/Samples/reduction/reduction_vs2017.vcxproj
index 29be33b6..27437e62 100644
--- a/Samples/reduction/reduction_vs2017.vcxproj
+++ b/Samples/reduction/reduction_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/reduction.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/reduction/reduction_vs2019.vcxproj b/Samples/reduction/reduction_vs2019.vcxproj
index 92ea9778..f0d5b9b9 100644
--- a/Samples/reduction/reduction_vs2019.vcxproj
+++ b/Samples/reduction/reduction_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/reduction.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/shfl_scan/Makefile b/Samples/shfl_scan/Makefile
index 89834209..20052c72 100644
--- a/Samples/shfl_scan/Makefile
+++ b/Samples/shfl_scan/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -247,9 +268,9 @@ LIBRARIES :=
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
diff --git a/Samples/shfl_scan/NsightEclipse.xml b/Samples/shfl_scan/NsightEclipse.xml
index 4899f979..933951d8 100644
--- a/Samples/shfl_scan/NsightEclipse.xml
+++ b/Samples/shfl_scan/NsightEclipse.xml
@@ -34,16 +34,6 @@
     <scope>1:Data-Parallel Algorithms</scope>
     <scope>1:Performance Strategies</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
-  <sm-arch>sm35</sm-arch>
-  <sm-arch>sm37</sm-arch>
-  <sm-arch>sm50</sm-arch>
-  <sm-arch>sm52</sm-arch>
-  <sm-arch>sm60</sm-arch>
-  <sm-arch>sm61</sm-arch>
-  <sm-arch>sm70</sm-arch>
-  <sm-arch>sm72</sm-arch>
-  <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/shfl_scan/README.md b/Samples/shfl_scan/README.md
index af7cf283..68695af0 100644
--- a/Samples/shfl_scan/README.md
+++ b/Samples/shfl_scan/README.md
@@ -10,11 +10,9 @@ Data-Parallel Algorithms, Performance Strategies
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
-
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -24,7 +22,7 @@ x86_64, ppc64le, armv7l, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
@@ -63,29 +61,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/shfl_scan/shfl_scan_vs2012.vcxproj b/Samples/shfl_scan/shfl_scan_vs2012.vcxproj
index 3a42e827..7a87c68a 100644
--- a/Samples/shfl_scan/shfl_scan_vs2012.vcxproj
+++ b/Samples/shfl_scan/shfl_scan_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/shfl_scan.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/shfl_scan/shfl_scan_vs2013.vcxproj b/Samples/shfl_scan/shfl_scan_vs2013.vcxproj
index f7f2265b..af4478d0 100644
--- a/Samples/shfl_scan/shfl_scan_vs2013.vcxproj
+++ b/Samples/shfl_scan/shfl_scan_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/shfl_scan.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/shfl_scan/shfl_scan_vs2015.vcxproj b/Samples/shfl_scan/shfl_scan_vs2015.vcxproj
index 66164cf6..3349cfb2 100644
--- a/Samples/shfl_scan/shfl_scan_vs2015.vcxproj
+++ b/Samples/shfl_scan/shfl_scan_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/shfl_scan.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/shfl_scan/shfl_scan_vs2017.vcxproj b/Samples/shfl_scan/shfl_scan_vs2017.vcxproj
index dbaea3f8..3220b9c2 100644
--- a/Samples/shfl_scan/shfl_scan_vs2017.vcxproj
+++ b/Samples/shfl_scan/shfl_scan_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/shfl_scan.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/shfl_scan/shfl_scan_vs2019.vcxproj b/Samples/shfl_scan/shfl_scan_vs2019.vcxproj
index 9c448a53..401a32f5 100644
--- a/Samples/shfl_scan/shfl_scan_vs2019.vcxproj
+++ b/Samples/shfl_scan/shfl_scan_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/shfl_scan.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleAWBarrier/Makefile b/Samples/simpleAWBarrier/Makefile
new file mode 100644
index 00000000..d11c54c6
--- /dev/null
+++ b/Samples/simpleAWBarrier/Makefile
@@ -0,0 +1,366 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+        endif
+    endif
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on Mac OSX
+ifeq ($(TARGET_OS),darwin)
+  $(info >>> WARNING - simpleAWBarrier is not supported on Mac OSX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - simpleAWBarrier is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+#Detect if installed version of GCC supports required C++11
+ifeq ($(TARGET_OS),linux)
+    empty :=
+    space := $(empty) $(empty)
+    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
+#Create version number without "."
+    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
+# Make sure the version number has at least 3 decimals
+    GCCVERSION += 00
+# Remove spaces from the version number
+    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
+# Crop the version number to 3 decimals.
+    GCCVERSION := $(shell expr `echo $(GCCVERSION)` | cut -b1-3)
+#$(warning $(GCCVERSION))
+
+    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 500)
+
+    ifeq ($(IS_MIN_VERSION), 1)
+        $(info >>> GCC Version is greater or equal to  5.0.0 <<<)
+    else
+        $(info >>> Waiving build. Minimum GCC version required is  5.0.0<<<)
+        SAMPLE_ENABLED := 0
+    endif
+endif
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 70 72 75 80
+else
+SMS ?= 70 75 80
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ALL_CCFLAGS += --std=c++11
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: simpleAWBarrier
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+simpleAWBarrier.o:simpleAWBarrier.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+simpleAWBarrier: simpleAWBarrier.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./simpleAWBarrier
+
+clean:
+	rm -f simpleAWBarrier simpleAWBarrier.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/simpleAWBarrier
+
+clobber: clean
diff --git a/Samples/simpleAWBarrier/NsightEclipse.xml b/Samples/simpleAWBarrier/NsightEclipse.xml
new file mode 100644
index 00000000..5c73108c
--- /dev/null
+++ b/Samples/simpleAWBarrier/NsightEclipse.xml
@@ -0,0 +1,75 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>simpleAWBarrier</name>
+  <cflags>
+    <flag>--std=c++11</flag>
+  </cflags>
+  <cuda_api_list>
+    <toolkit>cudaMalloc</toolkit>
+    <toolkit>cudaFree</toolkit>
+    <toolkit>cudaMemcpyAsync</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[A simple demonstration of arrive wait barriers.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">Arrive Wait Barrier</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>CUDA</keyword>
+    <keyword>GPGPU</keyword>
+    <keyword>CPP11</keyword>
+    <keyword>GCC 5.0.0</keyword>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>simpleAWBarrier.cu</primary_file>
+  <required_dependencies>
+    <dependency>CPP11</dependency>
+    <dependency>MBCG</dependency>
+  </required_dependencies>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+  </scopes>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>arm</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <arch>aarch64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <arch>aarch64</arch>
+      <platform>qnx</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <from>7.0</from>
+  </supported_sm_architectures>
+  <title>Simple Arrive Wait Barrier</title>
+  <type>exe</type>
+</entry>
diff --git a/Samples/simpleAWBarrier/README.md b/Samples/simpleAWBarrier/README.md
new file mode 100644
index 00000000..ab36896b
--- /dev/null
+++ b/Samples/simpleAWBarrier/README.md
@@ -0,0 +1,74 @@
+# simpleAWBarrier - Simple Arrive Wait Barrier
+
+## Description
+
+A simple demonstration of arrive wait barriers.
+
+## Key Concepts
+
+Arrive Wait Barrier
+
+## Supported SM Architectures
+
+[SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, armv7l, aarch64
+
+## CUDA APIs involved
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaMalloc, cudaFree, cudaMemcpyAsync
+
+## Dependencies needed to build/run
+[CPP11](../../README.md#cpp11), [MBCG](../../README.md#mbcg)
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l, aarch64.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/> `$ make TARGET_ARCH=aarch64` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
diff --git a/Samples/simpleAWBarrier/simpleAWBarrier.cu b/Samples/simpleAWBarrier/simpleAWBarrier.cu
new file mode 100644
index 00000000..2ec0a6f7
--- /dev/null
+++ b/Samples/simpleAWBarrier/simpleAWBarrier.cu
@@ -0,0 +1,263 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+// Includes, system
+#include <stdio.h>
+
+// Includes CUDA
+#include <cuda_runtime.h>
+#include <cuda_awbarrier.h>
+#include <cooperative_groups.h>
+
+// Utilities and timing functions
+#include <helper_functions.h>    // includes cuda.h and cuda_runtime_api.h
+
+// CUDA helper functions
+#include <helper_cuda.h>         // helper functions for CUDA error check
+
+namespace nvcuda_namespace = nvcuda::experimental;
+namespace cg = cooperative_groups;
+
+
+#if __CUDA_ARCH__ >= 700
+template <bool writeSquareRoot> __device__ void reduceBlockData(nvcuda_namespace::awbarrier &barrier,
+                            cg::thread_block_tile<32> &tile32, double &threadSum, double *result)
+{
+    extern __shared__ double tmp[];
+
+    #pragma unroll
+    for (int offset = tile32.size()/2; offset > 0; offset /= 2)
+    {
+         threadSum += tile32.shfl_down(threadSum, offset);
+    }
+    if (tile32.thread_rank() == 0)
+    {
+        tmp[tile32.meta_group_rank()] = threadSum;
+    }
+
+    const auto token = barrier.arrive();
+
+    // The warp which would arrive last at the barrier will 
+    // perform last round of reduction
+    if (tile32.any(token.pending_count() == 1)) {
+
+        double beta  = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0;
+
+        #pragma unroll
+        for (int offset = tile32.size()/2; offset > 0; offset /= 2)
+        {
+             beta += tile32.shfl_down(beta, offset);
+        }
+
+        if (tile32.thread_rank() == 0)
+        {
+            if (writeSquareRoot)
+                *result = sqrt(beta);
+            else
+                *result = beta;
+        }
+    }
+
+    barrier.wait(token);
+}
+#endif
+
+__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, double *partialResults, int size)
+{
+#if __CUDA_ARCH__ >= 700
+    cg::thread_block cta = cg::this_thread_block();
+    cg::grid_group grid = cg::this_grid();;
+    cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
+
+    __shared__ nvcuda_namespace::awbarrier barrier;
+
+    if (threadIdx.x == 0) {
+        nvcuda_namespace::init(&barrier, blockDim.x);
+    }
+
+    cg::sync(cta);
+
+    double threadSum = 0.0;
+    for (int i = grid.thread_rank(); i < size; i += grid.size())
+    {
+        threadSum += (double) (vecA[i] * vecB[i]);
+    }
+
+    // Each thread block performs reduction of partial dotProducts and writes to 
+    // global mem.
+    reduceBlockData<false>(barrier, tile32, threadSum, &partialResults[blockIdx.x]);
+
+    cg::sync(grid);
+
+    // One block performs the final summation of partial dot products
+    // of all the thread blocks and writes the sqrt of final dot product.
+    if (blockIdx.x == 0)
+    {
+        threadSum = 0.0;
+        for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size())
+        {
+            threadSum += partialResults[i];
+        }
+        reduceBlockData<true>(barrier, tile32, threadSum, &partialResults[0]);
+    }
+
+    cg::sync(grid);
+
+    const double finalValue = partialResults[0];
+
+    // Perform normalization of vecA & vecB.
+    for (int i = grid.thread_rank(); i < size; i += grid.size())
+    {
+        vecA[i] = (float)vecA[i] / finalValue;
+        vecB[i] = (float)vecB[i] / finalValue;
+    }
+#endif
+}
+
+
+int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId);
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv)
+{
+    printf("%s starting...\n", argv[0]);
+
+    // This will pick the best possible CUDA capable device
+    int dev = findCudaDevice(argc, (const char **)argv);
+
+    int major = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
+
+    // Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
+    if (major < 7) {
+        printf("simpleAWBarrier requires SM 7.0 or higher.  Exiting...\n");
+        exit(EXIT_WAIVED);
+    }
+
+    int supportsCooperativeLaunch = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch, cudaDevAttrCooperativeLaunch, dev));
+
+    if (!supportsCooperativeLaunch)
+    {
+        printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, Waiving the run\n", dev);
+        exit(EXIT_WAIVED);
+    }
+
+    int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev);
+
+    printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!");
+    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId)
+{
+    float *vecA, *d_vecA;
+    float *vecB, *d_vecB;
+    double *d_partialResults;
+    int size = 10000000;
+
+    vecA = new float[size];
+    vecB = new float[size];
+
+    checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float)*size));
+    checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float)*size));
+
+    float baseVal = 2.0;
+    for (int i = 0; i < size; i++)
+    {
+        vecA[i] = vecB[i] = baseVal;
+    }
+
+    cudaStream_t stream;
+    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+    checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float)*size, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float)*size, cudaMemcpyHostToDevice, stream));
+
+    // Kernel configuration, where a one-dimensional
+    // grid and one-dimensional blocks are configured.
+    int minGridSize = 0, blockSize = 0;
+    checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
+                            &minGridSize,
+                            &blockSize,
+                            (void*)normVecByDotProductAWBarrier,
+                            0,
+                            size));
+
+    int smemSize =  ((blockSize/32)+1) * sizeof(double);
+
+    int numBlocksPerSm = 0;
+    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
+
+    int multiProcessorCount = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
+
+    minGridSize = multiProcessorCount * numBlocksPerSm;
+    checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize*sizeof(double)));
+
+    printf("Launching normVecByDotProductAWBarrier kernel with numBlocks = %d blockSize = %d\n", minGridSize, blockSize);
+
+    dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
+
+    void *kernelArgs[] = {
+        (void*)&d_vecA,
+        (void*)&d_vecB,
+        (void*)&d_partialResults,
+        (void*)&size
+    };
+
+    checkCudaErrors(cudaLaunchCooperativeKernel((void *)normVecByDotProductAWBarrier, dimGrid, dimBlock, kernelArgs, smemSize, stream));
+
+    checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float)*size, cudaMemcpyDeviceToHost, stream));
+    checkCudaErrors(cudaStreamSynchronize(stream));
+
+    float expectedResult =  (baseVal / sqrt(size*baseVal*baseVal));
+    unsigned int matches = 0;
+    for (int i=0; i < size; i++)
+    {
+        if ((vecA[i] - expectedResult) > 0.00001)
+        {
+            printf("mismatch at i = %d\n", i);
+            break;
+        }
+        else
+        {
+            matches++;
+        }
+    }
+
+    checkCudaErrors(cudaFree(d_vecA));
+    checkCudaErrors(cudaFree(d_vecB));
+    checkCudaErrors(cudaFree(d_partialResults));
+
+    delete[] vecA;
+    delete[] vecB;
+    return matches == size;
+}
diff --git a/Samples/simpleAWBarrier/simpleAWBarrier_vs2015.sln b/Samples/simpleAWBarrier/simpleAWBarrier_vs2015.sln
new file mode 100644
index 00000000..304ea248
--- /dev/null
+++ b/Samples/simpleAWBarrier/simpleAWBarrier_vs2015.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 14.00
+# Visual Studio 2015
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleAWBarrier", "simpleAWBarrier_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/simpleAWBarrier/simpleAWBarrier_vs2015.vcxproj b/Samples/simpleAWBarrier/simpleAWBarrier_vs2015.vcxproj
new file mode 100644
index 00000000..0cfb81c0
--- /dev/null
+++ b/Samples/simpleAWBarrier/simpleAWBarrier_vs2015.vcxproj
@@ -0,0 +1,107 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>simpleAWBarrier_vs2015</RootNamespace>
+    <ProjectName>simpleAWBarrier</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/simpleAWBarrier.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="simpleAWBarrier.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/simpleAWBarrier/simpleAWBarrier_vs2017.sln b/Samples/simpleAWBarrier/simpleAWBarrier_vs2017.sln
new file mode 100644
index 00000000..e048a242
--- /dev/null
+++ b/Samples/simpleAWBarrier/simpleAWBarrier_vs2017.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleAWBarrier", "simpleAWBarrier_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/simpleAWBarrier/simpleAWBarrier_vs2017.vcxproj b/Samples/simpleAWBarrier/simpleAWBarrier_vs2017.vcxproj
new file mode 100644
index 00000000..902ed57f
--- /dev/null
+++ b/Samples/simpleAWBarrier/simpleAWBarrier_vs2017.vcxproj
@@ -0,0 +1,112 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>simpleAWBarrier_vs2017</RootNamespace>
+    <ProjectName>simpleAWBarrier</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/simpleAWBarrier.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="simpleAWBarrier.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/simpleAWBarrier/simpleAWBarrier_vs2019.sln b/Samples/simpleAWBarrier/simpleAWBarrier_vs2019.sln
new file mode 100644
index 00000000..84ab9783
--- /dev/null
+++ b/Samples/simpleAWBarrier/simpleAWBarrier_vs2019.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleAWBarrier", "simpleAWBarrier_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/simpleAWBarrier/simpleAWBarrier_vs2019.vcxproj b/Samples/simpleAWBarrier/simpleAWBarrier_vs2019.vcxproj
new file mode 100644
index 00000000..0907f8fb
--- /dev/null
+++ b/Samples/simpleAWBarrier/simpleAWBarrier_vs2019.vcxproj
@@ -0,0 +1,108 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>simpleAWBarrier_vs2019</RootNamespace>
+    <ProjectName>simpleAWBarrier</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/simpleAWBarrier.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="simpleAWBarrier.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/simpleAttributes/Makefile b/Samples/simpleAttributes/Makefile
new file mode 100644
index 00000000..42678edd
--- /dev/null
+++ b/Samples/simpleAttributes/Makefile
@@ -0,0 +1,331 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+        endif
+    endif
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on Mac OSX
+ifeq ($(TARGET_OS),darwin)
+  $(info >>> WARNING - simpleAttributes is not supported on Mac OSX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 35 37 50 52 60 61 70 72 75 80
+else
+SMS ?= 35 37 50 52 60 61 70 75 80
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: simpleAttributes
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+simpleAttributes.o:simpleAttributes.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+simpleAttributes: simpleAttributes.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./simpleAttributes
+
+clean:
+	rm -f simpleAttributes simpleAttributes.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/simpleAttributes
+
+clobber: clean
diff --git a/Samples/simpleAttributes/NsightEclipse.xml b/Samples/simpleAttributes/NsightEclipse.xml
new file mode 100644
index 00000000..29538676
--- /dev/null
+++ b/Samples/simpleAttributes/NsightEclipse.xml
@@ -0,0 +1,68 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>simpleAttributes</name>
+  <cuda_api_list>
+    <toolkit>cudaCtxResetPersistingL2Cache</toolkit>
+    <toolkit>cudaDeviceSetLimit</toolkit>
+    <toolkit>cudaFree</toolkit>
+    <toolkit>cudaGetDeviceProperties</toolkit>
+    <toolkit>cudaMalloc</toolkit>
+    <toolkit>cudaMemcpy</toolkit>
+    <toolkit>cudaStreamCreate</toolkit>
+    <toolkit>cudaStreamSetAttribute</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[This CUDA Runtime API sample is a very basic example that implements how to use the stream attributes that affect L2 locality. Performance improvement due to use of L2 access policy window can only be noticed on Compute capability 8.0 or higher.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">Attributes usage on stream</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>GPGPU</keyword>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>simpleAttributes.cu</primary_file>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+  </scopes>
+  <sm-arch>sm35</sm-arch>
+  <sm-arch>sm37</sm-arch>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>arm</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>simpleAttributes</title>
+  <type>exe</type>
+</entry>
diff --git a/Samples/simpleAttributes/README.md b/Samples/simpleAttributes/README.md
new file mode 100644
index 00000000..6d034226
--- /dev/null
+++ b/Samples/simpleAttributes/README.md
@@ -0,0 +1,70 @@
+# simpleAttributes - simpleAttributes
+
+## Description
+
+This CUDA Runtime API sample is a very basic example that implements how to use the stream attributes that affect L2 locality. Performance improvement due to use of L2 access policy window can only be noticed on Compute capability 8.0 or higher.
+
+## Key Concepts
+
+Attributes usage on stream
+
+## Supported SM Architectures
+
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, armv7l
+
+## CUDA APIs involved
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaCtxResetPersistingL2Cache, cudaDeviceSetLimit, cudaFree, cudaGetDeviceProperties, cudaMalloc, cudaMemcpy, cudaStreamCreate, cudaStreamSetAttribute
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
diff --git a/Samples/simpleAttributes/simpleAttributes.cu b/Samples/simpleAttributes/simpleAttributes.cu
new file mode 100644
index 00000000..46f00aae
--- /dev/null
+++ b/Samples/simpleAttributes/simpleAttributes.cu
@@ -0,0 +1,205 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// includes, system
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+// includes CUDA
+#include <cuda_runtime.h>
+
+// includes, project
+#include <helper_cuda.h>
+#include <helper_functions.h> // helper functions for SDK examples
+
+////////////////////////////////////////////////////////////////////////////////
+// declaration, forward
+void runTest(int argc, char **argv);
+
+cudaAccessPolicyWindow
+initAccessPolicyWindow(void) {
+   cudaAccessPolicyWindow accessPolicyWindow = { 0 };
+   accessPolicyWindow.base_ptr = (void *)0;
+   accessPolicyWindow.num_bytes = 0;
+   accessPolicyWindow.hitRatio = 0.f;
+   accessPolicyWindow.hitProp = cudaAccessPropertyNormal;
+   accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
+   return accessPolicyWindow;
+}
+////////////////////////////////////////////////////////////////////////////////
+//! Simple test kernel for device functionality
+//! @param data  input data in global memory
+//! @param dataSize  input data size
+//! @param bigData  input bigData in global memory
+//! @param bigDataSize  input bigData size
+//! @param hitcount how many data access are done within block
+////////////////////////////////////////////////////////////////////////////////
+static __global__ void
+kernCacheSegmentTest(int* data, int dataSize, int *trash, int bigDataSize, int hitCount)
+{
+    __shared__ unsigned int hit;
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    int tID = row * blockDim.y + col;
+    uint32_t psRand = tID;
+
+    atomicExch(&hit, 0);
+    __syncthreads();
+    while (hit < hitCount) {
+         psRand ^= psRand << 13;
+         psRand ^= psRand >> 17;
+         psRand ^= psRand << 5;
+
+         int idx = tID - psRand;
+         if (idx < 0) {
+             idx = -idx;
+         }
+
+         if((tID % 2) == 0) {
+             data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
+         } else {
+             trash[psRand % bigDataSize] = trash[psRand % bigDataSize] + trash[idx % bigDataSize];
+         }
+
+        atomicAdd(&hit, 1);
+    }
+}
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int
+main(int argc, char **argv)
+{
+    runTest(argc, argv);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Run a simple test for CUDA
+////////////////////////////////////////////////////////////////////////////////
+void
+runTest(int argc, char **argv)
+{
+    bool bTestResult = true;
+    cudaAccessPolicyWindow accessPolicyWindow;
+    cudaDeviceProp deviceProp;
+    cudaStreamAttrValue streamAttrValue;
+    cudaStream_t stream;
+    cudaStreamAttrID streamAttrID;
+    dim3 threads(32, 32);
+    int *dataDevicePointer;
+    int *dataHostPointer;
+    int dataSize;
+    int *bigDataDevicePointer;
+    int *bigDataHostPointer;
+    int bigDataSize;
+    StopWatchInterface *timer = 0;
+
+    printf("%s Starting...\n\n", argv[0]);
+
+    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
+    int devID = findCudaDevice(argc, (const char **)argv);
+    sdkCreateTimer(&timer);
+    sdkStartTimer(&timer);
+    //Get device properties
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
+    dim3 blocks(deviceProp.maxGridSize[1], 1);
+
+    //Make sure device the l2 optimization
+    if (deviceProp.persistingL2CacheMaxSize == 0) {
+        printf("Waiving execution as device %d does not support persisting L2 Caching\n", devID);
+       exit(EXIT_WAIVED);
+    }
+
+    //Create stream to assiocate with window
+    checkCudaErrors(cudaStreamCreate(&stream));
+
+    //Set the amount of l2 cache that will be persisting to maximum the device can support
+    checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, deviceProp.persistingL2CacheMaxSize));
+
+    //Stream attribute to set
+    streamAttrID = cudaStreamAttributeAccessPolicyWindow;
+
+    //Default window
+    streamAttrValue.accessPolicyWindow      = initAccessPolicyWindow();
+    accessPolicyWindow                      = initAccessPolicyWindow();
+
+    //Allocate size of both buffers
+    bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int);
+    dataSize = (deviceProp.l2CacheSize / 4)  / sizeof(int);
+
+    //Allocate data
+    dataHostPointer = (int *)malloc(dataSize * sizeof(int));
+    bigDataHostPointer = (int *)malloc(bigDataSize * sizeof(int));
+
+    for ( int i = 0; i < bigDataSize; ++i) {
+        if (i < dataSize) {
+            dataHostPointer[i] = i;
+        }
+
+        bigDataHostPointer[bigDataSize - i - 1] = i;
+    }
+
+    checkCudaErrors(cudaMalloc((void**) &dataDevicePointer, dataSize * sizeof(int)));
+    checkCudaErrors(cudaMalloc((void**) &bigDataDevicePointer, bigDataSize * sizeof(int)));
+    checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer, dataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer, bigDataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
+
+    //Make a window for the buffer of interest
+    accessPolicyWindow.base_ptr = (void *)dataDevicePointer;
+    accessPolicyWindow.num_bytes = dataSize * sizeof(int);
+    accessPolicyWindow.hitRatio = 1.f;
+    accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
+    accessPolicyWindow.missProp = cudaAccessPropertyNormal;
+    streamAttrValue.accessPolicyWindow = accessPolicyWindow;
+
+    //Assign window to stream
+    checkCudaErrors(cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
+
+    //Demote any previous persisting lines
+    checkCudaErrors(cudaCtxResetPersistingL2Cache());
+
+    checkCudaErrors(cudaStreamSynchronize(stream));
+    kernCacheSegmentTest<<<blocks, threads, 0, stream>>>(dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF);
+
+    checkCudaErrors(cudaStreamSynchronize(stream));
+    // check if kernel execution generated and error
+    getLastCudaError("Kernel execution failed");
+
+    //Free memory
+    free(dataHostPointer);
+    free(bigDataHostPointer);
+    checkCudaErrors(cudaFree(dataDevicePointer));
+    checkCudaErrors(cudaFree(bigDataDevicePointer));
+
+    sdkStopTimer(&timer);
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    sdkDeleteTimer(&timer);
+
+    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
+}
diff --git a/Samples/simpleAttributes/simpleAttributes_vs2012.sln b/Samples/simpleAttributes/simpleAttributes_vs2012.sln
new file mode 100644
index 00000000..f00b1367
--- /dev/null
+++ b/Samples/simpleAttributes/simpleAttributes_vs2012.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleAttributes", "simpleAttributes_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/simpleAttributes/simpleAttributes_vs2012.vcxproj b/Samples/simpleAttributes/simpleAttributes_vs2012.vcxproj
new file mode 100644
index 00000000..5361460b
--- /dev/null
+++ b/Samples/simpleAttributes/simpleAttributes_vs2012.vcxproj
@@ -0,0 +1,107 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>simpleAttributes_vs2012</RootNamespace>
+    <ProjectName>simpleAttributes</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/simpleAttributes.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="simpleAttributes.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/simpleAttributes/simpleAttributes_vs2013.sln b/Samples/simpleAttributes/simpleAttributes_vs2013.sln
new file mode 100644
index 00000000..7e15975e
--- /dev/null
+++ b/Samples/simpleAttributes/simpleAttributes_vs2013.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 13.00
+# Visual Studio 2013
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleAttributes", "simpleAttributes_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/simpleAttributes/simpleAttributes_vs2013.vcxproj b/Samples/simpleAttributes/simpleAttributes_vs2013.vcxproj
new file mode 100644
index 00000000..a3d725bb
--- /dev/null
+++ b/Samples/simpleAttributes/simpleAttributes_vs2013.vcxproj
@@ -0,0 +1,107 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>simpleAttributes_vs2013</RootNamespace>
+    <ProjectName>simpleAttributes</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/simpleAttributes.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="simpleAttributes.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/simpleAttributes/simpleAttributes_vs2015.sln b/Samples/simpleAttributes/simpleAttributes_vs2015.sln
new file mode 100644
index 00000000..7e8273fe
--- /dev/null
+++ b/Samples/simpleAttributes/simpleAttributes_vs2015.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 14.00
+# Visual Studio 2015
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleAttributes", "simpleAttributes_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/simpleAttributes/simpleAttributes_vs2015.vcxproj b/Samples/simpleAttributes/simpleAttributes_vs2015.vcxproj
new file mode 100644
index 00000000..61aa52d9
--- /dev/null
+++ b/Samples/simpleAttributes/simpleAttributes_vs2015.vcxproj
@@ -0,0 +1,107 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>simpleAttributes_vs2015</RootNamespace>
+    <ProjectName>simpleAttributes</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/simpleAttributes.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="simpleAttributes.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/simpleAttributes/simpleAttributes_vs2017.sln b/Samples/simpleAttributes/simpleAttributes_vs2017.sln
new file mode 100644
index 00000000..55bb578f
--- /dev/null
+++ b/Samples/simpleAttributes/simpleAttributes_vs2017.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleAttributes", "simpleAttributes_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/simpleAttributes/simpleAttributes_vs2017.vcxproj b/Samples/simpleAttributes/simpleAttributes_vs2017.vcxproj
new file mode 100644
index 00000000..7c28c99c
--- /dev/null
+++ b/Samples/simpleAttributes/simpleAttributes_vs2017.vcxproj
@@ -0,0 +1,112 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>simpleAttributes_vs2017</RootNamespace>
+    <ProjectName>simpleAttributes</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/simpleAttributes.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="simpleAttributes.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/simpleAttributes/simpleAttributes_vs2019.sln b/Samples/simpleAttributes/simpleAttributes_vs2019.sln
new file mode 100644
index 00000000..b33dffb9
--- /dev/null
+++ b/Samples/simpleAttributes/simpleAttributes_vs2019.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleAttributes", "simpleAttributes_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/simpleAttributes/simpleAttributes_vs2019.vcxproj b/Samples/simpleAttributes/simpleAttributes_vs2019.vcxproj
new file mode 100644
index 00000000..fa055a57
--- /dev/null
+++ b/Samples/simpleAttributes/simpleAttributes_vs2019.vcxproj
@@ -0,0 +1,108 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>simpleAttributes_vs2019</RootNamespace>
+    <ProjectName>simpleAttributes</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/simpleAttributes.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="simpleAttributes.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/simpleCUBLAS/Makefile b/Samples/simpleCUBLAS/Makefile
index 5509cdd3..d44a8ecc 100644
--- a/Samples/simpleCUBLAS/Makefile
+++ b/Samples/simpleCUBLAS/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -251,8 +272,8 @@ ifeq ($(GENCODE_FLAGS),)
 $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
 
 ifeq ($(SMS),)
-# Generate PTX code from SM 30
-GENCODE_FLAGS += -gencode arch=compute_30,code=compute_30
+# Generate PTX code from SM 35
+GENCODE_FLAGS += -gencode arch=compute_35,code=compute_35
 endif
 
 # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
diff --git a/Samples/simpleCUBLAS/NsightEclipse.xml b/Samples/simpleCUBLAS/NsightEclipse.xml
index 05c2a4fd..9e9022ae 100644
--- a/Samples/simpleCUBLAS/NsightEclipse.xml
+++ b/Samples/simpleCUBLAS/NsightEclipse.xml
@@ -33,7 +33,6 @@
     <scope>1:CUDA Basic Topics</scope>
     <scope>3:Linear Algebra</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
   <sm-arch>sm35</sm-arch>
   <sm-arch>sm37</sm-arch>
   <sm-arch>sm50</sm-arch>
@@ -43,6 +42,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/simpleCUBLAS/README.md b/Samples/simpleCUBLAS/README.md
index 9ef77821..58cea27f 100644
--- a/Samples/simpleCUBLAS/README.md
+++ b/Samples/simpleCUBLAS/README.md
@@ -10,11 +10,11 @@ Image Processing, CUBLAS Library
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
@@ -67,29 +67,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2012.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2012.vcxproj
index f73118e6..f293d3cb 100644
--- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2012.vcxproj
+++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleCUBLAS.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2013.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2013.vcxproj
index 6bae265a..9d6ad2b4 100644
--- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2013.vcxproj
+++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleCUBLAS.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2015.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2015.vcxproj
index 241ed2a0..73f61d44 100644
--- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2015.vcxproj
+++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleCUBLAS.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj
index ed8a564e..46d7d9d1 100644
--- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj
+++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/simpleCUBLAS.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2019.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2019.vcxproj
index 45419f71..2ac8a443 100644
--- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2019.vcxproj
+++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/simpleCUBLAS.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleCUBLASXT/Makefile b/Samples/simpleCUBLASXT/Makefile
index 9a7091c6..355b1e3b 100644
--- a/Samples/simpleCUBLASXT/Makefile
+++ b/Samples/simpleCUBLASXT/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -259,8 +280,8 @@ ifeq ($(GENCODE_FLAGS),)
 $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
 
 ifeq ($(SMS),)
-# Generate PTX code from SM 30
-GENCODE_FLAGS += -gencode arch=compute_30,code=compute_30
+# Generate PTX code from SM 35
+GENCODE_FLAGS += -gencode arch=compute_35,code=compute_35
 endif
 
 # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
diff --git a/Samples/simpleCUBLASXT/NsightEclipse.xml b/Samples/simpleCUBLASXT/NsightEclipse.xml
index 10f2775d..14138971 100644
--- a/Samples/simpleCUBLASXT/NsightEclipse.xml
+++ b/Samples/simpleCUBLASXT/NsightEclipse.xml
@@ -32,7 +32,6 @@
     <scope>1:CUDA Basic Topics</scope>
     <scope>3:Linear Algebra</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
   <sm-arch>sm35</sm-arch>
   <sm-arch>sm37</sm-arch>
   <sm-arch>sm50</sm-arch>
@@ -42,6 +41,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/simpleCUBLASXT/README.md b/Samples/simpleCUBLASXT/README.md
index 1b0c76f3..fea4f1c9 100644
--- a/Samples/simpleCUBLASXT/README.md
+++ b/Samples/simpleCUBLASXT/README.md
@@ -10,11 +10,11 @@ CUBLAS-XT Library
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -27,7 +27,7 @@ x86_64, ppc64le, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
@@ -67,29 +67,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2012.vcxproj b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2012.vcxproj
index 5f765b61..906a3cc3 100644
--- a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2012.vcxproj
+++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleCUBLASXT.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2013.vcxproj b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2013.vcxproj
index bede111d..951f0510 100644
--- a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2013.vcxproj
+++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleCUBLASXT.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2015.vcxproj b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2015.vcxproj
index 89017735..dfd4c019 100644
--- a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2015.vcxproj
+++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleCUBLASXT.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj
index ae0d9ae7..f2479b36 100644
--- a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj
+++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/simpleCUBLASXT.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2019.vcxproj b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2019.vcxproj
index 93560496..08f67b9c 100644
--- a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2019.vcxproj
+++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/simpleCUBLASXT.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleCUFFT/Makefile b/Samples/simpleCUFFT/Makefile
index 0b8295e6..d0178106 100644
--- a/Samples/simpleCUFFT/Makefile
+++ b/Samples/simpleCUFFT/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -247,9 +268,9 @@ LIBRARIES :=
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
diff --git a/Samples/simpleCUFFT/NsightEclipse.xml b/Samples/simpleCUFFT/NsightEclipse.xml
index 1d8ac3b1..c1a79133 100644
--- a/Samples/simpleCUFFT/NsightEclipse.xml
+++ b/Samples/simpleCUFFT/NsightEclipse.xml
@@ -31,7 +31,6 @@
     <scope>1:CUDA Basic Topics</scope>
     <scope>2:Image Processing</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
   <sm-arch>sm35</sm-arch>
   <sm-arch>sm37</sm-arch>
   <sm-arch>sm50</sm-arch>
@@ -41,6 +40,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/simpleCUFFT/README.md b/Samples/simpleCUFFT/README.md
index 602f7709..b96b46b8 100644
--- a/Samples/simpleCUFFT/README.md
+++ b/Samples/simpleCUFFT/README.md
@@ -10,11 +10,11 @@ Image Processing, CUFFT Library
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
@@ -67,29 +67,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2012.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2012.vcxproj
index 5641d7ca..2efcc959 100644
--- a/Samples/simpleCUFFT/simpleCUFFT_vs2012.vcxproj
+++ b/Samples/simpleCUFFT/simpleCUFFT_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleCUFFT.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2013.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2013.vcxproj
index 564d325b..0a8c8e65 100644
--- a/Samples/simpleCUFFT/simpleCUFFT_vs2013.vcxproj
+++ b/Samples/simpleCUFFT/simpleCUFFT_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleCUFFT.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2015.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2015.vcxproj
index 561eb3d1..4b5094ad 100644
--- a/Samples/simpleCUFFT/simpleCUFFT_vs2015.vcxproj
+++ b/Samples/simpleCUFFT/simpleCUFFT_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleCUFFT.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj
index 7aa1bd05..9481704c 100644
--- a/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj
+++ b/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/simpleCUFFT.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2019.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2019.vcxproj
index eb017591..00d4fa86 100644
--- a/Samples/simpleCUFFT/simpleCUFFT_vs2019.vcxproj
+++ b/Samples/simpleCUFFT/simpleCUFFT_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/simpleCUFFT.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleCudaGraphs/Makefile b/Samples/simpleCudaGraphs/Makefile
index ae3242da..0b2dab99 100644
--- a/Samples/simpleCudaGraphs/Makefile
+++ b/Samples/simpleCudaGraphs/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -247,9 +268,9 @@ LIBRARIES :=
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
diff --git a/Samples/simpleCudaGraphs/NsightEclipse.xml b/Samples/simpleCudaGraphs/NsightEclipse.xml
index 9137ab80..e5b751d1 100644
--- a/Samples/simpleCudaGraphs/NsightEclipse.xml
+++ b/Samples/simpleCudaGraphs/NsightEclipse.xml
@@ -41,16 +41,6 @@
   <scopes>
     <scope>1:CUDA</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
-  <sm-arch>sm35</sm-arch>
-  <sm-arch>sm37</sm-arch>
-  <sm-arch>sm50</sm-arch>
-  <sm-arch>sm52</sm-arch>
-  <sm-arch>sm60</sm-arch>
-  <sm-arch>sm61</sm-arch>
-  <sm-arch>sm70</sm-arch>
-  <sm-arch>sm72</sm-arch>
-  <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/simpleCudaGraphs/README.md b/Samples/simpleCudaGraphs/README.md
index dd4f4a41..792f4049 100644
--- a/Samples/simpleCudaGraphs/README.md
+++ b/Samples/simpleCudaGraphs/README.md
@@ -10,11 +10,9 @@ CUDA Graphs, Stream Capture
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
-
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -27,7 +25,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaLaunchHostFunc, cudaGraphCreat
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
@@ -66,29 +64,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2012.vcxproj b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2012.vcxproj
index 1ad4de3d..061a5f45 100644
--- a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2012.vcxproj
+++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleCudaGraphs.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2013.vcxproj b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2013.vcxproj
index 7638e92b..92727fce 100644
--- a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2013.vcxproj
+++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleCudaGraphs.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2015.vcxproj b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2015.vcxproj
index 41dcf104..d0faa0cb 100644
--- a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2015.vcxproj
+++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleCudaGraphs.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj
index ea712c78..69d3e0d9 100644
--- a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj
+++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/simpleCudaGraphs.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2019.vcxproj b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2019.vcxproj
index 038d873c..773ee60c 100644
--- a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2019.vcxproj
+++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/simpleCudaGraphs.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleD3D11/README.md b/Samples/simpleD3D11/README.md
index 4a59faa2..c63fbcce 100644
--- a/Samples/simpleD3D11/README.md
+++ b/Samples/simpleD3D11/README.md
@@ -10,7 +10,7 @@ Graphics Interop, Image Processing
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -30,7 +30,7 @@ cudaD3D11GetDevice, cudaImportExternalSemaphore, cudaImportExternalMemory, cudaE
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/simpleD3D11/simpleD3D11_vs2012.vcxproj b/Samples/simpleD3D11/simpleD3D11_vs2012.vcxproj
index 6af8a160..c26e752d 100644
--- a/Samples/simpleD3D11/simpleD3D11_vs2012.vcxproj
+++ b/Samples/simpleD3D11/simpleD3D11_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleD3D11.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleD3D11/simpleD3D11_vs2013.vcxproj b/Samples/simpleD3D11/simpleD3D11_vs2013.vcxproj
index 22d8e535..e5c8b879 100644
--- a/Samples/simpleD3D11/simpleD3D11_vs2013.vcxproj
+++ b/Samples/simpleD3D11/simpleD3D11_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleD3D11.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleD3D11/simpleD3D11_vs2015.vcxproj b/Samples/simpleD3D11/simpleD3D11_vs2015.vcxproj
index 1aa92634..82059ed6 100644
--- a/Samples/simpleD3D11/simpleD3D11_vs2015.vcxproj
+++ b/Samples/simpleD3D11/simpleD3D11_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleD3D11.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleD3D11/simpleD3D11_vs2017.vcxproj b/Samples/simpleD3D11/simpleD3D11_vs2017.vcxproj
index 68411364..69240309 100644
--- a/Samples/simpleD3D11/simpleD3D11_vs2017.vcxproj
+++ b/Samples/simpleD3D11/simpleD3D11_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/simpleD3D11.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleD3D11/simpleD3D11_vs2019.vcxproj b/Samples/simpleD3D11/simpleD3D11_vs2019.vcxproj
index 74fd4584..f06414f4 100644
--- a/Samples/simpleD3D11/simpleD3D11_vs2019.vcxproj
+++ b/Samples/simpleD3D11/simpleD3D11_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/simpleD3D11.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleD3D12/NsightEclipse.xml b/Samples/simpleD3D12/NsightEclipse.xml
index 902723ba..32fcba5e 100644
--- a/Samples/simpleD3D12/NsightEclipse.xml
+++ b/Samples/simpleD3D12/NsightEclipse.xml
@@ -40,7 +40,6 @@
     <scope>1:CUDA Basic Topics</scope>
     <scope>2:Graphics Interop</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
   <sm-arch>sm35</sm-arch>
   <sm-arch>sm37</sm-arch>
   <sm-arch>sm50</sm-arch>
@@ -50,6 +49,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <platform>windows10</platform>
diff --git a/Samples/simpleD3D12/README.md b/Samples/simpleD3D12/README.md
index f3a8c9a5..b268341a 100644
--- a/Samples/simpleD3D12/README.md
+++ b/Samples/simpleD3D12/README.md
@@ -10,7 +10,7 @@ Graphics Interop, CUDA DX12 Interop, Image Processing
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -30,7 +30,7 @@ cudaWaitExternalSemaphoresAsync, cudaSignalExternalSemaphoresAsync, cudaImportEx
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/simpleD3D12/simpleD3D12_vs2015.vcxproj b/Samples/simpleD3D12/simpleD3D12_vs2015.vcxproj
index 6901137d..4d102fa3 100644
--- a/Samples/simpleD3D12/simpleD3D12_vs2015.vcxproj
+++ b/Samples/simpleD3D12/simpleD3D12_vs2015.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/simpleD3D12.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj b/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj
index dae2d8e3..0a02c129 100644
--- a/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj
+++ b/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/simpleD3D12.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleD3D12/simpleD3D12_vs2019.vcxproj b/Samples/simpleD3D12/simpleD3D12_vs2019.vcxproj
index d6c6826d..3487b4ac 100644
--- a/Samples/simpleD3D12/simpleD3D12_vs2019.vcxproj
+++ b/Samples/simpleD3D12/simpleD3D12_vs2019.vcxproj
@@ -68,7 +68,7 @@
       <OutputFile>$(OutDir)/simpleD3D12.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleDrvRuntime/Makefile b/Samples/simpleDrvRuntime/Makefile
index ea67af23..2491545b 100644
--- a/Samples/simpleDrvRuntime/Makefile
+++ b/Samples/simpleDrvRuntime/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -251,9 +272,9 @@ FATBIN_FILE := vectorAdd_kernel${TARGET_SIZE}.fatbin
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
@@ -288,6 +309,10 @@ else
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
   endif
 
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
+  endif
+
   ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
   endif
@@ -302,12 +327,19 @@ else
 
   ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
+    ifdef TARGET_OVERRIDE
+        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
+    endif
   endif
 
   ifeq ($(TARGET_ARCH),ppc64le)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
   endif
 
+  ifeq ($(HOST_ARCH),ppc64le)
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
+  endif
+
   CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
   ifeq ("$(CUDALIB)","")
     $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
diff --git a/Samples/simpleDrvRuntime/README.md b/Samples/simpleDrvRuntime/README.md
index 2d965ddd..45233aed 100644
--- a/Samples/simpleDrvRuntime/README.md
+++ b/Samples/simpleDrvRuntime/README.md
@@ -10,11 +10,11 @@ CUDA Driver API, CUDA Runtime API, Vector Addition
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -28,7 +28,7 @@ cudaMemcpy, cudaMalloc, cudaStreamCreateWithFlags
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
@@ -67,29 +67,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2012.vcxproj b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2012.vcxproj
index 585fd8b5..0e512f7c 100644
--- a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2012.vcxproj
+++ b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleDrvRuntime.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2013.vcxproj b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2013.vcxproj
index f7091548..50c11387 100644
--- a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2013.vcxproj
+++ b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleDrvRuntime.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2015.vcxproj b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2015.vcxproj
index 2c2074b2..2bd207e7 100644
--- a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2015.vcxproj
+++ b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleDrvRuntime.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2017.vcxproj b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2017.vcxproj
index e477c1ed..014e4e71 100644
--- a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2017.vcxproj
+++ b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/simpleDrvRuntime.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2019.vcxproj b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2019.vcxproj
index e0d554ab..5f71989d 100644
--- a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2019.vcxproj
+++ b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/simpleDrvRuntime.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleIPC/Makefile b/Samples/simpleIPC/Makefile
index 5a4a906b..90df2682 100644
--- a/Samples/simpleIPC/Makefile
+++ b/Samples/simpleIPC/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -252,6 +273,12 @@ ifeq ($(TARGET_ARCH),aarch64)
   SAMPLE_ENABLED := 0
 endif
 
+# This sample is not supported on sbsa
+ifeq ($(TARGET_ARCH),sbsa)
+  $(info >>> WARNING - simpleIPC is not supported on sbsa - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
@@ -265,9 +292,9 @@ LIBRARIES :=
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
diff --git a/Samples/simpleIPC/NsightEclipse.xml b/Samples/simpleIPC/NsightEclipse.xml
index 8ac258f2..3fd5b6bd 100644
--- a/Samples/simpleIPC/NsightEclipse.xml
+++ b/Samples/simpleIPC/NsightEclipse.xml
@@ -39,7 +39,6 @@
     <scope>1:CUDA Basic Topics</scope>
     <scope>1:CUDA Systems Integration</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
   <sm-arch>sm35</sm-arch>
   <sm-arch>sm37</sm-arch>
   <sm-arch>sm50</sm-arch>
@@ -49,6 +48,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <sources>
     <extracompilation>../../Common/helper_multiprocess.cpp</extracompilation>
     <extraheader>../../Common/helper_multiprocess.h</extraheader>
diff --git a/Samples/simpleIPC/README.md b/Samples/simpleIPC/README.md
index 7668288d..f9b9fef9 100644
--- a/Samples/simpleIPC/README.md
+++ b/Samples/simpleIPC/README.md
@@ -10,7 +10,7 @@ CUDA Systems Integration, Peer to Peer, InterProcess Communication
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -30,7 +30,7 @@ cudaIpcGetEventHandle, cudaIpcOpenMemHandle, cudaIpcCloseMemHandle, cudaMemcpyAs
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/simpleIPC/simpleIPC_vs2012.vcxproj b/Samples/simpleIPC/simpleIPC_vs2012.vcxproj
index a4761acd..9fcbb221 100644
--- a/Samples/simpleIPC/simpleIPC_vs2012.vcxproj
+++ b/Samples/simpleIPC/simpleIPC_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleIPC.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleIPC/simpleIPC_vs2013.vcxproj b/Samples/simpleIPC/simpleIPC_vs2013.vcxproj
index a27bebeb..bba0dca0 100644
--- a/Samples/simpleIPC/simpleIPC_vs2013.vcxproj
+++ b/Samples/simpleIPC/simpleIPC_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleIPC.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleIPC/simpleIPC_vs2015.vcxproj b/Samples/simpleIPC/simpleIPC_vs2015.vcxproj
index 44af781f..c6150a14 100644
--- a/Samples/simpleIPC/simpleIPC_vs2015.vcxproj
+++ b/Samples/simpleIPC/simpleIPC_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleIPC.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleIPC/simpleIPC_vs2017.vcxproj b/Samples/simpleIPC/simpleIPC_vs2017.vcxproj
index 66b4694e..e1ebb906 100644
--- a/Samples/simpleIPC/simpleIPC_vs2017.vcxproj
+++ b/Samples/simpleIPC/simpleIPC_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/simpleIPC.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleIPC/simpleIPC_vs2019.vcxproj b/Samples/simpleIPC/simpleIPC_vs2019.vcxproj
index 6593da88..4bfc659f 100644
--- a/Samples/simpleIPC/simpleIPC_vs2019.vcxproj
+++ b/Samples/simpleIPC/simpleIPC_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/simpleIPC.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleVoteIntrinsics/Makefile b/Samples/simpleVoteIntrinsics/Makefile
index 1c9bbedd..c38ad0ca 100644
--- a/Samples/simpleVoteIntrinsics/Makefile
+++ b/Samples/simpleVoteIntrinsics/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -247,9 +268,9 @@ LIBRARIES :=
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
diff --git a/Samples/simpleVoteIntrinsics/NsightEclipse.xml b/Samples/simpleVoteIntrinsics/NsightEclipse.xml
index f7c2618f..3c5f4a00 100644
--- a/Samples/simpleVoteIntrinsics/NsightEclipse.xml
+++ b/Samples/simpleVoteIntrinsics/NsightEclipse.xml
@@ -32,7 +32,6 @@
   <scopes>
     <scope>1:CUDA Basic Topics</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
   <sm-arch>sm35</sm-arch>
   <sm-arch>sm37</sm-arch>
   <sm-arch>sm50</sm-arch>
@@ -42,6 +41,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/simpleVoteIntrinsics/README.md b/Samples/simpleVoteIntrinsics/README.md
index e05e69a0..7fd65b51 100644
--- a/Samples/simpleVoteIntrinsics/README.md
+++ b/Samples/simpleVoteIntrinsics/README.md
@@ -10,11 +10,11 @@ Vote Intrinsics
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -27,7 +27,7 @@ cudaMalloc, cudaFree, cudaMemcpy, cudaFreeHost
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
@@ -66,29 +66,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2012.vcxproj b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2012.vcxproj
index bdeb4881..f8a1d44c 100644
--- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2012.vcxproj
+++ b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleVoteIntrinsics.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2013.vcxproj b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2013.vcxproj
index 51b7ae5e..1c1db5dc 100644
--- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2013.vcxproj
+++ b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleVoteIntrinsics.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2015.vcxproj b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2015.vcxproj
index 70b35133..a6b15b1b 100644
--- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2015.vcxproj
+++ b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleVoteIntrinsics.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj
index 06432414..085bb1e4 100644
--- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj
+++ b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/simpleVoteIntrinsics.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2019.vcxproj b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2019.vcxproj
index fb752084..5f4076f9 100644
--- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2019.vcxproj
+++ b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/simpleVoteIntrinsics.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/simpleVulkan/Build_instructions.txt b/Samples/simpleVulkan/Build_instructions.txt
index e7517d42..7e4e5a2f 100644
--- a/Samples/simpleVulkan/Build_instructions.txt
+++ b/Samples/simpleVulkan/Build_instructions.txt
@@ -10,14 +10,17 @@ To add the GLFW3 headers path
 -- In Property pages window go to "VC++ Directories" section. Here in "Include Directories" edit and add path to GLFW3 headers include directory location.
 ** Make sure to add path to glfw3.dll in your PATH environment variable**
 
+
 For Linux:
 -- Install the Vulkan SDK from https://www.lunarg.com/vulkan-sdk/  and follow environment setup instructions.
--- Install GLFW3 library through your OS package repository. For example: apt-get for Ubuntu and dnf for RHEL/CentOS
+-- Install GLFW3 library through your OS package repository. For example: apt-get for Ubuntu and dnf for RHEL/CentOS. Below is for Ubuntu:
+    sudo apt-get install libglfw3
+    sudo apt-get install libglfw3-dev
 -- Install "libxcb1-dev" and "xorg-dev" as GLFW3 is depended on it
 -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
 
 For Linux aarch64(L4T):
--- Install GLFW3 library using "apt-get install libglfw3-dev" this will provide glfw3 
+-- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3 
 -- install above will also provide libvulkan-dev as dependencies
 -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
--- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK>', VULKAN_SDK_PATH in this scenario is typically "/usr"
+-- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK>', VULKAN_SDK_PATH in this scenario is typically "/usr"
\ No newline at end of file
diff --git a/Samples/simpleVulkan/Makefile b/Samples/simpleVulkan/Makefile
index 520ed07c..390b6eb1 100644
--- a/Samples/simpleVulkan/Makefile
+++ b/Samples/simpleVulkan/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -268,12 +289,18 @@ include ./findvulkan.mk
 
 # Vulkan specific libraries
 ifeq ($(TARGET_OS),linux)
- LIBRARIES += -L $(VULKAN_SDK_PATH)/lib
+ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+  LIBRARIES += -L$(VULKAN_SDK_LIB) -lvulkan
+  LIBRARIES += -lglfw
+  INCLUDES  += -I$(VULKAN_HEADER)
+ else
+ LIBRARIES += -L$(VULKAN_SDK_LIB)
  LIBRARIES += `pkg-config --static --libs glfw3` -lvulkan
- INCLUDES  += `pkg-config --static --cflags glfw3` -I$(VULKAN_SDK_PATH)/include
+ INCLUDES  += `pkg-config --static --cflags glfw3` -I$(VULKAN_HEADER)
+ endif
 endif
 
-#Detect if installed version of GCC supports C++11
+#Detect if installed version of GCC supports required C++11
 ifeq ($(TARGET_OS),linux)
     empty :=
     space := $(empty) $(empty)
@@ -295,16 +322,16 @@ ifeq ($(TARGET_OS),linux)
     ifeq ($(IS_MIN_VERSION), 1)
         $(info >>> GCC Version is greater or equal to 4.7.0 <<<)
     else
-        $(info >>> Waiving build. Minimum GCC version required for C++11 is 4.7.0 <<<)
+        $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
         SAMPLE_ENABLED := 0
     endif
 endif
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
@@ -343,10 +370,16 @@ else
 	@echo "Sample is ready - all dependencies have been met"
 endif
 
-vulkanCUDASinewave.o:vulkanCUDASinewave.cu
+SineWaveSimulation.o:SineWaveSimulation.cu
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 
-simpleVulkan: vulkanCUDASinewave.o
+VulkanBaseApp.o:VulkanBaseApp.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+main.o:main.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+simpleVulkan: SineWaveSimulation.o VulkanBaseApp.o main.o
 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
 	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
@@ -355,7 +388,7 @@ run: build
 	$(EXEC) ./simpleVulkan
 
 clean:
-	rm -f simpleVulkan vulkanCUDASinewave.o
+	rm -f simpleVulkan SineWaveSimulation.o VulkanBaseApp.o main.o
 	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/simpleVulkan
 
 clobber: clean
diff --git a/Samples/simpleVulkan/NsightEclipse.xml b/Samples/simpleVulkan/NsightEclipse.xml
index 34f7fae4..8b3c8bd1 100644
--- a/Samples/simpleVulkan/NsightEclipse.xml
+++ b/Samples/simpleVulkan/NsightEclipse.xml
@@ -17,6 +17,10 @@
   </cuda_api_list>
   <description><![CDATA[This sample demonstrates Vulkan CUDA Interop. CUDA imports the Vulkan vertex buffer and operates on it to create sinewave, and synchronizes with Vulkan through vulkan semaphores imported by CUDA. This sample depends on Vulkan SDK, GLFW3 libraries, for building this sample please refer to "Build_instructions.txt" provided in this sample's directory]]></description>
   <devicecompilation>whole</devicecompilation>
+  <files>
+    <file>sinewave.vert</file>
+    <file>sinewave.frag</file>
+  </files>
   <includepaths>
     <path>./</path>
     <path>../</path>
@@ -38,7 +42,7 @@
   <librarypaths>
   </librarypaths>
   <nsight_eclipse>true</nsight_eclipse>
-  <primary_file>vulkanCUDASinewave.cu</primary_file>
+  <primary_file>main.cpp</primary_file>
   <required_dependencies>
     <dependency>X11</dependency>
     <dependency>VULKAN</dependency>
@@ -48,7 +52,6 @@
     <scope>1:CUDA Advanced Topics</scope>
     <scope>1:CUDA Vulkan Interop</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
   <sm-arch>sm35</sm-arch>
   <sm-arch>sm37</sm-arch>
   <sm-arch>sm50</sm-arch>
@@ -58,6 +61,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/simpleVulkan/README.md b/Samples/simpleVulkan/README.md
index 91225856..66e03928 100644
--- a/Samples/simpleVulkan/README.md
+++ b/Samples/simpleVulkan/README.md
@@ -10,7 +10,7 @@ Graphics Interop, CUDA Vulkan Interop, Data Parallel Algorithms
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedBuffer, cudaImportExternalS
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/simpleVulkan/SineWaveSimulation.cu b/Samples/simpleVulkan/SineWaveSimulation.cu
new file mode 100644
index 00000000..68e63d7a
--- /dev/null
+++ b/Samples/simpleVulkan/SineWaveSimulation.cu
@@ -0,0 +1,138 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "SineWaveSimulation.h"
+#include <algorithm>
+#include <helper_cuda.h>
+
+__global__ void sinewave(float *heightMap, unsigned int width, unsigned int height, float time)
+{
+    const float freq = 4.0f;
+    const size_t stride = gridDim.x * blockDim.x;
+
+    // Iterate through the entire array in a way that is
+    // independent of the grid configuration
+    for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < width * height; tid += stride) {
+        // Calculate the x, y coordinates
+        const size_t y = tid / width;
+        const size_t x = tid - y * width;
+        // Normalize x, y to [0,1]
+        const float u = ((2.0f * x) / width)  - 1.0f;
+        const float v = ((2.0f * y) / height) - 1.0f;
+        // Calculate the new height value
+        const float w = 0.5f * sinf(u * freq + time) * cosf(v * freq + time);
+        // Store this new height value
+        heightMap[tid] = w;
+    }
+}
+
+SineWaveSimulation::SineWaveSimulation(size_t width, size_t height) 
+                                        : m_heightMap(nullptr), m_width(width), m_height(height)
+{
+}
+
+void SineWaveSimulation::initCudaLaunchConfig(int device)
+{
+    cudaDeviceProp prop = {};
+    checkCudaErrors(cudaSetDevice(device));
+    checkCudaErrors(cudaGetDeviceProperties(&prop, device));
+
+    // We don't need large block sizes, since there's not much inter-thread communication
+    m_threads = prop.warpSize;
+
+    // Use the occupancy calculator and fill the gpu as best as we can
+    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_blocks, sinewave, prop.warpSize, 0));
+    m_blocks *= prop.multiProcessorCount;
+
+    // Go ahead and the clamp the blocks to the minimum needed for this height/width
+    m_blocks = std::min(m_blocks, (int)((m_width * m_height + m_threads - 1) / m_threads));
+}
+
+int SineWaveSimulation::initCuda(uint8_t  *vkDeviceUUID, size_t UUID_SIZE)
+{
+    int current_device = 0;
+    int device_count = 0;
+    int devices_prohibited = 0;
+
+    cudaDeviceProp deviceProp;
+    checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+    if (device_count == 0) {
+        fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    // Find the GPU which is selected by Vulkan
+    while (current_device < device_count) {
+        cudaGetDeviceProperties(&deviceProp, current_device);
+
+        if ((deviceProp.computeMode != cudaComputeModeProhibited)) {
+            // Compare the cuda device UUID with vulkan UUID
+            int ret = memcmp((void*)&deviceProp.uuid, vkDeviceUUID, UUID_SIZE);
+            if (ret == 0)
+            {
+                checkCudaErrors(cudaSetDevice(current_device));
+                checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device));
+                printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+                 current_device, deviceProp.name, deviceProp.major,
+                 deviceProp.minor);
+
+                return current_device;
+            }
+
+        } else {
+          devices_prohibited++;
+        }
+
+        current_device++;
+    }
+
+    if (devices_prohibited == device_count) {
+        fprintf(stderr,
+                "CUDA error:"
+                " No Vulkan-CUDA Interop capable GPU found.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    return -1;
+}
+
+SineWaveSimulation::~SineWaveSimulation()
+{
+    m_heightMap = NULL;
+}
+
+void SineWaveSimulation::initSimulation(float *heights)
+{
+    m_heightMap = heights;
+}
+
+void SineWaveSimulation::stepSimulation(float time, cudaStream_t stream)
+{
+    sinewave <<< m_blocks, m_threads, 0, stream >>> (m_heightMap, m_width, m_height, time);
+    getLastCudaError("Failed to launch CUDA simulation");
+}
diff --git a/Samples/simpleVulkan/SineWaveSimulation.h b/Samples/simpleVulkan/SineWaveSimulation.h
new file mode 100644
index 00000000..dc889b4b
--- /dev/null
+++ b/Samples/simpleVulkan/SineWaveSimulation.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+#ifndef __SINESIM_H__
+#define __SINESIM_H__
+
+#include <vector>
+#include <cuda_runtime_api.h>
+#include <stdint.h>
+#include "linmath.h"
+
+class SineWaveSimulation
+{
+    float *m_heightMap;
+    size_t m_width, m_height;
+    int m_blocks, m_threads;
+public:
+    SineWaveSimulation(size_t width, size_t height);
+    ~SineWaveSimulation();
+    void initSimulation(float *heightMap);
+    void stepSimulation(float time, cudaStream_t stream = 0);
+    void initCudaLaunchConfig(int device);
+    int initCuda(uint8_t  *vkDeviceUUID, size_t UUID_SIZE);
+
+    size_t getWidth() const {
+        return m_width;
+    }
+    size_t getHeight() const {
+        return m_height;
+    }
+};
+
+#endif // __SINESIM_H__
diff --git a/Samples/simpleVulkan/VulkanBaseApp.cpp b/Samples/simpleVulkan/VulkanBaseApp.cpp
new file mode 100644
index 00000000..05dece53
--- /dev/null
+++ b/Samples/simpleVulkan/VulkanBaseApp.cpp
@@ -0,0 +1,1719 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This file contains basic cross-platform setup paths in working with Vulkan
+ * and rendering window.  It is largely based off of tutorials provided here:
+ * https://vulkan-tutorial.com/
+*/
+
+#include <stdexcept>
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include <functional>
+#include <set>
+#include <string.h>
+
+#include "VulkanBaseApp.h"
+
+#define GLFW_INCLUDE_VULKAN
+#define GLM_FORCE_DEPTH_ZERO_TO_ONE
+#include <GLFW/glfw3.h>
+
+#ifdef _WIN64
+#include <VersionHelpers.h>
+#include <dxgi1_2.h>
+#include <aclapi.h>
+#endif /* _WIN64 */
+
+#ifndef countof
+#define countof(x) (sizeof(x) / sizeof(*(x)))
+#endif
+
+static const char *validationLayers[] = { "VK_LAYER_KHRONOS_validation" };
+static const size_t MAX_FRAMES_IN_FLIGHT = 5;
+
+void VulkanBaseApp::resizeCallback(GLFWwindow *window, int width, int height)
+{
+    VulkanBaseApp *app = reinterpret_cast<VulkanBaseApp *>(glfwGetWindowUserPointer(window));
+    app->m_framebufferResized = true;
+}
+
+static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, VkDebugUtilsMessageTypeFlagsEXT messageType, const VkDebugUtilsMessengerCallbackDataEXT *pCallbackData, void *pUserData)
+{
+    std::cerr << "validation layer: " << pCallbackData->pMessage << std::endl;
+
+    return VK_FALSE;
+}
+
+VulkanBaseApp::VulkanBaseApp(const std::string& appName, bool enableValidation) :
+    m_appName(appName),
+    m_enableValidation(enableValidation),
+    m_instance(VK_NULL_HANDLE),
+    m_window(nullptr),
+    m_debugMessenger(VK_NULL_HANDLE),
+    m_surface(VK_NULL_HANDLE),
+    m_physicalDevice(VK_NULL_HANDLE),
+    m_device(VK_NULL_HANDLE),
+    m_graphicsQueue(VK_NULL_HANDLE),
+    m_presentQueue(VK_NULL_HANDLE),
+    m_swapChain(VK_NULL_HANDLE),
+    m_vkDeviceUUID(),
+    m_swapChainImages(),
+    m_swapChainFormat(),
+    m_swapChainExtent(),
+    m_swapChainImageViews(),
+    m_shaderFiles(),
+    m_renderPass(),
+    m_pipelineLayout(VK_NULL_HANDLE),
+    m_graphicsPipeline(VK_NULL_HANDLE),
+    m_swapChainFramebuffers(),
+    m_commandPool(VK_NULL_HANDLE),
+    m_commandBuffers(),
+    m_imageAvailableSemaphores(),
+    m_renderFinishedSemaphores(),
+    m_inFlightFences(),
+    m_uniformBuffers(),
+    m_uniformMemory(),
+    m_descriptorSetLayout(VK_NULL_HANDLE),
+    m_descriptorPool(VK_NULL_HANDLE),
+    m_descriptorSets(),
+    m_depthImage(VK_NULL_HANDLE),
+    m_depthImageMemory(VK_NULL_HANDLE),
+    m_depthImageView(VK_NULL_HANDLE),
+    m_currentFrame(0),
+    m_framebufferResized(false)
+{
+}
+
+VkExternalSemaphoreHandleTypeFlagBits VulkanBaseApp::getDefaultSemaphoreHandleType()
+{
+#ifdef _WIN64
+    return IsWindows8OrGreater() ?
+           VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT :
+           VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT;
+#else
+    return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT;
+#endif
+}
+
+VkExternalMemoryHandleTypeFlagBits VulkanBaseApp::getDefaultMemHandleType()
+{
+#ifdef _WIN64
+    return IsWindows8Point1OrGreater() ?
+           VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT :
+           VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT;
+#else
+    return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+#endif
+}
+
+VulkanBaseApp::~VulkanBaseApp()
+{
+    cleanupSwapChain();
+
+    if (m_descriptorSetLayout != VK_NULL_HANDLE) {
+        vkDestroyDescriptorSetLayout(m_device, m_descriptorSetLayout, nullptr);
+    }
+
+    for (size_t i = 0; i < m_renderFinishedSemaphores.size(); i++) {
+        vkDestroySemaphore(m_device, m_renderFinishedSemaphores[i], nullptr);
+        vkDestroySemaphore(m_device, m_imageAvailableSemaphores[i], nullptr);
+        vkDestroyFence(m_device, m_inFlightFences[i], nullptr);
+    }
+    if (m_commandPool != VK_NULL_HANDLE) {
+        vkDestroyCommandPool(m_device, m_commandPool, nullptr);
+    }
+
+    if (m_device != VK_NULL_HANDLE) {
+        vkDestroyDevice(m_device, nullptr);
+    }
+
+    if (m_enableValidation) {
+        PFN_vkDestroyDebugUtilsMessengerEXT func = (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr(m_instance, "vkDestroyDebugUtilsMessengerEXT");
+        if (func != nullptr) {
+            func(m_instance, m_debugMessenger, nullptr);
+        }
+    }
+
+    if (m_surface != VK_NULL_HANDLE) {
+        vkDestroySurfaceKHR(m_instance, m_surface, nullptr);
+    }
+
+    if (m_instance != VK_NULL_HANDLE) {
+        vkDestroyInstance(m_instance, nullptr);
+    }
+
+    if (m_window) {
+        glfwDestroyWindow(m_window);
+    }
+
+    glfwTerminate();
+}
+
+void VulkanBaseApp::init()
+{
+    initWindow();
+    initVulkan();
+}
+
+VkCommandBuffer VulkanBaseApp::beginSingleTimeCommands()
+{
+    VkCommandBufferAllocateInfo allocInfo = {};
+    allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+    allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+    allocInfo.commandPool = m_commandPool;
+    allocInfo.commandBufferCount = 1;
+
+    VkCommandBuffer commandBuffer;
+    vkAllocateCommandBuffers(m_device, &allocInfo, &commandBuffer);
+
+    VkCommandBufferBeginInfo beginInfo = {};
+    beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+
+    vkBeginCommandBuffer(commandBuffer, &beginInfo);
+
+    return commandBuffer;
+}
+
+void VulkanBaseApp::endSingleTimeCommands(VkCommandBuffer commandBuffer)
+{
+    vkEndCommandBuffer(commandBuffer);
+
+    VkSubmitInfo submitInfo = {};
+    submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    submitInfo.commandBufferCount = 1;
+    submitInfo.pCommandBuffers = &commandBuffer;
+
+    vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE);
+    vkQueueWaitIdle(m_graphicsQueue);
+
+    vkFreeCommandBuffers(m_device, m_commandPool, 1, &commandBuffer);
+}
+
+void VulkanBaseApp::initWindow()
+{
+    glfwInit();
+
+    glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API);
+    glfwWindowHint(GLFW_RESIZABLE, GLFW_FALSE);
+
+    m_window = glfwCreateWindow(1280, 800, m_appName.c_str(), nullptr, nullptr);
+    glfwSetWindowUserPointer(m_window, this);
+    glfwSetFramebufferSizeCallback(m_window, resizeCallback);
+}
+
+
+std::vector<const char *> VulkanBaseApp::getRequiredExtensions() const
+{
+    return std::vector<const char *>();
+}
+
+std::vector<const char *> VulkanBaseApp::getRequiredDeviceExtensions() const
+{
+    return std::vector<const char *>();
+}
+
+void VulkanBaseApp::initVulkan()
+{
+    createInstance();
+    createSurface();
+    createDevice();
+    createSwapChain();
+    createImageViews();
+    createRenderPass();
+    createDescriptorSetLayout();
+    createGraphicsPipeline();
+    createCommandPool();
+    createDepthResources();
+    createFramebuffers();
+    initVulkanApp();
+    createUniformBuffers();
+    createDescriptorPool();
+    createDescriptorSets();
+    createCommandBuffers();
+    createSyncObjects();
+}
+
+#ifdef _WIN64
+class WindowsSecurityAttributes
+{
+protected:
+    SECURITY_ATTRIBUTES m_winSecurityAttributes;
+    PSECURITY_DESCRIPTOR m_winPSecurityDescriptor;
+
+public:
+    WindowsSecurityAttributes();
+    SECURITY_ATTRIBUTES *operator&();
+    ~WindowsSecurityAttributes();
+};
+
+WindowsSecurityAttributes::WindowsSecurityAttributes()
+{
+    m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc(1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void **));
+    if (!m_winPSecurityDescriptor) {
+        throw std::runtime_error("Failed to allocate memory for security descriptor");
+    }
+
+    PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH);
+    PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *));
+
+    InitializeSecurityDescriptor(m_winPSecurityDescriptor, SECURITY_DESCRIPTOR_REVISION);
+
+    SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = SECURITY_WORLD_SID_AUTHORITY;
+    AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0, 0, 0, 0, 0, 0, ppSID);
+
+    EXPLICIT_ACCESS explicitAccess;
+    ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS));
+    explicitAccess.grfAccessPermissions = STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL;
+    explicitAccess.grfAccessMode = SET_ACCESS;
+    explicitAccess.grfInheritance = INHERIT_ONLY;
+    explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID;
+    explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP;
+    explicitAccess.Trustee.ptstrName = (LPTSTR) * ppSID;
+
+    SetEntriesInAcl(1, &explicitAccess, NULL, ppACL);
+
+    SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE);
+
+    m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes);
+    m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor;
+    m_winSecurityAttributes.bInheritHandle = TRUE;
+}
+
+SECURITY_ATTRIBUTES *
+WindowsSecurityAttributes::operator&()
+{
+    return &m_winSecurityAttributes;
+}
+
+WindowsSecurityAttributes::~WindowsSecurityAttributes()
+{
+    PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH);
+    PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *));
+
+    if (*ppSID) {
+        FreeSid(*ppSID);
+    }
+    if (*ppACL) {
+        LocalFree(*ppACL);
+    }
+    free(m_winPSecurityDescriptor);
+}
+#endif /* _WIN64 */
+
+
+static VkFormat findSupportedFormat(VkPhysicalDevice physicalDevice, const std::vector<VkFormat>& candidates, VkImageTiling tiling, VkFormatFeatureFlags features)
+{
+    for (VkFormat format : candidates) {
+        VkFormatProperties props;
+        vkGetPhysicalDeviceFormatProperties(physicalDevice, format, &props);
+        if (tiling == VK_IMAGE_TILING_LINEAR && (props.linearTilingFeatures & features) == features) {
+            return format;
+        }
+        else if (tiling == VK_IMAGE_TILING_OPTIMAL && (props.optimalTilingFeatures & features) == features) {
+            return format;
+        }
+    }
+    throw std::runtime_error("Failed to find supported format!");
+}
+
+static uint32_t findMemoryType(VkPhysicalDevice physicalDevice, uint32_t typeFilter, VkMemoryPropertyFlags properties)
+{
+    VkPhysicalDeviceMemoryProperties memProperties;
+    vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties);
+    for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) {
+        if (typeFilter & (1 << i) && (memProperties.memoryTypes[i].propertyFlags & properties) == properties) {
+            return i;
+        }
+    }
+    return ~0;
+}
+
+static bool supportsValidationLayers()
+{
+    std::vector<VkLayerProperties> availableLayers;
+    uint32_t layerCount;
+
+    vkEnumerateInstanceLayerProperties(&layerCount, nullptr);
+    availableLayers.resize(layerCount);
+    vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data());
+
+    for (const char * layerName : validationLayers) {
+        bool layerFound = false;
+
+        for (const auto & layerProperties : availableLayers) {
+            if (strcmp(layerName, layerProperties.layerName) == 0) {
+                layerFound = true;
+                break;
+            }
+        }
+
+        if (!layerFound) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+void VulkanBaseApp::createInstance()
+{
+    if (m_enableValidation && !supportsValidationLayers()) {
+        throw std::runtime_error("Validation requested, but not supported!");
+    }
+
+    VkApplicationInfo appInfo = {};
+    appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
+    appInfo.pApplicationName = m_appName.c_str();
+    appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
+    appInfo.pEngineName = "No Engine";
+    appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
+    appInfo.apiVersion = VK_API_VERSION_1_0;
+
+    VkInstanceCreateInfo createInfo = {};
+    createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
+    createInfo.pApplicationInfo = &appInfo;
+
+    std::vector<const char *> exts = getRequiredExtensions();
+
+    {
+        uint32_t glfwExtensionCount = 0;
+        const char **glfwExtensions;
+
+        glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount);
+
+        exts.insert(exts.begin(), glfwExtensions, glfwExtensions + glfwExtensionCount);
+
+        if (m_enableValidation) {
+            exts.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME);
+        }
+    }
+
+    createInfo.enabledExtensionCount = static_cast<uint32_t>(exts.size());
+    createInfo.ppEnabledExtensionNames = exts.data();
+    VkDebugUtilsMessengerCreateInfoEXT debugCreateInfo = {};
+    if (m_enableValidation) {
+        createInfo.enabledLayerCount = static_cast<uint32_t>(countof(validationLayers));
+        createInfo.ppEnabledLayerNames = validationLayers;
+
+        debugCreateInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT;
+        debugCreateInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT;
+        debugCreateInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT;
+        debugCreateInfo.pfnUserCallback = debugCallback;
+
+        createInfo.pNext = &debugCreateInfo;
+    }
+    else {
+        createInfo.enabledLayerCount = 0;
+        createInfo.pNext = nullptr;
+    }
+
+    if (vkCreateInstance(&createInfo, nullptr, &m_instance) != VK_SUCCESS) {
+        throw std::runtime_error("Failed to create Vulkan instance!");
+    }
+
+    if (m_enableValidation) {
+        PFN_vkCreateDebugUtilsMessengerEXT func = (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(m_instance, "vkCreateDebugUtilsMessengerEXT");
+        if (func == nullptr || func(m_instance, &debugCreateInfo, nullptr, &m_debugMessenger) != VK_SUCCESS) {
+            throw std::runtime_error("Failed to set up debug messenger!");
+        }
+    }
+}
+
+void VulkanBaseApp::createSurface()
+{
+    if (glfwCreateWindowSurface(m_instance, m_window, nullptr, &m_surface) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create window surface!");
+    }
+}
+
+static bool findGraphicsQueueIndicies(VkPhysicalDevice device, VkSurfaceKHR surface, uint32_t& graphicsFamily, uint32_t& presentFamily)
+{
+    uint32_t queueFamilyCount = 0;
+
+    vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, nullptr);
+
+    std::vector<VkQueueFamilyProperties> queueFamilies(queueFamilyCount);
+    vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, queueFamilies.data());
+
+    graphicsFamily = presentFamily = ~0;
+
+    for (uint32_t i = 0; i < queueFamilyCount; i++) {
+
+        if (queueFamilies[i].queueCount > 0) {
+            if (graphicsFamily == ~0 && queueFamilies[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
+                graphicsFamily = i;
+            }
+            uint32_t presentSupport = 0;
+            vkGetPhysicalDeviceSurfaceSupportKHR(device, i, surface, &presentSupport);
+            if (presentFamily == ~0 && presentSupport) {
+                presentFamily = i;
+            }
+            if (presentFamily != ~0 && graphicsFamily != ~0) {
+                break;
+            }
+        }
+    }
+
+    return graphicsFamily != ~0 && presentFamily != ~0;
+}
+
+static bool hasAllExtensions(VkPhysicalDevice device, const std::vector<const char *>& deviceExtensions)
+{
+    uint32_t extensionCount;
+    vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, nullptr);
+    std::vector<VkExtensionProperties> availableExtensions(extensionCount);
+    vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, availableExtensions.data());
+
+    std::set<std::string> requiredExtensions(deviceExtensions.begin(), deviceExtensions.end());
+
+    for (const auto & extension : availableExtensions) {
+        requiredExtensions.erase(extension.extensionName);
+    }
+
+    return requiredExtensions.empty();
+}
+
+static void getSwapChainProperties(VkPhysicalDevice device, VkSurfaceKHR surface, VkSurfaceCapabilitiesKHR& capabilities, std::vector<VkSurfaceFormatKHR>& formats, std::vector<VkPresentModeKHR>& presentModes)
+{
+    vkGetPhysicalDeviceSurfaceCapabilitiesKHR(device, surface, &capabilities);
+    uint32_t formatCount;
+    vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, nullptr);
+    if (formatCount != 0) {
+        formats.resize(formatCount);
+        vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, formats.data());
+    }
+    uint32_t presentModeCount;
+    vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, nullptr);
+    if (presentModeCount != 0) {
+        presentModes.resize(presentModeCount);
+        vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, presentModes.data());
+    }
+}
+
+bool VulkanBaseApp::isSuitableDevice(VkPhysicalDevice dev) const
+{
+    uint32_t graphicsQueueIndex, presentQueueIndex;
+    std::vector<const char *> deviceExtensions = getRequiredDeviceExtensions();
+    VkSurfaceCapabilitiesKHR caps;
+    std::vector<VkSurfaceFormatKHR> formats;
+    std::vector<VkPresentModeKHR> presentModes;
+    deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
+    getSwapChainProperties(dev, m_surface, caps, formats, presentModes);
+    return hasAllExtensions(dev, deviceExtensions)
+           && !formats.empty() && !presentModes.empty()
+           && findGraphicsQueueIndicies(dev, m_surface, graphicsQueueIndex, presentQueueIndex);
+}
+
+void VulkanBaseApp::createDevice()
+{
+    {
+        uint32_t deviceCount = 0;
+        vkEnumeratePhysicalDevices(m_instance, &deviceCount, nullptr);
+        if (deviceCount == 0) {
+            throw std::runtime_error("Failed to find Vulkan capable GPUs!");
+        }
+        std::vector<VkPhysicalDevice> phyDevs(deviceCount);
+        vkEnumeratePhysicalDevices(m_instance, &deviceCount, phyDevs.data());
+        std::vector<VkPhysicalDevice>::iterator it = std::find_if(phyDevs.begin(), phyDevs.end(),
+                                                                  std::bind(&VulkanBaseApp::isSuitableDevice, this, std::placeholders::_1));
+        if (it == phyDevs.end()) {
+            throw std::runtime_error("No suitable device found!");
+        }
+        m_physicalDevice = *it;
+    }
+
+    uint32_t graphicsQueueIndex, presentQueueIndex;
+    findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsQueueIndex, presentQueueIndex);
+
+    std::vector<VkDeviceQueueCreateInfo> queueCreateInfos;
+    std::set<uint32_t> uniqueFamilyIndices = { graphicsQueueIndex, presentQueueIndex };
+
+    float queuePriority = 1.0f;
+
+    for (uint32_t queueFamily : uniqueFamilyIndices) {
+        VkDeviceQueueCreateInfo queueCreateInfo = {};
+        queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
+        queueCreateInfo.queueFamilyIndex = graphicsQueueIndex;
+        queueCreateInfo.queueCount = 1;
+        queueCreateInfo.pQueuePriorities = &queuePriority;
+        queueCreateInfos.push_back(queueCreateInfo);
+    }
+
+    VkPhysicalDeviceFeatures deviceFeatures = {};
+    deviceFeatures.fillModeNonSolid = true;
+
+    VkDeviceCreateInfo createInfo = {};
+    createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
+
+    createInfo.pQueueCreateInfos = queueCreateInfos.data();
+    createInfo.queueCreateInfoCount = static_cast<uint32_t>(queueCreateInfos.size());
+
+    createInfo.pEnabledFeatures = &deviceFeatures;
+
+    std::vector<const char *> deviceExtensions = getRequiredDeviceExtensions();
+    deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
+
+    createInfo.enabledExtensionCount = static_cast<uint32_t>(deviceExtensions.size());
+    createInfo.ppEnabledExtensionNames = deviceExtensions.data();
+
+    if (m_enableValidation) {
+        createInfo.enabledLayerCount = static_cast<uint32_t>(countof(validationLayers));
+        createInfo.ppEnabledLayerNames = validationLayers;
+    }
+    else {
+        createInfo.enabledLayerCount = 0;
+    }
+
+    if (vkCreateDevice(m_physicalDevice, &createInfo, nullptr, &m_device) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create logical device!");
+    }
+
+    vkGetDeviceQueue(m_device, graphicsQueueIndex, 0, &m_graphicsQueue);
+    vkGetDeviceQueue(m_device, presentQueueIndex, 0, &m_presentQueue);
+
+    VkPhysicalDeviceIDProperties vkPhysicalDeviceIDProperties = {};
+    vkPhysicalDeviceIDProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES;
+    vkPhysicalDeviceIDProperties.pNext = NULL;
+
+    VkPhysicalDeviceProperties2 vkPhysicalDeviceProperties2 = {};
+    vkPhysicalDeviceProperties2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
+    vkPhysicalDeviceProperties2.pNext = &vkPhysicalDeviceIDProperties;
+
+    PFN_vkGetPhysicalDeviceProperties2 fpGetPhysicalDeviceProperties2;
+    fpGetPhysicalDeviceProperties2 = (PFN_vkGetPhysicalDeviceProperties2)vkGetInstanceProcAddr(m_instance, "vkGetPhysicalDeviceProperties2");
+    if (fpGetPhysicalDeviceProperties2 == NULL) {
+        throw std::runtime_error("Vulkan: Proc address for \"vkGetPhysicalDeviceProperties2KHR\" not found.\n");
+    }
+
+    fpGetPhysicalDeviceProperties2(m_physicalDevice, &vkPhysicalDeviceProperties2);
+
+    memcpy(m_vkDeviceUUID, vkPhysicalDeviceIDProperties.deviceUUID,  VK_UUID_SIZE);
+}
+
+static VkSurfaceFormatKHR chooseSwapSurfaceFormat(const std::vector<VkSurfaceFormatKHR>& availableFormats)
+{
+    if (availableFormats.size() == 1 && availableFormats[0].format == VK_FORMAT_UNDEFINED) {
+        return { VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR };
+    }
+
+    for (const auto & availableFormat : availableFormats) {
+        if (availableFormat.format == VK_FORMAT_B8G8R8A8_UNORM && availableFormat.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) {
+            return availableFormat;
+        }
+    }
+
+    return availableFormats[0];
+}
+
+static VkPresentModeKHR chooseSwapPresentMode(const std::vector<VkPresentModeKHR>& availablePresentModes)
+{
+    VkPresentModeKHR bestMode = VK_PRESENT_MODE_FIFO_KHR;
+
+    for (const auto & availablePresentMode : availablePresentModes) {
+        if (availablePresentMode == VK_PRESENT_MODE_MAILBOX_KHR) {
+            return availablePresentMode;
+        }
+        else if (availablePresentMode == VK_PRESENT_MODE_IMMEDIATE_KHR) {
+            bestMode = availablePresentMode;
+        }
+    }
+
+    return bestMode;
+}
+
+static VkExtent2D chooseSwapExtent(GLFWwindow *window, const VkSurfaceCapabilitiesKHR& capabilities)
+{
+    if (capabilities.currentExtent.width != std::numeric_limits<uint32_t>::max()) {
+        return capabilities.currentExtent;
+    }
+    else {
+        int width, height;
+        glfwGetFramebufferSize(window, &width, &height);
+        VkExtent2D actualExtent = { static_cast<uint32_t>(width), static_cast<uint32_t>(height) };
+
+        actualExtent.width = std::max(capabilities.minImageExtent.width, std::min(capabilities.maxImageExtent.width, actualExtent.width));
+        actualExtent.height = std::max(capabilities.minImageExtent.height, std::min(capabilities.maxImageExtent.height, actualExtent.height));
+
+        return actualExtent;
+    }
+}
+
+void VulkanBaseApp::createSwapChain()
+{
+    VkSurfaceCapabilitiesKHR capabilities;
+    VkSurfaceFormatKHR format;
+    VkPresentModeKHR presentMode;
+    VkExtent2D extent;
+    uint32_t imageCount;
+
+    {
+        std::vector<VkSurfaceFormatKHR> formats;
+        std::vector<VkPresentModeKHR> presentModes;
+
+        getSwapChainProperties(m_physicalDevice, m_surface, capabilities, formats, presentModes);
+        format = chooseSwapSurfaceFormat(formats);
+        presentMode = chooseSwapPresentMode(presentModes);
+        extent = chooseSwapExtent(m_window, capabilities);
+        imageCount = capabilities.minImageCount + 1;
+        if (capabilities.maxImageCount > 0 && imageCount > capabilities.maxImageCount) {
+            imageCount = capabilities.maxImageCount;
+        }
+    }
+
+    VkSwapchainCreateInfoKHR createInfo = {};
+    createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR;
+    createInfo.surface = m_surface;
+
+    createInfo.minImageCount = imageCount;
+    createInfo.imageFormat = format.format;
+    createInfo.imageColorSpace = format.colorSpace;
+    createInfo.imageExtent = extent;
+    createInfo.imageArrayLayers = 1;
+    createInfo.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+
+    uint32_t queueFamilyIndices[2];
+    findGraphicsQueueIndicies(m_physicalDevice, m_surface, queueFamilyIndices[0], queueFamilyIndices[1]);
+
+    if (queueFamilyIndices[0] != queueFamilyIndices[1]) {
+        createInfo.imageSharingMode = VK_SHARING_MODE_CONCURRENT;
+        createInfo.queueFamilyIndexCount = countof(queueFamilyIndices);
+        createInfo.pQueueFamilyIndices = queueFamilyIndices;
+    }
+    else {
+        createInfo.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE;
+    }
+
+    createInfo.preTransform = capabilities.currentTransform;
+    createInfo.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR;
+    createInfo.presentMode = presentMode;
+    createInfo.clipped = VK_TRUE;
+
+    createInfo.oldSwapchain = VK_NULL_HANDLE;
+
+    if (vkCreateSwapchainKHR(m_device, &createInfo, nullptr, &m_swapChain) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create swap chain!");
+    }
+
+    vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount, nullptr);
+    m_swapChainImages.resize(imageCount);
+    vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount, m_swapChainImages.data());
+
+    m_swapChainFormat = format.format;
+    m_swapChainExtent = extent;
+}
+
+static VkImageView createImageView(VkDevice dev, VkImage image, VkFormat format, VkImageAspectFlags aspectFlags)
+{
+    VkImageView imageView;
+    VkImageViewCreateInfo createInfo = {};
+    createInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
+    createInfo.image = image;
+    createInfo.viewType = VK_IMAGE_VIEW_TYPE_2D;
+    createInfo.format = format;
+    createInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
+    createInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
+    createInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
+    createInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
+    createInfo.subresourceRange.aspectMask = aspectFlags;
+    createInfo.subresourceRange.baseMipLevel = 0;
+    createInfo.subresourceRange.levelCount = 1;
+    createInfo.subresourceRange.baseArrayLayer = 0;
+    createInfo.subresourceRange.layerCount = 1;
+    if (vkCreateImageView(dev, &createInfo, nullptr, &imageView) != VK_SUCCESS) {
+        throw std::runtime_error("Failed to create image views!");
+    }
+
+    return imageView;
+}
+
+static void createImage(VkPhysicalDevice physicalDevice, VkDevice device, uint32_t width, uint32_t height, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage, VkMemoryPropertyFlags properties, VkImage& image, VkDeviceMemory& imageMemory)
+{
+    VkImageCreateInfo imageInfo = {};
+    imageInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
+    imageInfo.imageType = VK_IMAGE_TYPE_2D;
+    imageInfo.extent.width = width;
+    imageInfo.extent.height = height;
+    imageInfo.extent.depth = 1;
+    imageInfo.mipLevels = 1;
+    imageInfo.arrayLayers = 1;
+    imageInfo.format = format;
+    imageInfo.tiling = tiling;
+    imageInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+    imageInfo.usage = usage;
+    imageInfo.samples = VK_SAMPLE_COUNT_1_BIT;
+    imageInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+
+    if (vkCreateImage(device, &imageInfo, nullptr, &image) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create image!");
+    }
+
+    VkMemoryRequirements memRequirements;
+    vkGetImageMemoryRequirements(device, image, &memRequirements);
+
+    VkMemoryAllocateInfo allocInfo = {};
+    allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+    allocInfo.allocationSize = memRequirements.size;
+    allocInfo.memoryTypeIndex = findMemoryType(physicalDevice, memRequirements.memoryTypeBits, properties);
+
+    if (vkAllocateMemory(device, &allocInfo, nullptr, &imageMemory) != VK_SUCCESS) {
+        throw std::runtime_error("failed to allocate image memory!");
+    }
+
+    vkBindImageMemory(device, image, imageMemory, 0);
+}
+
+void VulkanBaseApp::createImageViews()
+{
+    m_swapChainImageViews.resize(m_swapChainImages.size());
+
+    for (uint32_t i = 0; i < m_swapChainImages.size(); i++) {
+        m_swapChainImageViews[i] = createImageView(m_device, m_swapChainImages[i], m_swapChainFormat, VK_IMAGE_ASPECT_COLOR_BIT);
+    }
+}
+
+void VulkanBaseApp::createRenderPass()
+{
+    VkAttachmentDescription colorAttachment = {};
+    colorAttachment.format = m_swapChainFormat;
+    colorAttachment.samples = VK_SAMPLE_COUNT_1_BIT;
+    colorAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR;
+    colorAttachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
+    colorAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+    colorAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+    colorAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+    colorAttachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
+
+    VkAttachmentReference colorAttachmentRef = {};
+    colorAttachmentRef.attachment = 0;
+    colorAttachmentRef.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+
+    VkAttachmentDescription depthAttachment = {};
+    depthAttachment.format = findSupportedFormat(m_physicalDevice,
+    { VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_D24_UNORM_S8_UINT },
+    VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT);
+    depthAttachment.samples = VK_SAMPLE_COUNT_1_BIT;
+    depthAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR;
+    depthAttachment.storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+    depthAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+    depthAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+    depthAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+    depthAttachment.finalLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
+
+    VkAttachmentReference depthAttachmentRef = {};
+    depthAttachmentRef.attachment = 1;
+    depthAttachmentRef.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
+
+    VkSubpassDescription subpass = {};
+    subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
+    subpass.colorAttachmentCount = 1;
+    subpass.pColorAttachments = &colorAttachmentRef;
+    subpass.pDepthStencilAttachment = &depthAttachmentRef;
+
+
+    VkSubpassDependency dependency = {};
+    dependency.srcSubpass = VK_SUBPASS_EXTERNAL;
+    dependency.dstSubpass = 0;
+    dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+    dependency.srcAccessMask = 0;
+    dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+    dependency.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+
+    VkAttachmentDescription attachments[] = {colorAttachment, depthAttachment};
+    VkRenderPassCreateInfo renderPassInfo = {};
+    renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO;
+    renderPassInfo.attachmentCount = countof(attachments);
+    renderPassInfo.pAttachments = attachments;
+    renderPassInfo.subpassCount = 1;
+    renderPassInfo.pSubpasses = &subpass;
+    renderPassInfo.dependencyCount = 1;
+    renderPassInfo.pDependencies = &dependency;
+
+    if (vkCreateRenderPass(m_device, &renderPassInfo, nullptr, &m_renderPass) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create render pass!");
+    }
+}
+
+void VulkanBaseApp::createDescriptorSetLayout()
+{
+    VkDescriptorSetLayoutBinding uboLayoutBinding = {};
+    uboLayoutBinding.binding = 0;
+    uboLayoutBinding.descriptorCount = 1;
+    uboLayoutBinding.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+    uboLayoutBinding.pImmutableSamplers = nullptr;
+    uboLayoutBinding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
+
+    VkDescriptorSetLayoutCreateInfo layoutInfo = {};
+    layoutInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+    layoutInfo.bindingCount = 1;
+    layoutInfo.pBindings = &uboLayoutBinding;
+
+    if (vkCreateDescriptorSetLayout(m_device, &layoutInfo, nullptr, &m_descriptorSetLayout) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create descriptor set layout!");
+    }
+}
+
+VkShaderModule createShaderModule(VkDevice device, const char *filename)
+{
+    std::vector<char> shaderContents;
+    std::ifstream shaderFile(filename, std::ios_base::in | std::ios_base::binary);
+    VkShaderModuleCreateInfo createInfo = {};
+    VkShaderModule shaderModule;
+
+    if (!shaderFile.good()) {
+        throw std::runtime_error("Failed to load shader contents");
+    }
+    readFile(shaderFile, shaderContents);
+
+    createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+    createInfo.codeSize = shaderContents.size();
+    createInfo.pCode = reinterpret_cast<const uint32_t *>(shaderContents.data());
+
+    if (vkCreateShaderModule(device, &createInfo, nullptr, &shaderModule) != VK_SUCCESS) {
+        throw std::runtime_error("Failed to create shader module!");
+    }
+
+    return shaderModule;
+}
+
+void VulkanBaseApp::getVertexDescriptions(std::vector<VkVertexInputBindingDescription>& bindingDesc, std::vector<VkVertexInputAttributeDescription>& attribDesc)
+{
+}
+
+void VulkanBaseApp::getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info)
+{
+
+}
+
+void VulkanBaseApp::createGraphicsPipeline()
+{
+    std::vector<VkPipelineShaderStageCreateInfo> shaderStageInfos(m_shaderFiles.size());
+    for (size_t i = 0; i < m_shaderFiles.size(); i++) {
+        shaderStageInfos[i] = {};
+        shaderStageInfos[i].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+        shaderStageInfos[i].stage = m_shaderFiles[i].first;
+        shaderStageInfos[i].module = createShaderModule(m_device, m_shaderFiles[i].second.c_str());
+        shaderStageInfos[i].pName = "main";
+    }
+
+    VkPipelineVertexInputStateCreateInfo vertexInputInfo = {};
+
+    std::vector<VkVertexInputBindingDescription> vertexBindingDescriptions;
+    std::vector<VkVertexInputAttributeDescription> vertexAttributeDescriptions;
+
+    getVertexDescriptions(vertexBindingDescriptions, vertexAttributeDescriptions);
+
+    vertexInputInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO;
+    vertexInputInfo.vertexBindingDescriptionCount = static_cast<uint32_t>(vertexBindingDescriptions.size());
+    vertexInputInfo.pVertexBindingDescriptions = vertexBindingDescriptions.data();
+    vertexInputInfo.vertexAttributeDescriptionCount = static_cast<uint32_t>(vertexAttributeDescriptions.size());
+    vertexInputInfo.pVertexAttributeDescriptions = vertexAttributeDescriptions.data();
+
+    VkPipelineInputAssemblyStateCreateInfo inputAssembly = {};
+    getAssemblyStateInfo(inputAssembly);
+
+    VkViewport viewport = {};
+    viewport.x = 0.0f;
+    viewport.y = 0.0f;
+    viewport.width = (float)m_swapChainExtent.width;
+    viewport.height = (float)m_swapChainExtent.height;
+    viewport.minDepth = 0.0f;
+    viewport.maxDepth = 1.0f;
+
+    VkRect2D scissor = {};
+    scissor.offset = { 0, 0 };
+    scissor.extent = m_swapChainExtent;
+
+    VkPipelineViewportStateCreateInfo viewportState = {};
+    viewportState.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO;
+    viewportState.viewportCount = 1;
+    viewportState.pViewports = &viewport;
+    viewportState.scissorCount = 1;
+    viewportState.pScissors = &scissor;
+
+    VkPipelineRasterizationStateCreateInfo rasterizer = {};
+    rasterizer.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
+    rasterizer.depthClampEnable = VK_FALSE;
+    rasterizer.rasterizerDiscardEnable = VK_FALSE;
+    rasterizer.polygonMode = VK_POLYGON_MODE_LINE;
+    rasterizer.lineWidth = 1.0f;
+    rasterizer.cullMode = VK_CULL_MODE_NONE;
+    rasterizer.frontFace = VK_FRONT_FACE_CLOCKWISE;
+    rasterizer.depthBiasEnable = VK_FALSE;
+
+    VkPipelineMultisampleStateCreateInfo multisampling = {};
+    multisampling.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
+    multisampling.sampleShadingEnable = VK_FALSE;
+    multisampling.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
+    multisampling.minSampleShading = 1.0f; // Optional
+    multisampling.pSampleMask = nullptr; // Optional
+    multisampling.alphaToCoverageEnable = VK_FALSE; // Optional
+    multisampling.alphaToOneEnable = VK_FALSE; // Optional
+
+    VkPipelineDepthStencilStateCreateInfo depthStencil = {};
+    depthStencil.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
+    depthStencil.depthTestEnable = VK_TRUE;
+    depthStencil.depthWriteEnable = VK_TRUE;
+    depthStencil.depthCompareOp = VK_COMPARE_OP_LESS;
+    depthStencil.depthBoundsTestEnable = VK_FALSE;
+    depthStencil.stencilTestEnable = VK_FALSE;
+
+    VkPipelineColorBlendAttachmentState colorBlendAttachment = {};
+    colorBlendAttachment.colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT;
+    colorBlendAttachment.blendEnable = VK_FALSE;
+
+    VkPipelineColorBlendStateCreateInfo colorBlending = {};
+    colorBlending.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO;
+    colorBlending.logicOpEnable = VK_FALSE;
+    colorBlending.logicOp = VK_LOGIC_OP_COPY;
+    colorBlending.attachmentCount = 1;
+    colorBlending.pAttachments = &colorBlendAttachment;
+    colorBlending.blendConstants[0] = 0.0f;
+    colorBlending.blendConstants[1] = 0.0f;
+    colorBlending.blendConstants[2] = 0.0f;
+    colorBlending.blendConstants[3] = 0.0f;
+
+    VkPipelineLayoutCreateInfo pipelineLayoutInfo = {};
+    pipelineLayoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+    pipelineLayoutInfo.setLayoutCount = 1; // Optional
+    pipelineLayoutInfo.pSetLayouts = &m_descriptorSetLayout; // Optional
+    pipelineLayoutInfo.pushConstantRangeCount = 0; // Optional
+    pipelineLayoutInfo.pPushConstantRanges = nullptr; // Optional
+
+    if (vkCreatePipelineLayout(m_device, &pipelineLayoutInfo, nullptr, &m_pipelineLayout) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create pipeline layout!");
+    }
+
+    VkGraphicsPipelineCreateInfo pipelineInfo = {};
+    pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO;
+    pipelineInfo.stageCount = static_cast<uint32_t>(shaderStageInfos.size());
+    pipelineInfo.pStages = shaderStageInfos.data();
+
+    pipelineInfo.pVertexInputState = &vertexInputInfo;
+    pipelineInfo.pInputAssemblyState = &inputAssembly;
+    pipelineInfo.pViewportState = &viewportState;
+    pipelineInfo.pRasterizationState = &rasterizer;
+    pipelineInfo.pMultisampleState = &multisampling;
+    pipelineInfo.pDepthStencilState = &depthStencil; // Optional
+    pipelineInfo.pColorBlendState = &colorBlending;
+    pipelineInfo.pDynamicState = nullptr; // Optional
+
+    pipelineInfo.layout = m_pipelineLayout;
+
+    pipelineInfo.renderPass = m_renderPass;
+    pipelineInfo.subpass = 0;
+
+    pipelineInfo.basePipelineHandle = VK_NULL_HANDLE; // Optional
+    pipelineInfo.basePipelineIndex = -1; // Optional
+
+    if (vkCreateGraphicsPipelines(m_device, VK_NULL_HANDLE, 1, &pipelineInfo, nullptr, &m_graphicsPipeline) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create graphics pipeline!");
+    }
+
+    for (size_t i = 0; i < shaderStageInfos.size(); i++) {
+        vkDestroyShaderModule(m_device, shaderStageInfos[i].module, nullptr);
+    }
+}
+
+void VulkanBaseApp::createFramebuffers()
+{
+    m_swapChainFramebuffers.resize(m_swapChainImageViews.size());
+    for (size_t i = 0; i < m_swapChainImageViews.size(); i++) {
+        VkImageView attachments[] = {
+            m_swapChainImageViews[i],
+            m_depthImageView
+        };
+
+        VkFramebufferCreateInfo framebufferInfo = {};
+        framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO;
+        framebufferInfo.renderPass = m_renderPass;
+        framebufferInfo.attachmentCount = countof(attachments);
+        framebufferInfo.pAttachments = attachments;
+        framebufferInfo.width = m_swapChainExtent.width;
+        framebufferInfo.height = m_swapChainExtent.height;
+        framebufferInfo.layers = 1;
+
+        if (vkCreateFramebuffer(m_device, &framebufferInfo, nullptr, &m_swapChainFramebuffers[i]) != VK_SUCCESS) {
+            throw std::runtime_error("failed to create framebuffer!");
+        }
+    }
+}
+
+void VulkanBaseApp::createCommandPool()
+{
+    VkCommandPoolCreateInfo poolInfo = {};
+    uint32_t graphicsIndex, presentIndex;
+
+    findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsIndex, presentIndex);
+
+    poolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+    poolInfo.queueFamilyIndex = graphicsIndex;
+    poolInfo.flags = 0; // Optional
+
+    if (vkCreateCommandPool(m_device, &poolInfo, nullptr, &m_commandPool) != VK_SUCCESS) {
+        throw std::runtime_error("Failed to create command pool!");
+    }
+}
+
+static void transitionImageLayout(VulkanBaseApp *app, VkImage image, VkFormat format, VkImageLayout oldLayout, VkImageLayout newLayout)
+{
+    VkCommandBuffer commandBuffer = app->beginSingleTimeCommands();
+
+    VkImageMemoryBarrier barrier = {};
+    barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+    barrier.oldLayout = oldLayout;
+    barrier.newLayout = newLayout;
+    barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    barrier.image = image;
+
+    if (newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) {
+        barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
+
+        if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_D24_UNORM_S8_UINT) {
+            barrier.subresourceRange.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
+        }
+    }
+    else {
+        barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+    }
+
+    barrier.subresourceRange.baseMipLevel = 0;
+    barrier.subresourceRange.levelCount = 1;
+    barrier.subresourceRange.baseArrayLayer = 0;
+    barrier.subresourceRange.layerCount = 1;
+
+    VkPipelineStageFlags sourceStage;
+    VkPipelineStageFlags destinationStage;
+
+    if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {
+        barrier.srcAccessMask = 0;
+        barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+
+        sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+        destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+    }
+    else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL && newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
+        barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+        barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+
+        sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+        destinationStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
+    }
+    else if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) {
+        barrier.srcAccessMask = 0;
+        barrier.dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
+
+        sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+        destinationStage = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT;
+    }
+    else {
+        throw std::invalid_argument("unsupported layout transition!");
+    }
+
+    vkCmdPipelineBarrier(
+        commandBuffer,
+        sourceStage, destinationStage,
+        0,
+        0, nullptr,
+        0, nullptr,
+        1, &barrier
+    );
+
+    app->endSingleTimeCommands(commandBuffer);
+}
+
+void VulkanBaseApp::createDepthResources()
+{
+    VkFormat depthFormat = findSupportedFormat(m_physicalDevice,
+    { VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_D24_UNORM_S8_UINT },
+    VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT);
+    createImage(m_physicalDevice, m_device, m_swapChainExtent.width, m_swapChainExtent.height, depthFormat, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_depthImage, m_depthImageMemory);
+    m_depthImageView = createImageView(m_device, m_depthImage, depthFormat, VK_IMAGE_ASPECT_DEPTH_BIT);
+    transitionImageLayout(this, m_depthImage, depthFormat, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL);
+}
+
+void VulkanBaseApp::createUniformBuffers()
+{
+    VkDeviceSize size = getUniformSize();
+    if (size > 0) {
+        m_uniformBuffers.resize(m_swapChainImages.size());
+        m_uniformMemory.resize(m_swapChainImages.size());
+        for (size_t i = 0; i < m_uniformBuffers.size(); i++) {
+            createBuffer(getUniformSize(),
+                         VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
+                         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+                         m_uniformBuffers[i], m_uniformMemory[i]);
+        }
+    }
+}
+
+void VulkanBaseApp::createDescriptorPool()
+{
+    VkDescriptorPoolSize poolSize = {};
+    poolSize.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+    poolSize.descriptorCount = static_cast<uint32_t>(m_swapChainImages.size());
+    VkDescriptorPoolCreateInfo poolInfo = {};
+    poolInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+    poolInfo.poolSizeCount = 1;
+    poolInfo.pPoolSizes = &poolSize;
+    poolInfo.maxSets = static_cast<uint32_t>(m_swapChainImages.size());;
+    if (vkCreateDescriptorPool(m_device, &poolInfo, nullptr, &m_descriptorPool) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create descriptor pool!");
+    }
+}
+
+void VulkanBaseApp::createDescriptorSets()
+{
+    std::vector<VkDescriptorSetLayout> layouts(m_swapChainImages.size(), m_descriptorSetLayout);
+    VkDescriptorSetAllocateInfo allocInfo = {};
+    allocInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+    allocInfo.descriptorPool = m_descriptorPool;
+    allocInfo.descriptorSetCount = static_cast<uint32_t>(m_swapChainImages.size());
+    allocInfo.pSetLayouts = layouts.data();
+    m_descriptorSets.resize(m_swapChainImages.size());
+
+    if (vkAllocateDescriptorSets(m_device, &allocInfo, m_descriptorSets.data()) != VK_SUCCESS) {
+        throw std::runtime_error("failed to allocate descriptor sets!");
+    }
+
+    VkDescriptorBufferInfo bufferInfo = {};
+    bufferInfo.offset = 0;
+    bufferInfo.range = VK_WHOLE_SIZE;
+    VkWriteDescriptorSet descriptorWrite = {};
+    descriptorWrite.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    descriptorWrite.dstBinding = 0;
+    descriptorWrite.dstArrayElement = 0;
+    descriptorWrite.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+    descriptorWrite.descriptorCount = 1;
+    descriptorWrite.pBufferInfo = &bufferInfo;
+    descriptorWrite.pImageInfo = nullptr; // Optional
+    descriptorWrite.pTexelBufferView = nullptr; // Optional
+
+    for (size_t i = 0; i < m_swapChainImages.size(); i++) {
+        bufferInfo.buffer = m_uniformBuffers[i];
+        descriptorWrite.dstSet = m_descriptorSets[i];
+        vkUpdateDescriptorSets(m_device, 1, &descriptorWrite, 0, nullptr);
+    }
+}
+
+void VulkanBaseApp::createCommandBuffers()
+{
+    m_commandBuffers.resize(m_swapChainFramebuffers.size());
+    VkCommandBufferAllocateInfo allocInfo = {};
+    allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+    allocInfo.commandPool = m_commandPool;
+    allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+    allocInfo.commandBufferCount = (uint32_t)m_commandBuffers.size();
+
+    if (vkAllocateCommandBuffers(m_device, &allocInfo, m_commandBuffers.data()) != VK_SUCCESS) {
+        throw std::runtime_error("failed to allocate command buffers!");
+    }
+
+    for (size_t i = 0; i < m_commandBuffers.size(); i++) {
+        VkCommandBufferBeginInfo beginInfo = {};
+        beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+        beginInfo.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
+        beginInfo.pInheritanceInfo = nullptr; // Optional
+
+        if (vkBeginCommandBuffer(m_commandBuffers[i], &beginInfo) != VK_SUCCESS) {
+            throw std::runtime_error("failed to begin recording command buffer!");
+        }
+
+        VkRenderPassBeginInfo renderPassInfo = {};
+        renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
+        renderPassInfo.renderPass = m_renderPass;
+        renderPassInfo.framebuffer = m_swapChainFramebuffers[i];
+
+        renderPassInfo.renderArea.offset = { 0, 0 };
+        renderPassInfo.renderArea.extent = m_swapChainExtent;
+
+        VkClearValue clearColors[2];
+        clearColors[0].color = { 0.0f, 0.0f, 0.0f, 1.0f };
+        clearColors[1].depthStencil = { 1.0f, 0 };
+        renderPassInfo.clearValueCount = countof(clearColors);
+        renderPassInfo.pClearValues = clearColors;
+
+        vkCmdBeginRenderPass(m_commandBuffers[i], &renderPassInfo, VK_SUBPASS_CONTENTS_INLINE);
+
+        vkCmdBindPipeline(m_commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, m_graphicsPipeline);
+
+        vkCmdBindDescriptorSets(m_commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, m_pipelineLayout, 0, 1, &m_descriptorSets[i], 0, nullptr);
+
+        fillRenderingCommandBuffer(m_commandBuffers[i]);
+
+        vkCmdEndRenderPass(m_commandBuffers[i]);
+
+        if (vkEndCommandBuffer(m_commandBuffers[i]) != VK_SUCCESS) {
+            throw std::runtime_error("failed to record command buffer!");
+        }
+    }
+}
+
+void VulkanBaseApp::createSyncObjects()
+{
+    VkSemaphoreCreateInfo semaphoreInfo = {};
+    semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+    VkFenceCreateInfo fenceInfo = {};
+    fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+    fenceInfo.flags = VK_FENCE_CREATE_SIGNALED_BIT;
+
+    m_inFlightFences.resize(MAX_FRAMES_IN_FLIGHT);
+    m_imageAvailableSemaphores.resize(MAX_FRAMES_IN_FLIGHT);
+    m_renderFinishedSemaphores.resize(MAX_FRAMES_IN_FLIGHT);
+
+    for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) {
+        if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &m_imageAvailableSemaphores[i]) != VK_SUCCESS) {
+            throw std::runtime_error("Failed to create image available semaphore!");
+        }
+        if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &m_renderFinishedSemaphores[i]) != VK_SUCCESS) {
+            throw std::runtime_error("Failed to create image available semaphore!");
+        }
+        if (vkCreateFence(m_device, &fenceInfo, nullptr, &m_inFlightFences[i]) != VK_SUCCESS) {
+            throw std::runtime_error("Failed to create image available semaphore!");
+        }
+    }
+}
+
+void VulkanBaseApp::getWaitFrameSemaphores(std::vector<VkSemaphore>& wait, std::vector<VkPipelineStageFlags>& waitStages) const
+{
+}
+
+void VulkanBaseApp::getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const
+{
+}
+
+VkDeviceSize VulkanBaseApp::getUniformSize() const
+{
+    return VkDeviceSize(0);
+}
+
+void VulkanBaseApp::updateUniformBuffer(uint32_t imageIndex)
+{
+}
+
+void VulkanBaseApp::createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& bufferMemory)
+{
+    VkBufferCreateInfo bufferInfo = {};
+    bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+    bufferInfo.size = size;
+    bufferInfo.usage = usage;
+    bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+
+    if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create buffer!");
+    }
+
+    VkMemoryRequirements memRequirements;
+    vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements);
+
+    VkMemoryAllocateInfo allocInfo = {};
+    allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+    allocInfo.allocationSize = memRequirements.size;
+    allocInfo.memoryTypeIndex = findMemoryType(m_physicalDevice, memRequirements.memoryTypeBits, properties);
+
+    if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) != VK_SUCCESS) {
+        throw std::runtime_error("failed to allocate buffer memory!");
+    }
+
+    vkBindBufferMemory(m_device, buffer, bufferMemory, 0);
+}
+
+void VulkanBaseApp::createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, VkBuffer& buffer, VkDeviceMemory& bufferMemory)
+{
+    VkBufferCreateInfo bufferInfo = {};
+    bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+    bufferInfo.size = size;
+    bufferInfo.usage = usage;
+    bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+
+    if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create buffer!");
+    }
+
+    VkMemoryRequirements memRequirements;
+    vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements);
+
+#ifdef _WIN64
+    WindowsSecurityAttributes winSecurityAttributes;
+
+    VkExportMemoryWin32HandleInfoKHR vulkanExportMemoryWin32HandleInfoKHR = {};
+    vulkanExportMemoryWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR;
+    vulkanExportMemoryWin32HandleInfoKHR.pNext = NULL;
+    vulkanExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes;
+    vulkanExportMemoryWin32HandleInfoKHR.dwAccess = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE;
+    vulkanExportMemoryWin32HandleInfoKHR.name = (LPCWSTR)NULL;
+#endif
+    VkExportMemoryAllocateInfoKHR vulkanExportMemoryAllocateInfoKHR = {};
+    vulkanExportMemoryAllocateInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR;
+#ifdef _WIN64
+    vulkanExportMemoryAllocateInfoKHR.pNext = extMemHandleType & VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR ? &vulkanExportMemoryWin32HandleInfoKHR : NULL;
+    vulkanExportMemoryAllocateInfoKHR.handleTypes = extMemHandleType;
+#else
+    vulkanExportMemoryAllocateInfoKHR.pNext = NULL;
+    vulkanExportMemoryAllocateInfoKHR.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+#endif
+    VkMemoryAllocateInfo allocInfo = {};
+    allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+    allocInfo.pNext = &vulkanExportMemoryAllocateInfoKHR;
+    allocInfo.allocationSize = memRequirements.size;
+    allocInfo.memoryTypeIndex = findMemoryType(m_physicalDevice, memRequirements.memoryTypeBits, properties);
+
+    if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) != VK_SUCCESS) {
+        throw std::runtime_error("failed to allocate external buffer memory!");
+    }
+
+    vkBindBufferMemory(m_device, buffer, bufferMemory, 0);
+}
+
+void *VulkanBaseApp::getMemHandle(VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType)
+{
+#ifdef _WIN64
+    HANDLE handle = 0;
+
+    VkMemoryGetWin32HandleInfoKHR vkMemoryGetWin32HandleInfoKHR = {};
+    vkMemoryGetWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR;
+    vkMemoryGetWin32HandleInfoKHR.pNext = NULL;
+    vkMemoryGetWin32HandleInfoKHR.memory = memory;
+    vkMemoryGetWin32HandleInfoKHR.handleType = handleType;
+
+    PFN_vkGetMemoryWin32HandleKHR fpGetMemoryWin32HandleKHR;
+    fpGetMemoryWin32HandleKHR = (PFN_vkGetMemoryWin32HandleKHR)vkGetDeviceProcAddr(m_device, "vkGetMemoryWin32HandleKHR");
+    if (!fpGetMemoryWin32HandleKHR) {
+        throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!");
+    }
+    if (fpGetMemoryWin32HandleKHR(m_device, &vkMemoryGetWin32HandleInfoKHR, &handle) != VK_SUCCESS) {
+        throw std::runtime_error("Failed to retrieve handle for buffer!");
+    }
+    return (void *)handle;
+#else
+    int fd = -1;
+
+    VkMemoryGetFdInfoKHR vkMemoryGetFdInfoKHR = {};
+    vkMemoryGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR;
+    vkMemoryGetFdInfoKHR.pNext = NULL;
+    vkMemoryGetFdInfoKHR.memory = memory;
+    vkMemoryGetFdInfoKHR.handleType = handleType;
+
+    PFN_vkGetMemoryFdKHR fpGetMemoryFdKHR;
+    fpGetMemoryFdKHR = (PFN_vkGetMemoryFdKHR)vkGetDeviceProcAddr(m_device, "vkGetMemoryFdKHR");
+    if (!fpGetMemoryFdKHR) {
+        throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!");
+    }
+    if (fpGetMemoryFdKHR(m_device, &vkMemoryGetFdInfoKHR, &fd) != VK_SUCCESS) {
+        throw std::runtime_error("Failed to retrieve handle for buffer!");
+    }
+    return (void *)(uintptr_t)fd;
+#endif /* _WIN64 */
+}
+
+void *VulkanBaseApp::getSemaphoreHandle(VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType)
+{
+#ifdef _WIN64
+    HANDLE handle;
+
+    VkSemaphoreGetWin32HandleInfoKHR semaphoreGetWin32HandleInfoKHR = {};
+    semaphoreGetWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR;
+    semaphoreGetWin32HandleInfoKHR.pNext = NULL;
+    semaphoreGetWin32HandleInfoKHR.semaphore = semaphore;
+    semaphoreGetWin32HandleInfoKHR.handleType = handleType;
+
+    PFN_vkGetSemaphoreWin32HandleKHR fpGetSemaphoreWin32HandleKHR;
+    fpGetSemaphoreWin32HandleKHR = (PFN_vkGetSemaphoreWin32HandleKHR)vkGetDeviceProcAddr(m_device, "vkGetSemaphoreWin32HandleKHR");
+    if (!fpGetSemaphoreWin32HandleKHR) {
+        throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!");
+    }
+    if (fpGetSemaphoreWin32HandleKHR(m_device, &semaphoreGetWin32HandleInfoKHR, &handle) != VK_SUCCESS) {
+        throw std::runtime_error("Failed to retrieve handle for buffer!");
+    }
+
+    return (void *)handle;
+#else
+    int fd;
+
+    VkSemaphoreGetFdInfoKHR semaphoreGetFdInfoKHR = {};
+    semaphoreGetFdInfoKHR.sType =VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR;
+    semaphoreGetFdInfoKHR.pNext = NULL;
+    semaphoreGetFdInfoKHR.semaphore = semaphore;
+    semaphoreGetFdInfoKHR.handleType = handleType;
+
+    PFN_vkGetSemaphoreFdKHR fpGetSemaphoreFdKHR;
+    fpGetSemaphoreFdKHR = (PFN_vkGetSemaphoreFdKHR)vkGetDeviceProcAddr(m_device, "vkGetSemaphoreFdKHR");
+    if (!fpGetSemaphoreFdKHR) {
+        throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!");
+    }
+    if (fpGetSemaphoreFdKHR(m_device, &semaphoreGetFdInfoKHR, &fd) != VK_SUCCESS) {
+        throw std::runtime_error("Failed to retrieve handle for buffer!");
+    }
+
+    return (void *)(uintptr_t)fd;
+#endif
+}
+
+void VulkanBaseApp::createExternalSemaphore(VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType)
+{
+    VkSemaphoreCreateInfo semaphoreInfo = {};
+    semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+    VkExportSemaphoreCreateInfoKHR exportSemaphoreCreateInfo = {};
+    exportSemaphoreCreateInfo.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR;
+
+#ifdef _WIN64
+    WindowsSecurityAttributes winSecurityAttributes;
+
+    VkExportSemaphoreWin32HandleInfoKHR exportSemaphoreWin32HandleInfoKHR = {};
+    exportSemaphoreWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR;
+    exportSemaphoreWin32HandleInfoKHR.pNext = NULL;
+    exportSemaphoreWin32HandleInfoKHR.pAttributes = &winSecurityAttributes;
+    exportSemaphoreWin32HandleInfoKHR.dwAccess = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE;
+    exportSemaphoreWin32HandleInfoKHR.name = (LPCWSTR)NULL;
+    exportSemaphoreCreateInfo.pNext = (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) ? &exportSemaphoreWin32HandleInfoKHR : NULL;
+#else
+    exportSemaphoreCreateInfo.pNext = NULL;
+#endif
+    exportSemaphoreCreateInfo.handleTypes = handleType;
+    semaphoreInfo.pNext = &exportSemaphoreCreateInfo;
+
+    if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &semaphore) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create synchronization objects for a CUDA-Vulkan!");
+    }
+}
+
+void VulkanBaseApp::importExternalBuffer(void *handle, VkExternalMemoryHandleTypeFlagBits handleType, size_t size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& memory)
+{
+    VkBufferCreateInfo bufferInfo = {};
+    bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+    bufferInfo.size = size;
+    bufferInfo.usage = usage;
+    bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+
+    if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create buffer!");
+    }
+
+    VkMemoryRequirements memRequirements;
+    vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements);
+
+#ifdef _WIN64
+    VkImportMemoryWin32HandleInfoKHR handleInfo = {};
+    handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR;
+    handleInfo.pNext = NULL;
+    handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
+    handleInfo.handle = handle;
+    handleInfo.name = NULL;
+#else
+    VkImportMemoryFdInfoKHR handleInfo = {};
+    handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR;
+    handleInfo.pNext = NULL;
+    handleInfo.fd = (int)(uintptr_t)handle;
+    handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+#endif /* _WIN64 */
+
+    VkMemoryAllocateInfo memAllocation = {};
+    memAllocation.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+    memAllocation.pNext = (void *)&handleInfo;
+    memAllocation.allocationSize = size;
+    memAllocation.memoryTypeIndex = findMemoryType(m_physicalDevice, memRequirements.memoryTypeBits, properties);
+
+    if (vkAllocateMemory(m_device, &memAllocation, nullptr, &memory) != VK_SUCCESS) {
+        throw std::runtime_error("Failed to import allocation!");
+    }
+
+    vkBindBufferMemory(m_device, buffer, memory, 0);
+}
+
+void VulkanBaseApp::copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size)
+{
+
+    VkCommandBuffer commandBuffer = beginSingleTimeCommands();
+
+    VkBufferCopy copyRegion = {};
+    copyRegion.size = size;
+    vkCmdCopyBuffer(commandBuffer, src, dst, 1, &copyRegion);
+
+    endSingleTimeCommands(commandBuffer);
+}
+
+void VulkanBaseApp::drawFrame()
+{
+    size_t currentFrameIdx = m_currentFrame % MAX_FRAMES_IN_FLIGHT;
+    vkWaitForFences(m_device, 1, &m_inFlightFences[currentFrameIdx], VK_TRUE, std::numeric_limits<uint64_t>::max());
+
+    uint32_t imageIndex;
+    VkResult result = vkAcquireNextImageKHR(m_device, m_swapChain, std::numeric_limits<uint64_t>::max(), m_imageAvailableSemaphores[currentFrameIdx], VK_NULL_HANDLE, &imageIndex);
+    if (result == VK_ERROR_OUT_OF_DATE_KHR) {
+        recreateSwapChain();
+    }
+    else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) {
+        throw std::runtime_error("Failed to acquire swap chain image!");
+    }
+
+    updateUniformBuffer(imageIndex);
+
+    VkSubmitInfo submitInfo = {};
+    submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+
+    std::vector<VkSemaphore> waitSemaphores;
+    std::vector<VkPipelineStageFlags> waitStages;
+
+    waitSemaphores.push_back(m_imageAvailableSemaphores[currentFrameIdx]);
+    waitStages.push_back(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT);
+    getWaitFrameSemaphores(waitSemaphores, waitStages);
+
+    submitInfo.waitSemaphoreCount = (uint32_t)waitSemaphores.size();
+    submitInfo.pWaitSemaphores = waitSemaphores.data();
+    submitInfo.pWaitDstStageMask = waitStages.data();
+
+    submitInfo.commandBufferCount = 1;
+    submitInfo.pCommandBuffers = &m_commandBuffers[imageIndex];
+
+    std::vector<VkSemaphore> signalSemaphores;
+    getSignalFrameSemaphores(signalSemaphores);
+    signalSemaphores.push_back(m_renderFinishedSemaphores[currentFrameIdx]);
+    submitInfo.signalSemaphoreCount = (uint32_t)signalSemaphores.size();
+    submitInfo.pSignalSemaphores = signalSemaphores.data();
+
+    vkResetFences(m_device, 1, &m_inFlightFences[currentFrameIdx]);
+
+    if (vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, m_inFlightFences[currentFrameIdx]) != VK_SUCCESS) {
+        throw std::runtime_error("failed to submit draw command buffer!");
+    }
+
+    VkPresentInfoKHR presentInfo = {};
+    presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
+    presentInfo.waitSemaphoreCount = 1;
+    presentInfo.pWaitSemaphores = &m_renderFinishedSemaphores[currentFrameIdx];
+
+    VkSwapchainKHR swapChains[] = { m_swapChain };
+    presentInfo.swapchainCount = 1;
+    presentInfo.pSwapchains = swapChains;
+    presentInfo.pImageIndices = &imageIndex;
+
+    result = vkQueuePresentKHR(m_presentQueue, &presentInfo);
+    if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR || m_framebufferResized) {
+        recreateSwapChain();
+        m_framebufferResized = false;
+    }
+    else if (result != VK_SUCCESS) {
+        throw std::runtime_error("Failed to acquire swap chain image!");
+    }
+
+    m_currentFrame++;
+}
+
+void VulkanBaseApp::cleanupSwapChain()
+{
+
+    if (m_depthImageView != VK_NULL_HANDLE) {
+        vkDestroyImageView(m_device, m_depthImageView, nullptr);
+    }
+    if (m_depthImage != VK_NULL_HANDLE) {
+        vkDestroyImage(m_device, m_depthImage, nullptr);
+    }
+    if (m_depthImageMemory != VK_NULL_HANDLE) {
+        vkFreeMemory(m_device, m_depthImageMemory, nullptr);
+    }
+
+    for (size_t i = 0; i < m_uniformBuffers.size(); i++) {
+        vkDestroyBuffer(m_device, m_uniformBuffers[i], nullptr);
+        vkFreeMemory(m_device, m_uniformMemory[i], nullptr);
+    }
+
+    if (m_descriptorPool != VK_NULL_HANDLE) {
+        vkDestroyDescriptorPool(m_device, m_descriptorPool, nullptr);
+    }
+
+    for (size_t i = 0; i < m_swapChainFramebuffers.size(); i++) {
+        vkDestroyFramebuffer(m_device, m_swapChainFramebuffers[i], nullptr);
+    }
+
+    if (m_graphicsPipeline != VK_NULL_HANDLE) {
+        vkDestroyPipeline(m_device, m_graphicsPipeline, nullptr);
+    }
+
+    if (m_pipelineLayout != VK_NULL_HANDLE) {
+        vkDestroyPipelineLayout(m_device, m_pipelineLayout, nullptr);
+    }
+
+    if (m_renderPass != VK_NULL_HANDLE) {
+        vkDestroyRenderPass(m_device, m_renderPass, nullptr);
+    }
+
+    for (size_t i = 0; i < m_swapChainImageViews.size(); i++) {
+        vkDestroyImageView(m_device, m_swapChainImageViews[i], nullptr);
+    }
+
+    if (m_swapChain != VK_NULL_HANDLE) {
+        vkDestroySwapchainKHR(m_device, m_swapChain, nullptr);
+    }
+}
+
+void VulkanBaseApp::recreateSwapChain()
+{
+    int width, height;
+
+    glfwGetFramebufferSize(m_window, &width, &height);
+    while (width == 0 || height == 0) {
+        glfwWaitEvents();
+        glfwGetFramebufferSize(m_window, &width, &height);
+    }
+
+    vkDeviceWaitIdle(m_device);
+
+    cleanupSwapChain();
+
+    createSwapChain();
+    createImageViews();
+    createRenderPass();
+    createGraphicsPipeline();
+    createDepthResources();
+    createFramebuffers();
+    createUniformBuffers();
+    createDescriptorPool();
+    createDescriptorSets();
+    createCommandBuffers();
+}
+
+void VulkanBaseApp::mainLoop()
+{
+    while (!glfwWindowShouldClose(m_window)) {
+        glfwPollEvents();
+        drawFrame();
+    }
+    vkDeviceWaitIdle(m_device);
+}
+
+void readFile(std::istream& s, std::vector<char>& data)
+{
+    s.seekg(0, std::ios_base::end);
+    data.resize(s.tellg());
+    s.clear();
+    s.seekg(0, std::ios_base::beg);
+    s.read(data.data(), data.size());
+}
diff --git a/Samples/simpleVulkan/VulkanBaseApp.h b/Samples/simpleVulkan/VulkanBaseApp.h
new file mode 100644
index 00000000..5cb7396d
--- /dev/null
+++ b/Samples/simpleVulkan/VulkanBaseApp.h
@@ -0,0 +1,140 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+#ifndef __VULKANBASEAPP_H__
+#define __VULKANBASEAPP_H__
+
+#include <string>
+#include <vector>
+#include <vulkan/vulkan.h>
+#ifdef _WIN64
+#define NOMINMAX
+#include <windows.h>
+#include <vulkan/vulkan_win32.h>
+#endif /* _WIN64 */
+
+struct GLFWwindow;
+
+class VulkanBaseApp
+{
+public:
+    VulkanBaseApp(const std::string& appName, bool enableValidation = false);
+    static VkExternalSemaphoreHandleTypeFlagBits getDefaultSemaphoreHandleType();
+    static VkExternalMemoryHandleTypeFlagBits getDefaultMemHandleType();
+    virtual ~VulkanBaseApp();
+    void init();
+    void *getMemHandle(VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType);
+    void *getSemaphoreHandle(VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType);
+    void createExternalSemaphore(VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType);
+    void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& bufferMemory);
+    void createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, VkBuffer& buffer, VkDeviceMemory& bufferMemory);
+    void importExternalBuffer(void *handle, VkExternalMemoryHandleTypeFlagBits handleType, size_t size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& memory);
+    void copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size);
+    VkCommandBuffer beginSingleTimeCommands();
+    void endSingleTimeCommands(VkCommandBuffer commandBuffer);
+    void mainLoop();
+protected:
+    const std::string m_appName;
+    const bool m_enableValidation;
+    VkInstance m_instance;
+    VkDebugUtilsMessengerEXT m_debugMessenger;
+    VkSurfaceKHR m_surface;
+    VkPhysicalDevice m_physicalDevice;
+    VkDevice m_device;
+    VkQueue m_graphicsQueue;
+    VkQueue m_presentQueue;
+    VkSwapchainKHR m_swapChain;
+    std::vector<VkImage> m_swapChainImages;
+    VkFormat m_swapChainFormat;
+    VkExtent2D m_swapChainExtent;
+    std::vector<VkImageView> m_swapChainImageViews;
+    std::vector<std::pair<VkShaderStageFlagBits, std::string> > m_shaderFiles;
+    VkRenderPass m_renderPass;
+    VkPipelineLayout m_pipelineLayout;
+    VkPipeline m_graphicsPipeline;
+    std::vector<VkFramebuffer> m_swapChainFramebuffers;
+    VkCommandPool m_commandPool;
+    std::vector<VkCommandBuffer> m_commandBuffers;
+    std::vector<VkSemaphore> m_imageAvailableSemaphores;
+    std::vector<VkSemaphore> m_renderFinishedSemaphores;
+    std::vector<VkFence> m_inFlightFences;
+    std::vector<VkBuffer> m_uniformBuffers;
+    std::vector<VkDeviceMemory> m_uniformMemory;
+    VkDescriptorSetLayout m_descriptorSetLayout;
+    VkDescriptorPool m_descriptorPool;
+    std::vector<VkDescriptorSet> m_descriptorSets;
+    VkImage m_depthImage;
+    VkDeviceMemory m_depthImageMemory;
+    VkImageView m_depthImageView;
+    size_t m_currentFrame;
+    bool m_framebufferResized;
+    uint8_t  m_vkDeviceUUID[VK_UUID_SIZE];
+
+    virtual void initVulkanApp() {}
+    virtual void fillRenderingCommandBuffer(VkCommandBuffer& buffer) {}
+    virtual std::vector<const char *> getRequiredExtensions() const;
+    virtual std::vector<const char *> getRequiredDeviceExtensions() const;
+    virtual void getVertexDescriptions(std::vector<VkVertexInputBindingDescription>& bindingDesc, std::vector<VkVertexInputAttributeDescription>& attribDesc);
+    virtual void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info);
+    virtual void getWaitFrameSemaphores(std::vector<VkSemaphore>& wait, std::vector< VkPipelineStageFlags>& waitStages) const;
+    virtual void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const;
+    virtual VkDeviceSize getUniformSize() const;
+    virtual void updateUniformBuffer(uint32_t imageIndex);
+    virtual void drawFrame();
+private:
+    GLFWwindow *m_window;
+
+    void initWindow();
+    void initVulkan();
+    void createInstance();
+    void createSurface();
+    void createDevice();
+    void createSwapChain();
+    void createImageViews();
+    void createRenderPass();
+    void createDescriptorSetLayout();
+    void createGraphicsPipeline();
+    void createFramebuffers();
+    void createCommandPool();
+    void createDepthResources();
+    void createUniformBuffers();
+    void createDescriptorPool();
+    void createDescriptorSets();
+    void createCommandBuffers();
+    void createSyncObjects();
+
+    void cleanupSwapChain();
+    void recreateSwapChain();
+
+    bool isSuitableDevice(VkPhysicalDevice dev) const;
+    static void resizeCallback(GLFWwindow *window, int width, int height);
+};
+
+void readFile(std::istream& s, std::vector<char>& data);
+
+#endif /* __VULKANBASEAPP_H__ */
diff --git a/Samples/simpleVulkan/findvulkan.mk b/Samples/simpleVulkan/findvulkan.mk
index 8946b6d5..47016fd7 100644
--- a/Samples/simpleVulkan/findvulkan.mk
+++ b/Samples/simpleVulkan/findvulkan.mk
@@ -51,7 +51,7 @@ ifeq ("$(TARGET_OS)","linux")
 endif
 
 ifeq ("$(TARGET_OS)","linux")
-    # Each set of Linux Distros have different paths for where to find their GLM/GLFW3 libraries reside
+    # Each set of Linux Distros have different paths for where to find libraries
     UBUNTU = $(shell echo $(DISTRO) | grep -i ubuntu      >/dev/null 2>&1; echo $$?)
     FEDORA = $(shell echo $(DISTRO) | grep -i fedora      >/dev/null 2>&1; echo $$?)
     RHEL   = $(shell echo $(DISTRO) | grep -i 'red\|rhel' >/dev/null 2>&1; echo $$?)
@@ -107,16 +107,17 @@ ifeq ("$(TARGET_OS)","linux")
   VULKAN_SDK_PATH ?= ${VULKAN_SDK}
 
   ifeq ("$(VULKAN_SDK_PATH)","")
-      $(info >>> WARNING - Vulkan SDK not found, please install Vulkan SDK <<<)
-      SAMPLE_ENABLED := 0
+      VULKAN_SDK_PATH := $(DFLT_PATH)
   endif
 
-  VULKAN_SDK_LIB  := $(shell find -L $(VULKAN_SDK_PATH)/lib -name libvulkan.so    -print 2>/dev/null)
+  VULKAN_SDK_LIB  := $(shell find -L $(VULKAN_SDK_PATH) -name libvulkan.so    -print 2>/dev/null)
   X11LIB          := $(shell find -L $(GLPATH) $(DFLT_PATH) -name libX11.so    -print 2>/dev/null)
 
   ifeq ("$(VULKAN_SDK_LIB)","")
-      $(info >>> WARNING - libvulkan.so not found, please install libvulkan.so <<<)
+      $(info >>> WARNING - libvulkan.so not found, please install Vulkan SDK and pass VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK> <<<)
       SAMPLE_ENABLED := 0
+  else
+      VULKAN_SDK_LIB := $(shell echo $(VULKAN_SDK_LIB) | sed "s/ .*//" | sed "s/\/libvulkan.so//" )
   endif
 
   ifeq ("$(X11LIB)","")
@@ -132,11 +133,13 @@ ifeq ("$(TARGET_OS)","linux")
       HEADER_SEARCH_PATH += /usr/aarch64-linux-gnu/include
   endif
 
-  VULKANHEADER  := $(shell find -L $(VULKAN_SDK_PATH)/include -name vulkan.h -print 2>/dev/null)
+  VULKAN_HEADER  := $(shell find -L $(VULKAN_SDK_PATH) $(HEADER_SEARCH_PATH) -name vulkan.h -print 2>/dev/null)
 
-  ifeq ("$(VULKANHEADER)","")
+  ifeq ("$(VULKAN_HEADER)","")
       $(info >>> WARNING - vulkan.h not found, please install vulkan.h <<<)
       SAMPLE_ENABLED := 0
+  else
+      VULKAN_HEADER := $(shell echo $(VULKAN_HEADER) | sed "s/ .*//" | sed "s/\/vulkan\/vulkan.h//" )
   endif
 else
 endif
diff --git a/Samples/simpleVulkan/linmath.h b/Samples/simpleVulkan/linmath.h
index b4d386cc..dbedbc16 100644
--- a/Samples/simpleVulkan/linmath.h
+++ b/Samples/simpleVulkan/linmath.h
@@ -21,6 +21,7 @@
 #ifndef LINMATH_H
 #define LINMATH_H
 
+#define _USE_MATH_DEFINES
 #include <math.h>
 
 // Converts degrees to radians.
diff --git a/Samples/simpleVulkan/main.cpp b/Samples/simpleVulkan/main.cpp
new file mode 100644
index 00000000..303361b1
--- /dev/null
+++ b/Samples/simpleVulkan/main.cpp
@@ -0,0 +1,441 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "VulkanBaseApp.h"
+
+#include <iostream>
+#include <iomanip>
+#include <chrono>
+#include <algorithm>
+#include "linmath.h"
+
+#include "SineWaveSimulation.h"
+
+#include <helper_cuda.h>
+
+typedef float vec2[2];
+std::string execution_path;
+
+#ifndef NDEBUG
+#define ENABLE_VALIDATION (false)
+#else
+#define ENABLE_VALIDATION (true)
+#endif
+
+class VulkanCudaSineWave : public VulkanBaseApp
+{
+
+    typedef struct UniformBufferObject_st {
+        mat4x4 modelViewProj;
+    } UniformBufferObject;
+
+    VkBuffer m_heightBuffer, m_xyBuffer, m_indexBuffer;
+    VkDeviceMemory m_heightMemory, m_xyMemory, m_indexMemory;
+    UniformBufferObject m_ubo;
+    VkSemaphore m_vkWaitSemaphore, m_vkSignalSemaphore;
+    SineWaveSimulation m_sim;
+    cudaStream_t m_stream;
+    cudaExternalSemaphore_t m_cudaWaitSemaphore, m_cudaSignalSemaphore;
+    cudaExternalMemory_t m_cudaVertMem;
+    float *m_cudaHeightMap;
+    using chrono_tp = std::chrono::time_point<std::chrono::high_resolution_clock>;
+    chrono_tp m_lastTime;
+    size_t m_lastFrame;
+public:
+    VulkanCudaSineWave(size_t width, size_t height) :
+        VulkanBaseApp("vulkanCudaSineWave", ENABLE_VALIDATION),
+        m_heightBuffer(VK_NULL_HANDLE),
+        m_xyBuffer(VK_NULL_HANDLE),
+        m_indexBuffer(VK_NULL_HANDLE),
+        m_heightMemory(VK_NULL_HANDLE),
+        m_xyMemory(VK_NULL_HANDLE),
+        m_indexMemory(VK_NULL_HANDLE),
+        m_ubo(),
+        m_sim(width, height),
+        m_stream(0),
+        m_vkWaitSemaphore(VK_NULL_HANDLE),
+        m_vkSignalSemaphore(VK_NULL_HANDLE),
+        m_cudaWaitSemaphore(),
+        m_cudaSignalSemaphore(),
+        m_cudaVertMem(),
+        m_cudaHeightMap(nullptr),
+        m_lastFrame(0) {
+        // Our index buffer can only index 32-bits of the vertex buffer
+        if ((width * height) > (1ULL << 32ULL)) {
+            throw std::runtime_error("Requested height and width is too large for this sample!");
+        }
+        // Add our compiled vulkan shader files
+        char* vertex_shader_path = sdkFindFilePath("sinewave.vert", execution_path.c_str());
+        char* fragment_shader_path = sdkFindFilePath("sinewave.frag", execution_path.c_str());
+        m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path));
+        m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader_path));
+
+    }
+    ~VulkanCudaSineWave() {
+        // Make sure there's no pending work before we start tearing down
+        checkCudaErrors(cudaStreamSynchronize(m_stream));
+
+        if (m_vkSignalSemaphore != VK_NULL_HANDLE) {
+            checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaSignalSemaphore));
+            vkDestroySemaphore(m_device, m_vkSignalSemaphore, nullptr);
+        }
+        if (m_vkWaitSemaphore != VK_NULL_HANDLE) {
+            checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaWaitSemaphore));
+            vkDestroySemaphore(m_device, m_vkWaitSemaphore, nullptr);
+        }
+
+        if (m_xyBuffer != VK_NULL_HANDLE) {
+            vkDestroyBuffer(m_device, m_xyBuffer, nullptr);
+        }
+        if (m_xyMemory != VK_NULL_HANDLE) {
+            vkFreeMemory(m_device, m_xyMemory, nullptr);
+        }
+
+        if (m_heightBuffer != VK_NULL_HANDLE) {
+            vkDestroyBuffer(m_device, m_heightBuffer, nullptr);
+        }
+        if (m_heightMemory != VK_NULL_HANDLE) {
+            vkFreeMemory(m_device, m_heightMemory, nullptr);
+        }
+        if (m_cudaHeightMap) {
+            checkCudaErrors(cudaDestroyExternalMemory(m_cudaVertMem));
+        }
+
+        if (m_indexBuffer != VK_NULL_HANDLE) {
+            vkDestroyBuffer(m_device, m_indexBuffer, nullptr);
+        }
+        if (m_indexMemory != VK_NULL_HANDLE) {
+            vkFreeMemory(m_device, m_indexMemory, nullptr);
+        }
+    }
+
+    void fillRenderingCommandBuffer(VkCommandBuffer& commandBuffer) {
+        VkBuffer vertexBuffers[] = { m_heightBuffer, m_xyBuffer };
+        VkDeviceSize offsets[] = { 0, 0 };
+        vkCmdBindVertexBuffers(commandBuffer, 0, sizeof(vertexBuffers) / sizeof(vertexBuffers[0]), vertexBuffers, offsets);
+        vkCmdBindIndexBuffer(commandBuffer, m_indexBuffer, 0, VK_INDEX_TYPE_UINT32);
+        vkCmdDrawIndexed(commandBuffer, (uint32_t)((m_sim.getWidth() - 1) * (m_sim.getHeight() - 1) * 6), 1, 0, 0, 0);
+    }
+
+    void getVertexDescriptions(std::vector<VkVertexInputBindingDescription>& bindingDesc, std::vector<VkVertexInputAttributeDescription>& attribDesc) {
+        bindingDesc.resize(2);
+        attribDesc.resize(2);
+
+        bindingDesc[0].binding = 0;
+        bindingDesc[0].stride = sizeof(float);
+        bindingDesc[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
+
+        bindingDesc[1].binding = 1;
+        bindingDesc[1].stride = sizeof(vec2);
+        bindingDesc[1].inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
+
+        attribDesc[0].binding = 0;
+        attribDesc[0].location = 0;
+        attribDesc[0].format = VK_FORMAT_R32_SFLOAT;
+        attribDesc[0].offset = 0;
+
+        attribDesc[1].binding = 1;
+        attribDesc[1].location = 1;
+        attribDesc[1].format = VK_FORMAT_R32G32_SFLOAT;
+        attribDesc[1].offset = 0;
+    }
+
+    void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info) {
+        info.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
+        info.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
+        info.primitiveRestartEnable = VK_FALSE;
+    }
+
+    void getWaitFrameSemaphores(std::vector<VkSemaphore>& wait, std::vector< VkPipelineStageFlags>& waitStages) const {
+        if (m_currentFrame != 0) {
+            // Have vulkan wait until cuda is done with the vertex buffer before rendering
+            // We don't do this on the first frame, as the wait semaphore hasn't been initialized yet
+            wait.push_back(m_vkWaitSemaphore);
+            // We want to wait until all the pipeline commands are complete before letting cuda work
+            waitStages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
+        }
+    }
+
+    void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const {
+        // Add this semaphore for vulkan to signal once the vertex buffer is ready for cuda to modify
+        signal.push_back(m_vkSignalSemaphore);
+    }
+
+    void initVulkanApp() {
+        int cuda_device = -1;
+
+        // Select cuda device where vulkan is running.
+        cuda_device = m_sim.initCuda(m_vkDeviceUUID, VK_UUID_SIZE);
+        if (cuda_device == -1)
+        {
+            printf("Error: No CUDA-Vulkan interop capable device found\n");
+            exit(EXIT_FAILURE);
+        }
+
+        m_sim.initCudaLaunchConfig(cuda_device);
+
+        // Create the cuda stream we'll be using
+        checkCudaErrors(cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking));
+
+        const size_t nVerts = m_sim.getWidth() * m_sim.getHeight();
+        const size_t nInds = (m_sim.getWidth() - 1) * (m_sim.getHeight() - 1) * 6;
+
+        // Create the height map cuda will write to
+        createExternalBuffer(nVerts * sizeof(float),
+                             VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
+                             VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+                             getDefaultMemHandleType(),
+                             m_heightBuffer, m_heightMemory);
+
+        // Create the vertex buffer that will hold the xy coordinates for the grid
+        createBuffer(nVerts * sizeof(vec2),
+                     VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
+                     VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+                     m_xyBuffer, m_xyMemory);
+
+        // Create the index buffer that references from both buffers above
+        createBuffer(nInds * sizeof(uint32_t),
+                     VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT,
+                     VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+                     m_indexBuffer, m_indexMemory);
+
+        // Import the height map into cuda and retrieve a device pointer to use
+        importCudaExternalMemory((void **)&m_cudaHeightMap, m_cudaVertMem, m_heightMemory, nVerts * sizeof(*m_cudaHeightMap), getDefaultMemHandleType());
+        // Set the height map to use in the simulation
+        m_sim.initSimulation(m_cudaHeightMap);
+
+        {
+            // Set up the initial values for the vertex buffers with Vulkan
+            void *stagingBase;
+            VkBuffer stagingBuffer;
+            VkDeviceMemory stagingMemory;
+            VkDeviceSize stagingSz = std::max(nVerts * sizeof(vec2), nInds * sizeof(uint32_t));
+            createBuffer(stagingSz, VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+                         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBuffer, stagingMemory);
+
+            vkMapMemory(m_device, stagingMemory, 0, stagingSz, 0, &stagingBase);
+
+            memset(stagingBase, 0, nVerts * sizeof(float));
+            copyBuffer(m_heightBuffer, stagingBuffer, nVerts * sizeof(float));
+
+            for (size_t y = 0; y < m_sim.getHeight(); y++) {
+                for (size_t x = 0; x < m_sim.getWidth(); x++) {
+                    vec2 *stagedVert = (vec2 *)stagingBase;
+                    stagedVert[y * m_sim.getWidth() + x][0] = (2.0f * x) / (m_sim.getWidth() - 1) - 1;
+                    stagedVert[y * m_sim.getWidth() + x][1] = (2.0f * y) / (m_sim.getHeight() - 1) - 1;
+                }
+            }
+            copyBuffer(m_xyBuffer, stagingBuffer, nVerts * sizeof(vec2));
+
+            {
+                uint32_t *indices = (uint32_t *)stagingBase;
+                for (size_t y = 0; y < m_sim.getHeight() - 1; y++) {
+                    for (size_t x = 0; x < m_sim.getWidth() - 1; x++) {
+                        indices[0] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 0));
+                        indices[1] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 0));
+                        indices[2] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 1));
+                        indices[3] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 0));
+                        indices[4] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 1));
+                        indices[5] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 1));
+                        indices += 6;
+                    }
+                }
+            }
+            copyBuffer(m_indexBuffer, stagingBuffer, nInds * sizeof(uint32_t));
+
+            vkUnmapMemory(m_device, stagingMemory);
+            vkDestroyBuffer(m_device, stagingBuffer, nullptr);
+            vkFreeMemory(m_device, stagingMemory, nullptr);
+        }
+
+        // Create the semaphore vulkan will signal when it's done with the vertex buffer
+        createExternalSemaphore(m_vkSignalSemaphore, getDefaultSemaphoreHandleType());
+        // Create the semaphore vulkan will wait for before using the vertex buffer
+        createExternalSemaphore(m_vkWaitSemaphore, getDefaultSemaphoreHandleType());
+        // Import the semaphore cuda will use -- vulkan's signal will be cuda's wait
+        importCudaExternalSemaphore(m_cudaWaitSemaphore, m_vkSignalSemaphore, getDefaultSemaphoreHandleType());
+        // Import the semaphore cuda will use -- cuda's signal will be vulkan's wait
+        importCudaExternalSemaphore(m_cudaSignalSemaphore, m_vkWaitSemaphore, getDefaultSemaphoreHandleType());
+    }
+
+    void importCudaExternalMemory(void **cudaPtr, cudaExternalMemory_t& cudaMem, VkDeviceMemory& vkMem, VkDeviceSize size, VkExternalMemoryHandleTypeFlagBits handleType) {
+        cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {};
+
+        if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) {
+            externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32;
+        }
+        else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) {
+            externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32Kmt;
+        }
+        else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) {
+            externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueFd;
+        }
+        else {
+            throw std::runtime_error("Unknown handle type requested!");
+        }
+
+        externalMemoryHandleDesc.size = size;
+
+#ifdef _WIN64
+        externalMemoryHandleDesc.handle.win32.handle = (HANDLE)getMemHandle(vkMem, handleType);
+#else
+        externalMemoryHandleDesc.handle.fd = (int)(uintptr_t)getMemHandle(vkMem, handleType);
+#endif
+
+        checkCudaErrors(cudaImportExternalMemory(&cudaMem, &externalMemoryHandleDesc));
+
+        cudaExternalMemoryBufferDesc externalMemBufferDesc = {};
+        externalMemBufferDesc.offset = 0;
+        externalMemBufferDesc.size = size;
+        externalMemBufferDesc.flags = 0;
+
+        checkCudaErrors(cudaExternalMemoryGetMappedBuffer(cudaPtr, cudaMem, &externalMemBufferDesc));
+    }
+
+    void importCudaExternalSemaphore(cudaExternalSemaphore_t& cudaSem, VkSemaphore& vkSem, VkExternalSemaphoreHandleTypeFlagBits handleType) {
+        cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc = {};
+
+        if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) {
+            externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32;
+        }
+        else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) {
+            externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt;
+        }
+        else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) {
+            externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd;
+        }
+        else {
+            throw std::runtime_error("Unknown handle type requested!");
+        }
+
+#ifdef _WIN64
+        externalSemaphoreHandleDesc.handle.win32.handle = (HANDLE)getSemaphoreHandle(vkSem, handleType);
+#else
+        externalSemaphoreHandleDesc.handle.fd = (int)(uintptr_t)getSemaphoreHandle(vkSem, handleType);
+#endif
+
+        externalSemaphoreHandleDesc.flags = 0;
+
+        checkCudaErrors(cudaImportExternalSemaphore(&cudaSem, &externalSemaphoreHandleDesc));
+    }
+
+    VkDeviceSize getUniformSize() const {
+        return sizeof(UniformBufferObject);
+    }
+
+    void updateUniformBuffer(uint32_t imageIndex) {
+        {
+            mat4x4 view, proj;
+            vec3 eye = { 1.75f, 1.75f, 1.25f };
+            vec3 center = { 0.0f, 0.0f, -0.25f };
+            vec3 up = { 0.0f, 0.0f, 1.0f };
+
+            mat4x4_perspective(proj, (float)degreesToRadians(45.0f), m_swapChainExtent.width / (float)m_swapChainExtent.height, 0.1f, 10.0f);
+            proj[1][1] *= -1.0f;        // Flip y axis
+
+            mat4x4_look_at(view, eye, center, up);
+            mat4x4_mul(m_ubo.modelViewProj, proj, view);
+        }
+
+        void *data;
+        vkMapMemory(m_device, m_uniformMemory[imageIndex], 0, getUniformSize(), 0, &data);
+        memcpy(data, &m_ubo, sizeof(m_ubo));
+        vkUnmapMemory(m_device, m_uniformMemory[imageIndex]);
+    }
+
+    std::vector<const char *> getRequiredExtensions() const {
+        std::vector<const char *> extensions;
+        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME);
+        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME);
+        return extensions;
+    }
+
+    std::vector<const char *> getRequiredDeviceExtensions() const {
+        std::vector<const char *> extensions;
+        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME);
+        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME);
+#ifdef _WIN64
+        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME);
+        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME);
+#else
+        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME);
+        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME);
+#endif /* _WIN64 */
+        return extensions;
+    }
+
+    void drawFrame() {
+        static chrono_tp startTime = std::chrono::high_resolution_clock::now();
+
+        chrono_tp currentTime = std::chrono::high_resolution_clock::now();
+        float time = std::chrono::duration<float, std::chrono::seconds::period>(currentTime - startTime).count();
+
+        if (m_currentFrame == 0) {
+            m_lastTime = startTime;
+        }
+
+        float frame_time = std::chrono::duration<float, std::chrono::seconds::period>(currentTime - m_lastTime).count();
+
+        cudaExternalSemaphoreWaitParams waitParams = {};
+        waitParams.flags = 0;
+        waitParams.params.fence.value = 0;
+
+        cudaExternalSemaphoreSignalParams signalParams = {};
+        signalParams.flags = 0;
+        signalParams.params.fence.value = 0;
+
+        // Have vulkan draw the current frame...
+        VulkanBaseApp::drawFrame();
+        // Wait for vulkan to complete it's work
+        checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaWaitSemaphore, &waitParams, 1, m_stream));
+        // Now step the simulation
+        m_sim.stepSimulation(time, m_stream);
+        // Signal vulkan to continue with the updated buffers
+        checkCudaErrors(cudaSignalExternalSemaphoresAsync(&m_cudaSignalSemaphore, &signalParams, 1, m_stream));
+
+        // Output a naive measurement of the frames per second every five seconds
+        if (frame_time > 5) {
+            std::cout << "Average FPS (over "
+                      << std::fixed << std::setprecision(2) << frame_time
+                      << " seconds): "
+                      << std::fixed << std::setprecision(2)
+                      << ((m_currentFrame - m_lastFrame) / frame_time)
+                      << std::endl;
+            m_lastFrame = m_currentFrame;
+            m_lastTime = currentTime;
+        }
+    }
+};
+
+int main(int argc, char **argv)
+{
+    execution_path = argv[0];
+    VulkanCudaSineWave app((1ULL << 8ULL), (1ULL << 8ULL));
+    app.init();
+    app.mainLoop();
+    return 0;
+}
diff --git a/Samples/simpleVulkan/shader_sine.frag b/Samples/simpleVulkan/shader_sine.frag
deleted file mode 100644
index b096569c..00000000
--- a/Samples/simpleVulkan/shader_sine.frag
+++ /dev/null
@@ -1,11 +0,0 @@
-#version 450
-#extension GL_ARB_separate_shader_objects : enable
-#extension GL_NV_gpu_shader5 : enable
-
-layout(location = 0) in vec3 fragColor;
-
-layout(location = 0) out vec4 outColor;
-
-void main() {
-    outColor = vec4(fragColor, 1.0);
-}
\ No newline at end of file
diff --git a/Samples/simpleVulkan/shader_sine.vert b/Samples/simpleVulkan/shader_sine.vert
deleted file mode 100644
index 849558b3..00000000
--- a/Samples/simpleVulkan/shader_sine.vert
+++ /dev/null
@@ -1,23 +0,0 @@
-#version 450
-#extension GL_ARB_separate_shader_objects : enable
-#extension GL_NV_gpu_shader5 : enable
-
-layout(binding = 0) uniform UniformBufferObject {
-    mat4 model;
-    mat4 view;
-    mat4 proj;
-} ubo;
-
-layout(location = 0) in vec4 inPosition;
-layout(location = 1) in vec3 inColor;
-
-layout(location = 0) out vec3 fragColor;
-
-out gl_PerVertex {
-    vec4 gl_Position;
-};
-
-void main() {
-    gl_Position = ubo.proj * ubo.view * ubo.model * inPosition;
-    fragColor = inColor;
-}
\ No newline at end of file
diff --git a/Samples/simpleVulkan/simpleVulkan_vs2015.vcxproj b/Samples/simpleVulkan/simpleVulkan_vs2015.vcxproj
index 826a2c4b..63185b7d 100644
--- a/Samples/simpleVulkan/simpleVulkan_vs2015.vcxproj
+++ b/Samples/simpleVulkan/simpleVulkan_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/simpleVulkan.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
@@ -107,7 +107,11 @@
     </Link>
   </ItemDefinitionGroup>
   <ItemGroup>
-    <CudaCompile Include="vulkanCUDASinewave.cu" />
+    <CudaCompile Include="SineWaveSimulation.cu" />
+    <ClCompile Include="VulkanBaseApp.cpp" />
+    <ClCompile Include="main.cpp" />
+    <ClInclude Include="SineWaveSimulation.h" />
+    <ClInclude Include="VulkanBaseApp.h" />
     <ClInclude Include="linmath.h" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj b/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj
index 47cef491..17a7cfdc 100644
--- a/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj
+++ b/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/simpleVulkan.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
@@ -112,7 +112,11 @@
     </Link>
   </ItemDefinitionGroup>
   <ItemGroup>
-    <CudaCompile Include="vulkanCUDASinewave.cu" />
+    <CudaCompile Include="SineWaveSimulation.cu" />
+    <ClCompile Include="VulkanBaseApp.cpp" />
+    <ClCompile Include="main.cpp" />
+    <ClInclude Include="SineWaveSimulation.h" />
+    <ClInclude Include="VulkanBaseApp.h" />
     <ClInclude Include="linmath.h" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/Samples/simpleVulkan/simpleVulkan_vs2019.vcxproj b/Samples/simpleVulkan/simpleVulkan_vs2019.vcxproj
index 606acc4c..e936df25 100644
--- a/Samples/simpleVulkan/simpleVulkan_vs2019.vcxproj
+++ b/Samples/simpleVulkan/simpleVulkan_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/simpleVulkan.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
@@ -108,7 +108,11 @@
     </Link>
   </ItemDefinitionGroup>
   <ItemGroup>
-    <CudaCompile Include="vulkanCUDASinewave.cu" />
+    <CudaCompile Include="SineWaveSimulation.cu" />
+    <ClCompile Include="VulkanBaseApp.cpp" />
+    <ClCompile Include="main.cpp" />
+    <ClInclude Include="SineWaveSimulation.h" />
+    <ClInclude Include="VulkanBaseApp.h" />
     <ClInclude Include="linmath.h" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/Samples/simpleVulkan/sinewave.frag b/Samples/simpleVulkan/sinewave.frag
new file mode 100644
index 00000000..82fa0519
--- /dev/null
+++ b/Samples/simpleVulkan/sinewave.frag
@@ -0,0 +1,38 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#version 450
+#extension GL_ARB_separate_shader_objects : enable
+
+layout(location = 0) in vec3 fragColor;
+
+layout(location = 0) out vec4 outColor;
+
+void main() {
+    outColor = vec4(fragColor, 1.0);
+}
\ No newline at end of file
diff --git a/Samples/simpleVulkan/sinewave.vert b/Samples/simpleVulkan/sinewave.vert
new file mode 100644
index 00000000..881329c6
--- /dev/null
+++ b/Samples/simpleVulkan/sinewave.vert
@@ -0,0 +1,43 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#version 450
+#extension GL_ARB_separate_shader_objects : enable
+
+layout(binding = 0) uniform UniformBufferObject {
+	mat4 modelViewProj;
+} ubo;
+
+layout(location = 0) in float height;
+layout(location = 1) in vec2 xyPos;
+
+layout(location = 0) out vec3 fragColor;
+
+void main() {
+    gl_Position = ubo.modelViewProj * vec4(xyPos.xy, height, 1.0f);
+    fragColor = vec3(0.0f, (height + 0.5f), 0.0f);
+}
diff --git a/Samples/simpleVulkan/vulkanCUDASinewave.cu b/Samples/simpleVulkan/vulkanCUDASinewave.cu
deleted file mode 100644
index 08af43c8..00000000
--- a/Samples/simpleVulkan/vulkanCUDASinewave.cu
+++ /dev/null
@@ -1,1863 +0,0 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#define GLFW_INCLUDE_VULKAN
-#include <GLFW/glfw3.h>
-#include <vulkan/vulkan.h>
-
-#include <algorithm>
-#include <array>
-#include <chrono>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <functional>
-#include <iostream>
-#include <set>
-#include <stdexcept>
-#include <thread>
-#include <vector>
-
-#ifdef _WIN64
-#include <aclapi.h>
-#include <dxgi1_2.h>
-#include <vulkan/vulkan_win32.h>
-#include <windows.h>
-#include <VersionHelpers.h>
-#define _USE_MATH_DEFINES
-#endif
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <helper_cuda.h>
-
-#include "linmath.h"
-
-#define WIDTH 800
-#define HEIGHT 600
-
-#define VULKAN_VALIDATION 0
-
-const std::vector<const char*> validationLayers = {
-    "VK_LAYER_LUNARG_standard_validation"};
-
-#if VULKAN_VALIDATION
-const bool enableValidationLayers = true;
-#else
-const bool enableValidationLayers = false;
-#endif
-
-struct QueueFamilyIndices {
-  int graphicsFamily = -1;
-  int presentFamily = -1;
-
-  bool isComplete() { return graphicsFamily >= 0 && presentFamily >= 0; }
-};
-
-const std::vector<const char*> deviceExtensions = {
-    VK_KHR_SWAPCHAIN_EXTENSION_NAME,
-    VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME,
-    VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME,
-#ifdef _WIN64
-    VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME,
-    VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME,
-#else
-    VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
-    VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME,
-#endif
-};
-
-#ifdef _WIN64
-class WindowsSecurityAttributes {
- protected:
-  SECURITY_ATTRIBUTES m_winSecurityAttributes;
-  PSECURITY_DESCRIPTOR m_winPSecurityDescriptor;
-
- public:
-  WindowsSecurityAttributes();
-  SECURITY_ATTRIBUTES* operator&();
-  ~WindowsSecurityAttributes();
-};
-
-WindowsSecurityAttributes::WindowsSecurityAttributes() {
-  m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc(
-      1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void**));
-  // CHECK_NEQ(m_winPSecurityDescriptor, (PSECURITY_DESCRIPTOR)NULL);
-
-  PSID* ppSID =
-      (PSID*)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH);
-  PACL* ppACL = (PACL*)((PBYTE)ppSID + sizeof(PSID*));
-
-  InitializeSecurityDescriptor(m_winPSecurityDescriptor,
-                               SECURITY_DESCRIPTOR_REVISION);
-
-  SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority =
-      SECURITY_WORLD_SID_AUTHORITY;
-  AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0,
-                           0, 0, 0, 0, 0, ppSID);
-
-  EXPLICIT_ACCESS explicitAccess;
-  ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS));
-  explicitAccess.grfAccessPermissions =
-      STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL;
-  explicitAccess.grfAccessMode = SET_ACCESS;
-  explicitAccess.grfInheritance = INHERIT_ONLY;
-  explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID;
-  explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP;
-  explicitAccess.Trustee.ptstrName = (LPTSTR)*ppSID;
-
-  SetEntriesInAcl(1, &explicitAccess, NULL, ppACL);
-
-  SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE);
-
-  m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes);
-  m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor;
-  m_winSecurityAttributes.bInheritHandle = TRUE;
-}
-
-SECURITY_ATTRIBUTES* WindowsSecurityAttributes::operator&() {
-  return &m_winSecurityAttributes;
-}
-
-WindowsSecurityAttributes::~WindowsSecurityAttributes() {
-  PSID* ppSID =
-      (PSID*)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH);
-  PACL* ppACL = (PACL*)((PBYTE)ppSID + sizeof(PSID*));
-
-  if (*ppSID) {
-    FreeSid(*ppSID);
-  }
-  if (*ppACL) {
-    LocalFree(*ppACL);
-  }
-  free(m_winPSecurityDescriptor);
-}
-#endif
-
-struct UniformBufferObject {
-  mat4x4 model;
-  mat4x4 view;
-  mat4x4 proj;
-};
-
-struct SwapChainSupportDetails {
-  VkSurfaceCapabilitiesKHR capabilities;
-  std::vector<VkSurfaceFormatKHR> formats;
-  std::vector<VkPresentModeKHR> presentModes;
-};
-
-struct Vertex {
-  vec4 pos;
-  vec3 color;
-
-  static VkVertexInputBindingDescription getBindingDescription() {
-    VkVertexInputBindingDescription bindingDescription = {};
-
-    bindingDescription.binding = 0;
-    bindingDescription.stride = sizeof(Vertex);
-    bindingDescription.inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
-
-    return bindingDescription;
-  }
-
-  static std::array<VkVertexInputAttributeDescription, 2>
-  getAttributeDescriptions() {
-    std::array<VkVertexInputAttributeDescription, 2> attributeDescriptions = {};
-    attributeDescriptions[0].binding = 0;
-    attributeDescriptions[0].location = 0;
-    attributeDescriptions[0].format = VK_FORMAT_R32G32B32A32_SFLOAT;
-    attributeDescriptions[0].offset = offsetof(Vertex, pos);
-
-    attributeDescriptions[1].binding = 0;
-    attributeDescriptions[1].location = 1;
-    attributeDescriptions[1].format = VK_FORMAT_R32G32B32_SFLOAT;
-    attributeDescriptions[1].offset = offsetof(Vertex, color);
-    return attributeDescriptions;
-  }
-};
-
-size_t mesh_width = 0, mesh_height = 0;
-std::string execution_path;
-
-__global__ void sinewave_gen_kernel(Vertex* vertices, unsigned int width,
-                                    unsigned int height, float time) {
-  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-  // calculate uv coordinates
-  float u = x / (float)width;
-  float v = y / (float)height;
-  u = u * 2.0f - 1.0f;
-  v = v * 2.0f - 1.0f;
-
-  // calculate simple sine wave pattern
-  float freq = 4.0f;
-  float w = sinf(u * freq + time) * cosf(v * freq + time) * 0.5f;
-
-  if (y < height && x < width) {
-    // write output vertex
-    vertices[y * width + x].pos[0] = u;
-    vertices[y * width + x].pos[1] = w;
-    vertices[y * width + x].pos[2] = v;
-    vertices[y * width + x].pos[3] = 1.0f;
-    vertices[y * width + x].color[0] = 1.0f;
-    vertices[y * width + x].color[1] = 0.0f;
-    vertices[y * width + x].color[2] = 0.0f;
-  }
-}
-
-class vulkanCudaApp {
- public:
-  void run() {
-    initWindow();
-    initVulkan();
-    initCuda();
-    mainLoop();
-    cleanup();
-  }
-
- private:
-  GLFWwindow* window;
-  VkInstance instance;
-  VkPhysicalDevice physicalDevice = VK_NULL_HANDLE;
-  uint8_t vkDeviceUUID[VK_UUID_SIZE];
-  VkDevice device;
-  VkQueue graphicsQueue;
-  VkQueue presentQueue;
-  VkSurfaceKHR surface;
-  VkSwapchainKHR swapChain;
-  std::vector<VkImage> swapChainImages;
-  VkFormat swapChainImageFormat;
-  VkExtent2D swapChainExtent;
-  std::vector<VkImageView> swapChainImageViews;
-  VkDescriptorSetLayout descriptorSetLayout;
-  VkDescriptorPool descriptorPool;
-  VkDescriptorSet descriptorSet;
-  VkPipelineLayout pipelineLayout;
-  VkRenderPass renderPass;
-  VkPipeline graphicsPipeline;
-  std::vector<VkFramebuffer> swapChainFramebuffers;
-  VkCommandPool commandPool;
-  VkBuffer vertexBuffer;
-  VkDeviceMemory vertexBufferMemory;
-  VkBuffer uniformBuffer;
-  VkDeviceMemory uniformBufferMemory;
-  std::vector<VkCommandBuffer> commandBuffers;
-  VkSemaphore imageAvailableSemaphore;
-  VkSemaphore renderFinishedSemaphore;
-  VkSemaphore cudaUpdateVkVertexBufSemaphore;
-  VkSemaphore vkUpdateCudaVertexBufSemaphore;
-
-  size_t vertexBufSize = 0;
-  bool startSubmit = 0;
-  double AnimTime = 1.0f;
-
-  VkDebugReportCallbackEXT callback;
-
-#ifdef _WIN64
-  PFN_vkGetMemoryWin32HandleKHR fpGetMemoryWin32HandleKHR;
-  PFN_vkGetSemaphoreWin32HandleKHR fpGetSemaphoreWin32HandleKHR;
-#else
-  PFN_vkGetMemoryFdKHR fpGetMemoryFdKHR;
-  PFN_vkGetSemaphoreFdKHR fpGetSemaphoreFdKHR;
-#endif
-
-  PFN_vkGetPhysicalDeviceProperties2 fpGetPhysicalDeviceProperties2;
-
-  // CUDA stuff
-  cudaExternalMemory_t cudaExtMemVertexBuffer;
-  cudaExternalSemaphore_t cudaExtCudaUpdateVkVertexBufSemaphore;
-  cudaExternalSemaphore_t cudaExtVkUpdateCudaVertexBufSemaphore;
-  void* cudaDevVertptr = NULL;
-  cudaStream_t streamToRun;
-
-  bool checkValidationLayerSupport() {
-    uint32_t layerCount;
-    vkEnumerateInstanceLayerProperties(&layerCount, nullptr);
-
-    std::vector<VkLayerProperties> availableLayers(layerCount);
-    vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data());
-
-    for (const char* layerName : validationLayers) {
-      bool layerFound = false;
-
-      for (const auto& layerProperties : availableLayers) {
-        if (strcmp(layerName, layerProperties.layerName) == 0) {
-          layerFound = true;
-          break;
-        }
-      }
-
-      if (!layerFound) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-  static VKAPI_ATTR VkBool32 VKAPI_CALL
-  debugCallback(VkDebugReportFlagsEXT flags, VkDebugReportObjectTypeEXT objType,
-                uint64_t obj, size_t location, int32_t code,
-                const char* layerPrefix, const char* msg, void* userData) {
-    std::cerr << "validation layer: " << msg << std::endl;
-
-    return VK_FALSE;
-  }
-
-  VkResult CreateDebugReportCallbackEXT(
-      VkInstance instance,
-      const VkDebugReportCallbackCreateInfoEXT* pCreateInfo,
-      const VkAllocationCallbacks* pAllocator,
-      VkDebugReportCallbackEXT* pCallback) {
-    auto func = (PFN_vkCreateDebugReportCallbackEXT)vkGetInstanceProcAddr(
-        instance, "vkCreateDebugReportCallbackEXT");
-    if (func != nullptr) {
-      return func(instance, pCreateInfo, pAllocator, pCallback);
-    } else {
-      return VK_ERROR_EXTENSION_NOT_PRESENT;
-    }
-  }
-
-  void DestroyDebugReportCallbackEXT(VkInstance instance,
-                                     VkDebugReportCallbackEXT callback,
-                                     const VkAllocationCallbacks* pAllocator) {
-    auto func = (PFN_vkDestroyDebugReportCallbackEXT)vkGetInstanceProcAddr(
-        instance, "vkDestroyDebugReportCallbackEXT");
-    if (func != nullptr) {
-      func(instance, callback, pAllocator);
-    }
-  }
-
-  void setupDebugCallback() {
-    if (!enableValidationLayers) return;
-
-    VkDebugReportCallbackCreateInfoEXT createInfo = {};
-    createInfo.sType = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT;
-    createInfo.flags =
-        VK_DEBUG_REPORT_ERROR_BIT_EXT | VK_DEBUG_REPORT_WARNING_BIT_EXT;
-    createInfo.pfnCallback = debugCallback;
-
-    if (CreateDebugReportCallbackEXT(instance, &createInfo, nullptr,
-                                     &callback) != VK_SUCCESS) {
-      throw std::runtime_error("failed to set up debug callback!");
-    }
-  }
-  void initWindow() {
-    glfwInit();
-    glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API);
-    glfwWindowHint(GLFW_RESIZABLE, GLFW_FALSE);
-    window = glfwCreateWindow(WIDTH, HEIGHT, "Vulkan-CUDA Interop Sinewave",
-                              nullptr, nullptr);
-  }
-
-  void createInstance() {
-    if (enableValidationLayers && !checkValidationLayerSupport()) {
-      throw std::runtime_error(
-          "validation layers requested, but not available!");
-    }
-
-    VkApplicationInfo appInfo = {};
-    appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
-    appInfo.pApplicationName = "Vulkan CUDA Sinewave";
-    appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
-    appInfo.pEngineName = "No Engine";
-    appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
-    appInfo.apiVersion = VK_API_VERSION_1_0;
-
-    VkInstanceCreateInfo createInfo = {};
-    createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
-    createInfo.pApplicationInfo = &appInfo;
-
-    uint32_t glfwExtensionCount = 0;
-    const char** glfwExtensions;
-
-    glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount);
-
-    std::vector<const char*> enabledExtensionNameList;
-    enabledExtensionNameList.push_back(
-        VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME);
-    enabledExtensionNameList.push_back(
-        VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME);
-    enabledExtensionNameList.push_back(
-        VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME);
-
-    for (int i = 0; i < glfwExtensionCount; i++) {
-      enabledExtensionNameList.push_back(glfwExtensions[i]);
-    }
-    if (enableValidationLayers) {
-      enabledExtensionNameList.push_back(VK_EXT_DEBUG_REPORT_EXTENSION_NAME);
-      createInfo.enabledLayerCount =
-          static_cast<uint32_t>(validationLayers.size());
-      createInfo.ppEnabledLayerNames = validationLayers.data();
-    } else {
-      createInfo.enabledLayerCount = 0;
-    }
-
-    createInfo.enabledExtensionCount = enabledExtensionNameList.size();
-    createInfo.ppEnabledExtensionNames = enabledExtensionNameList.data();
-
-    if (vkCreateInstance(&createInfo, nullptr, &instance) != VK_SUCCESS) {
-      throw std::runtime_error("failed to create instance!");
-    } else {
-      std::cout << "Instance created successfully!!\n";
-    }
-
-    fpGetPhysicalDeviceProperties2 =
-        (PFN_vkGetPhysicalDeviceProperties2)vkGetInstanceProcAddr(
-            instance, "vkGetPhysicalDeviceProperties2");
-    if (fpGetPhysicalDeviceProperties2 == NULL) {
-      throw std::runtime_error(
-          "Vulkan: Proc address for \"vkGetPhysicalDeviceProperties2KHR\" not "
-          "found.\n");
-    }
-
-#ifdef _WIN64
-    fpGetMemoryWin32HandleKHR =
-        (PFN_vkGetMemoryWin32HandleKHR)vkGetInstanceProcAddr(
-            instance, "vkGetMemoryWin32HandleKHR");
-    if (fpGetMemoryWin32HandleKHR == NULL) {
-      throw std::runtime_error(
-          "Vulkan: Proc address for \"vkGetMemoryWin32HandleKHR\" not "
-          "found.\n");
-    }
-#else
-    fpGetMemoryFdKHR = (PFN_vkGetMemoryFdKHR)vkGetInstanceProcAddr(
-        instance, "vkGetMemoryFdKHR");
-    if (fpGetMemoryFdKHR == NULL) {
-      throw std::runtime_error(
-          "Vulkan: Proc address for \"vkGetMemoryFdKHR\" not found.\n");
-    }
-#endif
-  }
-
-  void initVulkan() {
-    createInstance();
-    setupDebugCallback();
-    createSurface();
-    pickPhysicalDevice();
-    createLogicalDevice();
-    getKhrExtensionsFn();
-    createSwapChain();
-    createImageViews();
-    createRenderPass();
-    createDescriptorSetLayout();
-    createGraphicsPipeline();
-    createFramebuffers();
-    createCommandPool();
-    createVertexBuffer();
-    createUniformBuffer();
-    createDescriptorPool();
-    createDescriptorSet();
-    createCommandBuffers();
-    createSyncObjects();
-    createSyncObjectsExt();
-  }
-
-  void initCuda() {
-    setCudaVkDevice();
-    cudaVkImportVertexMem();
-    cudaInitVertexMem();
-    cudaVkImportSemaphore();
-  }
-
-  void createSurface() {
-    if (glfwCreateWindowSurface(instance, window, nullptr, &surface) !=
-        VK_SUCCESS) {
-      throw std::runtime_error("failed to create window surface!");
-    }
-  }
-
-  void pickPhysicalDevice() {
-    uint32_t deviceCount = 0;
-
-    vkEnumeratePhysicalDevices(instance, &deviceCount, nullptr);
-
-    if (deviceCount == 0) {
-      throw std::runtime_error("failed to find GPUs with Vulkan support!");
-    }
-
-    std::vector<VkPhysicalDevice> devices(deviceCount);
-    vkEnumeratePhysicalDevices(instance, &deviceCount, devices.data());
-
-    for (const auto& device : devices) {
-      if (isDeviceSuitable(device)) {
-        physicalDevice = device;
-        break;
-      }
-    }
-    if (physicalDevice == VK_NULL_HANDLE) {
-      throw std::runtime_error("failed to find a suitable GPU!");
-    }
-
-    std::cout << "Selected physical device = " << physicalDevice << std::endl;
-
-    VkPhysicalDeviceIDProperties vkPhysicalDeviceIDProperties = {};
-    vkPhysicalDeviceIDProperties.sType =
-        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES;
-    vkPhysicalDeviceIDProperties.pNext = NULL;
-
-    VkPhysicalDeviceProperties2 vkPhysicalDeviceProperties2 = {};
-    vkPhysicalDeviceProperties2.sType =
-        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
-    vkPhysicalDeviceProperties2.pNext = &vkPhysicalDeviceIDProperties;
-
-    fpGetPhysicalDeviceProperties2(physicalDevice,
-                                   &vkPhysicalDeviceProperties2);
-
-    memcpy(vkDeviceUUID, vkPhysicalDeviceIDProperties.deviceUUID,
-           sizeof(vkDeviceUUID));
-  }
-
-  int setCudaVkDevice() {
-    int current_device = 0;
-    int device_count = 0;
-    int devices_prohibited = 0;
-
-    cudaDeviceProp deviceProp;
-    checkCudaErrors(cudaGetDeviceCount(&device_count));
-
-    if (device_count == 0) {
-      fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
-      exit(EXIT_FAILURE);
-    }
-
-    // Find the GPU which is selected by Vulkan
-    while (current_device < device_count) {
-      cudaGetDeviceProperties(&deviceProp, current_device);
-
-      if ((deviceProp.computeMode != cudaComputeModeProhibited)) {
-        // Compare the cuda device UUID with vulkan UUID
-        int ret = memcmp(&deviceProp.uuid, &vkDeviceUUID, VK_UUID_SIZE);
-        if (ret == 0) {
-          checkCudaErrors(cudaSetDevice(current_device));
-          checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device));
-          printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
-                 current_device, deviceProp.name, deviceProp.major,
-                 deviceProp.minor);
-
-          return current_device;
-        }
-
-      } else {
-        devices_prohibited++;
-      }
-
-      current_device++;
-    }
-
-    if (devices_prohibited == device_count) {
-      fprintf(stderr,
-              "CUDA error:"
-              " No Vulkan-CUDA Interop capable GPU found.\n");
-      exit(EXIT_FAILURE);
-    }
-
-    return -1;
-  }
-
-  bool isDeviceSuitable(VkPhysicalDevice device) {
-    QueueFamilyIndices indices = findQueueFamilies(device);
-    bool extensionsSupported = checkDeviceExtensionSupport(device);
-
-    bool swapChainAdequate = false;
-    if (extensionsSupported) {
-      SwapChainSupportDetails swapChainSupport = querySwapChainSupport(device);
-      swapChainAdequate = !swapChainSupport.formats.empty() &&
-                          !swapChainSupport.presentModes.empty();
-    }
-
-    return indices.isComplete() && extensionsSupported && swapChainAdequate;
-  }
-
-  bool checkDeviceExtensionSupport(VkPhysicalDevice device) {
-    uint32_t extensionCount;
-    vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount,
-                                         nullptr);
-
-    std::vector<VkExtensionProperties> availableExtensions(extensionCount);
-    vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount,
-                                         availableExtensions.data());
-
-    std::set<std::string> requiredExtensions(deviceExtensions.begin(),
-                                             deviceExtensions.end());
-
-    for (const auto& extension : availableExtensions) {
-      requiredExtensions.erase(extension.extensionName);
-    }
-
-    return requiredExtensions.empty();
-  }
-
-  QueueFamilyIndices findQueueFamilies(VkPhysicalDevice device) {
-    QueueFamilyIndices indices;
-    uint32_t queueFamilyCount = 0;
-    vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount,
-                                             nullptr);
-
-    std::vector<VkQueueFamilyProperties> queueFamilies(queueFamilyCount);
-    vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount,
-                                             queueFamilies.data());
-
-    int i = 0;
-    for (const auto& queueFamily : queueFamilies) {
-      if (queueFamily.queueCount > 0 &&
-          queueFamily.queueFlags & VK_QUEUE_GRAPHICS_BIT) {
-        indices.graphicsFamily = i;
-      }
-
-      VkBool32 presentSupport = false;
-      vkGetPhysicalDeviceSurfaceSupportKHR(device, i, surface, &presentSupport);
-
-      if (queueFamily.queueCount > 0 && presentSupport) {
-        indices.presentFamily = i;
-      }
-
-      if (indices.isComplete()) {
-        break;
-      }
-      i++;
-    }
-    return indices;
-  }
-
-  SwapChainSupportDetails querySwapChainSupport(VkPhysicalDevice device) {
-    SwapChainSupportDetails details;
-    vkGetPhysicalDeviceSurfaceCapabilitiesKHR(device, surface,
-                                              &details.capabilities);
-
-    uint32_t formatCount;
-    vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount,
-                                         nullptr);
-
-    if (formatCount != 0) {
-      details.formats.resize(formatCount);
-      vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount,
-                                           details.formats.data());
-    }
-
-    uint32_t presentModeCount;
-    vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface,
-                                              &presentModeCount, nullptr);
-
-    if (presentModeCount != 0) {
-      details.presentModes.resize(presentModeCount);
-      vkGetPhysicalDeviceSurfacePresentModesKHR(
-          device, surface, &presentModeCount, details.presentModes.data());
-    }
-
-    return details;
-  }
-
-  VkSurfaceFormatKHR chooseSwapSurfaceFormat(
-      const std::vector<VkSurfaceFormatKHR>& availableFormats) {
-    if (availableFormats.size() == 1 &&
-        availableFormats[0].format == VK_FORMAT_UNDEFINED) {
-      return {VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR};
-    }
-
-    for (const auto& availableFormat : availableFormats) {
-      if (availableFormat.format == VK_FORMAT_B8G8R8A8_UNORM &&
-          availableFormat.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) {
-        return availableFormat;
-      }
-    }
-
-    return availableFormats[0];
-  }
-
-  VkPresentModeKHR chooseSwapPresentMode(
-      const std::vector<VkPresentModeKHR> availablePresentModes) {
-    VkPresentModeKHR bestMode = VK_PRESENT_MODE_FIFO_KHR;
-
-    for (const auto& availablePresentMode : availablePresentModes) {
-      if (availablePresentMode == VK_PRESENT_MODE_MAILBOX_KHR) {
-        return availablePresentMode;
-      } else if (availablePresentMode == VK_PRESENT_MODE_IMMEDIATE_KHR) {
-        bestMode = availablePresentMode;
-      }
-    }
-
-    return bestMode;
-  }
-
-  VkExtent2D chooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities) {
-    if (capabilities.currentExtent.width !=
-        std::numeric_limits<uint32_t>::max()) {
-      return capabilities.currentExtent;
-    } else {
-      VkExtent2D actualExtent = {WIDTH, HEIGHT};
-
-      actualExtent.width = std::max(
-          capabilities.minImageExtent.width,
-          std::min(capabilities.maxImageExtent.width, actualExtent.width));
-      actualExtent.height = std::max(
-          capabilities.minImageExtent.height,
-          std::min(capabilities.maxImageExtent.height, actualExtent.height));
-
-      return actualExtent;
-    }
-  }
-
-  void createLogicalDevice() {
-    QueueFamilyIndices indices = findQueueFamilies(physicalDevice);
-
-    std::vector<VkDeviceQueueCreateInfo> queueCreateInfos;
-    std::set<int> uniqueQueueFamilies = {indices.graphicsFamily,
-                                         indices.presentFamily};
-
-    float queuePriority = 1.0f;
-    for (int queueFamily : uniqueQueueFamilies) {
-      VkDeviceQueueCreateInfo queueCreateInfo = {};
-      queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
-      queueCreateInfo.queueFamilyIndex = queueFamily;
-      queueCreateInfo.queueCount = 1;
-      queueCreateInfo.pQueuePriorities = &queuePriority;
-      queueCreateInfos.push_back(queueCreateInfo);
-    }
-
-    VkPhysicalDeviceFeatures deviceFeatures = {};
-
-    VkDeviceCreateInfo createInfo = {};
-    createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
-
-    createInfo.pQueueCreateInfos = queueCreateInfos.data();
-    createInfo.queueCreateInfoCount = queueCreateInfos.size();
-
-    createInfo.pEnabledFeatures = &deviceFeatures;
-    std::vector<const char*> enabledExtensionNameList;
-
-    for (int i = 0; i < deviceExtensions.size(); i++) {
-      enabledExtensionNameList.push_back(deviceExtensions[i]);
-    }
-    if (enableValidationLayers) {
-      createInfo.enabledLayerCount =
-          static_cast<uint32_t>(validationLayers.size());
-      createInfo.ppEnabledLayerNames = validationLayers.data();
-    } else {
-      createInfo.enabledLayerCount = 0;
-    }
-    createInfo.enabledExtensionCount =
-        static_cast<uint32_t>(enabledExtensionNameList.size());
-    createInfo.ppEnabledExtensionNames = enabledExtensionNameList.data();
-
-    if (vkCreateDevice(physicalDevice, &createInfo, nullptr, &device) !=
-        VK_SUCCESS) {
-      throw std::runtime_error("failed to create logical device!");
-    }
-    vkGetDeviceQueue(device, indices.graphicsFamily, 0, &graphicsQueue);
-    vkGetDeviceQueue(device, indices.presentFamily, 0, &presentQueue);
-  }
-
-  void createSwapChain() {
-    SwapChainSupportDetails swapChainSupport =
-        querySwapChainSupport(physicalDevice);
-
-    VkSurfaceFormatKHR surfaceFormat =
-        chooseSwapSurfaceFormat(swapChainSupport.formats);
-    VkPresentModeKHR presentMode =
-        chooseSwapPresentMode(swapChainSupport.presentModes);
-    VkExtent2D extent = chooseSwapExtent(swapChainSupport.capabilities);
-
-    uint32_t imageCount = swapChainSupport.capabilities.minImageCount + 1;
-    if (swapChainSupport.capabilities.maxImageCount > 0 &&
-        imageCount > swapChainSupport.capabilities.maxImageCount) {
-      imageCount = swapChainSupport.capabilities.maxImageCount;
-    }
-
-    VkSwapchainCreateInfoKHR createInfo = {};
-    createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR;
-    createInfo.surface = surface;
-    createInfo.minImageCount = imageCount;
-    createInfo.imageFormat = surfaceFormat.format;
-    createInfo.imageColorSpace = surfaceFormat.colorSpace;
-    createInfo.imageExtent = extent;
-    createInfo.imageArrayLayers = 1;
-    createInfo.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
-
-    QueueFamilyIndices indices = findQueueFamilies(physicalDevice);
-    uint32_t queueFamilyIndices[] = {(uint32_t)indices.graphicsFamily,
-                                     (uint32_t)indices.presentFamily};
-
-    if (indices.graphicsFamily != indices.presentFamily) {
-      createInfo.imageSharingMode = VK_SHARING_MODE_CONCURRENT;
-      createInfo.queueFamilyIndexCount = 2;
-      createInfo.pQueueFamilyIndices = queueFamilyIndices;
-    } else {
-      createInfo.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE;
-      createInfo.queueFamilyIndexCount = 0;      // Optional
-      createInfo.pQueueFamilyIndices = nullptr;  // Optional
-    }
-
-    createInfo.preTransform = swapChainSupport.capabilities.currentTransform;
-    createInfo.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR;
-    createInfo.presentMode = presentMode;
-    createInfo.clipped = VK_TRUE;
-    createInfo.oldSwapchain = VK_NULL_HANDLE;
-
-    if (vkCreateSwapchainKHR(device, &createInfo, nullptr, &swapChain) !=
-        VK_SUCCESS) {
-      throw std::runtime_error("failed to create swap chain!");
-    } else {
-      std::cout << "Swapchain created!!\n";
-    }
-
-    vkGetSwapchainImagesKHR(device, swapChain, &imageCount, nullptr);
-    swapChainImages.resize(imageCount);
-    vkGetSwapchainImagesKHR(device, swapChain, &imageCount,
-                            swapChainImages.data());
-
-    swapChainImageFormat = surfaceFormat.format;
-    swapChainExtent = extent;
-  }
-
-  void createImageViews() {
-    swapChainImageViews.resize(swapChainImages.size());
-
-    for (size_t i = 0; i < swapChainImages.size(); i++) {
-      VkImageViewCreateInfo createInfo = {};
-      createInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
-      createInfo.image = swapChainImages[i];
-      createInfo.viewType = VK_IMAGE_VIEW_TYPE_2D;
-      createInfo.format = swapChainImageFormat;
-
-      createInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
-      createInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
-      createInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
-      createInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
-
-      createInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-      createInfo.subresourceRange.baseMipLevel = 0;
-      createInfo.subresourceRange.levelCount = 1;
-      createInfo.subresourceRange.baseArrayLayer = 0;
-      createInfo.subresourceRange.layerCount = 1;
-
-      if (vkCreateImageView(device, &createInfo, nullptr,
-                            &swapChainImageViews[i]) != VK_SUCCESS) {
-        throw std::runtime_error("failed to create image views!");
-      }
-    }
-  }
-
-  void createDescriptorSetLayout() {
-    VkDescriptorSetLayoutBinding uboLayoutBinding = {};
-    uboLayoutBinding.binding = 0;
-    uboLayoutBinding.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
-    uboLayoutBinding.descriptorCount = 1;
-    uboLayoutBinding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
-    uboLayoutBinding.pImmutableSamplers = nullptr;  // Optional
-
-    VkDescriptorSetLayoutCreateInfo layoutInfo = {};
-    layoutInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
-    layoutInfo.bindingCount = 1;
-    layoutInfo.pBindings = &uboLayoutBinding;
-
-    if (vkCreateDescriptorSetLayout(device, &layoutInfo, nullptr,
-                                    &descriptorSetLayout) != VK_SUCCESS) {
-      throw std::runtime_error("failed to create descriptor set layout!");
-    }
-  }
-
-  void createGraphicsPipeline() {
-    auto vertShaderCode = readFile("shader_sine.vert");
-    auto fragShaderCode = readFile("shader_sine.frag");
-
-    VkShaderModule vertShaderModule;
-    VkShaderModule fragShaderModule;
-
-    vertShaderModule = createShaderModule(vertShaderCode);
-    fragShaderModule = createShaderModule(fragShaderCode);
-
-    VkPipelineShaderStageCreateInfo vertShaderStageInfo = {};
-    vertShaderStageInfo.sType =
-        VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
-    vertShaderStageInfo.stage = VK_SHADER_STAGE_VERTEX_BIT;
-    vertShaderStageInfo.module = vertShaderModule;
-    vertShaderStageInfo.pName = "main";
-
-    VkPipelineShaderStageCreateInfo fragShaderStageInfo = {};
-    fragShaderStageInfo.sType =
-        VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
-    fragShaderStageInfo.stage = VK_SHADER_STAGE_FRAGMENT_BIT;
-    fragShaderStageInfo.module = fragShaderModule;
-    fragShaderStageInfo.pName = "main";
-
-    VkPipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo,
-                                                      fragShaderStageInfo};
-
-    VkPipelineVertexInputStateCreateInfo vertexInputInfo = {};
-    vertexInputInfo.sType =
-        VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO;
-    auto bindingDescription = Vertex::getBindingDescription();
-    auto attributeDescriptions = Vertex::getAttributeDescriptions();
-    vertexInputInfo.vertexBindingDescriptionCount = 1;
-    vertexInputInfo.pVertexBindingDescriptions = &bindingDescription;
-    vertexInputInfo.vertexAttributeDescriptionCount =
-        static_cast<uint32_t>(attributeDescriptions.size());
-    vertexInputInfo.pVertexAttributeDescriptions = attributeDescriptions.data();
-
-    VkPipelineInputAssemblyStateCreateInfo inputAssembly = {};
-    inputAssembly.sType =
-        VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
-    inputAssembly.topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
-    inputAssembly.primitiveRestartEnable = VK_FALSE;
-
-    VkViewport viewport = {};
-    viewport.x = 0.0f;
-    viewport.y = 0.0f;
-    viewport.width = (float)swapChainExtent.width;
-    viewport.height = (float)swapChainExtent.height;
-    viewport.minDepth = 0.0f;
-    viewport.maxDepth = 1.0f;
-
-    VkRect2D scissor = {};
-    scissor.offset = {0, 0};
-    scissor.extent = swapChainExtent;
-
-    VkPipelineViewportStateCreateInfo viewportState = {};
-    viewportState.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO;
-    viewportState.viewportCount = 1;
-    viewportState.pViewports = &viewport;
-    viewportState.scissorCount = 1;
-    viewportState.pScissors = &scissor;
-
-    VkPipelineRasterizationStateCreateInfo rasterizer = {};
-    rasterizer.sType =
-        VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
-    rasterizer.depthClampEnable = VK_FALSE;
-    rasterizer.rasterizerDiscardEnable = VK_FALSE;
-    rasterizer.polygonMode = VK_POLYGON_MODE_FILL;
-    rasterizer.lineWidth = 1.0f;
-    rasterizer.cullMode = VK_CULL_MODE_BACK_BIT;
-    rasterizer.frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE;
-    rasterizer.depthBiasEnable = VK_FALSE;
-    rasterizer.depthBiasConstantFactor = 0.0f;  // Optional
-    rasterizer.depthBiasClamp = 0.0f;           // Optional
-    rasterizer.depthBiasSlopeFactor = 0.0f;     // Optional
-
-    VkPipelineMultisampleStateCreateInfo multisampling = {};
-    multisampling.sType =
-        VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
-    multisampling.sampleShadingEnable = VK_FALSE;
-    multisampling.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
-    multisampling.minSampleShading = 1.0f;           // Optional
-    multisampling.pSampleMask = nullptr;             // Optional
-    multisampling.alphaToCoverageEnable = VK_FALSE;  // Optional
-    multisampling.alphaToOneEnable = VK_FALSE;       // Optional
-
-    VkPipelineColorBlendAttachmentState colorBlendAttachment = {};
-    colorBlendAttachment.colorWriteMask =
-        VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT |
-        VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT;
-    colorBlendAttachment.blendEnable = VK_FALSE;
-    colorBlendAttachment.srcColorBlendFactor = VK_BLEND_FACTOR_ONE;  // Optional
-    colorBlendAttachment.dstColorBlendFactor =
-        VK_BLEND_FACTOR_ZERO;                                        // Optional
-    colorBlendAttachment.colorBlendOp = VK_BLEND_OP_ADD;             // Optional
-    colorBlendAttachment.srcAlphaBlendFactor = VK_BLEND_FACTOR_ONE;  // Optional
-    colorBlendAttachment.dstAlphaBlendFactor =
-        VK_BLEND_FACTOR_ZERO;                             // Optional
-    colorBlendAttachment.alphaBlendOp = VK_BLEND_OP_ADD;  // Optional
-
-    VkPipelineColorBlendStateCreateInfo colorBlending = {};
-    colorBlending.sType =
-        VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO;
-    colorBlending.logicOpEnable = VK_FALSE;
-    colorBlending.logicOp = VK_LOGIC_OP_COPY;  // Optional
-    colorBlending.attachmentCount = 1;
-    colorBlending.pAttachments = &colorBlendAttachment;
-    colorBlending.blendConstants[0] = 0.0f;  // Optional
-    colorBlending.blendConstants[1] = 0.0f;  // Optional
-    colorBlending.blendConstants[2] = 0.0f;  // Optional
-    colorBlending.blendConstants[3] = 0.0f;  // Optional
-
-#if 0
-        VkDynamicState dynamicStates[] = {
-            VK_DYNAMIC_STATE_VIEWPORT,
-            VK_DYNAMIC_STATE_LINE_WIDTH
-        };
-
-        VkPipelineDynamicStateCreateInfo dynamicState = {};
-        dynamicState.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO;
-        dynamicState.dynamicStateCount = 2;
-        dynamicState.pDynamicStates = dynamicStates;
-#endif
-    VkPipelineLayoutCreateInfo pipelineLayoutInfo = {};
-    pipelineLayoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
-    pipelineLayoutInfo.setLayoutCount = 1;                  // Optional
-    pipelineLayoutInfo.pSetLayouts = &descriptorSetLayout;  // Optional
-    pipelineLayoutInfo.pushConstantRangeCount = 0;          // Optional
-    pipelineLayoutInfo.pPushConstantRanges = nullptr;       // Optional
-
-    if (vkCreatePipelineLayout(device, &pipelineLayoutInfo, nullptr,
-                               &pipelineLayout) != VK_SUCCESS) {
-      throw std::runtime_error("failed to create pipeline layout!");
-    }
-
-    VkGraphicsPipelineCreateInfo pipelineInfo = {};
-    pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO;
-    pipelineInfo.stageCount = 2;
-    pipelineInfo.pStages = shaderStages;
-    pipelineInfo.pVertexInputState = &vertexInputInfo;
-    pipelineInfo.pInputAssemblyState = &inputAssembly;
-    pipelineInfo.pViewportState = &viewportState;
-    pipelineInfo.pRasterizationState = &rasterizer;
-    pipelineInfo.pMultisampleState = &multisampling;
-    pipelineInfo.pDepthStencilState = nullptr;  // Optional
-    pipelineInfo.pColorBlendState = &colorBlending;
-    pipelineInfo.pDynamicState = nullptr;  // Optional
-    pipelineInfo.layout = pipelineLayout;
-    pipelineInfo.renderPass = renderPass;
-    pipelineInfo.subpass = 0;
-    pipelineInfo.basePipelineHandle = VK_NULL_HANDLE;  // Optional
-    pipelineInfo.basePipelineIndex = -1;               // Optional
-
-    if (vkCreateGraphicsPipelines(device, VK_NULL_HANDLE, 1, &pipelineInfo,
-                                  nullptr, &graphicsPipeline) != VK_SUCCESS) {
-      throw std::runtime_error("failed to create graphics pipeline!");
-    } else {
-      std::cout << "Pipeline created successfully!!\n";
-    }
-    vkDestroyShaderModule(device, fragShaderModule, nullptr);
-    vkDestroyShaderModule(device, vertShaderModule, nullptr);
-  }
-
-  void createRenderPass() {
-    VkAttachmentDescription colorAttachment = {};
-    colorAttachment.format = swapChainImageFormat;
-    colorAttachment.samples = VK_SAMPLE_COUNT_1_BIT;
-
-    colorAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR;
-    colorAttachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
-
-    colorAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
-    colorAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
-
-    colorAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
-    colorAttachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
-
-    VkAttachmentReference colorAttachmentRef = {};
-    colorAttachmentRef.attachment = 0;
-    colorAttachmentRef.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
-
-    VkSubpassDescription subpass = {};
-    subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
-
-    subpass.colorAttachmentCount = 1;
-    subpass.pColorAttachments = &colorAttachmentRef;
-
-    VkRenderPassCreateInfo renderPassInfo = {};
-    renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO;
-    renderPassInfo.attachmentCount = 1;
-    renderPassInfo.pAttachments = &colorAttachment;
-    renderPassInfo.subpassCount = 1;
-    renderPassInfo.pSubpasses = &subpass;
-
-    VkSubpassDependency dependency = {};
-    dependency.srcSubpass = VK_SUBPASS_EXTERNAL;
-    dependency.dstSubpass = 0;
-    dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
-    dependency.srcAccessMask = 0;
-    dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
-    dependency.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
-                               VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
-    renderPassInfo.dependencyCount = 1;
-    renderPassInfo.pDependencies = &dependency;
-
-    if (vkCreateRenderPass(device, &renderPassInfo, nullptr, &renderPass) !=
-        VK_SUCCESS) {
-      throw std::runtime_error("failed to create render pass!");
-    }
-  }
-
-  void createFramebuffers() {
-    swapChainFramebuffers.resize(swapChainImageViews.size());
-
-    for (size_t i = 0; i < swapChainImageViews.size(); i++) {
-      VkImageView attachments[] = {swapChainImageViews[i]};
-
-      VkFramebufferCreateInfo framebufferInfo = {};
-      framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO;
-      framebufferInfo.renderPass = renderPass;
-      framebufferInfo.attachmentCount = 1;
-      framebufferInfo.pAttachments = attachments;
-      framebufferInfo.width = swapChainExtent.width;
-      framebufferInfo.height = swapChainExtent.height;
-      framebufferInfo.layers = 1;
-
-      if (vkCreateFramebuffer(device, &framebufferInfo, nullptr,
-                              &swapChainFramebuffers[i]) != VK_SUCCESS) {
-        throw std::runtime_error("failed to create framebuffer!");
-      }
-    }
-  }
-
-  void createCommandPool() {
-    QueueFamilyIndices queueFamilyIndices = findQueueFamilies(physicalDevice);
-
-    VkCommandPoolCreateInfo poolInfo = {};
-    poolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
-    poolInfo.queueFamilyIndex = queueFamilyIndices.graphicsFamily;
-    poolInfo.flags = 0;  // Optional
-
-    if (vkCreateCommandPool(device, &poolInfo, nullptr, &commandPool) !=
-        VK_SUCCESS) {
-      throw std::runtime_error("failed to create command pool!");
-    }
-  }
-
-  void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage,
-                    VkMemoryPropertyFlags properties, VkBuffer& buffer,
-                    VkDeviceMemory& bufferMemory) {
-    VkBufferCreateInfo bufferInfo = {};
-    bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
-    bufferInfo.size = size;
-    bufferInfo.usage = usage;
-    bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-
-    if (vkCreateBuffer(device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
-      throw std::runtime_error("failed to create buffer!");
-    }
-
-    VkMemoryRequirements memRequirements;
-    vkGetBufferMemoryRequirements(device, buffer, &memRequirements);
-
-    VkMemoryAllocateInfo allocInfo = {};
-    allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
-    allocInfo.allocationSize = memRequirements.size;
-    allocInfo.memoryTypeIndex =
-        findMemoryType(memRequirements.memoryTypeBits, properties);
-
-    if (vkAllocateMemory(device, &allocInfo, nullptr, &bufferMemory) !=
-        VK_SUCCESS) {
-      throw std::runtime_error("failed to allocate buffer memory!");
-    }
-
-    vkBindBufferMemory(device, buffer, bufferMemory, 0);
-  }
-
-  void createBufferExtMem(VkDeviceSize size, VkBufferUsageFlags usage,
-                          VkMemoryPropertyFlags properties,
-                          VkExternalMemoryHandleTypeFlagsKHR extMemHandleType,
-                          VkBuffer& buffer, VkDeviceMemory& bufferMemory) {
-    VkBufferCreateInfo bufferInfo = {};
-    bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
-    bufferInfo.size = size;
-    bufferInfo.usage = usage;
-    bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-
-    if (vkCreateBuffer(device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
-      throw std::runtime_error("failed to create buffer!");
-    }
-
-    VkMemoryRequirements memRequirements;
-    vkGetBufferMemoryRequirements(device, buffer, &memRequirements);
-
-#ifdef _WIN64
-    WindowsSecurityAttributes winSecurityAttributes;
-
-    VkExportMemoryWin32HandleInfoKHR vulkanExportMemoryWin32HandleInfoKHR = {};
-    vulkanExportMemoryWin32HandleInfoKHR.sType =
-        VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR;
-    vulkanExportMemoryWin32HandleInfoKHR.pNext = NULL;
-    vulkanExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes;
-    vulkanExportMemoryWin32HandleInfoKHR.dwAccess =
-        DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE;
-    vulkanExportMemoryWin32HandleInfoKHR.name = (LPCWSTR)NULL;
-#endif
-    VkExportMemoryAllocateInfoKHR vulkanExportMemoryAllocateInfoKHR = {};
-    vulkanExportMemoryAllocateInfoKHR.sType =
-        VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR;
-#ifdef _WIN64
-    vulkanExportMemoryAllocateInfoKHR.pNext =
-        extMemHandleType & VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR
-            ? &vulkanExportMemoryWin32HandleInfoKHR
-            : NULL;
-    vulkanExportMemoryAllocateInfoKHR.handleTypes = extMemHandleType;
-#else
-    vulkanExportMemoryAllocateInfoKHR.pNext = NULL;
-    vulkanExportMemoryAllocateInfoKHR.handleTypes =
-        VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
-#endif
-    VkMemoryAllocateInfo allocInfo = {};
-    allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
-    allocInfo.pNext = &vulkanExportMemoryAllocateInfoKHR;
-    allocInfo.allocationSize = memRequirements.size;
-    allocInfo.memoryTypeIndex =
-        findMemoryType(memRequirements.memoryTypeBits, properties);
-
-    if (vkAllocateMemory(device, &allocInfo, nullptr, &bufferMemory) !=
-        VK_SUCCESS) {
-      throw std::runtime_error("failed to allocate external buffer memory!");
-    }
-
-    vkBindBufferMemory(device, buffer, bufferMemory, 0);
-  }
-
-  void createVertexBuffer() {
-    mesh_width = swapChainExtent.width / 2;
-    mesh_height = swapChainExtent.height / 2;
-    vertexBufSize = mesh_height * mesh_width;
-
-    VkDeviceSize bufferSize = sizeof(Vertex) * vertexBufSize;
-#ifdef _WIN64
-    if (IsWindows8OrGreater()) {
-      createBufferExtMem(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
-                         VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
-                         VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT,
-                         vertexBuffer, vertexBufferMemory);
-    } else {
-      createBufferExtMem(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
-                         VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
-                         VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT,
-                         vertexBuffer, vertexBufferMemory);
-    }
-#else
-    createBufferExtMem(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
-                       VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
-                       VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
-                       vertexBuffer, vertexBufferMemory);
-#endif
-  }
-
-  void cudaInitVertexMem() {
-    checkCudaErrors(cudaStreamCreate(&streamToRun));
-
-    dim3 block(16, 16, 1);
-    dim3 grid(mesh_width / 16, mesh_height / 16, 1);
-    Vertex* vertices = (Vertex*)cudaDevVertptr;
-    sinewave_gen_kernel<<<grid, block, 0, streamToRun>>>(vertices, mesh_width,
-                                                         mesh_height, 1.0);
-    checkCudaErrors(cudaStreamSynchronize(streamToRun));
-  }
-
-  void createUniformBuffer() {
-    VkDeviceSize bufferSize = sizeof(UniformBufferObject);
-    createBuffer(bufferSize, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
-                 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-                     VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
-                 uniformBuffer, uniformBufferMemory);
-  }
-
-  uint32_t findMemoryType(uint32_t typeFilter,
-                          VkMemoryPropertyFlags properties) {
-    VkPhysicalDeviceMemoryProperties memProperties;
-    vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties);
-
-    for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) {
-      if (typeFilter & (1 << i) && (memProperties.memoryTypes[i].propertyFlags &
-                                    properties) == properties) {
-        return i;
-      }
-    }
-
-    throw std::runtime_error("failed to find suitable memory type!");
-  }
-
-  void getKhrExtensionsFn() {
-#ifdef _WIN64
-
-    fpGetSemaphoreWin32HandleKHR =
-        (PFN_vkGetSemaphoreWin32HandleKHR)vkGetDeviceProcAddr(
-            device, "vkGetSemaphoreWin32HandleKHR");
-    if (fpGetSemaphoreWin32HandleKHR == NULL) {
-      throw std::runtime_error(
-          "Vulkan: Proc address for \"vkGetSemaphoreWin32HandleKHR\" not "
-          "found.\n");
-    }
-#else
-    fpGetSemaphoreFdKHR = (PFN_vkGetSemaphoreFdKHR)vkGetDeviceProcAddr(
-        device, "vkGetSemaphoreFdKHR");
-    if (fpGetSemaphoreFdKHR == NULL) {
-      throw std::runtime_error(
-          "Vulkan: Proc address for \"vkGetSemaphoreFdKHR\" not found.\n");
-    }
-#endif
-  }
-
-  void createCommandBuffers() {
-    commandBuffers.resize(swapChainFramebuffers.size());
-
-    VkCommandBufferAllocateInfo allocInfo = {};
-    allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
-    allocInfo.commandPool = commandPool;
-    allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
-    allocInfo.commandBufferCount = (uint32_t)commandBuffers.size();
-
-    if (vkAllocateCommandBuffers(device, &allocInfo, commandBuffers.data()) !=
-        VK_SUCCESS) {
-      throw std::runtime_error("failed to allocate command buffers!");
-    }
-
-    for (size_t i = 0; i < commandBuffers.size(); i++) {
-      VkCommandBufferBeginInfo beginInfo = {};
-      beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
-      beginInfo.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
-      beginInfo.pInheritanceInfo = nullptr;  // Optional
-
-      if (vkBeginCommandBuffer(commandBuffers[i], &beginInfo) != VK_SUCCESS) {
-        throw std::runtime_error("failed to begin recording command buffer!");
-      }
-
-      VkRenderPassBeginInfo renderPassInfo = {};
-      renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
-      renderPassInfo.renderPass = renderPass;
-      renderPassInfo.framebuffer = swapChainFramebuffers[i];
-      renderPassInfo.renderArea.offset = {0, 0};
-      renderPassInfo.renderArea.extent = swapChainExtent;
-
-      VkClearValue clearColor = {0.0f, 0.0f, 0.0f, 1.0f};
-      renderPassInfo.clearValueCount = 1;
-      renderPassInfo.pClearValues = &clearColor;
-
-      vkCmdBeginRenderPass(commandBuffers[i], &renderPassInfo,
-                           VK_SUBPASS_CONTENTS_INLINE);
-      vkCmdBindPipeline(commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS,
-                        graphicsPipeline);
-      VkBuffer vertexBuffers[] = {vertexBuffer};
-      VkDeviceSize offsets[] = {0};
-      vkCmdBindVertexBuffers(commandBuffers[i], 0, 1, vertexBuffers, offsets);
-      vkCmdBindDescriptorSets(commandBuffers[i],
-                              VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout,
-                              0, 1, &descriptorSet, 0, nullptr);
-      vkCmdDraw(commandBuffers[i], static_cast<uint32_t>(vertexBufSize), 1, 0,
-                0);
-      vkCmdEndRenderPass(commandBuffers[i]);
-      if (vkEndCommandBuffer(commandBuffers[i]) != VK_SUCCESS) {
-        throw std::runtime_error("failed to record command buffer!");
-      }
-    }
-  }
-
-  VkShaderModule createShaderModule(const std::vector<char>& code) {
-    VkShaderModuleCreateInfo createInfo = {};
-    createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
-    createInfo.codeSize = code.size();
-    createInfo.pCode = reinterpret_cast<const uint32_t*>(code.data());
-
-    VkShaderModule shaderModule;
-    if (vkCreateShaderModule(device, &createInfo, nullptr, &shaderModule) !=
-        VK_SUCCESS) {
-      throw std::runtime_error("failed to create shader module!");
-    }
-
-    return shaderModule;
-  }
-
-  static std::vector<char> readFile(const std::string& filename) {
-    char* file_path = sdkFindFilePath(filename.c_str(), execution_path.c_str());
-
-    std::ifstream file(file_path, std::ios::ate | std::ios::binary);
-
-    if (!file.is_open()) {
-      throw std::runtime_error("failed to open shader spv file!\n");
-    }
-    size_t fileSize = (size_t)file.tellg();
-    std::vector<char> buffer(fileSize);
-    file.seekg(0);
-    file.read(buffer.data(), fileSize);
-    file.close();
-
-    return buffer;
-  }
-
-  void mainLoop() {
-    updateUniformBuffer();
-
-    while (!glfwWindowShouldClose(window)) {
-      glfwPollEvents();
-      drawFrame();
-    }
-
-    vkDeviceWaitIdle(device);
-  }
-
-  void updateUniformBuffer() {
-    UniformBufferObject ubo = {};
-
-    mat4x4_identity(ubo.model);
-    mat4x4 Model;
-    mat4x4_dup(Model, ubo.model);
-    mat4x4_rotate(ubo.model, Model, 1.0f, 0.0f, 1.0f, degreesToRadians(45.0f));
-
-    vec3 eye = {2.0f, 2.0f, 2.0f};
-    vec3 center = {0.0f, 0.0f, 0.0f};
-    vec3 up = {0.0f, 0.0f, 1.0f};
-    mat4x4_look_at(ubo.view, eye, center, up);
-    mat4x4_perspective(ubo.proj, degreesToRadians(45.0f),
-                       swapChainExtent.width / (float)swapChainExtent.height,
-                       0.1f, 10.0f);
-    ubo.proj[1][1] *= -1;
-    void* data;
-    vkMapMemory(device, uniformBufferMemory, 0, sizeof(ubo), 0, &data);
-    memcpy(data, &ubo, sizeof(ubo));
-    vkUnmapMemory(device, uniformBufferMemory);
-  }
-
-  void createDescriptorPool() {
-    VkDescriptorPoolSize poolSize = {};
-    poolSize.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
-    poolSize.descriptorCount = 1;
-
-    VkDescriptorPoolCreateInfo poolInfo = {};
-    poolInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
-    poolInfo.poolSizeCount = 1;
-    poolInfo.pPoolSizes = &poolSize;
-    poolInfo.maxSets = 1;
-
-    if (vkCreateDescriptorPool(device, &poolInfo, nullptr, &descriptorPool) !=
-        VK_SUCCESS) {
-      throw std::runtime_error("failed to create descriptor pool!");
-    }
-  }
-
-  void createDescriptorSet() {
-    VkDescriptorSetLayout layouts[] = {descriptorSetLayout};
-    VkDescriptorSetAllocateInfo allocInfo = {};
-    allocInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
-    allocInfo.descriptorPool = descriptorPool;
-    allocInfo.descriptorSetCount = 1;
-    allocInfo.pSetLayouts = layouts;
-
-    if (vkAllocateDescriptorSets(device, &allocInfo, &descriptorSet) !=
-        VK_SUCCESS) {
-      throw std::runtime_error("failed to allocate descriptor set!");
-    }
-
-    VkDescriptorBufferInfo bufferInfo = {};
-    bufferInfo.buffer = uniformBuffer;
-    bufferInfo.offset = 0;
-    bufferInfo.range = sizeof(UniformBufferObject);
-
-    VkWriteDescriptorSet descriptorWrite = {};
-    descriptorWrite.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
-    descriptorWrite.dstSet = descriptorSet;
-    descriptorWrite.dstBinding = 0;
-    descriptorWrite.dstArrayElement = 0;
-    descriptorWrite.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
-    descriptorWrite.descriptorCount = 1;
-    descriptorWrite.pBufferInfo = &bufferInfo;
-    descriptorWrite.pImageInfo = nullptr;        // Optional
-    descriptorWrite.pTexelBufferView = nullptr;  // Optional
-
-    vkUpdateDescriptorSets(device, 1, &descriptorWrite, 0, nullptr);
-  }
-
-  void drawFrame() {
-    uint32_t imageIndex;
-    vkAcquireNextImageKHR(device, swapChain,
-                          std::numeric_limits<uint64_t>::max(),
-                          imageAvailableSemaphore, VK_NULL_HANDLE, &imageIndex);
-
-    if (!startSubmit) {
-      submitVulkan(imageIndex);
-      startSubmit = 1;
-    } else {
-      submitVulkanCuda(imageIndex);
-    }
-
-    VkPresentInfoKHR presentInfo = {};
-    presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
-
-    VkSemaphore signalSemaphores[] = {renderFinishedSemaphore};
-
-    presentInfo.waitSemaphoreCount = 1;
-    presentInfo.pWaitSemaphores = signalSemaphores;
-
-    VkSwapchainKHR swapChains[] = {swapChain};
-    presentInfo.swapchainCount = 1;
-    presentInfo.pSwapchains = swapChains;
-    presentInfo.pImageIndices = &imageIndex;
-    presentInfo.pResults = nullptr;  // Optional
-
-    vkQueuePresentKHR(presentQueue, &presentInfo);
-
-    cudaUpdateVertexBuffer();
-    // Added sleep of 5 millisecs so that CPU does not submit too much work to
-    // GPU
-    std::this_thread::sleep_for(std::chrono::microseconds(5000));
-  }
-
-  void submitVulkan(uint32_t imageIndex) {
-    VkSubmitInfo submitInfo = {};
-    submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
-
-    VkSemaphore waitSemaphores[] = {imageAvailableSemaphore};
-    VkPipelineStageFlags waitStages[] = {
-        VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT};
-    submitInfo.waitSemaphoreCount = 1;
-    submitInfo.pWaitSemaphores = waitSemaphores;
-    submitInfo.pWaitDstStageMask = waitStages;
-    submitInfo.commandBufferCount = 1;
-    submitInfo.pCommandBuffers = &commandBuffers[imageIndex];
-
-    VkSemaphore signalSemaphores[] = {renderFinishedSemaphore,
-                                      vkUpdateCudaVertexBufSemaphore};
-
-    submitInfo.signalSemaphoreCount = 2;
-    submitInfo.pSignalSemaphores = signalSemaphores;
-
-    if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE) !=
-        VK_SUCCESS) {
-      throw std::runtime_error("failed to submit draw command buffer!");
-    }
-  }
-
-  void submitVulkanCuda(uint32_t imageIndex) {
-    VkSubmitInfo submitInfo = {};
-    submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
-
-    VkSemaphore waitSemaphores[] = {imageAvailableSemaphore,
-                                    cudaUpdateVkVertexBufSemaphore};
-    VkPipelineStageFlags waitStages[] = {
-        VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
-        VK_PIPELINE_STAGE_ALL_COMMANDS_BIT};
-    submitInfo.waitSemaphoreCount = 2;
-    submitInfo.pWaitSemaphores = waitSemaphores;
-    submitInfo.pWaitDstStageMask = waitStages;
-    submitInfo.commandBufferCount = 1;
-    submitInfo.pCommandBuffers = &commandBuffers[imageIndex];
-
-    VkSemaphore signalSemaphores[] = {renderFinishedSemaphore,
-                                      vkUpdateCudaVertexBufSemaphore};
-
-    submitInfo.signalSemaphoreCount = 2;
-    submitInfo.pSignalSemaphores = signalSemaphores;
-
-    if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE) !=
-        VK_SUCCESS) {
-      throw std::runtime_error("failed to submit draw command buffer!");
-    }
-  }
-
-  void createSyncObjects() {
-    VkSemaphoreCreateInfo semaphoreInfo = {};
-    semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
-
-    if (vkCreateSemaphore(device, &semaphoreInfo, nullptr,
-                          &imageAvailableSemaphore) != VK_SUCCESS ||
-        vkCreateSemaphore(device, &semaphoreInfo, nullptr,
-                          &renderFinishedSemaphore) != VK_SUCCESS) {
-      throw std::runtime_error(
-          "failed to create synchronization objects for a frame!");
-    }
-  }
-
-  void createSyncObjectsExt() {
-    VkSemaphoreCreateInfo semaphoreInfo = {};
-    semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
-
-    memset(&semaphoreInfo, 0, sizeof(semaphoreInfo));
-    semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
-
-#ifdef _WIN64
-    WindowsSecurityAttributes winSecurityAttributes;
-
-    VkExportSemaphoreWin32HandleInfoKHR
-        vulkanExportSemaphoreWin32HandleInfoKHR = {};
-    vulkanExportSemaphoreWin32HandleInfoKHR.sType =
-        VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR;
-    vulkanExportSemaphoreWin32HandleInfoKHR.pNext = NULL;
-    vulkanExportSemaphoreWin32HandleInfoKHR.pAttributes =
-        &winSecurityAttributes;
-    vulkanExportSemaphoreWin32HandleInfoKHR.dwAccess =
-        DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE;
-    vulkanExportSemaphoreWin32HandleInfoKHR.name = (LPCWSTR)NULL;
-#endif
-    VkExportSemaphoreCreateInfoKHR vulkanExportSemaphoreCreateInfo = {};
-    vulkanExportSemaphoreCreateInfo.sType =
-        VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR;
-#ifdef _WIN64
-    vulkanExportSemaphoreCreateInfo.pNext =
-        IsWindows8OrGreater() ? &vulkanExportSemaphoreWin32HandleInfoKHR : NULL;
-    vulkanExportSemaphoreCreateInfo.handleTypes =
-        IsWindows8OrGreater()
-            ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT
-            : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT;
-#else
-    vulkanExportSemaphoreCreateInfo.pNext = NULL;
-    vulkanExportSemaphoreCreateInfo.handleTypes =
-        VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT;
-#endif
-    semaphoreInfo.pNext = &vulkanExportSemaphoreCreateInfo;
-
-    if (vkCreateSemaphore(device, &semaphoreInfo, nullptr,
-                          &cudaUpdateVkVertexBufSemaphore) != VK_SUCCESS ||
-        vkCreateSemaphore(device, &semaphoreInfo, nullptr,
-                          &vkUpdateCudaVertexBufSemaphore) != VK_SUCCESS) {
-      throw std::runtime_error(
-          "failed to create synchronization objects for a CUDA-Vulkan!");
-    }
-  }
-
-  void cudaVkImportVertexMem() {
-    cudaExternalMemoryHandleDesc cudaExtMemHandleDesc;
-    memset(&cudaExtMemHandleDesc, 0, sizeof(cudaExtMemHandleDesc));
-#ifdef _WIN64
-    cudaExtMemHandleDesc.type =
-        IsWindows8OrGreater() ? cudaExternalMemoryHandleTypeOpaqueWin32
-                              : cudaExternalMemoryHandleTypeOpaqueWin32Kmt;
-    cudaExtMemHandleDesc.handle.win32.handle = getVkMemHandle(
-        IsWindows8OrGreater()
-            ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT
-            : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT);
-#else
-    cudaExtMemHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueFd;
-    cudaExtMemHandleDesc.handle.fd =
-        getVkMemHandle(VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT);
-#endif
-    cudaExtMemHandleDesc.size = sizeof(Vertex) * vertexBufSize;
-
-    checkCudaErrors(cudaImportExternalMemory(&cudaExtMemVertexBuffer,
-                                             &cudaExtMemHandleDesc));
-
-    cudaExternalMemoryBufferDesc cudaExtBufferDesc;
-    cudaExtBufferDesc.offset = 0;
-    cudaExtBufferDesc.size = sizeof(Vertex) * vertexBufSize;
-    cudaExtBufferDesc.flags = 0;
-
-    checkCudaErrors(cudaExternalMemoryGetMappedBuffer(
-        &cudaDevVertptr, cudaExtMemVertexBuffer, &cudaExtBufferDesc));
-    printf("CUDA Imported Vulkan vertex buffer\n");
-  }
-
-  void cudaVkImportSemaphore() {
-    cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc;
-    memset(&externalSemaphoreHandleDesc, 0,
-           sizeof(externalSemaphoreHandleDesc));
-#ifdef _WIN64
-    externalSemaphoreHandleDesc.type =
-        IsWindows8OrGreater() ? cudaExternalSemaphoreHandleTypeOpaqueWin32
-                              : cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt;
-    externalSemaphoreHandleDesc.handle.win32.handle = getVkSemaphoreHandle(
-        IsWindows8OrGreater()
-            ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT
-            : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT,
-        cudaUpdateVkVertexBufSemaphore);
-#else
-    externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd;
-    externalSemaphoreHandleDesc.handle.fd =
-        getVkSemaphoreHandle(VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT,
-                             cudaUpdateVkVertexBufSemaphore);
-#endif
-    externalSemaphoreHandleDesc.flags = 0;
-
-    checkCudaErrors(cudaImportExternalSemaphore(
-        &cudaExtCudaUpdateVkVertexBufSemaphore, &externalSemaphoreHandleDesc));
-
-    memset(&externalSemaphoreHandleDesc, 0,
-           sizeof(externalSemaphoreHandleDesc));
-#ifdef _WIN64
-    externalSemaphoreHandleDesc.type =
-        IsWindows8OrGreater() ? cudaExternalSemaphoreHandleTypeOpaqueWin32
-                              : cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt;
-    ;
-    externalSemaphoreHandleDesc.handle.win32.handle = getVkSemaphoreHandle(
-        IsWindows8OrGreater()
-            ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT
-            : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT,
-        vkUpdateCudaVertexBufSemaphore);
-#else
-    externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd;
-    externalSemaphoreHandleDesc.handle.fd =
-        getVkSemaphoreHandle(VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT,
-                             vkUpdateCudaVertexBufSemaphore);
-#endif
-    externalSemaphoreHandleDesc.flags = 0;
-    checkCudaErrors(cudaImportExternalSemaphore(
-        &cudaExtVkUpdateCudaVertexBufSemaphore, &externalSemaphoreHandleDesc));
-    printf("CUDA Imported Vulkan semaphore\n");
-  }
-
-#ifdef _WIN64  // For windows
-  HANDLE getVkMemHandle(
-      VkExternalMemoryHandleTypeFlagsKHR externalMemoryHandleType) {
-    HANDLE handle;
-
-    VkMemoryGetWin32HandleInfoKHR vkMemoryGetWin32HandleInfoKHR = {};
-    vkMemoryGetWin32HandleInfoKHR.sType =
-        VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR;
-    vkMemoryGetWin32HandleInfoKHR.pNext = NULL;
-    vkMemoryGetWin32HandleInfoKHR.memory = vertexBufferMemory;
-    vkMemoryGetWin32HandleInfoKHR.handleType =
-        (VkExternalMemoryHandleTypeFlagBitsKHR)externalMemoryHandleType;
-
-    fpGetMemoryWin32HandleKHR(device, &vkMemoryGetWin32HandleInfoKHR, &handle);
-    return handle;
-  }
-#else
-  int getVkMemHandle(
-      VkExternalMemoryHandleTypeFlagsKHR externalMemoryHandleType) {
-    if (externalMemoryHandleType ==
-        VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT) {
-      int fd;
-
-      VkMemoryGetFdInfoKHR vkMemoryGetFdInfoKHR = {};
-      vkMemoryGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR;
-      vkMemoryGetFdInfoKHR.pNext = NULL;
-      vkMemoryGetFdInfoKHR.memory = vertexBufferMemory;
-      vkMemoryGetFdInfoKHR.handleType =
-          VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
-
-      fpGetMemoryFdKHR(device, &vkMemoryGetFdInfoKHR, &fd);
-
-      return fd;
-    }
-    return -1;
-  }
-#endif
-
-#ifdef _WIN64
-  HANDLE getVkSemaphoreHandle(
-      VkExternalSemaphoreHandleTypeFlagBitsKHR externalSemaphoreHandleType,
-      VkSemaphore& semVkCuda) {
-    HANDLE handle;
-
-    VkSemaphoreGetWin32HandleInfoKHR vulkanSemaphoreGetWin32HandleInfoKHR = {};
-    vulkanSemaphoreGetWin32HandleInfoKHR.sType =
-        VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR;
-    vulkanSemaphoreGetWin32HandleInfoKHR.pNext = NULL;
-    vulkanSemaphoreGetWin32HandleInfoKHR.semaphore = semVkCuda;
-    vulkanSemaphoreGetWin32HandleInfoKHR.handleType =
-        externalSemaphoreHandleType;
-
-    fpGetSemaphoreWin32HandleKHR(device, &vulkanSemaphoreGetWin32HandleInfoKHR,
-                                 &handle);
-
-    return handle;
-  }
-#else
-  int getVkSemaphoreHandle(
-      VkExternalSemaphoreHandleTypeFlagBitsKHR externalSemaphoreHandleType,
-      VkSemaphore& semVkCuda) {
-    if (externalSemaphoreHandleType ==
-        VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) {
-      int fd;
-
-      VkSemaphoreGetFdInfoKHR vulkanSemaphoreGetFdInfoKHR = {};
-      vulkanSemaphoreGetFdInfoKHR.sType =
-          VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR;
-      vulkanSemaphoreGetFdInfoKHR.pNext = NULL;
-      vulkanSemaphoreGetFdInfoKHR.semaphore = semVkCuda;
-      vulkanSemaphoreGetFdInfoKHR.handleType =
-          VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
-
-      fpGetSemaphoreFdKHR(device, &vulkanSemaphoreGetFdInfoKHR, &fd);
-
-      return fd;
-    }
-    return -1;
-  }
-#endif
-
-  void cudaVkSemaphoreSignal(cudaExternalSemaphore_t& extSemaphore) {
-    cudaExternalSemaphoreSignalParams extSemaphoreSignalParams;
-    memset(&extSemaphoreSignalParams, 0, sizeof(extSemaphoreSignalParams));
-
-    extSemaphoreSignalParams.params.fence.value = 0;
-    extSemaphoreSignalParams.flags = 0;
-    checkCudaErrors(cudaSignalExternalSemaphoresAsync(
-        &extSemaphore, &extSemaphoreSignalParams, 1, streamToRun));
-  }
-
-  void cudaVkSemaphoreWait(cudaExternalSemaphore_t& extSemaphore) {
-    cudaExternalSemaphoreWaitParams extSemaphoreWaitParams;
-
-    memset(&extSemaphoreWaitParams, 0, sizeof(extSemaphoreWaitParams));
-
-    extSemaphoreWaitParams.params.fence.value = 0;
-    extSemaphoreWaitParams.flags = 0;
-
-    checkCudaErrors(cudaWaitExternalSemaphoresAsync(
-        &extSemaphore, &extSemaphoreWaitParams, 1, streamToRun));
-  }
-
-  void cudaUpdateVertexBuffer() {
-    cudaVkSemaphoreWait(cudaExtVkUpdateCudaVertexBufSemaphore);
-
-    dim3 block(16, 16, 1);
-    dim3 grid(mesh_width / block.x, mesh_height / block.y, 1);
-    Vertex* pos = (Vertex*)cudaDevVertptr;
-    AnimTime += 0.01f;
-    sinewave_gen_kernel<<<grid, block, 0, streamToRun>>>(pos, mesh_width,
-                                                         mesh_height, AnimTime);
-    cudaVkSemaphoreSignal(cudaExtCudaUpdateVkVertexBufSemaphore);
-  }
-
-  void cleanup() {
-    if (enableValidationLayers) {
-      DestroyDebugReportCallbackEXT(instance, callback, nullptr);
-    }
-
-    vkDestroySemaphore(device, renderFinishedSemaphore, nullptr);
-    vkDestroySemaphore(device, imageAvailableSemaphore, nullptr);
-    checkCudaErrors(
-        cudaDestroyExternalSemaphore(cudaExtCudaUpdateVkVertexBufSemaphore));
-    vkDestroySemaphore(device, cudaUpdateVkVertexBufSemaphore, nullptr);
-    checkCudaErrors(
-        cudaDestroyExternalSemaphore(cudaExtVkUpdateCudaVertexBufSemaphore));
-    vkDestroySemaphore(device, vkUpdateCudaVertexBufSemaphore, nullptr);
-
-    vkDestroyCommandPool(device, commandPool, nullptr);
-    for (auto framebuffer : swapChainFramebuffers) {
-      vkDestroyFramebuffer(device, framebuffer, nullptr);
-    }
-    for (auto imageView : swapChainImageViews) {
-      vkDestroyImageView(device, imageView, nullptr);
-    }
-    vkDestroyPipeline(device, graphicsPipeline, nullptr);
-    vkDestroyPipelineLayout(device, pipelineLayout, nullptr);
-    vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr);
-    vkDestroyBuffer(device, uniformBuffer, nullptr);
-    vkFreeMemory(device, uniformBufferMemory, nullptr);
-    vkDestroyRenderPass(device, renderPass, nullptr);
-    vkDestroySwapchainKHR(device, swapChain, nullptr);
-    checkCudaErrors(cudaDestroyExternalMemory(cudaExtMemVertexBuffer));
-    vkDestroyBuffer(device, vertexBuffer, nullptr);
-    vkFreeMemory(device, vertexBufferMemory, nullptr);
-    vkDestroyDescriptorPool(device, descriptorPool, nullptr);
-    vkDestroyDevice(device, nullptr);
-    vkDestroySurfaceKHR(instance, surface, nullptr);
-    vkDestroyInstance(instance, nullptr);
-    glfwDestroyWindow(window);
-    glfwTerminate();
-  }
-};
-
-int main(int argc, char* argv[]) {
-  execution_path = argv[0];
-  vulkanCudaApp app;
-
-  try {
-    app.run();
-  } catch (const std::runtime_error& e) {
-    std::cerr << e.what() << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  return EXIT_SUCCESS;
-}
\ No newline at end of file
diff --git a/Samples/simpleVulkanMMAP/Makefile b/Samples/simpleVulkanMMAP/Makefile
new file mode 100644
index 00000000..92651d14
--- /dev/null
+++ b/Samples/simpleVulkanMMAP/Makefile
@@ -0,0 +1,456 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+        endif
+    endif
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on Mac OSX
+ifeq ($(TARGET_OS),darwin)
+  $(info >>> WARNING - simpleVulkanMMAP is not supported on Mac OSX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on ARMv7
+ifeq ($(TARGET_ARCH),armv7l)
+  $(info >>> WARNING - simpleVulkanMMAP is not supported on ARMv7 - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - simpleVulkanMMAP is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Makefile include to help find Vulkan SDK and dependencies
+include ./findvulkan.mk
+
+# Vulkan specific libraries
+ifeq ($(TARGET_OS),linux)
+ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+  LIBRARIES += -L$(VULKAN_SDK_LIB) -lvulkan
+  LIBRARIES += -lglfw
+  INCLUDES  += -I$(VULKAN_HEADER)
+ else
+ LIBRARIES += -L$(VULKAN_SDK_LIB)
+ LIBRARIES += `pkg-config --static --libs glfw3` -lvulkan
+ INCLUDES  += `pkg-config --static --cflags glfw3` -I$(VULKAN_HEADER)
+ endif
+endif
+
+#Detect if installed version of GCC supports required C++11
+ifeq ($(TARGET_OS),linux)
+    empty :=
+    space := $(empty) $(empty)
+    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
+#Create version number without "."
+    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
+# Make sure the version number has at least 3 decimals
+    GCCVERSION += 00
+# Remove spaces from the version number
+    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
+# Crop the version number to 3 decimals.
+    GCCVERSION := $(shell expr `echo $(GCCVERSION)` | cut -b1-3)
+#$(warning $(GCCVERSION))
+
+    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 470)
+
+    ifeq ($(IS_MIN_VERSION), 1)
+        $(info >>> GCC Version is greater or equal to 4.7.0 <<<)
+    else
+        $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
+        SAMPLE_ENABLED := 0
+    endif
+endif
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 35 37 50 52 60 61 70 72 75 80
+else
+SMS ?= 35 37 50 52 60 61 70 75 80
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ifeq ($(TARGET_OS),darwin)
+  ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
+else
+  ifeq ($(TARGET_ARCH),x86_64)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
+    ifdef TARGET_OVERRIDE
+        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
+    endif
+  endif
+
+  ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
+  endif
+
+  ifeq ($(HOST_ARCH),ppc64le)
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
+  endif
+
+  CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
+  ifeq ("$(CUDALIB)","")
+    $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
+    SAMPLE_ENABLED := 0
+  else
+    CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
+    LIBRARIES += -L$(CUDALIB) -lcuda
+  endif
+endif
+
+ALL_CCFLAGS += --std=c++11
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: simpleVulkanMMAP
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+helper_multiprocess.o:../../Common/helper_multiprocess.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+MonteCarloPi.o:MonteCarloPi.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+VulkanBaseApp.o:VulkanBaseApp.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+main.o:main.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+simpleVulkanMMAP: helper_multiprocess.o MonteCarloPi.o VulkanBaseApp.o main.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./simpleVulkanMMAP
+
+clean:
+	rm -f simpleVulkanMMAP helper_multiprocess.o MonteCarloPi.o VulkanBaseApp.o main.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/simpleVulkanMMAP
+
+clobber: clean
diff --git a/Samples/simpleVulkanMMAP/MonteCarloPi.cu b/Samples/simpleVulkanMMAP/MonteCarloPi.cu
new file mode 100644
index 00000000..bb275cf2
--- /dev/null
+++ b/Samples/simpleVulkanMMAP/MonteCarloPi.cu
@@ -0,0 +1,275 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+ /*
+  * See: https://www.piday.org/million/
+  */
+
+#include "MonteCarloPi.h"
+#include <algorithm>
+#define CUDA_DRIVER_API
+#include <helper_cuda.h>
+#include <iostream>
+
+#define ROUND_UP_TO_GRANULARITY(x, n) (((x + n - 1) / n) * n)
+
+  // `ipcHandleTypeFlag` specifies the platform specific handle type this sample
+  // uses for importing and exporting memory allocation. On Linux this sample
+  // specifies the type as CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR meaning that
+  // file descriptors will be used. On Windows this sample specifies the type as
+  // CU_MEM_HANDLE_TYPE_WIN32 meaning that NT HANDLEs will be used. The
+  // ipcHandleTypeFlag variable is a convenience variable and is passed by value
+  // to individual requests.
+#if defined(__linux__)
+CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+#else
+CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_WIN32;
+#endif
+
+// Windows-specific LPSECURITYATTRIBUTES
+void getDefaultSecurityDescriptor(CUmemAllocationProp *prop) {
+#if defined(__linux__)
+    return;
+#elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    static const char sddl[] = "D:P(OA;;GARCSDWDWOCCDCLCSWLODTWPRPCRFA;;;WD)";
+    static OBJECT_ATTRIBUTES objAttributes;
+    static bool objAttributesConfigured = false;
+
+    if (!objAttributesConfigured) {
+        PSECURITY_DESCRIPTOR secDesc;
+        BOOL result = ConvertStringSecurityDescriptorToSecurityDescriptorA(
+            sddl, SDDL_REVISION_1, &secDesc, NULL);
+        if (result == 0) {
+            printf("IPC failure: getDefaultSecurityDescriptor Failed! (%d)\n",
+                GetLastError());
+        }
+
+        InitializeObjectAttributes(&objAttributes, NULL, 0, NULL, secDesc);
+
+        objAttributesConfigured = true;
+    }
+
+    prop->win32HandleMetaData = &objAttributes;
+    return;
+#endif
+}
+
+__global__ void monte_carlo_kernel(vec2 *xyVector, float *pointsInsideCircle, float *numPointsInCircle, unsigned int numPoints, float time)
+{
+    const size_t stride = gridDim.x * blockDim.x;
+    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    float count = 0.0f;
+
+    curandState rgnState;
+    curand_init((unsigned long long)time, tid, 0, &rgnState);
+
+    for (; tid < numPoints; tid += stride) {
+        float x = curand_uniform(&rgnState);
+        float y = curand_uniform(&rgnState);
+        x = (2.0f * x) - 1.0f;
+        y = (2.0f * y) - 1.0f;
+        xyVector[tid][0] = x;
+        xyVector[tid][1] = y;
+
+        // Compute the distance of this point form the center(0, 0)
+        float dist = sqrtf((x*x) + (y*y));
+
+        // If distance is less than the radius of the unit circle, the point lies in the circle.
+        pointsInsideCircle[tid] = (dist <= 1.0f);
+        count += (dist <= 1.0f);
+    }
+    atomicAdd(numPointsInCircle, count);
+}
+
+MonteCarloPiSimulation::MonteCarloPiSimulation(size_t num_points) :
+    m_xyVector(nullptr),
+    m_pointsInsideCircle(nullptr),
+    m_totalPointsInsideCircle(0),
+    m_totalPointsSimulated(0),
+    m_numPoints(num_points)
+{
+}
+
+MonteCarloPiSimulation::~MonteCarloPiSimulation()
+{
+    if (m_numPointsInCircle) {
+        checkCudaErrors(cudaFree(m_numPointsInCircle));
+        m_numPointsInCircle = nullptr;
+    }
+    if (m_hostNumPointsInCircle) {
+        checkCudaErrors(cudaFreeHost(m_hostNumPointsInCircle));
+        m_hostNumPointsInCircle = nullptr;
+    }
+
+    cleanupSimulationAllocations();
+}
+
+void MonteCarloPiSimulation::initSimulation(int cudaDevice, cudaStream_t stream)
+{
+    m_cudaDevice = cudaDevice;
+    getIdealExecutionConfiguration();
+
+    // Allocate a position buffer that contains random location of the points in XY cartesian plane.
+    // Allocate a bitmap buffer which holds information of whether a point in the position buffer is inside the unit circle or not.
+    setupSimulationAllocations();
+
+    checkCudaErrors(cudaMalloc((float **)&m_numPointsInCircle, sizeof(*m_numPointsInCircle)));
+    checkCudaErrors(cudaMallocHost((float **)&m_hostNumPointsInCircle, sizeof(*m_hostNumPointsInCircle)));
+}
+
+void MonteCarloPiSimulation::stepSimulation(float time, cudaStream_t stream)
+{
+
+    checkCudaErrors(cudaMemsetAsync(m_numPointsInCircle, 0, sizeof(*m_numPointsInCircle), stream));
+
+    monte_carlo_kernel << < m_blocks, m_threads, 0, stream >> > (m_xyVector, m_pointsInsideCircle, m_numPointsInCircle, m_numPoints, time);
+    getLastCudaError("Failed to launch CUDA simulation");
+
+    checkCudaErrors(cudaMemcpyAsync(m_hostNumPointsInCircle, m_numPointsInCircle, sizeof(*m_numPointsInCircle), cudaMemcpyDeviceToHost, stream));
+
+    // Queue up a stream callback to compute and print the PI value.
+    checkCudaErrors(cudaLaunchHostFunc(stream, this->computePiCallback, (void *)this));
+}
+
+void MonteCarloPiSimulation::computePiCallback(void *args)
+{
+    MonteCarloPiSimulation *cbData = (MonteCarloPiSimulation *)args;
+    cbData->m_totalPointsInsideCircle += *(cbData->m_hostNumPointsInCircle);
+    cbData->m_totalPointsSimulated += cbData->m_numPoints;
+    double piValue = 4.0 * ((double)cbData->m_totalPointsInsideCircle / (double)cbData->m_totalPointsSimulated);
+    printf("Approximate Pi value for %zd data points: %lf \n", cbData->m_totalPointsSimulated, piValue);
+}
+
+void MonteCarloPiSimulation::getIdealExecutionConfiguration()
+{
+    int warpSize = 0;
+    int multiProcessorCount = 0;
+
+    checkCudaErrors(cudaSetDevice(m_cudaDevice));
+    checkCudaErrors(cudaDeviceGetAttribute(&warpSize, cudaDevAttrWarpSize, m_cudaDevice));
+
+    // We don't need large block sizes, since there's not much inter-thread communication
+    m_threads = warpSize;
+
+    // Use the occupancy calculator and fill the gpu as best as we can
+    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_blocks, monte_carlo_kernel, warpSize, 0));
+
+    checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, m_cudaDevice));
+    m_blocks *= multiProcessorCount;
+
+    // Go ahead and the clamp the blocks to the minimum needed for this height/width
+    m_blocks = std::min(m_blocks, (int)((m_numPoints + m_threads - 1) / m_threads));
+}
+
+void MonteCarloPiSimulation::setupSimulationAllocations()
+{
+    CUdeviceptr d_ptr = 0U;
+    size_t granularity = 0;
+    CUmemGenericAllocationHandle cudaPositionHandle, cudaInCircleHandle;
+
+    CUmemAllocationProp allocProp = { };
+    allocProp.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+    allocProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    allocProp.location.id = m_cudaDevice;
+    allocProp.win32HandleMetaData = NULL;
+    allocProp.requestedHandleTypes = ipcHandleTypeFlag;
+
+    // Windows-specific LPSECURITYATTRIBUTES is required when
+    // CU_MEM_HANDLE_TYPE_WIN32 is used. The security attribute defines the scope
+    // of which exported allocations may be tranferred to other processes. For all
+    // other handle types, pass NULL.
+    getDefaultSecurityDescriptor(&allocProp);
+
+    // Get the recommended granularity for m_cudaDevice.
+    checkCudaErrors(cuMemGetAllocationGranularity(&granularity, &allocProp, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+
+    size_t xyPositionVecSize = m_numPoints * sizeof(*m_xyVector);
+    size_t inCircleVecSize = m_numPoints * sizeof(*m_pointsInsideCircle);
+
+    size_t xyPositionSize = ROUND_UP_TO_GRANULARITY(xyPositionVecSize, granularity);
+    size_t inCircleSize = ROUND_UP_TO_GRANULARITY(inCircleVecSize, granularity);
+    m_totalAllocationSize = (xyPositionSize + inCircleSize);
+
+    // Reserve the required contiguous VA space for the allocations
+    checkCudaErrors(cuMemAddressReserve(&d_ptr, m_totalAllocationSize, granularity, 0U, 0));
+
+    // Create the allocations as a pinned allocation on this device.
+    // Create an allocation to store all the positions of points on the xy plane and a second
+    // allocation which stores information if the corresponding position is inside the unit circle or not.
+    checkCudaErrors(cuMemCreate(&cudaPositionHandle, xyPositionSize, &allocProp, 0));
+    checkCudaErrors(cuMemCreate(&cudaInCircleHandle, inCircleSize, &allocProp, 0));
+
+    // Export the allocation to a platform-specific handle. The type of handle
+    // requested here must match the requestedHandleTypes field in the prop
+    // structure passed to cuMemCreate. The handle obtained here will be passed to vulkan
+    // to import the allocation.
+    checkCudaErrors(cuMemExportToShareableHandle((void *)&m_posShareableHandle, cudaPositionHandle, ipcHandleTypeFlag, 0));
+    checkCudaErrors(cuMemExportToShareableHandle((void *)&m_inCircleShareableHandle, cudaInCircleHandle, ipcHandleTypeFlag, 0));
+
+    CUdeviceptr va_position = d_ptr;
+    CUdeviceptr va_InCircle = va_position + xyPositionSize;
+    m_pointsInsideCircle = (float *)va_InCircle;
+    m_xyVector = (vec2 *)va_position;
+
+    // Assign the chunk to the appropriate VA range
+    checkCudaErrors(cuMemMap(va_position, xyPositionSize, 0, cudaPositionHandle, 0));
+    checkCudaErrors(cuMemMap(va_InCircle, inCircleSize, 0, cudaInCircleHandle, 0));
+
+    // Release the handles for the allocation. Since the allocation is currently mapped to a VA range
+    // with a previous call to cuMemMap the actual freeing of memory allocation will happen on an eventual call to
+    // cuMemUnmap. Thus the allocation will be kept live until it is unmapped.
+    checkCudaErrors(cuMemRelease(cudaPositionHandle));
+    checkCudaErrors(cuMemRelease(cudaInCircleHandle));
+
+    CUmemAccessDesc accessDescriptor = {};
+    accessDescriptor.location.id = m_cudaDevice;
+    accessDescriptor.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    accessDescriptor.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+
+    // Apply the access descriptor to the whole VA range. Essentially enables Read-Write access to the range.
+    checkCudaErrors(cuMemSetAccess(d_ptr, m_totalAllocationSize, &accessDescriptor, 1));
+}
+
+void MonteCarloPiSimulation::cleanupSimulationAllocations()
+{
+    if (m_xyVector && m_pointsInsideCircle) {
+        // Unmap the mapped virtual memory region
+        // Since the handles to the mapped backing stores have already been released
+        // by cuMemRelease, and these are the only/last mappings referencing them,
+        // The backing stores will be freed.
+        checkCudaErrors(cuMemUnmap((CUdeviceptr)m_xyVector, m_totalAllocationSize));
+
+        checkIpcErrors(ipcCloseShareableHandle(m_posShareableHandle));
+        checkIpcErrors(ipcCloseShareableHandle(m_inCircleShareableHandle));
+
+        // Free the virtual address region.
+        checkCudaErrors(cuMemAddressFree((CUdeviceptr)m_xyVector, m_totalAllocationSize));
+
+        m_xyVector = nullptr;
+        m_pointsInsideCircle = nullptr;
+    }
+}
diff --git a/Samples/simpleVulkanMMAP/MonteCarloPi.h b/Samples/simpleVulkanMMAP/MonteCarloPi.h
new file mode 100644
index 00000000..09f0854a
--- /dev/null
+++ b/Samples/simpleVulkanMMAP/MonteCarloPi.h
@@ -0,0 +1,100 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+#ifndef __PISIM_H__
+#define __PISIM_H__
+
+#include <vector>
+#include <cuda_runtime_api.h>
+#include <curand.h>
+#include <curand_kernel.h>
+#include <cuda.h>
+
+#include "helper_multiprocess.h"
+
+typedef float vec2[2];
+
+class MonteCarloPiSimulation
+{
+    size_t m_numPoints;
+
+    // Pointers to Cuda allocated buffers which are imported and used by vulkan as vertex buffer
+    vec2 *m_xyVector;
+    float *m_pointsInsideCircle;
+
+    // Pointers to device and host allocated memories storing number of points that are inside the unit circle
+    float *m_numPointsInCircle;
+    float *m_hostNumPointsInCircle;
+
+    int m_blocks, m_threads;
+
+    // Total size of allocations created by cuMemMap Apis. This size is the sum of sizes of
+    // m_xyVector and m_pointsInsideCircle buffers.
+    size_t m_totalAllocationSize;
+
+    // Shareable Handles(a file descriptor on Linux and NT Handle on Windows), used for sharing cuda
+    // allocated memory with Vulkan
+    ShareableHandle m_posShareableHandle, m_inCircleShareableHandle;
+
+    // Cuda Device corresponding to the Vulkan Physical device
+    int m_cudaDevice;
+
+    // Track and accumulate total points that have been simulated since start of the sample.
+    // The idea is to get a closer approximation to PI with time.
+    size_t m_totalPointsInsideCircle;
+    size_t m_totalPointsSimulated;
+
+    void setupSimulationAllocations();
+    void cleanupSimulationAllocations();
+    void getIdealExecutionConfiguration();
+
+public:
+    MonteCarloPiSimulation(size_t num_points);
+    ~MonteCarloPiSimulation();
+    void initSimulation(int cudaDevice, cudaStream_t stream = 0);
+    void stepSimulation(float time, cudaStream_t stream = 0);
+    static void computePiCallback(void *args);
+
+    size_t getNumPoints() const {
+        return m_numPoints;
+    }
+
+    float getNumPointsInCircle() const {
+        return *m_hostNumPointsInCircle;
+    }
+
+    ShareableHandle &getPositionShareableHandle() {
+        return m_posShareableHandle;
+    }
+    ShareableHandle &getInCircleShareableHandle() {
+        return m_inCircleShareableHandle;
+    }
+
+};
+
+#endif // __PISIM_H__
diff --git a/Samples/simpleVulkanMMAP/NsightEclipse.xml b/Samples/simpleVulkanMMAP/NsightEclipse.xml
new file mode 100644
index 00000000..6fa08e36
--- /dev/null
+++ b/Samples/simpleVulkanMMAP/NsightEclipse.xml
@@ -0,0 +1,101 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>simpleVulkanMMAP</name>
+  <cflags>
+    <flag>--std=c++11</flag>
+  </cflags>
+  <cuda_api_list>
+    <driver>cuDeviceGetAttribute</driver>
+    <driver>cuMemAddressReserve</driver>
+    <driver>cuMemCreate</driver>
+    <driver>cuMemRelease</driver>
+    <driver>cuCtxSetCurrent</driver>
+    <driver>cuMemExportToShareableHandle</driver>
+    <driver>cuMemImportFromShareableHandle</driver>
+    <driver>cuMemMap</driver>
+    <driver>cuMemSetAccess</driver>
+    <driver>cuMemUnmap</driver>
+    <driver>cuMemAddressFree</driver>
+    <toolkit>cudaGetDeviceProperties</toolkit>
+    <toolkit>cudaImportExternalMemory</toolkit>
+    <toolkit>cudaExternalMemoryGetMappedBuffer</toolkit>
+    <toolkit>cudaImportExternalSemaphore</toolkit>
+    <toolkit>cudaImportExternalSemaphore</toolkit>
+    <toolkit>cudaSignalExternalSemaphoresAsync</toolkit>
+    <toolkit>cudaWaitExternalSemaphoresAsync</toolkit>
+    <toolkit>cudaDestroyExternalSemaphore</toolkit>
+    <toolkit>cudaDestroyExternalMemory</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[ This sample demonstrates Vulkan CUDA Interop via cuMemMap APIs. CUDA exports buffers that Vulkan imports as vertex buffer. CUDA invokes kernels to operate on vertices and synchronizes with Vulkan through vulkan semaphores imported by CUDA. This sample depends on Vulkan SDK, GLFW3 libraries, for building this sample please refer to "Build_instructions.txt" provided in this sample's directory]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <files>
+    <file>montecarlo.vert</file>
+    <file>montecarlo.frag</file>
+  </files>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">cuMemMap IPC</concept>
+    <concept level="basic">MMAP</concept>
+    <concept level="advanced">Graphics Interop</concept>
+    <concept level="advanced">CUDA Vulkan Interop</concept>
+    <concept level="advanced">Data Parallel Algorithms</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>CUDA</keyword>
+    <keyword>CPP11</keyword>
+    <keyword>monte-carlo</keyword>
+    <keyword>Vulkan</keyword>
+  </keywords>
+  <libraries>
+    <library os="linux">cuda</library>
+    <library framework="true" os="macosx">CUDA</library>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>main.cpp</primary_file>
+  <required_dependencies>
+    <dependency>X11</dependency>
+    <dependency>VULKAN</dependency>
+  </required_dependencies>
+  <scopes>
+    <scope>2:Graphics Interop</scope>
+    <scope>1:CUDA Advanced Topics</scope>
+    <scope>1:CUDA Vulkan Interop</scope>
+  </scopes>
+  <sm-arch>sm35</sm-arch>
+  <sm-arch>sm37</sm-arch>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
+  <sources>
+    <extracompilation>../../Common/helper_multiprocess.cpp</extracompilation>
+    <extraheader>../../Common/helper_multiprocess.h</extraheader>
+  </sources>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <platform>aarch64</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>Vulkan CUDA Interop PI Approximation</title>
+</entry>
diff --git a/Samples/simpleVulkanMMAP/README.md b/Samples/simpleVulkanMMAP/README.md
new file mode 100644
index 00000000..42a9c554
--- /dev/null
+++ b/Samples/simpleVulkanMMAP/README.md
@@ -0,0 +1,75 @@
+# simpleVulkanMMAP - Vulkan CUDA Interop PI Approximation
+
+## Description
+
+ This sample demonstrates Vulkan CUDA Interop via cuMemMap APIs. CUDA exports buffers that Vulkan imports as vertex buffer. CUDA invokes kernels to operate on vertices and synchronizes with Vulkan through vulkan semaphores imported by CUDA. This sample depends on Vulkan SDK, GLFW3 libraries, for building this sample please refer to "Build_instructions.txt" provided in this sample's directory
+
+## Key Concepts
+
+cuMemMap IPC, MMAP, Graphics Interop, CUDA Vulkan Interop, Data Parallel Algorithms
+
+## Supported SM Architectures
+
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, aarch64
+
+## CUDA APIs involved
+
+### [CUDA Driver API](http://docs.nvidia.com/cuda/cuda-driver-api/index.html)
+cuDeviceGetAttribute, cuMemAddressReserve, cuMemCreate, cuMemRelease, cuCtxSetCurrent, cuMemExportToShareableHandle, cuMemImportFromShareableHandle, cuMemMap, cuMemSetAccess, cuMemUnmap, cuMemAddressFree### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaGetDeviceProperties, cudaImportExternalMemory, cudaExternalMemoryGetMappedBuffer, cudaImportExternalSemaphore, cudaImportExternalSemaphore, cudaSignalExternalSemaphoresAsync, cudaWaitExternalSemaphoresAsync, cudaDestroyExternalSemaphore, cudaDestroyExternalMemory
+
+## Dependencies needed to build/run
+[X11](../../README.md#x11), [VULKAN](../../README.md#vulkan)
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, aarch64.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=aarch64` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
diff --git a/Samples/simpleVulkanMMAP/VulkanBaseApp.cpp b/Samples/simpleVulkanMMAP/VulkanBaseApp.cpp
new file mode 100644
index 00000000..91fc83a9
--- /dev/null
+++ b/Samples/simpleVulkanMMAP/VulkanBaseApp.cpp
@@ -0,0 +1,1727 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+ /*
+  * This file contains basic cross-platform setup paths in working with Vulkan
+  * and rendering window.  It is largely based off of tutorials provided here:
+  * https://vulkan-tutorial.com/
+ */
+
+#include <stdexcept>
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include <functional>
+#include <set>
+#include <string.h>
+
+#include "VulkanBaseApp.h"
+#include "VulkanCudaInterop.h"
+
+#define GLFW_INCLUDE_VULKAN
+#define GLM_FORCE_DEPTH_ZERO_TO_ONE
+#include <GLFW/glfw3.h>
+
+#ifdef _WIN64
+#include <VersionHelpers.h>
+#include <dxgi1_2.h>
+#include <aclapi.h>
+#endif /* _WIN64 */
+
+#ifndef countof
+#define countof(x) (sizeof(x) / sizeof(*(x)))
+#endif
+
+static const char *validationLayers[] = { "VK_LAYER_KHRONOS_validation" };
+static const size_t MAX_FRAMES_IN_FLIGHT = 5;
+
+void VulkanBaseApp::resizeCallback(GLFWwindow *window, int width, int height)
+{
+    VulkanBaseApp *app = reinterpret_cast<VulkanBaseApp *>(glfwGetWindowUserPointer(window));
+    app->m_framebufferResized = true;
+}
+
+static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, VkDebugUtilsMessageTypeFlagsEXT messageType, const VkDebugUtilsMessengerCallbackDataEXT *pCallbackData, void *pUserData)
+{
+    std::cerr << "validation layer: " << pCallbackData->pMessage << std::endl;
+
+    return VK_FALSE;
+}
+
+VulkanBaseApp::VulkanBaseApp(const std::string& appName, bool enableValidation) :
+    m_appName(appName),
+    m_enableValidation(enableValidation),
+    m_instance(VK_NULL_HANDLE),
+    m_window(nullptr),
+    m_debugMessenger(VK_NULL_HANDLE),
+    m_surface(VK_NULL_HANDLE),
+    m_physicalDevice(VK_NULL_HANDLE),
+    m_device(VK_NULL_HANDLE),
+    m_graphicsQueue(VK_NULL_HANDLE),
+    m_presentQueue(VK_NULL_HANDLE),
+    m_swapChain(VK_NULL_HANDLE),
+    m_swapChainImages(),
+    m_swapChainFormat(),
+    m_swapChainExtent(),
+    m_swapChainImageViews(),
+    m_shaderFiles(),
+    m_renderPass(),
+    m_pipelineLayout(VK_NULL_HANDLE),
+    m_graphicsPipeline(VK_NULL_HANDLE),
+    m_swapChainFramebuffers(),
+    m_commandPool(VK_NULL_HANDLE),
+    m_commandBuffers(),
+    m_imageAvailableSemaphores(),
+    m_renderFinishedSemaphores(),
+    m_inFlightFences(),
+    m_uniformBuffers(),
+    m_uniformMemory(),
+    m_descriptorSetLayout(VK_NULL_HANDLE),
+    m_descriptorPool(VK_NULL_HANDLE),
+    m_descriptorSets(),
+    m_depthImage(VK_NULL_HANDLE),
+    m_depthImageMemory(VK_NULL_HANDLE),
+    m_depthImageView(VK_NULL_HANDLE),
+    m_currentFrame(0),
+    m_framebufferResized(false)
+{
+}
+
+VkExternalSemaphoreHandleTypeFlagBits VulkanBaseApp::getDefaultSemaphoreHandleType()
+{
+#ifdef _WIN64
+    return IsWindows8OrGreater() ?
+        VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT :
+        VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT;
+#else
+    return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT;
+#endif
+}
+
+VkExternalMemoryHandleTypeFlagBits VulkanBaseApp::getDefaultMemHandleType()
+{
+#ifdef _WIN64
+    return IsWindows8Point1OrGreater() ?
+        VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT :
+        VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT;
+#else
+    return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+#endif
+}
+
+VulkanBaseApp::~VulkanBaseApp()
+{
+    cleanupSwapChain();
+
+    if (m_descriptorSetLayout != VK_NULL_HANDLE) {
+        vkDestroyDescriptorSetLayout(m_device, m_descriptorSetLayout, nullptr);
+    }
+
+    for (size_t i = 0; i < m_renderFinishedSemaphores.size(); i++) {
+        vkDestroySemaphore(m_device, m_renderFinishedSemaphores[i], nullptr);
+        vkDestroySemaphore(m_device, m_imageAvailableSemaphores[i], nullptr);
+        vkDestroyFence(m_device, m_inFlightFences[i], nullptr);
+    }
+    if (m_commandPool != VK_NULL_HANDLE) {
+        vkDestroyCommandPool(m_device, m_commandPool, nullptr);
+    }
+
+    if (m_device != VK_NULL_HANDLE) {
+        vkDestroyDevice(m_device, nullptr);
+    }
+
+    if (m_enableValidation) {
+        PFN_vkDestroyDebugUtilsMessengerEXT func = (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr(m_instance, "vkDestroyDebugUtilsMessengerEXT");
+        if (func != nullptr) {
+            func(m_instance, m_debugMessenger, nullptr);
+        }
+    }
+
+    if (m_surface != VK_NULL_HANDLE) {
+        vkDestroySurfaceKHR(m_instance, m_surface, nullptr);
+    }
+
+    if (m_instance != VK_NULL_HANDLE) {
+        vkDestroyInstance(m_instance, nullptr);
+    }
+
+    if (m_window) {
+        glfwDestroyWindow(m_window);
+    }
+
+    glfwTerminate();
+}
+
+void VulkanBaseApp::init()
+{
+    initWindow();
+    initVulkan();
+}
+
+VkCommandBuffer VulkanBaseApp::beginSingleTimeCommands()
+{
+    VkCommandBufferAllocateInfo allocInfo = {};
+    allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+    allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+    allocInfo.commandPool = m_commandPool;
+    allocInfo.commandBufferCount = 1;
+
+    VkCommandBuffer commandBuffer;
+    vkAllocateCommandBuffers(m_device, &allocInfo, &commandBuffer);
+
+    VkCommandBufferBeginInfo beginInfo = {};
+    beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+
+    vkBeginCommandBuffer(commandBuffer, &beginInfo);
+
+    return commandBuffer;
+}
+
+void VulkanBaseApp::endSingleTimeCommands(VkCommandBuffer commandBuffer)
+{
+    vkEndCommandBuffer(commandBuffer);
+
+    VkSubmitInfo submitInfo = {};
+    submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    submitInfo.commandBufferCount = 1;
+    submitInfo.pCommandBuffers = &commandBuffer;
+
+    vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE);
+    vkQueueWaitIdle(m_graphicsQueue);
+
+    vkFreeCommandBuffers(m_device, m_commandPool, 1, &commandBuffer);
+}
+
+void VulkanBaseApp::initWindow()
+{
+    glfwInit();
+
+    glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API);
+    glfwWindowHint(GLFW_RESIZABLE, GLFW_FALSE);
+
+    m_window = glfwCreateWindow(1280, 800, m_appName.c_str(), nullptr, nullptr);
+    glfwSetWindowUserPointer(m_window, this);
+    glfwSetFramebufferSizeCallback(m_window, resizeCallback);
+}
+
+
+std::vector<const char *> VulkanBaseApp::getRequiredExtensions() const
+{
+    return std::vector<const char *>();
+}
+
+std::vector<const char *> VulkanBaseApp::getRequiredDeviceExtensions() const
+{
+    return std::vector<const char *>();
+}
+
+void VulkanBaseApp::initVulkan()
+{
+    createInstance();
+    createSurface();
+    createDevice();
+    createSwapChain();
+    createImageViews();
+    createRenderPass();
+    createDescriptorSetLayout();
+    createGraphicsPipeline();
+    createCommandPool();
+    createDepthResources();
+    createFramebuffers();
+    initVulkanApp();
+    createUniformBuffers();
+    createDescriptorPool();
+    createDescriptorSets();
+    createCommandBuffers();
+    createSyncObjects();
+}
+
+#ifdef _WIN64
+class WindowsSecurityAttributes
+{
+protected:
+    SECURITY_ATTRIBUTES m_winSecurityAttributes;
+    PSECURITY_DESCRIPTOR m_winPSecurityDescriptor;
+
+public:
+    WindowsSecurityAttributes();
+    SECURITY_ATTRIBUTES *operator&();
+    ~WindowsSecurityAttributes();
+};
+
+WindowsSecurityAttributes::WindowsSecurityAttributes()
+{
+    m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc(1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void **));
+    if (!m_winPSecurityDescriptor) {
+        throw std::runtime_error("Failed to allocate memory for security descriptor");
+    }
+
+    PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH);
+    PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *));
+
+    InitializeSecurityDescriptor(m_winPSecurityDescriptor, SECURITY_DESCRIPTOR_REVISION);
+
+    SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = SECURITY_WORLD_SID_AUTHORITY;
+    AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0, 0, 0, 0, 0, 0, ppSID);
+
+    EXPLICIT_ACCESS explicitAccess;
+    ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS));
+    explicitAccess.grfAccessPermissions = STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL;
+    explicitAccess.grfAccessMode = SET_ACCESS;
+    explicitAccess.grfInheritance = INHERIT_ONLY;
+    explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID;
+    explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP;
+    explicitAccess.Trustee.ptstrName = (LPTSTR)* ppSID;
+
+    SetEntriesInAcl(1, &explicitAccess, NULL, ppACL);
+
+    SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE);
+
+    m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes);
+    m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor;
+    m_winSecurityAttributes.bInheritHandle = TRUE;
+}
+
+SECURITY_ATTRIBUTES *
+WindowsSecurityAttributes::operator&()
+{
+    return &m_winSecurityAttributes;
+}
+
+WindowsSecurityAttributes::~WindowsSecurityAttributes()
+{
+    PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH);
+    PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *));
+
+    if (*ppSID) {
+        FreeSid(*ppSID);
+    }
+    if (*ppACL) {
+        LocalFree(*ppACL);
+    }
+    free(m_winPSecurityDescriptor);
+}
+#endif /* _WIN64 */
+
+
+static VkFormat findSupportedFormat(VkPhysicalDevice physicalDevice, const std::vector<VkFormat>& candidates, VkImageTiling tiling, VkFormatFeatureFlags features)
+{
+    for (VkFormat format : candidates) {
+        VkFormatProperties props;
+        vkGetPhysicalDeviceFormatProperties(physicalDevice, format, &props);
+        if (tiling == VK_IMAGE_TILING_LINEAR && (props.linearTilingFeatures & features) == features) {
+            return format;
+        }
+        else if (tiling == VK_IMAGE_TILING_OPTIMAL && (props.optimalTilingFeatures & features) == features) {
+            return format;
+        }
+    }
+    throw std::runtime_error("Failed to find supported format!");
+}
+
+static uint32_t findMemoryType(VkPhysicalDevice physicalDevice, uint32_t typeFilter, VkMemoryPropertyFlags properties)
+{
+    VkPhysicalDeviceMemoryProperties memProperties;
+    vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties);
+    for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) {
+        if (typeFilter & (1 << i) && (memProperties.memoryTypes[i].propertyFlags & properties) == properties) {
+            return i;
+        }
+    }
+    return ~0;
+}
+
+static bool supportsValidationLayers()
+{
+    std::vector<VkLayerProperties> availableLayers;
+    uint32_t layerCount;
+
+    vkEnumerateInstanceLayerProperties(&layerCount, nullptr);
+    availableLayers.resize(layerCount);
+    vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data());
+
+    for (const char * layerName : validationLayers) {
+        bool layerFound = false;
+
+        for (const auto & layerProperties : availableLayers) {
+            if (strcmp(layerName, layerProperties.layerName) == 0) {
+                layerFound = true;
+                break;
+            }
+        }
+
+        if (!layerFound) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+void VulkanBaseApp::createInstance()
+{
+    if (m_enableValidation && !supportsValidationLayers()) {
+        throw std::runtime_error("Validation requested, but not supported!");
+    }
+
+    VkApplicationInfo appInfo = {};
+    appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
+    appInfo.pApplicationName = m_appName.c_str();
+    appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
+    appInfo.pEngineName = "No Engine";
+    appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
+    appInfo.apiVersion = VK_API_VERSION_1_0;
+
+    VkInstanceCreateInfo createInfo = {};
+    createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
+    createInfo.pApplicationInfo = &appInfo;
+
+    std::vector<const char *> exts = getRequiredExtensions();
+
+    {
+        uint32_t glfwExtensionCount = 0;
+        const char **glfwExtensions;
+
+        glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount);
+
+        exts.insert(exts.begin(), glfwExtensions, glfwExtensions + glfwExtensionCount);
+
+        if (m_enableValidation) {
+            exts.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME);
+        }
+    }
+
+    createInfo.enabledExtensionCount = static_cast<uint32_t>(exts.size());
+    createInfo.ppEnabledExtensionNames = exts.data();
+    VkDebugUtilsMessengerCreateInfoEXT debugCreateInfo = {};
+    if (m_enableValidation) {
+        createInfo.enabledLayerCount = static_cast<uint32_t>(countof(validationLayers));
+        createInfo.ppEnabledLayerNames = validationLayers;
+
+        debugCreateInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT;
+        debugCreateInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT;
+        debugCreateInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT;
+        debugCreateInfo.pfnUserCallback = debugCallback;
+
+        createInfo.pNext = &debugCreateInfo;
+    }
+    else {
+        createInfo.enabledLayerCount = 0;
+        createInfo.pNext = nullptr;
+    }
+
+    if (vkCreateInstance(&createInfo, nullptr, &m_instance) != VK_SUCCESS) {
+        throw std::runtime_error("Failed to create Vulkan instance!");
+    }
+
+    if (m_enableValidation) {
+        PFN_vkCreateDebugUtilsMessengerEXT func = (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(m_instance, "vkCreateDebugUtilsMessengerEXT");
+        if (func == nullptr || func(m_instance, &debugCreateInfo, nullptr, &m_debugMessenger) != VK_SUCCESS) {
+            throw std::runtime_error("Failed to set up debug messenger!");
+        }
+    }
+}
+
+void VulkanBaseApp::createSurface()
+{
+    if (glfwCreateWindowSurface(m_instance, m_window, nullptr, &m_surface) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create window surface!");
+    }
+}
+
+static bool findGraphicsQueueIndicies(VkPhysicalDevice device, VkSurfaceKHR surface, uint32_t& graphicsFamily, uint32_t& presentFamily)
+{
+    uint32_t queueFamilyCount = 0;
+
+    vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, nullptr);
+
+    std::vector<VkQueueFamilyProperties> queueFamilies(queueFamilyCount);
+    vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, queueFamilies.data());
+
+    graphicsFamily = presentFamily = ~0;
+
+    for (uint32_t i = 0; i < queueFamilyCount; i++) {
+
+        if (queueFamilies[i].queueCount > 0) {
+            if (graphicsFamily == ~0 && queueFamilies[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
+                graphicsFamily = i;
+            }
+            uint32_t presentSupport = 0;
+            vkGetPhysicalDeviceSurfaceSupportKHR(device, i, surface, &presentSupport);
+            if (presentFamily == ~0 && presentSupport) {
+                presentFamily = i;
+            }
+            if (presentFamily != ~0 && graphicsFamily != ~0) {
+                break;
+            }
+        }
+    }
+
+    return graphicsFamily != ~0 && presentFamily != ~0;
+}
+
+static bool hasAllExtensions(VkPhysicalDevice device, const std::vector<const char *>& deviceExtensions)
+{
+    uint32_t extensionCount;
+    vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, nullptr);
+    std::vector<VkExtensionProperties> availableExtensions(extensionCount);
+    vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, availableExtensions.data());
+
+    std::set<std::string> requiredExtensions(deviceExtensions.begin(), deviceExtensions.end());
+
+    for (const auto & extension : availableExtensions) {
+        requiredExtensions.erase(extension.extensionName);
+    }
+
+    return requiredExtensions.empty();
+}
+
+static void getSwapChainProperties(VkPhysicalDevice device, VkSurfaceKHR surface, VkSurfaceCapabilitiesKHR& capabilities, std::vector<VkSurfaceFormatKHR>& formats, std::vector<VkPresentModeKHR>& presentModes)
+{
+    vkGetPhysicalDeviceSurfaceCapabilitiesKHR(device, surface, &capabilities);
+    uint32_t formatCount;
+    vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, nullptr);
+    if (formatCount != 0) {
+        formats.resize(formatCount);
+        vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, formats.data());
+    }
+    uint32_t presentModeCount;
+    vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, nullptr);
+    if (presentModeCount != 0) {
+        presentModes.resize(presentModeCount);
+        vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, presentModes.data());
+    }
+}
+
+bool VulkanBaseApp::isSuitableDevice(VkPhysicalDevice dev) const
+{
+    bool isSuitable = false;
+    uint32_t graphicsQueueIndex, presentQueueIndex;
+    std::vector<const char *> deviceExtensions = getRequiredDeviceExtensions();
+    VkSurfaceCapabilitiesKHR caps;
+    std::vector<VkSurfaceFormatKHR> formats;
+    std::vector<VkPresentModeKHR> presentModes;
+    deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
+    getSwapChainProperties(dev, m_surface, caps, formats, presentModes);
+
+    VkPhysicalDeviceIDPropertiesKHR vkPhysicalDeviceIDPropertiesKHR = {};
+    vkPhysicalDeviceIDPropertiesKHR.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR;
+    vkPhysicalDeviceIDPropertiesKHR.pNext = NULL;
+
+    VkPhysicalDeviceProperties2KHR vkPhysicalDeviceProperties2KHR = {};
+    vkPhysicalDeviceProperties2KHR.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR;
+    vkPhysicalDeviceProperties2KHR.pNext = &vkPhysicalDeviceIDPropertiesKHR;
+
+    vkGetPhysicalDeviceProperties2(dev, &vkPhysicalDeviceProperties2KHR);
+
+    isSuitable = hasAllExtensions(dev, deviceExtensions)
+        && isDeviceCompatible(vkPhysicalDeviceIDPropertiesKHR.deviceUUID, (size_t)VK_UUID_SIZE)
+        && !formats.empty() && !presentModes.empty()
+        && findGraphicsQueueIndicies(dev, m_surface, graphicsQueueIndex, presentQueueIndex);
+
+    if (isSuitable) {
+        memcpy((void *)m_deviceUUID, vkPhysicalDeviceIDPropertiesKHR.deviceUUID, sizeof(m_deviceUUID));
+    }
+
+    return isSuitable;
+}
+
+bool VulkanBaseApp::isVkPhysicalDeviceUuid(void *Uuid) {
+    return !memcmp((void *)m_deviceUUID, Uuid, (size_t)VK_UUID_SIZE);
+}
+
+void VulkanBaseApp::createDevice()
+{
+    {
+        uint32_t deviceCount = 0;
+        vkEnumeratePhysicalDevices(m_instance, &deviceCount, nullptr);
+        if (deviceCount == 0) {
+            throw std::runtime_error("Failed to find Vulkan capable GPUs!");
+        }
+        std::vector<VkPhysicalDevice> phyDevs(deviceCount);
+        vkEnumeratePhysicalDevices(m_instance, &deviceCount, phyDevs.data());
+        std::vector<VkPhysicalDevice>::iterator it = std::find_if(phyDevs.begin(), phyDevs.end(),
+            std::bind(&VulkanBaseApp::isSuitableDevice, this, std::placeholders::_1));
+        if (it == phyDevs.end()) {
+            throw std::runtime_error("No suitable device found!");
+        }
+        m_physicalDevice = *it;
+    }
+
+    uint32_t graphicsQueueIndex, presentQueueIndex;
+    findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsQueueIndex, presentQueueIndex);
+
+    std::vector<VkDeviceQueueCreateInfo> queueCreateInfos;
+    std::set<uint32_t> uniqueFamilyIndices = { graphicsQueueIndex, presentQueueIndex };
+
+    float queuePriority = 1.0f;
+
+    for (uint32_t queueFamily : uniqueFamilyIndices) {
+        VkDeviceQueueCreateInfo queueCreateInfo = {};
+        queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
+        queueCreateInfo.queueFamilyIndex = graphicsQueueIndex;
+        queueCreateInfo.queueCount = 1;
+        queueCreateInfo.pQueuePriorities = &queuePriority;
+        queueCreateInfos.push_back(queueCreateInfo);
+    }
+
+    VkPhysicalDeviceFeatures deviceFeatures = {};
+    deviceFeatures.fillModeNonSolid = true;
+
+    VkDeviceCreateInfo createInfo = {};
+    createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
+
+    createInfo.pQueueCreateInfos = queueCreateInfos.data();
+    createInfo.queueCreateInfoCount = static_cast<uint32_t>(queueCreateInfos.size());
+
+    createInfo.pEnabledFeatures = &deviceFeatures;
+
+    std::vector<const char *> deviceExtensions = getRequiredDeviceExtensions();
+    deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
+
+    createInfo.enabledExtensionCount = static_cast<uint32_t>(deviceExtensions.size());
+    createInfo.ppEnabledExtensionNames = deviceExtensions.data();
+
+    if (m_enableValidation) {
+        createInfo.enabledLayerCount = static_cast<uint32_t>(countof(validationLayers));
+        createInfo.ppEnabledLayerNames = validationLayers;
+    }
+    else {
+        createInfo.enabledLayerCount = 0;
+    }
+
+    if (vkCreateDevice(m_physicalDevice, &createInfo, nullptr, &m_device) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create logical device!");
+    }
+
+    vkGetDeviceQueue(m_device, graphicsQueueIndex, 0, &m_graphicsQueue);
+    vkGetDeviceQueue(m_device, presentQueueIndex, 0, &m_presentQueue);
+}
+
+static VkSurfaceFormatKHR chooseSwapSurfaceFormat(const std::vector<VkSurfaceFormatKHR>& availableFormats)
+{
+    if (availableFormats.size() == 1 && availableFormats[0].format == VK_FORMAT_UNDEFINED) {
+        return { VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR };
+    }
+
+    for (const auto & availableFormat : availableFormats) {
+        if (availableFormat.format == VK_FORMAT_B8G8R8A8_UNORM && availableFormat.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) {
+            return availableFormat;
+        }
+    }
+
+    return availableFormats[0];
+}
+
+static VkPresentModeKHR chooseSwapPresentMode(const std::vector<VkPresentModeKHR>& availablePresentModes)
+{
+    VkPresentModeKHR bestMode = VK_PRESENT_MODE_FIFO_KHR;
+
+    for (const auto & availablePresentMode : availablePresentModes) {
+        if (availablePresentMode == VK_PRESENT_MODE_MAILBOX_KHR) {
+            return availablePresentMode;
+        }
+        else if (availablePresentMode == VK_PRESENT_MODE_IMMEDIATE_KHR) {
+            bestMode = availablePresentMode;
+        }
+    }
+
+    return bestMode;
+}
+
+static VkExtent2D chooseSwapExtent(GLFWwindow *window, const VkSurfaceCapabilitiesKHR& capabilities)
+{
+    if (capabilities.currentExtent.width != std::numeric_limits<uint32_t>::max()) {
+        return capabilities.currentExtent;
+    }
+    else {
+        int width, height;
+        glfwGetFramebufferSize(window, &width, &height);
+        VkExtent2D actualExtent = { static_cast<uint32_t>(width), static_cast<uint32_t>(height) };
+
+        actualExtent.width = std::max(capabilities.minImageExtent.width, std::min(capabilities.maxImageExtent.width, actualExtent.width));
+        actualExtent.height = std::max(capabilities.minImageExtent.height, std::min(capabilities.maxImageExtent.height, actualExtent.height));
+
+        return actualExtent;
+    }
+}
+
+void VulkanBaseApp::createSwapChain()
+{
+    VkSurfaceCapabilitiesKHR capabilities;
+    VkSurfaceFormatKHR format;
+    VkPresentModeKHR presentMode;
+    VkExtent2D extent;
+    uint32_t imageCount;
+
+    {
+        std::vector<VkSurfaceFormatKHR> formats;
+        std::vector<VkPresentModeKHR> presentModes;
+
+        getSwapChainProperties(m_physicalDevice, m_surface, capabilities, formats, presentModes);
+        format = chooseSwapSurfaceFormat(formats);
+        presentMode = chooseSwapPresentMode(presentModes);
+        extent = chooseSwapExtent(m_window, capabilities);
+        imageCount = capabilities.minImageCount + 1;
+        if (capabilities.maxImageCount > 0 && imageCount > capabilities.maxImageCount) {
+            imageCount = capabilities.maxImageCount;
+        }
+    }
+
+    VkSwapchainCreateInfoKHR createInfo = {};
+    createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR;
+    createInfo.surface = m_surface;
+
+    createInfo.minImageCount = imageCount;
+    createInfo.imageFormat = format.format;
+    createInfo.imageColorSpace = format.colorSpace;
+    createInfo.imageExtent = extent;
+    createInfo.imageArrayLayers = 1;
+    createInfo.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+
+    uint32_t queueFamilyIndices[2];
+    findGraphicsQueueIndicies(m_physicalDevice, m_surface, queueFamilyIndices[0], queueFamilyIndices[1]);
+
+    if (queueFamilyIndices[0] != queueFamilyIndices[1]) {
+        createInfo.imageSharingMode = VK_SHARING_MODE_CONCURRENT;
+        createInfo.queueFamilyIndexCount = countof(queueFamilyIndices);
+        createInfo.pQueueFamilyIndices = queueFamilyIndices;
+    }
+    else {
+        createInfo.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE;
+    }
+
+    createInfo.preTransform = capabilities.currentTransform;
+    createInfo.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR;
+    createInfo.presentMode = presentMode;
+    createInfo.clipped = VK_TRUE;
+
+    createInfo.oldSwapchain = VK_NULL_HANDLE;
+
+    if (vkCreateSwapchainKHR(m_device, &createInfo, nullptr, &m_swapChain) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create swap chain!");
+    }
+
+    vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount, nullptr);
+    m_swapChainImages.resize(imageCount);
+    vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount, m_swapChainImages.data());
+
+    m_swapChainFormat = format.format;
+    m_swapChainExtent = extent;
+}
+
+static VkImageView createImageView(VkDevice dev, VkImage image, VkFormat format, VkImageAspectFlags aspectFlags)
+{
+    VkImageView imageView;
+    VkImageViewCreateInfo createInfo = {};
+    createInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
+    createInfo.image = image;
+    createInfo.viewType = VK_IMAGE_VIEW_TYPE_2D;
+    createInfo.format = format;
+    createInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
+    createInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
+    createInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
+    createInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
+    createInfo.subresourceRange.aspectMask = aspectFlags;
+    createInfo.subresourceRange.baseMipLevel = 0;
+    createInfo.subresourceRange.levelCount = 1;
+    createInfo.subresourceRange.baseArrayLayer = 0;
+    createInfo.subresourceRange.layerCount = 1;
+    if (vkCreateImageView(dev, &createInfo, nullptr, &imageView) != VK_SUCCESS) {
+        throw std::runtime_error("Failed to create image views!");
+    }
+
+    return imageView;
+}
+
+static void createImage(VkPhysicalDevice physicalDevice, VkDevice device, uint32_t width, uint32_t height, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage, VkMemoryPropertyFlags properties, VkImage& image, VkDeviceMemory& imageMemory)
+{
+    VkImageCreateInfo imageInfo = {};
+    imageInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
+    imageInfo.imageType = VK_IMAGE_TYPE_2D;
+    imageInfo.extent.width = width;
+    imageInfo.extent.height = height;
+    imageInfo.extent.depth = 1;
+    imageInfo.mipLevels = 1;
+    imageInfo.arrayLayers = 1;
+    imageInfo.format = format;
+    imageInfo.tiling = tiling;
+    imageInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+    imageInfo.usage = usage;
+    imageInfo.samples = VK_SAMPLE_COUNT_1_BIT;
+    imageInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+
+    if (vkCreateImage(device, &imageInfo, nullptr, &image) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create image!");
+    }
+
+    VkMemoryRequirements memRequirements;
+    vkGetImageMemoryRequirements(device, image, &memRequirements);
+
+    VkMemoryAllocateInfo allocInfo = {};
+    allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+    allocInfo.allocationSize = memRequirements.size;
+    allocInfo.memoryTypeIndex = findMemoryType(physicalDevice, memRequirements.memoryTypeBits, properties);
+
+    if (vkAllocateMemory(device, &allocInfo, nullptr, &imageMemory) != VK_SUCCESS) {
+        throw std::runtime_error("failed to allocate image memory!");
+    }
+
+    vkBindImageMemory(device, image, imageMemory, 0);
+}
+
+void VulkanBaseApp::createImageViews()
+{
+    m_swapChainImageViews.resize(m_swapChainImages.size());
+
+    for (uint32_t i = 0; i < m_swapChainImages.size(); i++) {
+        m_swapChainImageViews[i] = createImageView(m_device, m_swapChainImages[i], m_swapChainFormat, VK_IMAGE_ASPECT_COLOR_BIT);
+    }
+}
+
+void VulkanBaseApp::createRenderPass()
+{
+    VkAttachmentDescription colorAttachment = {};
+    colorAttachment.format = m_swapChainFormat;
+    colorAttachment.samples = VK_SAMPLE_COUNT_1_BIT;
+    // Set up the render pass to preserve the contents of the attachment while rendering.
+    // By doing this the points already rendered are not cleared and thus displays growing number of
+    // points with time eventhough the number of points rendered per frame is constant
+    colorAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
+    colorAttachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
+    colorAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+    colorAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+    colorAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+    colorAttachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
+
+    VkAttachmentReference colorAttachmentRef = {};
+    colorAttachmentRef.attachment = 0;
+    colorAttachmentRef.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+
+    VkAttachmentDescription depthAttachment = {};
+    depthAttachment.format = findSupportedFormat(m_physicalDevice,
+        { VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_D24_UNORM_S8_UINT },
+        VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT);
+    depthAttachment.samples = VK_SAMPLE_COUNT_1_BIT;
+    depthAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR;
+    depthAttachment.storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+    depthAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+    depthAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+    depthAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+    depthAttachment.finalLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
+
+    VkAttachmentReference depthAttachmentRef = {};
+    depthAttachmentRef.attachment = 1;
+    depthAttachmentRef.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
+
+    VkSubpassDescription subpass = {};
+    subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
+    subpass.colorAttachmentCount = 1;
+    subpass.pColorAttachments = &colorAttachmentRef;
+    subpass.pDepthStencilAttachment = &depthAttachmentRef;
+
+
+    VkSubpassDependency dependency = {};
+    dependency.srcSubpass = VK_SUBPASS_EXTERNAL;
+    dependency.dstSubpass = 0;
+    dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+    dependency.srcAccessMask = 0;
+    dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+    dependency.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+
+    VkAttachmentDescription attachments[] = { colorAttachment, depthAttachment };
+    VkRenderPassCreateInfo renderPassInfo = {};
+    renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO;
+    renderPassInfo.attachmentCount = countof(attachments);
+    renderPassInfo.pAttachments = attachments;
+    renderPassInfo.subpassCount = 1;
+    renderPassInfo.pSubpasses = &subpass;
+    renderPassInfo.dependencyCount = 1;
+    renderPassInfo.pDependencies = &dependency;
+
+    if (vkCreateRenderPass(m_device, &renderPassInfo, nullptr, &m_renderPass) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create render pass!");
+    }
+}
+
+void VulkanBaseApp::createDescriptorSetLayout()
+{
+    VkDescriptorSetLayoutBinding uboLayoutBinding = {};
+    uboLayoutBinding.binding = 0;
+    uboLayoutBinding.descriptorCount = 1;
+    uboLayoutBinding.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+    uboLayoutBinding.pImmutableSamplers = nullptr;
+    uboLayoutBinding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
+
+    VkDescriptorSetLayoutCreateInfo layoutInfo = {};
+    layoutInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+    layoutInfo.bindingCount = 1;
+    layoutInfo.pBindings = &uboLayoutBinding;
+
+    if (vkCreateDescriptorSetLayout(m_device, &layoutInfo, nullptr, &m_descriptorSetLayout) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create descriptor set layout!");
+    }
+}
+
+VkShaderModule createShaderModule(VkDevice device, const char *filename)
+{
+    std::vector<char> shaderContents;
+    std::ifstream shaderFile(filename, std::ios_base::in | std::ios_base::binary);
+    VkShaderModuleCreateInfo createInfo = {};
+    VkShaderModule shaderModule;
+
+    if (!shaderFile.good()) {
+        throw std::runtime_error("Failed to load shader contents");
+    }
+    readFile(shaderFile, shaderContents);
+
+    createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+    createInfo.codeSize = shaderContents.size();
+    createInfo.pCode = reinterpret_cast<const uint32_t *>(shaderContents.data());
+
+    if (vkCreateShaderModule(device, &createInfo, nullptr, &shaderModule) != VK_SUCCESS) {
+        throw std::runtime_error("Failed to create shader module!");
+    }
+
+    return shaderModule;
+}
+
+void VulkanBaseApp::getVertexDescriptions(std::vector<VkVertexInputBindingDescription>& bindingDesc, std::vector<VkVertexInputAttributeDescription>& attribDesc)
+{
+}
+
+void VulkanBaseApp::getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info)
+{
+
+}
+
+void VulkanBaseApp::createGraphicsPipeline()
+{
+    std::vector<VkPipelineShaderStageCreateInfo> shaderStageInfos(m_shaderFiles.size());
+    for (size_t i = 0; i < m_shaderFiles.size(); i++) {
+        shaderStageInfos[i] = {};
+        shaderStageInfos[i].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+        shaderStageInfos[i].stage = m_shaderFiles[i].first;
+        shaderStageInfos[i].module = createShaderModule(m_device, m_shaderFiles[i].second.c_str());
+        shaderStageInfos[i].pName = "main";
+    }
+
+    VkPipelineVertexInputStateCreateInfo vertexInputInfo = {};
+
+    std::vector<VkVertexInputBindingDescription> vertexBindingDescriptions;
+    std::vector<VkVertexInputAttributeDescription> vertexAttributeDescriptions;
+
+    getVertexDescriptions(vertexBindingDescriptions, vertexAttributeDescriptions);
+
+    vertexInputInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO;
+    vertexInputInfo.vertexBindingDescriptionCount = static_cast<uint32_t>(vertexBindingDescriptions.size());
+    vertexInputInfo.pVertexBindingDescriptions = vertexBindingDescriptions.data();
+    vertexInputInfo.vertexAttributeDescriptionCount = static_cast<uint32_t>(vertexAttributeDescriptions.size());
+    vertexInputInfo.pVertexAttributeDescriptions = vertexAttributeDescriptions.data();
+
+    VkPipelineInputAssemblyStateCreateInfo inputAssembly = {};
+    getAssemblyStateInfo(inputAssembly);
+
+    VkViewport viewport = {};
+    viewport.x = 0.0f;
+    viewport.y = 0.0f;
+    viewport.width = (float)m_swapChainExtent.width;
+    viewport.height = (float)m_swapChainExtent.height;
+    viewport.minDepth = 0.0f;
+    viewport.maxDepth = 1.0f;
+
+    VkRect2D scissor = {};
+    scissor.offset = { 0, 0 };
+    scissor.extent = m_swapChainExtent;
+
+    VkPipelineViewportStateCreateInfo viewportState = {};
+    viewportState.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO;
+    viewportState.viewportCount = 1;
+    viewportState.pViewports = &viewport;
+    viewportState.scissorCount = 1;
+    viewportState.pScissors = &scissor;
+
+    VkPipelineRasterizationStateCreateInfo rasterizer = {};
+    rasterizer.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
+    rasterizer.depthClampEnable = VK_FALSE;
+    rasterizer.rasterizerDiscardEnable = VK_FALSE;
+    rasterizer.polygonMode = VK_POLYGON_MODE_POINT;
+    rasterizer.lineWidth = 1.0f;
+    rasterizer.cullMode = VK_CULL_MODE_NONE;
+    rasterizer.frontFace = VK_FRONT_FACE_CLOCKWISE;
+    rasterizer.depthBiasEnable = VK_FALSE;
+
+    VkPipelineMultisampleStateCreateInfo multisampling = {};
+    multisampling.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
+    multisampling.sampleShadingEnable = VK_FALSE;
+    multisampling.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
+    multisampling.minSampleShading = 1.0f; // Optional
+    multisampling.pSampleMask = nullptr; // Optional
+    multisampling.alphaToCoverageEnable = VK_FALSE; // Optional
+    multisampling.alphaToOneEnable = VK_FALSE; // Optional
+
+    VkPipelineDepthStencilStateCreateInfo depthStencil = {};
+    depthStencil.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
+    depthStencil.depthTestEnable = VK_TRUE;
+    depthStencil.depthWriteEnable = VK_TRUE;
+    depthStencil.depthCompareOp = VK_COMPARE_OP_LESS;
+    depthStencil.depthBoundsTestEnable = VK_FALSE;
+    depthStencil.stencilTestEnable = VK_FALSE;
+
+    VkPipelineColorBlendAttachmentState colorBlendAttachment = {};
+    colorBlendAttachment.colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT;
+    colorBlendAttachment.blendEnable = VK_FALSE;
+
+    VkPipelineColorBlendStateCreateInfo colorBlending = {};
+    colorBlending.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO;
+    colorBlending.logicOpEnable = VK_FALSE;
+    colorBlending.logicOp = VK_LOGIC_OP_COPY;
+    colorBlending.attachmentCount = 1;
+    colorBlending.pAttachments = &colorBlendAttachment;
+    colorBlending.blendConstants[0] = 0.0f;
+    colorBlending.blendConstants[1] = 0.0f;
+    colorBlending.blendConstants[2] = 0.0f;
+    colorBlending.blendConstants[3] = 0.0f;
+
+    VkPipelineLayoutCreateInfo pipelineLayoutInfo = {};
+    pipelineLayoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+    pipelineLayoutInfo.setLayoutCount = 1; // Optional
+    pipelineLayoutInfo.pSetLayouts = &m_descriptorSetLayout; // Optional
+    pipelineLayoutInfo.pushConstantRangeCount = 0; // Optional
+    pipelineLayoutInfo.pPushConstantRanges = nullptr; // Optional
+
+    if (vkCreatePipelineLayout(m_device, &pipelineLayoutInfo, nullptr, &m_pipelineLayout) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create pipeline layout!");
+    }
+
+    VkGraphicsPipelineCreateInfo pipelineInfo = {};
+    pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO;
+    pipelineInfo.stageCount = static_cast<uint32_t>(shaderStageInfos.size());
+    pipelineInfo.pStages = shaderStageInfos.data();
+
+    pipelineInfo.pVertexInputState = &vertexInputInfo;
+    pipelineInfo.pInputAssemblyState = &inputAssembly;
+    pipelineInfo.pViewportState = &viewportState;
+    pipelineInfo.pRasterizationState = &rasterizer;
+    pipelineInfo.pMultisampleState = &multisampling;
+    pipelineInfo.pDepthStencilState = &depthStencil; // Optional
+    pipelineInfo.pColorBlendState = &colorBlending;
+    pipelineInfo.pDynamicState = nullptr; // Optional
+
+    pipelineInfo.layout = m_pipelineLayout;
+
+    pipelineInfo.renderPass = m_renderPass;
+    pipelineInfo.subpass = 0;
+
+    pipelineInfo.basePipelineHandle = VK_NULL_HANDLE; // Optional
+    pipelineInfo.basePipelineIndex = -1; // Optional
+
+    if (vkCreateGraphicsPipelines(m_device, VK_NULL_HANDLE, 1, &pipelineInfo, nullptr, &m_graphicsPipeline) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create graphics pipeline!");
+    }
+
+    for (size_t i = 0; i < shaderStageInfos.size(); i++) {
+        vkDestroyShaderModule(m_device, shaderStageInfos[i].module, nullptr);
+    }
+}
+
+void VulkanBaseApp::createFramebuffers()
+{
+    m_swapChainFramebuffers.resize(m_swapChainImageViews.size());
+    for (size_t i = 0; i < m_swapChainImageViews.size(); i++) {
+        VkImageView attachments[] = {
+            m_swapChainImageViews[i],
+            m_depthImageView
+        };
+
+        VkFramebufferCreateInfo framebufferInfo = {};
+        framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO;
+        framebufferInfo.renderPass = m_renderPass;
+        framebufferInfo.attachmentCount = countof(attachments);
+        framebufferInfo.pAttachments = attachments;
+        framebufferInfo.width = m_swapChainExtent.width;
+        framebufferInfo.height = m_swapChainExtent.height;
+        framebufferInfo.layers = 1;
+
+        if (vkCreateFramebuffer(m_device, &framebufferInfo, nullptr, &m_swapChainFramebuffers[i]) != VK_SUCCESS) {
+            throw std::runtime_error("failed to create framebuffer!");
+        }
+    }
+}
+
+void VulkanBaseApp::createCommandPool()
+{
+    VkCommandPoolCreateInfo poolInfo = {};
+    uint32_t graphicsIndex, presentIndex;
+
+    findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsIndex, presentIndex);
+
+    poolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+    poolInfo.queueFamilyIndex = graphicsIndex;
+    poolInfo.flags = 0; // Optional
+
+    if (vkCreateCommandPool(m_device, &poolInfo, nullptr, &m_commandPool) != VK_SUCCESS) {
+        throw std::runtime_error("Failed to create command pool!");
+    }
+}
+
+static void transitionImageLayout(VulkanBaseApp *app, VkImage image, VkFormat format, VkImageLayout oldLayout, VkImageLayout newLayout)
+{
+    VkCommandBuffer commandBuffer = app->beginSingleTimeCommands();
+
+    VkImageMemoryBarrier barrier = {};
+    barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+    barrier.oldLayout = oldLayout;
+    barrier.newLayout = newLayout;
+    barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    barrier.image = image;
+
+    if (newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) {
+        barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
+
+        if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_D24_UNORM_S8_UINT) {
+            barrier.subresourceRange.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
+        }
+    }
+    else {
+        barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+    }
+
+    barrier.subresourceRange.baseMipLevel = 0;
+    barrier.subresourceRange.levelCount = 1;
+    barrier.subresourceRange.baseArrayLayer = 0;
+    barrier.subresourceRange.layerCount = 1;
+
+    VkPipelineStageFlags sourceStage;
+    VkPipelineStageFlags destinationStage;
+
+    if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {
+        barrier.srcAccessMask = 0;
+        barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+
+        sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+        destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+    }
+    else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL && newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
+        barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+        barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+
+        sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+        destinationStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
+    }
+    else if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) {
+        barrier.srcAccessMask = 0;
+        barrier.dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
+
+        sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+        destinationStage = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT;
+    }
+    else {
+        throw std::invalid_argument("unsupported layout transition!");
+    }
+
+    vkCmdPipelineBarrier(
+        commandBuffer,
+        sourceStage, destinationStage,
+        0,
+        0, nullptr,
+        0, nullptr,
+        1, &barrier
+    );
+
+    app->endSingleTimeCommands(commandBuffer);
+}
+
+void VulkanBaseApp::createDepthResources()
+{
+    VkFormat depthFormat = findSupportedFormat(m_physicalDevice,
+        { VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_D24_UNORM_S8_UINT },
+        VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT);
+    createImage(m_physicalDevice, m_device, m_swapChainExtent.width, m_swapChainExtent.height, depthFormat, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_depthImage, m_depthImageMemory);
+    m_depthImageView = createImageView(m_device, m_depthImage, depthFormat, VK_IMAGE_ASPECT_DEPTH_BIT);
+    transitionImageLayout(this, m_depthImage, depthFormat, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL);
+}
+
+void VulkanBaseApp::createUniformBuffers()
+{
+    VkDeviceSize size = getUniformSize();
+    if (size > 0) {
+        m_uniformBuffers.resize(m_swapChainImages.size());
+        m_uniformMemory.resize(m_swapChainImages.size());
+        for (size_t i = 0; i < m_uniformBuffers.size(); i++) {
+            createBuffer(getUniformSize(),
+                VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
+                VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+                m_uniformBuffers[i], m_uniformMemory[i]);
+        }
+    }
+}
+
+void VulkanBaseApp::createDescriptorPool()
+{
+    VkDescriptorPoolSize poolSize = {};
+    poolSize.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+    poolSize.descriptorCount = static_cast<uint32_t>(m_swapChainImages.size());
+    VkDescriptorPoolCreateInfo poolInfo = {};
+    poolInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+    poolInfo.poolSizeCount = 1;
+    poolInfo.pPoolSizes = &poolSize;
+    poolInfo.maxSets = static_cast<uint32_t>(m_swapChainImages.size());;
+    if (vkCreateDescriptorPool(m_device, &poolInfo, nullptr, &m_descriptorPool) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create descriptor pool!");
+    }
+}
+
+void VulkanBaseApp::createDescriptorSets()
+{
+    std::vector<VkDescriptorSetLayout> layouts(m_swapChainImages.size(), m_descriptorSetLayout);
+    VkDescriptorSetAllocateInfo allocInfo = {};
+    allocInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+    allocInfo.descriptorPool = m_descriptorPool;
+    allocInfo.descriptorSetCount = static_cast<uint32_t>(m_swapChainImages.size());
+    allocInfo.pSetLayouts = layouts.data();
+    m_descriptorSets.resize(m_swapChainImages.size());
+
+    if (vkAllocateDescriptorSets(m_device, &allocInfo, m_descriptorSets.data()) != VK_SUCCESS) {
+        throw std::runtime_error("failed to allocate descriptor sets!");
+    }
+
+    VkDescriptorBufferInfo bufferInfo = {};
+    bufferInfo.offset = 0;
+    bufferInfo.range = VK_WHOLE_SIZE;
+    VkWriteDescriptorSet descriptorWrite = {};
+    descriptorWrite.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    descriptorWrite.dstBinding = 0;
+    descriptorWrite.dstArrayElement = 0;
+    descriptorWrite.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+    descriptorWrite.descriptorCount = 1;
+    descriptorWrite.pBufferInfo = &bufferInfo;
+    descriptorWrite.pImageInfo = nullptr; // Optional
+    descriptorWrite.pTexelBufferView = nullptr; // Optional
+
+    for (size_t i = 0; i < m_swapChainImages.size(); i++) {
+        bufferInfo.buffer = m_uniformBuffers[i];
+        descriptorWrite.dstSet = m_descriptorSets[i];
+        vkUpdateDescriptorSets(m_device, 1, &descriptorWrite, 0, nullptr);
+    }
+}
+
+void VulkanBaseApp::createCommandBuffers()
+{
+    m_commandBuffers.resize(m_swapChainFramebuffers.size());
+    VkCommandBufferAllocateInfo allocInfo = {};
+    allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+    allocInfo.commandPool = m_commandPool;
+    allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+    allocInfo.commandBufferCount = (uint32_t)m_commandBuffers.size();
+
+    if (vkAllocateCommandBuffers(m_device, &allocInfo, m_commandBuffers.data()) != VK_SUCCESS) {
+        throw std::runtime_error("failed to allocate command buffers!");
+    }
+
+    for (size_t i = 0; i < m_commandBuffers.size(); i++) {
+        VkCommandBufferBeginInfo beginInfo = {};
+        beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+        beginInfo.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
+        beginInfo.pInheritanceInfo = nullptr; // Optional
+
+        if (vkBeginCommandBuffer(m_commandBuffers[i], &beginInfo) != VK_SUCCESS) {
+            throw std::runtime_error("failed to begin recording command buffer!");
+        }
+
+        VkRenderPassBeginInfo renderPassInfo = {};
+        renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
+        renderPassInfo.renderPass = m_renderPass;
+        renderPassInfo.framebuffer = m_swapChainFramebuffers[i];
+
+        renderPassInfo.renderArea.offset = { 0, 0 };
+        renderPassInfo.renderArea.extent = m_swapChainExtent;
+
+        VkClearValue clearColors[2];
+        clearColors[0].color = { 0.0f, 0.0f, 0.0f, 1.0f };
+        clearColors[1].depthStencil = { 1.0f, 0 };
+        renderPassInfo.clearValueCount = countof(clearColors);
+        renderPassInfo.pClearValues = clearColors;
+
+        vkCmdBeginRenderPass(m_commandBuffers[i], &renderPassInfo, VK_SUBPASS_CONTENTS_INLINE);
+
+        vkCmdBindPipeline(m_commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, m_graphicsPipeline);
+
+        vkCmdBindDescriptorSets(m_commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, m_pipelineLayout, 0, 1, &m_descriptorSets[i], 0, nullptr);
+
+        fillRenderingCommandBuffer(m_commandBuffers[i]);
+
+        vkCmdEndRenderPass(m_commandBuffers[i]);
+
+        if (vkEndCommandBuffer(m_commandBuffers[i]) != VK_SUCCESS) {
+            throw std::runtime_error("failed to record command buffer!");
+        }
+    }
+}
+
+void VulkanBaseApp::createSyncObjects()
+{
+    VkSemaphoreCreateInfo semaphoreInfo = {};
+    semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+    VkFenceCreateInfo fenceInfo = {};
+    fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+    fenceInfo.flags = VK_FENCE_CREATE_SIGNALED_BIT;
+
+    m_inFlightFences.resize(MAX_FRAMES_IN_FLIGHT);
+    m_imageAvailableSemaphores.resize(MAX_FRAMES_IN_FLIGHT);
+    m_renderFinishedSemaphores.resize(MAX_FRAMES_IN_FLIGHT);
+
+    for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) {
+        if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &m_imageAvailableSemaphores[i]) != VK_SUCCESS) {
+            throw std::runtime_error("Failed to create image available semaphore!");
+        }
+        if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &m_renderFinishedSemaphores[i]) != VK_SUCCESS) {
+            throw std::runtime_error("Failed to create image available semaphore!");
+        }
+        if (vkCreateFence(m_device, &fenceInfo, nullptr, &m_inFlightFences[i]) != VK_SUCCESS) {
+            throw std::runtime_error("Failed to create image available semaphore!");
+        }
+    }
+}
+
+void VulkanBaseApp::getWaitFrameSemaphores(std::vector<VkSemaphore>& wait, std::vector<VkPipelineStageFlags>& waitStages) const
+{
+}
+
+void VulkanBaseApp::getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const
+{
+}
+
+VkDeviceSize VulkanBaseApp::getUniformSize() const
+{
+    return VkDeviceSize(0);
+}
+
+void VulkanBaseApp::updateUniformBuffer(uint32_t imageIndex, size_t frame)
+{
+}
+
+void VulkanBaseApp::createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& bufferMemory)
+{
+    VkBufferCreateInfo bufferInfo = {};
+    bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+    bufferInfo.size = size;
+    bufferInfo.usage = usage;
+    bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+
+    if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create buffer!");
+    }
+
+    VkMemoryRequirements memRequirements;
+    vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements);
+
+    VkMemoryAllocateInfo allocInfo = {};
+    allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+    allocInfo.allocationSize = memRequirements.size;
+    allocInfo.memoryTypeIndex = findMemoryType(m_physicalDevice, memRequirements.memoryTypeBits, properties);
+
+    if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) != VK_SUCCESS) {
+        throw std::runtime_error("failed to allocate buffer memory!");
+    }
+
+    vkBindBufferMemory(m_device, buffer, bufferMemory, 0);
+}
+
+void VulkanBaseApp::createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, VkBuffer& buffer, VkDeviceMemory& bufferMemory)
+{
+    VkBufferCreateInfo bufferInfo = {};
+    bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+    bufferInfo.size = size;
+    bufferInfo.usage = usage;
+    bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+
+    if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create buffer!");
+    }
+
+    VkMemoryRequirements memRequirements;
+    vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements);
+
+#ifdef _WIN64
+    WindowsSecurityAttributes winSecurityAttributes;
+
+    VkExportMemoryWin32HandleInfoKHR vulkanExportMemoryWin32HandleInfoKHR = {};
+    vulkanExportMemoryWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR;
+    vulkanExportMemoryWin32HandleInfoKHR.pNext = NULL;
+    vulkanExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes;
+    vulkanExportMemoryWin32HandleInfoKHR.dwAccess = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE;
+    vulkanExportMemoryWin32HandleInfoKHR.name = (LPCWSTR)NULL;
+#endif
+    VkExportMemoryAllocateInfoKHR vulkanExportMemoryAllocateInfoKHR = {};
+    vulkanExportMemoryAllocateInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR;
+#ifdef _WIN64
+    vulkanExportMemoryAllocateInfoKHR.pNext = extMemHandleType & VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR ? &vulkanExportMemoryWin32HandleInfoKHR : NULL;
+    vulkanExportMemoryAllocateInfoKHR.handleTypes = extMemHandleType;
+#else
+    vulkanExportMemoryAllocateInfoKHR.pNext = NULL;
+    vulkanExportMemoryAllocateInfoKHR.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+#endif
+    VkMemoryAllocateInfo allocInfo = {};
+    allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+    allocInfo.pNext = &vulkanExportMemoryAllocateInfoKHR;
+    allocInfo.allocationSize = memRequirements.size;
+    allocInfo.memoryTypeIndex = findMemoryType(m_physicalDevice, memRequirements.memoryTypeBits, properties);
+
+    if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) != VK_SUCCESS) {
+        throw std::runtime_error("failed to allocate external buffer memory!");
+    }
+
+    vkBindBufferMemory(m_device, buffer, bufferMemory, 0);
+}
+
+void *VulkanBaseApp::getMemHandle(VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType)
+{
+#ifdef _WIN64
+    HANDLE handle = 0;
+
+    VkMemoryGetWin32HandleInfoKHR vkMemoryGetWin32HandleInfoKHR = {};
+    vkMemoryGetWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR;
+    vkMemoryGetWin32HandleInfoKHR.pNext = NULL;
+    vkMemoryGetWin32HandleInfoKHR.memory = memory;
+    vkMemoryGetWin32HandleInfoKHR.handleType = handleType;
+
+    PFN_vkGetMemoryWin32HandleKHR fpGetMemoryWin32HandleKHR;
+    fpGetMemoryWin32HandleKHR = (PFN_vkGetMemoryWin32HandleKHR)vkGetDeviceProcAddr(m_device, "vkGetMemoryWin32HandleKHR");
+    if (!fpGetMemoryWin32HandleKHR) {
+        throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!");
+    }
+    if (fpGetMemoryWin32HandleKHR(m_device, &vkMemoryGetWin32HandleInfoKHR, &handle) != VK_SUCCESS) {
+        throw std::runtime_error("Failed to retrieve handle for buffer!");
+    }
+    return (void *)handle;
+#else
+    int fd = -1;
+
+    VkMemoryGetFdInfoKHR vkMemoryGetFdInfoKHR = {};
+    vkMemoryGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR;
+    vkMemoryGetFdInfoKHR.pNext = NULL;
+    vkMemoryGetFdInfoKHR.memory = memory;
+    vkMemoryGetFdInfoKHR.handleType = handleType;
+
+    PFN_vkGetMemoryFdKHR fpGetMemoryFdKHR;
+    fpGetMemoryFdKHR = (PFN_vkGetMemoryFdKHR)vkGetDeviceProcAddr(m_device, "vkGetMemoryFdKHR");
+    if (!fpGetMemoryFdKHR) {
+        throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!");
+    }
+    if (fpGetMemoryFdKHR(m_device, &vkMemoryGetFdInfoKHR, &fd) != VK_SUCCESS) {
+        throw std::runtime_error("Failed to retrieve handle for buffer!");
+    }
+    return (void *)(uintptr_t)fd;
+#endif /* _WIN64 */
+}
+
+void *VulkanBaseApp::getSemaphoreHandle(VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType)
+{
+#ifdef _WIN64
+    HANDLE handle;
+
+    VkSemaphoreGetWin32HandleInfoKHR semaphoreGetWin32HandleInfoKHR = {};
+    semaphoreGetWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR;
+    semaphoreGetWin32HandleInfoKHR.pNext = NULL;
+    semaphoreGetWin32HandleInfoKHR.semaphore = semaphore;
+    semaphoreGetWin32HandleInfoKHR.handleType = handleType;
+
+    PFN_vkGetSemaphoreWin32HandleKHR fpGetSemaphoreWin32HandleKHR;
+    fpGetSemaphoreWin32HandleKHR = (PFN_vkGetSemaphoreWin32HandleKHR)vkGetDeviceProcAddr(m_device, "vkGetSemaphoreWin32HandleKHR");
+    if (!fpGetSemaphoreWin32HandleKHR) {
+        throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!");
+    }
+    if (fpGetSemaphoreWin32HandleKHR(m_device, &semaphoreGetWin32HandleInfoKHR, &handle) != VK_SUCCESS) {
+        throw std::runtime_error("Failed to retrieve handle for buffer!");
+    }
+
+    return (void *)handle;
+#else
+    int fd;
+
+    VkSemaphoreGetFdInfoKHR semaphoreGetFdInfoKHR = {};
+    semaphoreGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR;
+    semaphoreGetFdInfoKHR.pNext = NULL;
+    semaphoreGetFdInfoKHR.semaphore = semaphore;
+    semaphoreGetFdInfoKHR.handleType = handleType;
+
+    PFN_vkGetSemaphoreFdKHR fpGetSemaphoreFdKHR;
+    fpGetSemaphoreFdKHR = (PFN_vkGetSemaphoreFdKHR)vkGetDeviceProcAddr(m_device, "vkGetSemaphoreFdKHR");
+    if (!fpGetSemaphoreFdKHR) {
+        throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!");
+    }
+    if (fpGetSemaphoreFdKHR(m_device, &semaphoreGetFdInfoKHR, &fd) != VK_SUCCESS) {
+        throw std::runtime_error("Failed to retrieve handle for buffer!");
+    }
+
+    return (void *)(uintptr_t)fd;
+#endif
+}
+
+void VulkanBaseApp::createExternalSemaphore(VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType)
+{
+    VkSemaphoreCreateInfo semaphoreInfo = {};
+    semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+    VkExportSemaphoreCreateInfoKHR exportSemaphoreCreateInfo = {};
+    exportSemaphoreCreateInfo.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR;
+
+#ifdef _WIN64
+    WindowsSecurityAttributes winSecurityAttributes;
+
+    VkExportSemaphoreWin32HandleInfoKHR exportSemaphoreWin32HandleInfoKHR = {};
+    exportSemaphoreWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR;
+    exportSemaphoreWin32HandleInfoKHR.pNext = NULL;
+    exportSemaphoreWin32HandleInfoKHR.pAttributes = &winSecurityAttributes;
+    exportSemaphoreWin32HandleInfoKHR.dwAccess = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE;
+    exportSemaphoreWin32HandleInfoKHR.name = (LPCWSTR)NULL;
+    exportSemaphoreCreateInfo.pNext = (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) ? &exportSemaphoreWin32HandleInfoKHR : NULL;
+#else
+    exportSemaphoreCreateInfo.pNext = NULL;
+#endif
+    exportSemaphoreCreateInfo.handleTypes = handleType;
+    semaphoreInfo.pNext = &exportSemaphoreCreateInfo;
+
+    if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &semaphore) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create synchronization objects for a CUDA-Vulkan!");
+    }
+}
+
+void VulkanBaseApp::importExternalBuffer(void *handle, VkExternalMemoryHandleTypeFlagBits handleType, size_t size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& memory)
+{
+    VkBufferCreateInfo bufferInfo = {};
+    bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+    bufferInfo.size = size;
+    bufferInfo.usage = usage;
+    bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+
+    if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
+        throw std::runtime_error("failed to create buffer!");
+    }
+
+    VkMemoryRequirements memRequirements;
+    vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements);
+
+#ifdef _WIN64
+    VkImportMemoryWin32HandleInfoKHR handleInfo = {};
+    handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR;
+    handleInfo.pNext = NULL;
+    handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
+    handleInfo.handle = handle;
+    handleInfo.name = NULL;
+#else
+    VkImportMemoryFdInfoKHR handleInfo = {};
+    handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR;
+    handleInfo.pNext = NULL;
+    handleInfo.fd = (int)(uintptr_t)handle;
+    handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+#endif /* _WIN64 */
+
+    VkMemoryAllocateInfo memAllocation = {};
+    memAllocation.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+    memAllocation.pNext = (void *)&handleInfo;
+    memAllocation.allocationSize = size;
+    memAllocation.memoryTypeIndex = findMemoryType(m_physicalDevice, memRequirements.memoryTypeBits, properties);
+
+    if (vkAllocateMemory(m_device, &memAllocation, nullptr, &memory) != VK_SUCCESS) {
+        throw std::runtime_error("Failed to import allocation!");
+    }
+
+    vkBindBufferMemory(m_device, buffer, memory, 0);
+}
+
+void VulkanBaseApp::copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size)
+{
+
+    VkCommandBuffer commandBuffer = beginSingleTimeCommands();
+
+    VkBufferCopy copyRegion = {};
+    copyRegion.size = size;
+    vkCmdCopyBuffer(commandBuffer, src, dst, 1, &copyRegion);
+
+    endSingleTimeCommands(commandBuffer);
+}
+
+void VulkanBaseApp::drawFrame()
+{
+    size_t currentFrameIdx = m_currentFrame % MAX_FRAMES_IN_FLIGHT;
+    vkWaitForFences(m_device, 1, &m_inFlightFences[currentFrameIdx], VK_TRUE, std::numeric_limits<uint64_t>::max());
+
+    uint32_t imageIndex;
+    VkResult result = vkAcquireNextImageKHR(m_device, m_swapChain, std::numeric_limits<uint64_t>::max(), m_imageAvailableSemaphores[currentFrameIdx], VK_NULL_HANDLE, &imageIndex);
+    if (result == VK_ERROR_OUT_OF_DATE_KHR) {
+        recreateSwapChain();
+    }
+    else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) {
+        throw std::runtime_error("Failed to acquire swap chain image!");
+    }
+
+    updateUniformBuffer(imageIndex, m_currentFrame);
+
+    VkSubmitInfo submitInfo = {};
+    submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+
+    std::vector<VkSemaphore> waitSemaphores;
+    std::vector<VkPipelineStageFlags> waitStages;
+
+    waitSemaphores.push_back(m_imageAvailableSemaphores[currentFrameIdx]);
+    waitStages.push_back(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT);
+    getWaitFrameSemaphores(waitSemaphores, waitStages);
+
+    submitInfo.waitSemaphoreCount = (uint32_t)waitSemaphores.size();
+    submitInfo.pWaitSemaphores = waitSemaphores.data();
+    submitInfo.pWaitDstStageMask = waitStages.data();
+
+    submitInfo.commandBufferCount = 1;
+    submitInfo.pCommandBuffers = &m_commandBuffers[imageIndex];
+
+    std::vector<VkSemaphore> signalSemaphores;
+    getSignalFrameSemaphores(signalSemaphores);
+    signalSemaphores.push_back(m_renderFinishedSemaphores[currentFrameIdx]);
+    submitInfo.signalSemaphoreCount = (uint32_t)signalSemaphores.size();
+    submitInfo.pSignalSemaphores = signalSemaphores.data();
+
+    vkResetFences(m_device, 1, &m_inFlightFences[currentFrameIdx]);
+
+    if (vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, m_inFlightFences[currentFrameIdx]) != VK_SUCCESS) {
+        throw std::runtime_error("failed to submit draw command buffer!");
+    }
+
+    VkPresentInfoKHR presentInfo = {};
+    presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
+    presentInfo.waitSemaphoreCount = 1;
+    presentInfo.pWaitSemaphores = &m_renderFinishedSemaphores[currentFrameIdx];
+
+    VkSwapchainKHR swapChains[] = { m_swapChain };
+    presentInfo.swapchainCount = 1;
+    presentInfo.pSwapchains = swapChains;
+    presentInfo.pImageIndices = &imageIndex;
+
+    result = vkQueuePresentKHR(m_presentQueue, &presentInfo);
+    if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR || m_framebufferResized) {
+        recreateSwapChain();
+        m_framebufferResized = false;
+    }
+    else if (result != VK_SUCCESS) {
+        throw std::runtime_error("Failed to acquire swap chain image!");
+    }
+
+    m_currentFrame++;
+}
+
+void VulkanBaseApp::cleanupSwapChain()
+{
+
+    if (m_depthImageView != VK_NULL_HANDLE) {
+        vkDestroyImageView(m_device, m_depthImageView, nullptr);
+    }
+    if (m_depthImage != VK_NULL_HANDLE) {
+        vkDestroyImage(m_device, m_depthImage, nullptr);
+    }
+    if (m_depthImageMemory != VK_NULL_HANDLE) {
+        vkFreeMemory(m_device, m_depthImageMemory, nullptr);
+    }
+
+    for (size_t i = 0; i < m_uniformBuffers.size(); i++) {
+        vkDestroyBuffer(m_device, m_uniformBuffers[i], nullptr);
+        vkFreeMemory(m_device, m_uniformMemory[i], nullptr);
+    }
+
+    if (m_descriptorPool != VK_NULL_HANDLE) {
+        vkDestroyDescriptorPool(m_device, m_descriptorPool, nullptr);
+    }
+
+    for (size_t i = 0; i < m_swapChainFramebuffers.size(); i++) {
+        vkDestroyFramebuffer(m_device, m_swapChainFramebuffers[i], nullptr);
+    }
+
+    if (m_graphicsPipeline != VK_NULL_HANDLE) {
+        vkDestroyPipeline(m_device, m_graphicsPipeline, nullptr);
+    }
+
+    if (m_pipelineLayout != VK_NULL_HANDLE) {
+        vkDestroyPipelineLayout(m_device, m_pipelineLayout, nullptr);
+    }
+
+    if (m_renderPass != VK_NULL_HANDLE) {
+        vkDestroyRenderPass(m_device, m_renderPass, nullptr);
+    }
+
+    for (size_t i = 0; i < m_swapChainImageViews.size(); i++) {
+        vkDestroyImageView(m_device, m_swapChainImageViews[i], nullptr);
+    }
+
+    if (m_swapChain != VK_NULL_HANDLE) {
+        vkDestroySwapchainKHR(m_device, m_swapChain, nullptr);
+    }
+}
+
+void VulkanBaseApp::recreateSwapChain()
+{
+    int width, height;
+
+    glfwGetFramebufferSize(m_window, &width, &height);
+    while (width == 0 || height == 0) {
+        glfwWaitEvents();
+        glfwGetFramebufferSize(m_window, &width, &height);
+    }
+
+    vkDeviceWaitIdle(m_device);
+
+    cleanupSwapChain();
+
+    createSwapChain();
+    createImageViews();
+    createRenderPass();
+    createGraphicsPipeline();
+    createDepthResources();
+    createFramebuffers();
+    createUniformBuffers();
+    createDescriptorPool();
+    createDescriptorSets();
+    createCommandBuffers();
+}
+
+void VulkanBaseApp::mainLoop()
+{
+    while (!glfwWindowShouldClose(m_window)) {
+        glfwPollEvents();
+        drawFrame();
+    }
+    vkDeviceWaitIdle(m_device);
+}
+
+void readFile(std::istream& s, std::vector<char>& data)
+{
+    s.seekg(0, std::ios_base::end);
+    data.resize(s.tellg());
+    s.clear();
+    s.seekg(0, std::ios_base::beg);
+    s.read(data.data(), data.size());
+}
diff --git a/Samples/simpleVulkanMMAP/VulkanBaseApp.h b/Samples/simpleVulkanMMAP/VulkanBaseApp.h
new file mode 100644
index 00000000..5c80dc8d
--- /dev/null
+++ b/Samples/simpleVulkanMMAP/VulkanBaseApp.h
@@ -0,0 +1,142 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+#ifndef __VULKANBASEAPP_H__
+#define __VULKANBASEAPP_H__
+
+#include <string>
+#include <vector>
+#include <vulkan/vulkan.h>
+#ifdef _WIN64
+#define NOMINMAX
+#include <windows.h>
+#include <vulkan/vulkan_win32.h>
+#endif /* _WIN64 */
+
+struct GLFWwindow;
+
+class VulkanBaseApp
+{
+public:
+    VulkanBaseApp(const std::string& appName, bool enableValidation = false);
+    static VkExternalSemaphoreHandleTypeFlagBits getDefaultSemaphoreHandleType();
+    static VkExternalMemoryHandleTypeFlagBits getDefaultMemHandleType();
+    virtual ~VulkanBaseApp();
+    void init();
+    void *getMemHandle(VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType);
+    void *getSemaphoreHandle(VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType);
+    bool isVkPhysicalDeviceUuid(void *Uuid);
+    void createExternalSemaphore(VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType);
+    void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& bufferMemory);
+    void createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, VkBuffer& buffer, VkDeviceMemory& bufferMemory);
+    void importExternalBuffer(void *handle, VkExternalMemoryHandleTypeFlagBits handleType, size_t size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& memory);
+    void copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size);
+    VkCommandBuffer beginSingleTimeCommands();
+    void endSingleTimeCommands(VkCommandBuffer commandBuffer);
+    void mainLoop();
+protected:
+    const std::string m_appName;
+    const bool m_enableValidation;
+    VkInstance m_instance;
+    VkDebugUtilsMessengerEXT m_debugMessenger;
+    VkSurfaceKHR m_surface;
+    VkPhysicalDevice m_physicalDevice;
+    uint8_t m_deviceUUID[VK_UUID_SIZE];
+    VkDevice m_device;
+    VkQueue m_graphicsQueue;
+    VkQueue m_presentQueue;
+    VkSwapchainKHR m_swapChain;
+    std::vector<VkImage> m_swapChainImages;
+    VkFormat m_swapChainFormat;
+    VkExtent2D m_swapChainExtent;
+    std::vector<VkImageView> m_swapChainImageViews;
+    std::vector<std::pair<VkShaderStageFlagBits, std::string> > m_shaderFiles;
+    VkRenderPass m_renderPass;
+    VkPipelineLayout m_pipelineLayout;
+    VkPipeline m_graphicsPipeline;
+    std::vector<VkFramebuffer> m_swapChainFramebuffers;
+    VkCommandPool m_commandPool;
+    std::vector<VkCommandBuffer> m_commandBuffers;
+    std::vector<VkSemaphore> m_imageAvailableSemaphores;
+    std::vector<VkSemaphore> m_renderFinishedSemaphores;
+    std::vector<VkFence> m_inFlightFences;
+    std::vector<VkBuffer> m_uniformBuffers;
+    std::vector<VkDeviceMemory> m_uniformMemory;
+    VkDescriptorSetLayout m_descriptorSetLayout;
+    VkDescriptorPool m_descriptorPool;
+    std::vector<VkDescriptorSet> m_descriptorSets;
+
+    VkImage m_depthImage;
+    VkDeviceMemory m_depthImageMemory;
+    VkImageView m_depthImageView;
+    size_t m_currentFrame;
+    bool m_framebufferResized;
+
+    virtual void initVulkanApp() {}
+    virtual void fillRenderingCommandBuffer(VkCommandBuffer& buffer) {}
+    virtual std::vector<const char *> getRequiredExtensions() const;
+    virtual std::vector<const char *> getRequiredDeviceExtensions() const;
+    virtual void getVertexDescriptions(std::vector<VkVertexInputBindingDescription>& bindingDesc, std::vector<VkVertexInputAttributeDescription>& attribDesc);
+    virtual void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info);
+    virtual void getWaitFrameSemaphores(std::vector<VkSemaphore>& wait, std::vector< VkPipelineStageFlags>& waitStages) const;
+    virtual void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const;
+    virtual VkDeviceSize getUniformSize() const;
+    virtual void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame);
+    virtual void drawFrame();
+private:
+    GLFWwindow *m_window;
+
+    void initWindow();
+    void initVulkan();
+    void createInstance();
+    void createSurface();
+    void createDevice();
+    void createSwapChain();
+    void createImageViews();
+    void createRenderPass();
+    void createDescriptorSetLayout();
+    void createGraphicsPipeline();
+    void createFramebuffers();
+    void createCommandPool();
+    void createDepthResources();
+    void createUniformBuffers();
+    void createDescriptorPool();
+    void createDescriptorSets();
+    void createCommandBuffers();
+    void createSyncObjects();
+
+    void cleanupSwapChain();
+    void recreateSwapChain();
+
+    bool isSuitableDevice(VkPhysicalDevice dev) const;
+    static void resizeCallback(GLFWwindow *window, int width, int height);
+};
+
+void readFile(std::istream& s, std::vector<char>& data);
+
+#endif /* __VULKANBASEAPP_H__ */
diff --git a/Samples/simpleVulkanMMAP/VulkanCudaInterop.h b/Samples/simpleVulkanMMAP/VulkanCudaInterop.h
new file mode 100644
index 00000000..a16ddab5
--- /dev/null
+++ b/Samples/simpleVulkanMMAP/VulkanCudaInterop.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+#ifndef __VKCUDA_H__
+#define __VKCUDA_H__
+
+#include <cuda_runtime_api.h>
+#include "cuda.h"
+#define CUDA_DRIVER_API
+#include <helper_cuda.h>
+
+bool isDeviceCompatible(void *Uuid, size_t size) {
+
+    int cudaDevice = cudaInvalidDeviceId;
+    int deviceCount;
+    checkCudaErrors(cudaGetDeviceCount(&deviceCount));
+
+    for (int i = 0; i < deviceCount; ++i) {
+        cudaDeviceProp devProp = { };
+        checkCudaErrors(cudaGetDeviceProperties(&devProp, i));
+        if (!memcmp(&devProp.uuid, Uuid, size)) {
+            cudaDevice = i;
+            break;
+        }
+    }
+    if (cudaDevice == cudaInvalidDeviceId) {
+        return false;
+    }
+
+    int deviceSupportsHandle = 0;
+    int attributeVal = 0;
+    int deviceComputeMode = 0;
+
+    checkCudaErrors(cuDeviceGetAttribute(&deviceComputeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, cudaDevice));
+    checkCudaErrors(cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cudaDevice));
+
+#if defined(__linux__)
+    checkCudaErrors(cuDeviceGetAttribute(&deviceSupportsHandle, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED, cudaDevice));
+#else
+    checkCudaErrors(cuDeviceGetAttribute(&deviceSupportsHandle, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED, cudaDevice));
+#endif
+
+    if ((deviceComputeMode != CU_COMPUTEMODE_DEFAULT) || !attributeVal || !deviceSupportsHandle) {
+        return false;
+    }
+    return true;
+}
+
+#endif // __VKCUDA_H__
+
diff --git a/Samples/simpleVulkanMMAP/findvulkan.mk b/Samples/simpleVulkanMMAP/findvulkan.mk
new file mode 100644
index 00000000..47016fd7
--- /dev/null
+++ b/Samples/simpleVulkanMMAP/findvulkan.mk
@@ -0,0 +1,146 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+#  findvulkan.mk is used to find the necessary Vulkan Libraries for specific distributions
+#               this is supported on Linux
+#
+################################################################################
+
+# Determine OS platform and unix distribution
+ifeq ("$(TARGET_OS)","linux")
+   # first search lsb_release
+   DISTRO  = $(shell lsb_release -i -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+   ifeq ("$(DISTRO)","")
+     # second search and parse /etc/issue
+     DISTRO = $(shell more /etc/issue | awk '{print $$1}' | sed '1!d' | sed -e "/^$$/d" 2>/dev/null | tr "[:upper:]" "[:lower:]")
+     # ensure data from /etc/issue is valid
+     ifneq (,$(filter-out $(DISTRO),ubuntu fedora red rhel centos suse))
+       DISTRO = 
+     endif
+     ifeq ("$(DISTRO)","")
+       # third, we can search in /etc/os-release or /etc/{distro}-release
+       DISTRO = $(shell awk '/ID/' /etc/*-release | sed 's/ID=//' | grep -v "VERSION" | grep -v "ID" | grep -v "DISTRIB")
+     endif
+   endif
+endif
+
+ifeq ("$(TARGET_OS)","linux")
+    # Each set of Linux Distros have different paths for where to find libraries
+    UBUNTU = $(shell echo $(DISTRO) | grep -i ubuntu      >/dev/null 2>&1; echo $$?)
+    FEDORA = $(shell echo $(DISTRO) | grep -i fedora      >/dev/null 2>&1; echo $$?)
+    RHEL   = $(shell echo $(DISTRO) | grep -i 'red\|rhel' >/dev/null 2>&1; echo $$?)
+    CENTOS = $(shell echo $(DISTRO) | grep -i centos      >/dev/null 2>&1; echo $$?)
+    SUSE   = $(shell echo $(DISTRO) | grep -i 'suse\|sles' >/dev/null 2>&1; echo $$?)
+    ifeq ("$(UBUNTU)","0")
+      ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        GLPATH := /usr/arm-linux-gnueabihf/lib
+        GLLINK := -L/usr/arm-linux-gnueabihf/lib
+        ifneq ($(TARGET_FS),) 
+          GLPATH += $(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+          GLLINK += -L$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif 
+      else ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-aarch64)
+        GLPATH := /usr/aarch64-linux-gnu/lib
+        GLLINK := -L/usr/aarch64-linux-gnu/lib
+        ifneq ($(TARGET_FS),) 
+          GLPATH += $(TARGET_FS)/usr/lib
+          GLPATH += $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+          GLLINK += -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+        endif 
+      else
+        UBUNTU_PKG_NAME = $(shell which dpkg >/dev/null 2>&1 && dpkg -l 'nvidia-*' | grep '^ii' | awk '{print $$2}' | head -1)
+        ifneq ("$(UBUNTU_PKG_NAME)","")
+          GLPATH    ?= /usr/lib/$(UBUNTU_PKG_NAME)
+          GLLINK    ?= -L/usr/lib/$(UBUNTU_PKG_NAME)
+        endif
+
+        DFLT_PATH ?= /usr/lib
+      endif
+    endif
+    ifeq ("$(SUSE)","0")
+      GLPATH    ?= /usr/X11R6/lib64
+      GLLINK    ?= -L/usr/X11R6/lib64
+      DFLT_PATH ?= /usr/lib64
+    endif
+    ifeq ("$(FEDORA)","0")
+      GLPATH    ?= /usr/lib64/nvidia
+      GLLINK    ?= -L/usr/lib64/nvidia
+      DFLT_PATH ?= /usr/lib64
+    endif
+    ifeq ("$(RHEL)","0")
+      GLPATH    ?= /usr/lib64/nvidia
+      GLLINK    ?= -L/usr/lib64/nvidia
+      DFLT_PATH ?= /usr/lib64
+    endif
+    ifeq ("$(CENTOS)","0")
+      GLPATH    ?= /usr/lib64/nvidia
+      GLLINK    ?= -L/usr/lib64/nvidia
+      DFLT_PATH ?= /usr/lib64
+    endif
+
+  VULKAN_SDK_PATH ?= ${VULKAN_SDK}
+
+  ifeq ("$(VULKAN_SDK_PATH)","")
+      VULKAN_SDK_PATH := $(DFLT_PATH)
+  endif
+
+  VULKAN_SDK_LIB  := $(shell find -L $(VULKAN_SDK_PATH) -name libvulkan.so    -print 2>/dev/null)
+  X11LIB          := $(shell find -L $(GLPATH) $(DFLT_PATH) -name libX11.so    -print 2>/dev/null)
+
+  ifeq ("$(VULKAN_SDK_LIB)","")
+      $(info >>> WARNING - libvulkan.so not found, please install Vulkan SDK and pass VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK> <<<)
+      SAMPLE_ENABLED := 0
+  else
+      VULKAN_SDK_LIB := $(shell echo $(VULKAN_SDK_LIB) | sed "s/ .*//" | sed "s/\/libvulkan.so//" )
+  endif
+
+  ifeq ("$(X11LIB)","")
+      $(info >>> WARNING - libX11.so not found, please install libX11.so <<<)
+      SAMPLE_ENABLED := 0
+  endif
+
+  HEADER_SEARCH_PATH ?= $(TARGET_FS)/usr/include
+  HEADER_SEARCH_PATH += $(TARGET_FS)/usr/local/include
+  ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+      HEADER_SEARCH_PATH += /usr/arm-linux-gnueabihf/include
+  else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-aarch64-linux)
+      HEADER_SEARCH_PATH += /usr/aarch64-linux-gnu/include
+  endif
+
+  VULKAN_HEADER  := $(shell find -L $(VULKAN_SDK_PATH) $(HEADER_SEARCH_PATH) -name vulkan.h -print 2>/dev/null)
+
+  ifeq ("$(VULKAN_HEADER)","")
+      $(info >>> WARNING - vulkan.h not found, please install vulkan.h <<<)
+      SAMPLE_ENABLED := 0
+  else
+      VULKAN_HEADER := $(shell echo $(VULKAN_HEADER) | sed "s/ .*//" | sed "s/\/vulkan\/vulkan.h//" )
+  endif
+else
+endif
+
diff --git a/Samples/simpleVulkanMMAP/main.cpp b/Samples/simpleVulkanMMAP/main.cpp
new file mode 100644
index 00000000..110e6111
--- /dev/null
+++ b/Samples/simpleVulkanMMAP/main.cpp
@@ -0,0 +1,312 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+ /*
+  * This sample demonstrates CUDA Interop with Vulkan using cuMemMap APIs.
+  * Allocating device memory and updating values in those allocations are performed by CUDA
+  * and the contents of the allocation are visualized by Vulkan.
+  */
+
+#include "VulkanBaseApp.h"
+
+#include <iostream>
+#include <iomanip>
+#include <chrono>
+#include <algorithm>
+
+#include "MonteCarloPi.h"
+#include <helper_cuda.h>
+#include <cuda.h>
+
+#include "helper_multiprocess.h"
+
+//#define DEBUG
+#ifndef DEBUG
+#define ENABLE_VALIDATION (false)
+#else
+#define ENABLE_VALIDATION (true)
+#endif
+
+#define NUM_SIMULATION_POINTS 50000
+
+class VulkanCudaPi : public VulkanBaseApp
+{
+    typedef struct UniformBufferObject_st {
+        float frame;
+    } UniformBufferObject;
+
+    VkBuffer m_inCircleBuffer, m_xyPositionBuffer;
+    VkDeviceMemory m_inCircleMemory, m_xyPositionMemory;
+    VkSemaphore m_vkWaitSemaphore, m_vkSignalSemaphore;
+    MonteCarloPiSimulation m_sim;
+    UniformBufferObject m_ubo;
+    cudaStream_t m_stream;
+    cudaExternalSemaphore_t m_cudaWaitSemaphore, m_cudaSignalSemaphore;
+    using chrono_tp = std::chrono::time_point<std::chrono::high_resolution_clock>;
+    chrono_tp m_lastTime;
+    size_t m_lastFrame;
+public:
+    VulkanCudaPi(size_t num_points) :
+        VulkanBaseApp("simpleVulkanMMAP", ENABLE_VALIDATION),
+        m_inCircleBuffer(VK_NULL_HANDLE),
+        m_xyPositionBuffer(VK_NULL_HANDLE),
+        m_inCircleMemory(VK_NULL_HANDLE),
+        m_xyPositionMemory(VK_NULL_HANDLE),
+        m_sim(num_points),
+        m_ubo(),
+        m_stream(0),
+        m_vkWaitSemaphore(VK_NULL_HANDLE),
+        m_vkSignalSemaphore(VK_NULL_HANDLE),
+        m_cudaWaitSemaphore(),
+        m_cudaSignalSemaphore(),
+        m_lastFrame(0) {
+
+        // Add our compiled vulkan shader files
+        m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, "montecarlo.vert"));
+        m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, "montecarlo.frag"));
+    }
+
+    ~VulkanCudaPi() {
+        if (m_stream) {
+            // Make sure there's no pending work before we start tearing down
+            checkCudaErrors(cudaStreamSynchronize(m_stream));
+            checkCudaErrors(cudaStreamDestroy(m_stream));
+        }
+
+        if (m_vkSignalSemaphore != VK_NULL_HANDLE) {
+            checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaSignalSemaphore));
+            vkDestroySemaphore(m_device, m_vkSignalSemaphore, nullptr);
+        }
+        if (m_vkWaitSemaphore != VK_NULL_HANDLE) {
+            checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaWaitSemaphore));
+            vkDestroySemaphore(m_device, m_vkWaitSemaphore, nullptr);
+        }
+        if (m_xyPositionBuffer != VK_NULL_HANDLE) {
+            vkDestroyBuffer(m_device, m_xyPositionBuffer, nullptr);
+        }
+        if (m_xyPositionMemory != VK_NULL_HANDLE) {
+            vkFreeMemory(m_device, m_xyPositionMemory, nullptr);
+        }
+        if (m_inCircleBuffer != VK_NULL_HANDLE) {
+            vkDestroyBuffer(m_device, m_inCircleBuffer, nullptr);
+        }
+        if (m_inCircleMemory != VK_NULL_HANDLE) {
+            vkFreeMemory(m_device, m_inCircleMemory, nullptr);
+        }
+    }
+
+    void fillRenderingCommandBuffer(VkCommandBuffer& commandBuffer) {
+        VkBuffer vertexBuffers[] = { m_inCircleBuffer, m_xyPositionBuffer };
+        VkDeviceSize offsets[] = { 0, 0 };
+        vkCmdBindVertexBuffers(commandBuffer, 0, sizeof(vertexBuffers) / sizeof(vertexBuffers[0]), vertexBuffers, offsets);
+        vkCmdDraw(commandBuffer, (uint32_t)(m_sim.getNumPoints()), 1, 0, 0);
+    }
+
+    void getVertexDescriptions(std::vector<VkVertexInputBindingDescription>& bindingDesc, std::vector<VkVertexInputAttributeDescription>& attribDesc) {
+        bindingDesc.resize(2);
+        attribDesc.resize(2);
+
+        bindingDesc[0].binding = 0;
+        bindingDesc[0].stride = sizeof(float);
+        bindingDesc[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
+
+        bindingDesc[1].binding = 1;
+        bindingDesc[1].stride = sizeof(vec2);
+        bindingDesc[1].inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
+
+        attribDesc[0].binding = 0;
+        attribDesc[0].location = 0;
+        attribDesc[0].format = VK_FORMAT_R32_SFLOAT;
+        attribDesc[0].offset = 0;
+
+        attribDesc[1].binding = 1;
+        attribDesc[1].location = 1;
+        attribDesc[1].format = VK_FORMAT_R32G32_SFLOAT;
+        attribDesc[1].offset = 0;
+    }
+
+    void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info) {
+        info.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
+        info.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
+        info.primitiveRestartEnable = VK_FALSE;
+    }
+
+    void getWaitFrameSemaphores(std::vector<VkSemaphore>& wait, std::vector< VkPipelineStageFlags>& waitStages) const {
+        if (m_currentFrame != 0) {
+            // Have vulkan wait until cuda is done with the vertex buffer before rendering
+            // We don't do this on the first frame, as the wait semaphore hasn't been initialized yet
+            wait.push_back(m_vkWaitSemaphore);
+            // We want to wait until all the pipeline commands are complete before letting cuda work
+            waitStages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
+        }
+    }
+
+    void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const {
+        // Add this semaphore for vulkan to signal once the vertex buffer is ready for cuda to modify
+        signal.push_back(m_vkSignalSemaphore);
+    }
+
+    void initVulkanApp() {
+        const size_t nVerts = m_sim.getNumPoints();
+
+        // Obtain cuda device id for the device corresponding to the Vulkan physical device
+        int deviceCount;
+        int cudaDevice = cudaInvalidDeviceId;
+        checkCudaErrors(cudaGetDeviceCount(&deviceCount));
+        for (int dev = 0; dev < deviceCount; ++dev) {
+            cudaDeviceProp devProp = { };
+            checkCudaErrors(cudaGetDeviceProperties(&devProp, dev));
+            if (isVkPhysicalDeviceUuid(&devProp.uuid)) {
+                cudaDevice = dev;
+                break;
+            }
+        }
+        if (cudaDevice == cudaInvalidDeviceId) {
+            throw std::runtime_error("No Suitable device found!");
+        }
+
+        // On the corresponding cuda device, create the cuda stream we'll using
+        checkCudaErrors(cudaSetDevice(cudaDevice));
+        checkCudaErrors(cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking));
+        m_sim.initSimulation(cudaDevice, m_stream);
+
+        importExternalBuffer((void *)(uintptr_t)m_sim.getPositionShareableHandle(), getDefaultMemHandleType(), nVerts * sizeof(vec2),
+            VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
+            VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_xyPositionBuffer, m_xyPositionMemory);
+
+        importExternalBuffer((void *)(uintptr_t)m_sim.getInCircleShareableHandle(), getDefaultMemHandleType(), nVerts * sizeof(float),
+            VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
+            VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_inCircleBuffer, m_inCircleMemory);
+
+        // Create the semaphore vulkan will signal when it's done with the vertex buffer
+        createExternalSemaphore(m_vkSignalSemaphore, getDefaultSemaphoreHandleType());
+        // Create the semaphore vulkan will wait for before using the vertex buffer
+        createExternalSemaphore(m_vkWaitSemaphore, getDefaultSemaphoreHandleType());
+        // Import the semaphore cuda will use -- vulkan's signal will be cuda's wait
+        importCudaExternalSemaphore(m_cudaWaitSemaphore, m_vkSignalSemaphore, getDefaultSemaphoreHandleType());
+        // Import the semaphore cuda will use -- cuda's signal will be vulkan's wait
+        importCudaExternalSemaphore(m_cudaSignalSemaphore, m_vkWaitSemaphore, getDefaultSemaphoreHandleType());
+    }
+
+    void importCudaExternalSemaphore(cudaExternalSemaphore_t& cudaSem, VkSemaphore& vkSem, VkExternalSemaphoreHandleTypeFlagBits handleType) {
+        cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc = {};
+
+        if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) {
+            externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32;
+        }
+        else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) {
+            externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt;
+        }
+        else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) {
+            externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd;
+        }
+        else {
+            throw std::runtime_error("Unknown handle type requested!");
+        }
+
+#ifdef _WIN64
+        externalSemaphoreHandleDesc.handle.win32.handle = (HANDLE)getSemaphoreHandle(vkSem, handleType);
+#else
+        externalSemaphoreHandleDesc.handle.fd = (int)(uintptr_t)getSemaphoreHandle(vkSem, handleType);
+#endif
+
+        externalSemaphoreHandleDesc.flags = 0;
+
+        checkCudaErrors(cudaImportExternalSemaphore(&cudaSem, &externalSemaphoreHandleDesc));
+    }
+
+    VkDeviceSize getUniformSize() const {
+        return sizeof(UniformBufferObject);
+    }
+
+    void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame) {
+        m_ubo.frame = (float)globalFrame;
+        void *data;
+        vkMapMemory(m_device, m_uniformMemory[imageIndex], 0, getUniformSize(), 0, &data);
+        memcpy(data, &m_ubo, sizeof(m_ubo));
+        vkUnmapMemory(m_device, m_uniformMemory[imageIndex]);
+    }
+
+    std::vector<const char *> getRequiredExtensions() const {
+        std::vector<const char *> extensions;
+        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME);
+        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME);
+        return extensions;
+    }
+
+    std::vector<const char *> getRequiredDeviceExtensions() const {
+        std::vector<const char *> extensions;
+
+        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME);
+        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME);
+#ifdef _WIN64
+        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME);
+        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME);
+#else
+        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME);
+        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME);
+#endif /* _WIN64 */
+        return extensions;
+    }
+
+    void drawFrame() {
+        static chrono_tp startTime = std::chrono::high_resolution_clock::now();
+
+        chrono_tp currentTime = std::chrono::high_resolution_clock::now();
+        float time = std::chrono::duration<float, std::chrono::seconds::period>(currentTime - startTime).count();
+
+        if (m_currentFrame == 0) {
+            m_lastTime = startTime;
+        }
+
+        cudaExternalSemaphoreWaitParams waitParams = {};
+        waitParams.flags = 0;
+        waitParams.params.fence.value = 0;
+
+        cudaExternalSemaphoreSignalParams signalParams = {};
+        signalParams.flags = 0;
+        signalParams.params.fence.value = 0;
+
+        // Have vulkan draw the current frame...
+        VulkanBaseApp::drawFrame();
+        // Wait for vulkan to complete it's work
+        checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaWaitSemaphore, &waitParams, 1, m_stream));
+        // Now step the simulation
+        m_sim.stepSimulation(time, m_stream);
+
+        // Signal vulkan to continue with the updated buffers
+        checkCudaErrors(cudaSignalExternalSemaphoresAsync(&m_cudaSignalSemaphore, &signalParams, 1, m_stream));
+    }
+};
+
+int main()
+{
+    VulkanCudaPi app(NUM_SIMULATION_POINTS);
+    app.init();
+    app.mainLoop();
+    return 0;
+}
diff --git a/Samples/simpleVulkanMMAP/montecarlo.frag b/Samples/simpleVulkanMMAP/montecarlo.frag
new file mode 100644
index 00000000..c408a3fc
--- /dev/null
+++ b/Samples/simpleVulkanMMAP/montecarlo.frag
@@ -0,0 +1,37 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#version 450
+#extension GL_ARB_separate_shader_objects : enable
+
+layout(location = 0) in vec3 fragColor;
+
+layout(location = 0) out vec4 outColor;
+
+void main() {
+    outColor = vec4(fragColor, 1.0);
+}
diff --git a/Samples/simpleVulkanMMAP/montecarlo.vert b/Samples/simpleVulkanMMAP/montecarlo.vert
new file mode 100644
index 00000000..2bc18f98
--- /dev/null
+++ b/Samples/simpleVulkanMMAP/montecarlo.vert
@@ -0,0 +1,55 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+#version 450
+#extension GL_ARB_separate_shader_objects : enable
+
+layout(binding = 0) uniform UniformBufferObject {
+    float frame;
+} ubo;
+
+layout(location = 0) in float pointInsideCircle;
+layout(location = 1) in vec2 xyPos;
+ 
+layout(location = 0) out vec3 fragColor;
+ 
+const float PI = 3.1415926;
+ 
+out gl_PerVertex
+{
+    vec4 gl_Position;
+    float gl_PointSize;
+};
+ 
+void main() {
+    gl_PointSize = 1.0;
+    gl_Position = vec4(xyPos.xy, 0.0f, 1.0f);
+    float color_r = 1.0f + 0.5f * sin(ubo.frame / 100.0f);
+    float color_g = 1.0f + 0.5f * sin((ubo.frame / 100.0f) + (2.0f*PI/3.0f));
+    float color_b = 1.0f;
+    fragColor = vec3(pointInsideCircle.x * color_r, pointInsideCircle.x * color_g, color_b);
+}
diff --git a/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2015.sln b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2015.sln
new file mode 100644
index 00000000..fc58a805
--- /dev/null
+++ b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2015.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 14.00
+# Visual Studio 2015
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleVulkanMMAP", "simpleVulkanMMAP_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2015.vcxproj b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2015.vcxproj
new file mode 100644
index 00000000..456e74bb
--- /dev/null
+++ b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2015.vcxproj
@@ -0,0 +1,123 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>simpleVulkanMMAP_vs2015</RootNamespace>
+    <ProjectName>simpleVulkanMMAP</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(VULKAN_SDK)/include;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cuda.lib;vulkan-1.lib;glfw3dll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);../../common/lib/$(PlatformName);$(VULKAN_SDK)/Lib;</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/simpleVulkanMMAP.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Platform)'=='Win32'">
+    <Link>
+      
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Platform)'=='x64'">
+    <Link>
+      
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="MonteCarloPi.cu" />
+    <ClCompile Include="VulkanBaseApp.cpp" />
+    <ClCompile Include="main.cpp" />
+    <ClCompile Include="../../Common/helper_multiprocess.cpp" />
+    <ClInclude Include="MonteCarloPi.h" />
+    <ClInclude Include="VulkanBaseApp.h" />
+    <ClInclude Include="VulkanCudaInterop.h" />
+    <ClInclude Include="../../Common/helper_multiprocess.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2017.sln b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2017.sln
new file mode 100644
index 00000000..b53b2481
--- /dev/null
+++ b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2017.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleVulkanMMAP", "simpleVulkanMMAP_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2017.vcxproj b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2017.vcxproj
new file mode 100644
index 00000000..cf8935c7
--- /dev/null
+++ b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2017.vcxproj
@@ -0,0 +1,128 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>simpleVulkanMMAP_vs2017</RootNamespace>
+    <ProjectName>simpleVulkanMMAP</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(VULKAN_SDK)/include;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cuda.lib;vulkan-1.lib;glfw3dll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);../../common/lib/$(PlatformName);$(VULKAN_SDK)/Lib;</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/simpleVulkanMMAP.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Platform)'=='Win32'">
+    <Link>
+      
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Platform)'=='x64'">
+    <Link>
+      
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="MonteCarloPi.cu" />
+    <ClCompile Include="VulkanBaseApp.cpp" />
+    <ClCompile Include="main.cpp" />
+    <ClCompile Include="../../Common/helper_multiprocess.cpp" />
+    <ClInclude Include="MonteCarloPi.h" />
+    <ClInclude Include="VulkanBaseApp.h" />
+    <ClInclude Include="VulkanCudaInterop.h" />
+    <ClInclude Include="../../Common/helper_multiprocess.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2019.sln b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2019.sln
new file mode 100644
index 00000000..2b62146c
--- /dev/null
+++ b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2019.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleVulkanMMAP", "simpleVulkanMMAP_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2019.vcxproj b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2019.vcxproj
new file mode 100644
index 00000000..38d94608
--- /dev/null
+++ b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2019.vcxproj
@@ -0,0 +1,124 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>simpleVulkanMMAP_vs2019</RootNamespace>
+    <ProjectName>simpleVulkanMMAP</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(VULKAN_SDK)/include;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cuda.lib;vulkan-1.lib;glfw3dll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);../../common/lib/$(PlatformName);$(VULKAN_SDK)/Lib;</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/simpleVulkanMMAP.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Platform)'=='Win32'">
+    <Link>
+      
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Platform)'=='x64'">
+    <Link>
+      
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="MonteCarloPi.cu" />
+    <ClCompile Include="VulkanBaseApp.cpp" />
+    <ClCompile Include="main.cpp" />
+    <ClCompile Include="../../Common/helper_multiprocess.cpp" />
+    <ClInclude Include="MonteCarloPi.h" />
+    <ClInclude Include="VulkanBaseApp.h" />
+    <ClInclude Include="VulkanCudaInterop.h" />
+    <ClInclude Include="../../Common/helper_multiprocess.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/systemWideAtomics/Makefile b/Samples/systemWideAtomics/Makefile
index 14d6dff0..f80c167c 100644
--- a/Samples/systemWideAtomics/Makefile
+++ b/Samples/systemWideAtomics/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -252,6 +273,12 @@ ifeq ($(TARGET_ARCH),aarch64)
   SAMPLE_ENABLED := 0
 endif
 
+# This sample is not supported on sbsa
+ifeq ($(TARGET_ARCH),sbsa)
+  $(info >>> WARNING - systemWideAtomics is not supported on sbsa - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
@@ -265,9 +292,9 @@ LIBRARIES :=
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 60 61 70 72 75
+SMS ?= 60 61 70 72 75 80
 else
-SMS ?= 60 61 70 75
+SMS ?= 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
diff --git a/Samples/systemWideAtomics/NsightEclipse.xml b/Samples/systemWideAtomics/NsightEclipse.xml
index 5108b3af..4bc50b92 100644
--- a/Samples/systemWideAtomics/NsightEclipse.xml
+++ b/Samples/systemWideAtomics/NsightEclipse.xml
@@ -41,6 +41,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/systemWideAtomics/README.md b/Samples/systemWideAtomics/README.md
index 2cb2e9f9..3b008e59 100644
--- a/Samples/systemWideAtomics/README.md
+++ b/Samples/systemWideAtomics/README.md
@@ -10,7 +10,7 @@ Atomic Intrinsics, Unified Memory
 
 ## Supported SM Architectures
 
-[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -30,7 +30,7 @@ cudaMalloc, cudaFree, cudaMemcpy, cudaFreeHost
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/tf32TensorCoreGemm/Makefile b/Samples/tf32TensorCoreGemm/Makefile
new file mode 100644
index 00000000..9bfa32d2
--- /dev/null
+++ b/Samples/tf32TensorCoreGemm/Makefile
@@ -0,0 +1,362 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+        endif
+    endif
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on Mac OSX
+ifeq ($(TARGET_OS),darwin)
+  $(info >>> WARNING - tf32TensorCoreGemm is not supported on Mac OSX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on ARMv7
+ifeq ($(TARGET_ARCH),armv7l)
+  $(info >>> WARNING - tf32TensorCoreGemm is not supported on ARMv7 - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+#Detect if installed version of GCC supports required C++11
+ifeq ($(TARGET_OS),linux)
+    empty :=
+    space := $(empty) $(empty)
+    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
+#Create version number without "."
+    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
+# Make sure the version number has at least 3 decimals
+    GCCVERSION += 00
+# Remove spaces from the version number
+    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
+# Crop the version number to 3 decimals.
+    GCCVERSION := $(shell expr `echo $(GCCVERSION)` | cut -b1-3)
+#$(warning $(GCCVERSION))
+
+    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 500)
+
+    ifeq ($(IS_MIN_VERSION), 1)
+        $(info >>> GCC Version is greater or equal to  5.0.0 <<<)
+    else
+        $(info >>> Waiving build. Minimum GCC version required is  5.0.0<<<)
+        SAMPLE_ENABLED := 0
+    endif
+endif
+
+# Gencode arguments
+SMS ?= 80
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ALL_CCFLAGS += --std=c++11
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: tf32TensorCoreGemm
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+tf32TensorCoreGemm.o:tf32TensorCoreGemm.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+tf32TensorCoreGemm: tf32TensorCoreGemm.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./tf32TensorCoreGemm
+
+clean:
+	rm -f tf32TensorCoreGemm tf32TensorCoreGemm.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/tf32TensorCoreGemm
+
+clobber: clean
diff --git a/Samples/tf32TensorCoreGemm/NsightEclipse.xml b/Samples/tf32TensorCoreGemm/NsightEclipse.xml
new file mode 100644
index 00000000..14258688
--- /dev/null
+++ b/Samples/tf32TensorCoreGemm/NsightEclipse.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>tf32TensorCoreGemm</name>
+  <cflags>
+    <flag>--std=c++11</flag>
+  </cflags>
+  <cuda_api_list>
+    <toolkit>cudaMalloc</toolkit>
+    <toolkit>cudaDeviceSynchronize</toolkit>
+    <toolkit>cudaFuncSetAttribute</toolkit>
+    <toolkit>cudaEventCreate</toolkit>
+    <toolkit>cudaEventRecord</toolkit>
+    <toolkit>cudaEventSynchronize</toolkit>
+    <toolkit>cudaEventElapsedTime</toolkit>
+    <toolkit>cudaFree</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[A CUDA sample demonstrating tf32 (e8m10) GEMM computation using the Warp Matrix Multiply and Accumulate (WMMA) API introduced with CUDA 11 in Ampere chip family tensor cores for faster matrix operations. This sample also uses async copy provided by cuda pipeline interface for gmem to shmem async loads which improves kernel performance and reduces register presssure.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">Matrix Multiply</concept>
+    <concept level="advanced">WMMA</concept>
+    <concept level="advanced">Tensor Cores</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>matrix multiply</keyword>
+    <keyword>Async copy</keyword>
+    <keyword>CPP11</keyword>
+    <keyword>GCC 5.0.0</keyword>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>tf32TensorCoreGemm.cu</primary_file>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+  </scopes>
+  <sm-arch>sm80</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <arch>aarch64</arch>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <from>8.0</from>
+  </supported_sm_architectures>
+  <title>tf32 Tensor Core GEMM</title>
+  <type>exe</type>
+</entry>
diff --git a/Samples/tf32TensorCoreGemm/README.md b/Samples/tf32TensorCoreGemm/README.md
new file mode 100644
index 00000000..62b61f7f
--- /dev/null
+++ b/Samples/tf32TensorCoreGemm/README.md
@@ -0,0 +1,70 @@
+# tf32TensorCoreGemm - tf32 Tensor Core GEMM
+
+## Description
+
+A CUDA sample demonstrating tf32 (e8m10) GEMM computation using the Warp Matrix Multiply and Accumulate (WMMA) API introduced with CUDA 11 in Ampere chip family tensor cores for faster matrix operations. This sample also uses async copy provided by cuda pipeline interface for gmem to shmem async loads which improves kernel performance and reduces register presssure.
+
+## Key Concepts
+
+Matrix Multiply, WMMA, Tensor Cores
+
+## Supported SM Architectures
+
+[SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, aarch64
+
+## CUDA APIs involved
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaMalloc, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate, cudaEventRecord, cudaEventSynchronize, cudaEventElapsedTime, cudaFree
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
diff --git a/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm.cu b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm.cu
new file mode 100644
index 00000000..dcc69155
--- /dev/null
+++ b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm.cu
@@ -0,0 +1,863 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// CUDA sample demonstrating a tf32 (E8M10) GEMM computation using the Warp Matrix Multiply
+// and Accumulate API introduced in CUDA 11.0.
+
+// In this program, the compute_gemm kernel computes the result of a matrix multiplication
+// and addition: D = alpha * A * B + beta * C. The dimensions of both C and D matrices
+// are M_GLOBAL x N_GLOBAL. The A matrix is M_GLOBAL x K_GLOBAL (row-major), the B matrix
+// is K_GLOBAL x N_GLOBAL (column-major).
+// In that kernel, each CTA computes one 128 x 128 tile of the resulting matrix
+// per iteration. When the tile is computed, the CTA stores it to the global memory
+// and begins a new iteration, selecting a new 128 x 128 tile to compute.
+// Each CTA consists of eight warps. For the 128 x 128 tile, each warp computes eight
+// 16 x 16 subtiles, organized in a 2 x 4 two-dimensional array.
+// Warps compute the 16 x 16 subtiles using nvcuda::wmma::mma_sync operations by
+// moving through the K_GLOBAL dimension of the A and B matrices and accumulating
+// the intermediate result in the local thread state.
+
+// There are a number of simple optimizations used in the algorithm:
+// - The CTA copies the 128 x 128 tile of the C matrix from the global memory to
+//   shared memory. After that is done, each warp loads the C matrix fragments from
+//   shared memory, thus avoiding a random global memory access.
+// - On each internal iteration, the CTA copies a portion of the A and B matrices from
+//   global memory to shared memory. After that, all warps in the CTA reuse the A and B
+//   data from shared memory, thus reducing the number of data copies from global memory.
+// - The portions of the A and B matrices are stored in shared memory with an additional
+//   padding (skew) to reduce the number of shared memory access bank conflicts.
+//   (See a detailed explanation near the SKEW_FLOAT macro definition.)
+// - When the CTA finishes computing the tiles of the resulting matrix, each warp stores
+//   its subtiles to shared memory. The CTA then copies the shared memory contents to
+//   global memory, again avoiding redundant random global memory accesses.
+// - Note that the CTA tile size is chosen to maximize the GPU register utilization,
+//   but carefully enough to avoid local memory use.
+
+#include <assert.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <mma.h>
+#include <cuda_pipeline.h>
+
+// helper functions and utilities to work with CUDA
+#include <helper_functions.h>
+#include <helper_cuda.h>
+
+// Externally configurable parameters.
+
+// Switch for choosing cpp interface for cuda pipeline 
+// vs primitives interface.
+#define USE_CPP_API 0
+
+#ifndef CPU_DEBUG
+// Set this to 1 to verify the correctness of the GPU-computed matrix.
+#define CPU_DEBUG 0
+#endif
+
+#ifndef SHARED_MEMORY_LIMIT_64K
+// Set this to 0 to use more than 64 Kb of shared memory to cache data, to
+// improve the performance of the computations on GPU.
+// Note that you need a GPU that can have more than 64 Kb of shared memory
+// per multiprocessor.
+#define SHARED_MEMORY_LIMIT_64K 0
+#endif
+
+// GPU configuration.
+
+#define WARP_SIZE 32
+
+// MMA matrix tile dimensions.
+
+#define M 16
+#define N 16
+#define K 8
+
+// GEMM configuration.
+
+#define M_TILES 512
+#define N_TILES 512
+#define K_TILES 512
+
+#define M_GLOBAL (M * M_TILES)
+#define N_GLOBAL (N * N_TILES)
+#define K_GLOBAL (K * K_TILES)
+
+#define C_LAYOUT wmma::mem_row_major
+
+// Implementation constants.
+
+#define WARPS_PER_BLOCK 8
+#define THREADS_PER_BLOCK (WARP_SIZE * WARPS_PER_BLOCK)
+
+#if SHARED_MEMORY_LIMIT_64K
+// With only 64 Kb shared memory available, we can fit two 8-tile chunks of
+// the A and B matrix data, that is (M = 16) * (K = 8) * 8 * (CHUNK_K = 8)
+// * sizeof(float) = 32 Kb each.
+// (i.e. two 8x8 arrays of tiles of 16x8 float-typed elements per CTA).
+// But we cannot account the 8 Kb total skew overhead, without which the performance
+// would be severely impacted. So we choose to reduce the chunk size in half,
+// i.e. the amount of A and B matrix data we cache in shared memory.
+// Accordingly, this doubles the number of outer iterations across the global K
+// dimension, which only slightly impacts the performance.
+#define CHUNK_K 4
+#else
+#define CHUNK_K 8
+#endif
+
+#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(float))
+#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4))
+#define CHUNK_COPY_LINES_PER_WARP (WARP_COPY_BYTES / CHUNK_LINE_BYTES)
+#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP)
+
+#define BLOCK_ROW_WARPS 2
+#define BLOCK_COL_WARPS 4
+
+#define WARP_ROW_TILES 4
+#define WARP_COL_TILES 2
+
+#define BLOCK_ROW_TILES (WARP_ROW_TILES * BLOCK_ROW_WARPS)
+#define BLOCK_COL_TILES (WARP_COL_TILES * BLOCK_COL_WARPS)
+
+#define GLOBAL_MEM_STRIDE N_GLOBAL
+
+#define SHMEM_STRIDE (N * BLOCK_ROW_TILES)
+#define SHMEM_OFFSET (N * WARP_ROW_TILES)
+
+// The macro below is used to shift rows of the A matrix and columns of the B matrix
+// in shared memory to minimize possible bank conflicts.
+// Before performing the nvcuda::wmma::mma_sync operation, the warp must load the matrix
+// data using the nvcuda::wmma::load_matrix_sync operation. Although the memory access pattern
+// is not specified for that function, each lane in the warp can read one or multiple matrix
+// elements from different matrix rows or columns.
+// For shared memory, such access can result in bank conflicts if different rows / columns
+// of the matrix map to the same bank. By shifting each row and column by a few bytes, we
+// make sure that they map to different banks, thus reducing the number of possible bank
+// conflicts.
+// The number of 8 four-byte "float" elements is chosen as the minimum possible shift because
+// we must keep each row and column 256-bit aligned, as required by nvcuda::wmma::load_matrix_sync.
+#define SKEW_FLOAT 8
+
+#define checkKernelErrors(expr) do {                                                        \
+    expr;                                                                                   \
+                                                                                            \
+    cudaError_t __err = cudaGetLastError();                                                 \
+    if (__err != cudaSuccess) {                                                             \
+        printf("Line %d: '%s' failed: %s\n", __LINE__, # expr, cudaGetErrorString(__err));  \
+        abort();                                                                            \
+    }                                                                                       \
+} while(0)
+
+enum kernels
+{
+    tf32mma_shmem_gemm_async_copy  = 0, // tf32 MMA shmem using kernel with async_copy 
+    tf32mma_shmem_gemm             = 1, // tf32 MMA shmem using kernel normal copy (without async_copy).
+    simple_tf32mma_gemm            = 2  // tf32 MMA non-shmem using simple kernel.
+};
+
+const char* kernelNames[] = {"compute_tf32gemm_async_copy", "compute_tf32gemm", 
+                            "simple_wmma_tf32gemm"};
+
+using namespace nvcuda;
+namespace nvcuda_namespace = nvcuda::experimental;
+
+__host__ void init_host_matrices(float *a, float *b, float *c)
+{
+    for (int i = 0; i < M_GLOBAL; i++) {
+        for (int j = 0; j < K_GLOBAL; j++) {
+            a[i*K_GLOBAL+j] = (float)(rand() % 3);
+        }
+    }
+
+    for (int i = 0; i < N_GLOBAL; i++) {
+        for (int j = 0; j < K_GLOBAL; j++) {
+            b[i*K_GLOBAL+j] = (float)(rand() % 3);
+        }
+    }
+
+    for (int t = 0; t < M_GLOBAL * N_GLOBAL; t++) {
+        c[t] =  (float)(rand() % 3);
+    }
+}
+
+__global__ void compute_tf32gemm(const float *A, const float *B, const float *C, float *D, float alpha, float beta)
+{
+#if __CUDA_ARCH__ >= 800
+    extern __shared__ float shmem[][CHUNK_K * K + SKEW_FLOAT];
+
+    // Warp and lane identification.
+    const unsigned int warpId = threadIdx.x / WARP_SIZE;
+    const unsigned int laneId = threadIdx.x % WARP_SIZE;
+
+    // Offset in shared memory from which the B matrix is stored.
+    const size_t shmem_idx_b_off = BLOCK_COL_TILES * M;
+
+    // This pointer is used to access the C and D matrix tiles this warp computes.
+    float *shmem_warp_tile_ptr = (float*)&shmem[0][0] + (warpId / BLOCK_ROW_WARPS) * SHMEM_STRIDE * N * BLOCK_ROW_WARPS + (warpId % BLOCK_ROW_WARPS) * SHMEM_OFFSET;
+
+    // This pointer is used to stream the C and D matrices block-wide tile to and from shared memory.
+    float *shmem_warp_stream_ptr = (float*)&shmem[0][0] + warpId * SHMEM_STRIDE * N;
+
+    // Adjust the beta scaler, as it'll be multiplied by alpha at the end of
+    // each tile computation. Technically this is not generally correct (may result
+    // in a loss of precision). Zero still needs to be specially handled though.
+    beta /= alpha;
+
+    // Each CTA slides along the 128 x 128 tiles from the top left corner of the matrix to the
+    // right and down, and selects the next tile to compute. Once there's no such tile,
+    // all warps in this CTA exit.
+    for(unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) {
+        const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES);
+        const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES;
+
+        // Stop when there are no more D matrix tiles to compute in this CTA.
+        if (block_tile_i >= M_TILES) {
+            break;
+        }
+
+        // This warp's pointer to the C matrix data to copy memory from to shared memory.
+        const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N;
+        const float *src_gmem_warp_stream_ptr = &C[gmem_idx];
+
+        // Stream multiple C tiles to shared memory.
+#pragma unroll
+        for (int i = 0; i < N; i++) {
+            *((int4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId) = 
+                *((int4*)(src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId);
+        }
+
+        __syncthreads();
+
+        // These fragments will accumulate the result of A and B matrix fragment multiplications
+        // along the K_GLOBAL dimension.
+        wmma::fragment<wmma::accumulator, M, N, K, float> c[WARP_COL_TILES][WARP_ROW_TILES];
+
+        // Load the C matrix tiles into fragments from shared memory.
+#pragma unroll
+        for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+                const float *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * N + j * N;
+
+                wmma::load_matrix_sync(c[i][j], tile_ptr, SHMEM_STRIDE, C_LAYOUT);
+            }
+        }
+
+        __syncthreads();
+
+        // Scale the C matrix.
+#pragma unroll
+       for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+#pragma unroll
+                for (int t = 0; t < c[i][j].num_elements; t++) {
+                    c[i][j].x[t] *= beta;
+                }
+            }
+        }
+
+        // Select what warp copies what matrix to shared memory.
+        // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix.
+        const float *warp_ptr = (warpId < (WARPS_PER_BLOCK/2)) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2) :
+                                              (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2);
+
+        // Go through the global K dimension by a fixed step at a time.
+#pragma unroll
+        for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) {
+            // Copy slices of the A and B matrices to shared memory.
+            // The first half of the warps in the CTA copy the A matrix, the rest copy the B matrix.
+            size_t shmem_idx = warpId < (WARPS_PER_BLOCK/2) ? (M * (warpId % (WARPS_PER_BLOCK/2)) * 2) : 
+                                                              (N * (warpId % (WARPS_PER_BLOCK/2)) * 2 + shmem_idx_b_off);
+
+            // First half of the warp copies the first row / column of the matrix,
+            // the second half of the warp copies the next.
+            const float *lane_ptr = (warp_ptr + tile_k * K + (laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL);
+
+            // Shift the second half of the warp to the next row / column in the shared memory.
+            shmem_idx += laneId / CHUNK_COPY_LINE_LANES;
+
+#pragma unroll
+            for(int i = 0; i < ((WARP_SIZE/2) / CHUNK_COPY_LINES_PER_WARP) * 2; i++) {
+                // Copy 16 bytes at once in each lane.
+                *((int4*)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) = *((int4*)lane_ptr +  (laneId % CHUNK_COPY_LINE_LANES));
+
+                // Advance the global memory pointer and the shared memory index.
+                lane_ptr = lane_ptr + K_GLOBAL * CHUNK_COPY_LINES_PER_WARP;
+                shmem_idx += CHUNK_COPY_LINES_PER_WARP;
+            }
+
+            __syncthreads();
+
+            // Compute a grid of C matrix tiles in each warp.
+#pragma unroll
+            for (int k_step = 0; k_step < CHUNK_K; k_step++) {
+                wmma::fragment<wmma::matrix_a, M, N, K, wmma::precision::tf32, wmma::row_major> a[WARP_COL_TILES];
+                wmma::fragment<wmma::matrix_b, M, N, K, wmma::precision::tf32, wmma::col_major> b[WARP_ROW_TILES];
+
+#pragma unroll
+                for (int i = 0; i < WARP_COL_TILES; i++) {
+                    size_t shmem_idx_a = (warpId/BLOCK_ROW_WARPS) * M * BLOCK_ROW_WARPS + (i * M);
+                    const float *tile_ptr = &shmem[shmem_idx_a][k_step * K];
+
+                    wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_FLOAT);
+#pragma unroll
+                    for (int t = 0; t < a[i].num_elements; t++) {
+                        a[i].x[t] = wmma::__float_to_tf32(a[i].x[t]);
+                    }
+#pragma unroll
+                    for (int j = 0; j < WARP_ROW_TILES; j++) {
+                        if (i == 0) {
+                            // Load the B matrix fragment once, because it is going to be reused
+                            // against the other A matrix fragments.
+                            size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId%2) + (j * N);
+                            const float *tile_ptr = &shmem[shmem_idx_b][k_step * K];
+
+                            wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_FLOAT);
+#pragma unroll
+                            for (int t = 0; t < b[j].num_elements; t++) {
+                                b[j].x[t] = wmma::__float_to_tf32(b[j].x[t]);
+                            }
+                        }
+
+                        wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]);
+                    }
+                }
+            }
+
+            __syncthreads();
+        }
+
+        // Store the D fragments to shared memory.
+#pragma unroll
+        for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+#pragma unroll
+                // Uniform, point-wise transformations of ALL fragment elements by ALL threads in the
+                // warp are well-defined even though element indices within fragment storage are not defined.
+                for (int t = 0; t < c[i][j].num_elements; t++)
+                    c[i][j].x[t] *= alpha;
+
+                float *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * N + j * N;
+
+                wmma::store_matrix_sync(tile_ptr, c[i][j], SHMEM_STRIDE, C_LAYOUT);
+            }
+        }
+
+        __syncthreads();
+
+        // Now that shared memory contains all the D tiles, stream them to global memory.
+        float *dst_gmem_warp_stream_ptr = &D[gmem_idx];
+
+#pragma unroll
+        for (int i = 0; i < N; i++) {
+            *((int4*)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) =
+                *((int4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId);
+        }
+
+        __syncthreads();
+    }
+#endif
+}
+
+__global__ void compute_tf32gemm_async_copy(const float *A, const float *B, const float *C, float *D, float alpha, float beta)
+{
+#if __CUDA_ARCH__ >= 800
+    extern __shared__ float shmem[][CHUNK_K * K + SKEW_FLOAT];
+
+    // Warp and lane identification.
+    const unsigned int warpId = threadIdx.x / WARP_SIZE;
+    const unsigned int laneId = threadIdx.x % WARP_SIZE;
+
+    // Offset in shared memory from which the B matrix is stored.
+    const size_t shmem_idx_b_off = BLOCK_COL_TILES * M;
+
+    // This pointer is used to access the C and D matrix tiles this warp computes.
+    float *shmem_warp_tile_ptr = (float*)&shmem[0][0] + (warpId / BLOCK_ROW_WARPS) * SHMEM_STRIDE * N * BLOCK_ROW_WARPS + (warpId % BLOCK_ROW_WARPS) * SHMEM_OFFSET;
+
+    // This pointer is used to stream the C and D matrices block-wide tile to and from shared memory.
+    float *shmem_warp_stream_ptr = (float*)&shmem[0][0] + warpId * SHMEM_STRIDE * N;
+
+    // Adjust the beta scaler, as it'll be multiplied by alpha at the end of
+    // each tile computation. Technically this is not generally correct (may result
+    // in a loss of precision). Zero still needs to be specially handled though.
+    beta /= alpha;
+
+#if USE_CPP_API
+    nvcuda_namespace::pipeline pipe;
+#endif
+    // Each CTA slides along the 128 x 128 tiles from the top left corner of the matrix to the
+    // right and down, and selects the next tile to compute. Once there's no such tile,
+    // all warps in this CTA exit.
+    for(unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) {
+        const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES);
+        const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES;
+
+        // Stop when there are no more D matrix tiles to compute in this CTA.
+        if (block_tile_i >= M_TILES) {
+            break;
+        }
+
+        // This warp's pointer to the C matrix data to copy memory from to shared memory.
+        const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N;
+        const float *src_gmem_warp_stream_ptr = &C[gmem_idx];
+
+        // Stream multiple C tiles to shared memory.
+#pragma unroll
+        for (int i = 0; i < N; i++) {
+#if USE_CPP_API
+            nvcuda_namespace::memcpy_async(*((int4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId),
+                                            *((int4*)(src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId),
+                                            pipe);
+            pipe.commit();
+#else
+            __pipeline_memcpy_async((reinterpret_cast<int4*>(&shmem_warp_stream_ptr[(SHMEM_STRIDE * i)])) + laneId,
+                                (reinterpret_cast<const int4*>(&src_gmem_warp_stream_ptr[(GLOBAL_MEM_STRIDE * i)])) + laneId,
+                                sizeof(int4));
+            __pipeline_commit();
+#endif
+        }
+
+#if USE_CPP_API
+        pipe.wait_prior<0>();
+#else
+        __pipeline_wait_prior(0);
+#endif
+        __syncthreads();
+
+        // These fragments will accumulate the result of A and B matrix fragment multiplications
+        // along the K_GLOBAL dimension.
+        wmma::fragment<wmma::accumulator, M, N, K, float> c[WARP_COL_TILES][WARP_ROW_TILES];
+
+        // Load the C matrix tiles into fragments from shared memory.
+#pragma unroll
+        for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+                const float *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * N + j * N;
+
+                wmma::load_matrix_sync(c[i][j], tile_ptr, SHMEM_STRIDE, C_LAYOUT);
+            }
+        }
+
+        __syncthreads();
+
+        // Scale the C matrix.
+#pragma unroll
+       for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+#pragma unroll
+                for (int t = 0; t < c[i][j].num_elements; t++) {
+                    c[i][j].x[t] *= beta;
+                }
+            }
+        }
+
+        // Select what warp copies what matrix to shared memory.
+        // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix.
+        const float *warp_ptr = (warpId < (WARPS_PER_BLOCK/2)) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2) :
+                                              (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2);
+
+        // Go through the global K dimension by a fixed step at a time.
+#pragma unroll
+        for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) {
+            // Copy slices of the A and B matrices to shared memory.
+            // The first half of the warps in the CTA copy the A matrix, the rest copy the B matrix.
+            size_t shmem_idx = warpId < (WARPS_PER_BLOCK/2) ? (M * (warpId % (WARPS_PER_BLOCK/2)) * 2) : 
+                                                              (N * (warpId % (WARPS_PER_BLOCK/2)) * 2 + shmem_idx_b_off);
+
+            // First half of the warp copies the first row / column of the matrix,
+            // the second half of the warp copies the next.
+            const float *lane_ptr = (warp_ptr + tile_k * K + (laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL);
+
+            // Shift the second half of the warp to the next row / column in the shared memory.
+            shmem_idx += laneId / CHUNK_COPY_LINE_LANES;
+
+#pragma unroll
+            for(int i = 0; i < ((WARP_SIZE/2) / CHUNK_COPY_LINES_PER_WARP) * 2; i++) {
+                // Copy 16 bytes at once in each lane.
+#if USE_CPP_API
+                nvcuda_namespace::memcpy_async(*((int4*)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)),
+                                                *((int4*)lane_ptr + (laneId % CHUNK_COPY_LINE_LANES)), pipe);
+                pipe.commit();
+#else
+                __pipeline_memcpy_async((int4*)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES),
+                                        (int4*)lane_ptr + (laneId % CHUNK_COPY_LINE_LANES), sizeof(int4));
+                __pipeline_commit();
+#endif
+                // Advance the global memory pointer and the shared memory index.
+                lane_ptr = lane_ptr + K_GLOBAL * CHUNK_COPY_LINES_PER_WARP;
+                shmem_idx += CHUNK_COPY_LINES_PER_WARP;
+            }
+
+#if USE_CPP_API
+            pipe.wait_prior<0>();
+#else
+            __pipeline_wait_prior(0);
+#endif
+            __syncthreads();
+
+            // Compute a grid of C matrix tiles in each warp.
+#pragma unroll
+            for (int k_step = 0; k_step < CHUNK_K; k_step++) {
+                wmma::fragment<wmma::matrix_a, M, N, K, wmma::precision::tf32, wmma::row_major> a[WARP_COL_TILES];
+                wmma::fragment<wmma::matrix_b, M, N, K, wmma::precision::tf32, wmma::col_major> b[WARP_ROW_TILES];
+
+#pragma unroll
+                for (int i = 0; i < WARP_COL_TILES; i++) {
+                    size_t shmem_idx_a = (warpId / BLOCK_ROW_WARPS) * M * BLOCK_ROW_WARPS + (i * M);
+                    const float *tile_ptr = &shmem[shmem_idx_a][k_step * K];
+
+                    wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_FLOAT);
+
+#pragma unroll
+                    for (int t = 0; t < a[i].num_elements; t++) {
+                        a[i].x[t] = wmma::__float_to_tf32(a[i].x[t]);
+                    }
+#pragma unroll
+                    for (int j = 0; j < WARP_ROW_TILES; j++) {
+                        if (i == 0) {
+                            // Load the B matrix fragment once, because it is going to be reused
+                            // against the other A matrix fragments.
+                            size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId%2) + (j * N);
+                            const float *tile_ptr = &shmem[shmem_idx_b][k_step * K];
+
+                            wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_FLOAT);
+#pragma unroll
+                            for (int t = 0; t < b[j].num_elements; t++) {
+                                b[j].x[t] =  wmma::__float_to_tf32(b[j].x[t]);
+                            }
+                        }
+
+                        wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]);
+                    }
+                }
+            }
+
+            __syncthreads();
+        }
+
+        // Store the D fragments to shared memory.
+#pragma unroll
+        for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+#pragma unroll
+                // Uniform, point-wise transformations of ALL fragment elements by ALL threads in the
+                // warp are well-defined even though element indices within fragment storage are not defined.
+                for (int t = 0; t < c[i][j].num_elements; t++)
+                    c[i][j].x[t] *= alpha;
+
+                float *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * N + j * N;
+
+                wmma::store_matrix_sync(tile_ptr, c[i][j], SHMEM_STRIDE, C_LAYOUT);
+            }
+        }
+
+        __syncthreads();
+
+        // Now that shared memory contains all the D tiles, stream them to global memory.
+        float *dst_gmem_warp_stream_ptr = &D[gmem_idx];
+
+#pragma unroll
+        for (int i = 0; i < N; i++) {
+            *((int4*)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) =
+                *((int4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId);
+        }
+
+        __syncthreads();
+    }
+#endif
+}
+
+// Performs an MxNxK tf32 GEMM (C=alpha*A*B + beta*C) assuming:
+//  1) Matrices are packed in memory.
+//  2) M, N and K are multiples of 16, 16 and 8 respectively. 
+//  3) A is row major, B is column major matrix.
+// Note: This is a less performant version of the compute_tf32gemm kernel. It is designed for
+//       demonstration purposes only to show the CUDA WMMA API use without relying on
+//       availability of the shared memory.
+__global__ void simple_wmma_tf32gemm(float *a, float *b, float *c, float *d, int m_ld, int n_ld, int k_ld, float alpha, float beta)
+{
+#if __CUDA_ARCH__ >= 800
+   // Leading dimensions. Packed with no transpositions.
+    int lda = k_ld;
+    int ldb = k_ld;
+    int ldc = n_ld;
+
+   // Tile using a 2D grid
+   int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize;
+   int warpN = (blockIdx.y * blockDim.y + threadIdx.y);
+ 
+   // Declare the fragments
+   wmma::fragment<wmma::matrix_a, M, N, K, wmma::precision::tf32, wmma::row_major> a_frag;
+   wmma::fragment<wmma::matrix_b, M, N, K, wmma::precision::tf32, wmma::col_major> b_frag;
+   wmma::fragment<wmma::accumulator, M, N, K, float> acc_frag;
+   wmma::fragment<wmma::accumulator, M, N, K, float> c_frag;
+
+   wmma::fill_fragment(acc_frag, 0.0f);
+
+   // Loop over k
+   for (int i = 0; i < k_ld; i += K) {
+      int aCol = i; 
+      int aRow = warpM * M;
+
+      //int bCol = i;
+      //int bRow = warpN * N;
+      int bCol = warpN * N;
+      int bRow = i;
+
+      // Bounds checking
+      if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) {
+         // Load the inputs
+         wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda);
+         wmma::load_matrix_sync(b_frag, b + bRow + bCol * ldb, ldb);
+ 
+ #pragma unroll
+        for (int t = 0; t < a_frag.num_elements; t++) {
+                a_frag.x[t] =  wmma::__float_to_tf32(a_frag.x[t]);
+        }
+
+ #pragma unroll
+        for (int t = 0; t < b_frag.num_elements; t++) {
+                b_frag.x[t] =  wmma::__float_to_tf32(b_frag.x[t]);
+        }
+         // Perform the matrix multiplication
+         wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
+
+      }
+   }
+
+   // Load in the current value of c, scale it by beta, and add this our result scaled by alpha
+   int cCol = warpN * N;
+   int cRow = warpM * M;
+
+   if (cRow < m_ld && cCol < n_ld) {
+      wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, wmma::mem_row_major);
+
+      for(int i=0; i < c_frag.num_elements; i++) {
+         c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i];
+      }
+
+      // Store the output
+      wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc, wmma::mem_row_major);
+   }
+#endif
+}
+
+__host__ void matMultiplyOnHost(float *A, float *B, float *C,
+                                float alpha, float beta,
+                                int numARows, int numAColumns,
+                                int numBRows, int numBColumns,
+                                int numCRows, int numCColumns)
+{
+    for (int i = 0; i < numCRows; i++) {
+        for (int j = 0; j < numCColumns; j++) {
+            float temp = 0.0;
+
+            for (int k = 0; k < numAColumns; k++) {
+                temp += A[i * numAColumns + k] * B[j * numBRows + k];
+            }
+
+            C[i*numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j];
+        }
+    }
+}
+
+int main(int argc, char **argv)
+{
+    printf("Initializing...\n");
+
+    int dev = findCudaDevice(argc, (const char **)argv);
+
+    cudaDeviceProp deviceProp;
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
+
+    // Tensor cores require a GPU of Volta (SM8X) architecture or higher.
+    if (deviceProp.major < 8) {
+        printf("tf32TensorCoreGemm requires requires SM 8.0 or higher to use Tensor Cores.  Exiting...\n");
+        exit(EXIT_WAIVED);
+    }
+
+    printf("M: %d (%d x %d)\n", M_GLOBAL, M, M_TILES);
+    printf("N: %d (%d x %d)\n", N_GLOBAL, N, N_TILES);
+    printf("K: %d (%d x %d)\n", K_GLOBAL, K, K_TILES);
+
+    float *A_h = NULL;
+    float *B_h = NULL;
+    float *C_h = NULL;
+#if CPU_DEBUG
+    float *result_hD = NULL;
+    float *result_host = NULL;
+#endif
+
+    A_h = (float*) malloc(sizeof(float) * M_GLOBAL * K_GLOBAL);
+    B_h = (float*) malloc(sizeof(float) * K_GLOBAL * N_GLOBAL);
+    C_h = (float*) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
+#if CPU_DEBUG
+    result_hD   = (float*) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
+    result_host = (float*) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
+#endif
+
+    float *A = NULL;
+    float *B = NULL;
+    float *C = NULL;
+    float *D = NULL;
+
+    checkCudaErrors(cudaMalloc((void**)&A, sizeof(float) * M_GLOBAL * K_GLOBAL));
+    checkCudaErrors(cudaMalloc((void**)&B, sizeof(float) * N_GLOBAL * K_GLOBAL));
+    checkCudaErrors(cudaMalloc((void**)&C, sizeof(float) * M_GLOBAL * N_GLOBAL));
+    checkCudaErrors(cudaMalloc((void**)&D, sizeof(float) * M_GLOBAL * N_GLOBAL));
+
+    assert(((unsigned long long)A) % 128 == 0);
+    assert(((unsigned long long)B) % 128 == 0);
+    assert(((unsigned long long)C) % 128 == 0);
+    assert(((unsigned long long)D) % 128 == 0);
+
+    init_host_matrices(A_h, B_h, C_h);
+
+    printf("Preparing data for GPU...\n");
+
+    checkCudaErrors(cudaMemcpy(A, A_h, sizeof(float) * M_GLOBAL * K_GLOBAL, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(B, B_h, sizeof(float) * N_GLOBAL * K_GLOBAL, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(C, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemset(D, 0, sizeof(float) * M_GLOBAL * N_GLOBAL));
+
+    enum {
+        // Compute the right amount of shared memory to request.
+        // We need shared memory to hold per-CTA C and D matrix tiles, and to cache per-CTA chunks
+        // of the A and B matrices. Therefore, the right amount to request is the maximum of those
+        // two numbers.
+        SHMEM_SZ = MAX(sizeof(float) * (BLOCK_COL_TILES * M) * (CHUNK_K * K + SKEW_FLOAT) * 2,
+                       M * (BLOCK_ROW_WARPS * WARP_ROW_TILES) * N * (BLOCK_COL_WARPS * WARP_COL_TILES) * sizeof(float))
+    };
+
+    printf("Required shared memory size: %lu Kb\n", SHMEM_SZ / 1024UL);
+
+    const float alpha = 1.1f;
+    const float beta = 1.2f;
+
+    cudaEvent_t start, stop;
+
+    checkCudaErrors(cudaEventCreate(&start));    
+    checkCudaErrors(cudaEventCreate(&stop));
+    checkCudaErrors(cudaEventRecord(start));
+
+    // kernel to run - default (tf32mma_shmem_gemm_async_copy == 0)
+    kernels selected_kernel = tf32mma_shmem_gemm_async_copy;
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) {
+        int kernel_number = getCmdLineArgumentInt(argc, (const char **)argv, "kernel");
+        if (kernel_number < 3) {
+            selected_kernel = (kernels)kernel_number;
+        }
+        else {
+            printf("Error: kernel number should be between 0 to 2, you have entered %d\n", kernel_number);
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    // If enough shared memory available on the GPU use high performant kernel
+    if ((deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) && (selected_kernel != simple_tf32mma_gemm)) {
+        printf("Computing using high performance kernel = %d - %s\n", selected_kernel, kernelNames[selected_kernel]);
+
+        switch (selected_kernel)
+        {
+            case tf32mma_shmem_gemm_async_copy :
+            default:
+                checkCudaErrors(cudaFuncSetAttribute(compute_tf32gemm_async_copy, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ));
+                checkKernelErrors((compute_tf32gemm_async_copy<<<deviceProp.multiProcessorCount*2, THREADS_PER_BLOCK, SHMEM_SZ>>>(A, B, C, D, alpha, beta)));
+                break;
+            case tf32mma_shmem_gemm :
+                checkCudaErrors(cudaFuncSetAttribute(compute_tf32gemm, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ));
+                checkKernelErrors((compute_tf32gemm<<<deviceProp.multiProcessorCount*2, THREADS_PER_BLOCK, SHMEM_SZ>>>(A, B, C, D, alpha, beta)));
+                break;
+        }
+#if CPU_DEBUG
+        checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(float)*M_GLOBAL*N_GLOBAL, cudaMemcpyDeviceToHost));
+#endif
+    }
+    else {
+        dim3 gridDim;
+        dim3 blockDim;
+     
+        // blockDim.x must be a multple of warpSize
+        // 128x4 means we have 16 warps and a block computes a 64x64 output tile
+        blockDim.x = 128;
+        blockDim.y = 4;
+
+        gridDim.x = (M_GLOBAL + (M * blockDim.x / 32 - 1)) / (M * blockDim.x / 32);
+        gridDim.y = (N_GLOBAL + N * blockDim.y - 1) / (N * blockDim.y);
+
+        printf("Computing... using simple_wmma_gemm kernel\n");
+        simple_wmma_tf32gemm<<<gridDim, blockDim>>>(A, B, C, D, M_GLOBAL, N_GLOBAL, K_GLOBAL, alpha, beta);
+#if CPU_DEBUG
+        checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(float) * M_GLOBAL * N_GLOBAL, cudaMemcpyDeviceToHost));
+#endif
+    }
+
+    checkCudaErrors(cudaEventRecord(stop));
+    checkCudaErrors(cudaEventSynchronize(stop));
+
+#if CPU_DEBUG
+    printf("Verifying correctness of the computations...\n");
+
+    memcpy(result_host, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL);
+
+    matMultiplyOnHost(A_h, B_h, result_host,
+                      alpha, beta,
+                      M_GLOBAL, K_GLOBAL,
+                      K_GLOBAL, N_GLOBAL,
+                      M_GLOBAL, N_GLOBAL);
+
+    for (int i = 0; i < N_GLOBAL * M_GLOBAL; i++) {
+        if (fabs(result_hD[i] - result_host[i]) > 0.1f) {
+            printf("mismatch i=%d result_hD=%f result_host=%f\n", i, result_hD[i], result_host[i]);
+        }
+    }
+    free(result_hD);
+    free(result_host);
+#endif
+
+    float milliseconds = 0;
+
+    checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop));
+
+    printf("Time: %f ms\n", milliseconds);
+    printf("TFLOPS: %.2f\n", (((double)M_GLOBAL * N_GLOBAL * K_GLOBAL * 2)/(milliseconds/1000.)) / 1e12);
+
+    free(A_h);
+    free(B_h);
+    free(C_h);
+    checkCudaErrors(cudaFree((void*)A));
+    checkCudaErrors(cudaFree((void*)B));
+    checkCudaErrors(cudaFree((void*)C));
+    checkCudaErrors(cudaFree((void*)D));
+
+    return 0;
+}
diff --git a/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2015.sln b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2015.sln
new file mode 100644
index 00000000..8cbb8758
--- /dev/null
+++ b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2015.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 14.00
+# Visual Studio 2015
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tf32TensorCoreGemm", "tf32TensorCoreGemm_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2015.vcxproj b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2015.vcxproj
new file mode 100644
index 00000000..d0b84954
--- /dev/null
+++ b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2015.vcxproj
@@ -0,0 +1,107 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>tf32TensorCoreGemm_vs2015</RootNamespace>
+    <ProjectName>tf32TensorCoreGemm</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/tf32TensorCoreGemm.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="tf32TensorCoreGemm.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2017.sln b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2017.sln
new file mode 100644
index 00000000..2274f1dc
--- /dev/null
+++ b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2017.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tf32TensorCoreGemm", "tf32TensorCoreGemm_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2017.vcxproj b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2017.vcxproj
new file mode 100644
index 00000000..1a09c012
--- /dev/null
+++ b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2017.vcxproj
@@ -0,0 +1,112 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>tf32TensorCoreGemm_vs2017</RootNamespace>
+    <ProjectName>tf32TensorCoreGemm</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/tf32TensorCoreGemm.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="tf32TensorCoreGemm.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2019.sln b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2019.sln
new file mode 100644
index 00000000..b269b647
--- /dev/null
+++ b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2019.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tf32TensorCoreGemm", "tf32TensorCoreGemm_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2019.vcxproj b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2019.vcxproj
new file mode 100644
index 00000000..f85ffab2
--- /dev/null
+++ b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2019.vcxproj
@@ -0,0 +1,108 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>tf32TensorCoreGemm_vs2019</RootNamespace>
+    <ProjectName>tf32TensorCoreGemm</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/tf32TensorCoreGemm.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_80,sm_80;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="tf32TensorCoreGemm.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.2.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/vectorAddMMAP/Makefile b/Samples/vectorAddMMAP/Makefile
index 09a07c02..cf28edf0 100644
--- a/Samples/vectorAddMMAP/Makefile
+++ b/Samples/vectorAddMMAP/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -254,6 +275,12 @@ ifeq ($(TARGET_ARCH),aarch64)
   SAMPLE_ENABLED := 0
 endif
 
+# This sample is not supported on sbsa
+ifeq ($(TARGET_ARCH),sbsa)
+  $(info >>> WARNING - vectorAddMMAP is not supported on sbsa - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
@@ -275,8 +302,8 @@ ifeq ($(GENCODE_FLAGS),)
 $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
 
 ifeq ($(SMS),)
-# Generate PTX code from SM 30
-GENCODE_FLAGS += -gencode arch=compute_30,code=compute_30
+# Generate PTX code from SM 35
+GENCODE_FLAGS += -gencode arch=compute_35,code=compute_35
 endif
 
 # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
@@ -302,6 +329,10 @@ else
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
   endif
 
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
+  endif
+
   ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
   endif
@@ -316,12 +347,19 @@ else
 
   ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
+    ifdef TARGET_OVERRIDE
+        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
+    endif
   endif
 
   ifeq ($(TARGET_ARCH),ppc64le)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
   endif
 
+  ifeq ($(HOST_ARCH),ppc64le)
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
+  endif
+
   CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
   ifeq ("$(CUDALIB)","")
     $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
diff --git a/Samples/vectorAddMMAP/README.md b/Samples/vectorAddMMAP/README.md
index 9e5ec83e..82c26f17 100644
--- a/Samples/vectorAddMMAP/README.md
+++ b/Samples/vectorAddMMAP/README.md
@@ -10,7 +10,7 @@ CUDA Driver API, Vector Addition, MMAP
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -27,7 +27,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cu
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/vectorAddMMAP/vectorAddMMAP_vs2012.vcxproj b/Samples/vectorAddMMAP/vectorAddMMAP_vs2012.vcxproj
index 90fb8a9d..d923197d 100644
--- a/Samples/vectorAddMMAP/vectorAddMMAP_vs2012.vcxproj
+++ b/Samples/vectorAddMMAP/vectorAddMMAP_vs2012.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/vectorAddMMAP.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/vectorAddMMAP/vectorAddMMAP_vs2013.vcxproj b/Samples/vectorAddMMAP/vectorAddMMAP_vs2013.vcxproj
index 005b1b94..84c52762 100644
--- a/Samples/vectorAddMMAP/vectorAddMMAP_vs2013.vcxproj
+++ b/Samples/vectorAddMMAP/vectorAddMMAP_vs2013.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/vectorAddMMAP.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/vectorAddMMAP/vectorAddMMAP_vs2015.vcxproj b/Samples/vectorAddMMAP/vectorAddMMAP_vs2015.vcxproj
index 56c1e10f..7f38991d 100644
--- a/Samples/vectorAddMMAP/vectorAddMMAP_vs2015.vcxproj
+++ b/Samples/vectorAddMMAP/vectorAddMMAP_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/vectorAddMMAP.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/vectorAddMMAP/vectorAddMMAP_vs2017.vcxproj b/Samples/vectorAddMMAP/vectorAddMMAP_vs2017.vcxproj
index ad90a3bb..6c610fd4 100644
--- a/Samples/vectorAddMMAP/vectorAddMMAP_vs2017.vcxproj
+++ b/Samples/vectorAddMMAP/vectorAddMMAP_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/vectorAddMMAP.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/vectorAddMMAP/vectorAddMMAP_vs2019.vcxproj b/Samples/vectorAddMMAP/vectorAddMMAP_vs2019.vcxproj
index 95bb5e57..93a4921d 100644
--- a/Samples/vectorAddMMAP/vectorAddMMAP_vs2019.vcxproj
+++ b/Samples/vectorAddMMAP/vectorAddMMAP_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/vectorAddMMAP.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,compute_30;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/vectorAdd_nvrtc/Makefile b/Samples/vectorAdd_nvrtc/Makefile
index fddc3092..3d991fe9 100644
--- a/Samples/vectorAdd_nvrtc/Makefile
+++ b/Samples/vectorAdd_nvrtc/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -280,6 +301,10 @@ else
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
   endif
 
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
+  endif
+
   ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
   endif
@@ -294,12 +319,19 @@ else
 
   ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
+    ifdef TARGET_OVERRIDE
+        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
+    endif
   endif
 
   ifeq ($(TARGET_ARCH),ppc64le)
     CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
   endif
 
+  ifeq ($(HOST_ARCH),ppc64le)
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
+  endif
+
   CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
   ifeq ("$(CUDALIB)","")
     $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
diff --git a/Samples/vectorAdd_nvrtc/README.md b/Samples/vectorAdd_nvrtc/README.md
index 98deedce..c552178a 100644
--- a/Samples/vectorAdd_nvrtc/README.md
+++ b/Samples/vectorAdd_nvrtc/README.md
@@ -10,11 +10,11 @@ CUDA Driver API, Vector Addition, Runtime Compilation
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -30,7 +30,7 @@ cuMemAlloc, cuMemFree, cuMemcpyHtoD, cuMemcpyDtoH
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
@@ -70,29 +70,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/vulkanImageCUDA/Makefile b/Samples/vulkanImageCUDA/Makefile
index 4e272deb..c229af9a 100644
--- a/Samples/vulkanImageCUDA/Makefile
+++ b/Samples/vulkanImageCUDA/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -268,12 +289,18 @@ include ./findvulkan.mk
 
 # Vulkan specific libraries
 ifeq ($(TARGET_OS),linux)
- LIBRARIES += -L $(VULKAN_SDK_PATH)/lib
+ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+  LIBRARIES += -L$(VULKAN_SDK_LIB) -lvulkan
+  LIBRARIES += -lglfw
+  INCLUDES  += -I$(VULKAN_HEADER)
+ else
+ LIBRARIES += -L$(VULKAN_SDK_LIB)
  LIBRARIES += `pkg-config --static --libs glfw3` -lvulkan
- INCLUDES  += `pkg-config --static --cflags glfw3` -I$(VULKAN_SDK_PATH)/include
+ INCLUDES  += `pkg-config --static --cflags glfw3` -I$(VULKAN_HEADER)
+ endif
 endif
 
-#Detect if installed version of GCC supports C++11
+#Detect if installed version of GCC supports required C++11
 ifeq ($(TARGET_OS),linux)
     empty :=
     space := $(empty) $(empty)
@@ -295,16 +322,16 @@ ifeq ($(TARGET_OS),linux)
     ifeq ($(IS_MIN_VERSION), 1)
         $(info >>> GCC Version is greater or equal to 4.7.0 <<<)
     else
-        $(info >>> Waiving build. Minimum GCC version required for C++11 is 4.7.0 <<<)
+        $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
         SAMPLE_ENABLED := 0
     endif
 endif
 
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
diff --git a/Samples/vulkanImageCUDA/NsightEclipse.xml b/Samples/vulkanImageCUDA/NsightEclipse.xml
index 409a488c..74d9e421 100644
--- a/Samples/vulkanImageCUDA/NsightEclipse.xml
+++ b/Samples/vulkanImageCUDA/NsightEclipse.xml
@@ -47,7 +47,6 @@
     <scope>1:CUDA Advanced Topics</scope>
     <scope>1:CUDA Vulkan Interop</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
   <sm-arch>sm35</sm-arch>
   <sm-arch>sm37</sm-arch>
   <sm-arch>sm50</sm-arch>
@@ -57,6 +56,7 @@
   <sm-arch>sm70</sm-arch>
   <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/vulkanImageCUDA/README.md b/Samples/vulkanImageCUDA/README.md
index e69cbf5e..75d2bc0c 100644
--- a/Samples/vulkanImageCUDA/README.md
+++ b/Samples/vulkanImageCUDA/README.md
@@ -10,7 +10,7 @@ Graphics Interop, CUDA Vulkan Interop, Data Parallel Algorithms
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedMipmappedArray, cudaImportE
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/vulkanImageCUDA/findvulkan.mk b/Samples/vulkanImageCUDA/findvulkan.mk
index 8946b6d5..47016fd7 100644
--- a/Samples/vulkanImageCUDA/findvulkan.mk
+++ b/Samples/vulkanImageCUDA/findvulkan.mk
@@ -51,7 +51,7 @@ ifeq ("$(TARGET_OS)","linux")
 endif
 
 ifeq ("$(TARGET_OS)","linux")
-    # Each set of Linux Distros have different paths for where to find their GLM/GLFW3 libraries reside
+    # Each set of Linux Distros have different paths for where to find libraries
     UBUNTU = $(shell echo $(DISTRO) | grep -i ubuntu      >/dev/null 2>&1; echo $$?)
     FEDORA = $(shell echo $(DISTRO) | grep -i fedora      >/dev/null 2>&1; echo $$?)
     RHEL   = $(shell echo $(DISTRO) | grep -i 'red\|rhel' >/dev/null 2>&1; echo $$?)
@@ -107,16 +107,17 @@ ifeq ("$(TARGET_OS)","linux")
   VULKAN_SDK_PATH ?= ${VULKAN_SDK}
 
   ifeq ("$(VULKAN_SDK_PATH)","")
-      $(info >>> WARNING - Vulkan SDK not found, please install Vulkan SDK <<<)
-      SAMPLE_ENABLED := 0
+      VULKAN_SDK_PATH := $(DFLT_PATH)
   endif
 
-  VULKAN_SDK_LIB  := $(shell find -L $(VULKAN_SDK_PATH)/lib -name libvulkan.so    -print 2>/dev/null)
+  VULKAN_SDK_LIB  := $(shell find -L $(VULKAN_SDK_PATH) -name libvulkan.so    -print 2>/dev/null)
   X11LIB          := $(shell find -L $(GLPATH) $(DFLT_PATH) -name libX11.so    -print 2>/dev/null)
 
   ifeq ("$(VULKAN_SDK_LIB)","")
-      $(info >>> WARNING - libvulkan.so not found, please install libvulkan.so <<<)
+      $(info >>> WARNING - libvulkan.so not found, please install Vulkan SDK and pass VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK> <<<)
       SAMPLE_ENABLED := 0
+  else
+      VULKAN_SDK_LIB := $(shell echo $(VULKAN_SDK_LIB) | sed "s/ .*//" | sed "s/\/libvulkan.so//" )
   endif
 
   ifeq ("$(X11LIB)","")
@@ -132,11 +133,13 @@ ifeq ("$(TARGET_OS)","linux")
       HEADER_SEARCH_PATH += /usr/aarch64-linux-gnu/include
   endif
 
-  VULKANHEADER  := $(shell find -L $(VULKAN_SDK_PATH)/include -name vulkan.h -print 2>/dev/null)
+  VULKAN_HEADER  := $(shell find -L $(VULKAN_SDK_PATH) $(HEADER_SEARCH_PATH) -name vulkan.h -print 2>/dev/null)
 
-  ifeq ("$(VULKANHEADER)","")
+  ifeq ("$(VULKAN_HEADER)","")
       $(info >>> WARNING - vulkan.h not found, please install vulkan.h <<<)
       SAMPLE_ENABLED := 0
+  else
+      VULKAN_HEADER := $(shell echo $(VULKAN_HEADER) | sed "s/ .*//" | sed "s/\/vulkan\/vulkan.h//" )
   endif
 else
 endif
diff --git a/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2015.vcxproj b/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2015.vcxproj
index 40f87ae3..1a70a7e4 100644
--- a/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2015.vcxproj
+++ b/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/vulkanImageCUDA.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2017.vcxproj b/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2017.vcxproj
index 6709823c..95fdc6f9 100644
--- a/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2017.vcxproj
+++ b/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/vulkanImageCUDA.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2019.vcxproj b/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2019.vcxproj
index a3319f69..9a9d18ba 100644
--- a/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2019.vcxproj
+++ b/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/vulkanImageCUDA.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/warpAggregatedAtomicsCG/Makefile b/Samples/warpAggregatedAtomicsCG/Makefile
index 51b4ad56..71942c7e 100644
--- a/Samples/warpAggregatedAtomicsCG/Makefile
+++ b/Samples/warpAggregatedAtomicsCG/Makefile
@@ -72,9 +72,9 @@ endif
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
-ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
             TARGET_SIZE := 64
         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
             TARGET_SIZE := 32
@@ -85,8 +85,17 @@ ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 else
     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
-    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
     endif
 endif
@@ -141,6 +150,8 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
         else ifeq ($(TARGET_OS), android)
             HOST_COMPILER ?= aarch64-linux-android-clang++
         endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
     else ifeq ($(TARGET_ARCH),ppc64le)
         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
     endif
@@ -185,19 +196,27 @@ ifneq ($(TARGET_ARCH),$(HOST_ARCH))
                 CCFLAGS += --sysroot=$(TARGET_FS)
             endif
             LDFLAGS += --sysroot=$(TARGET_FS)
-            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
-            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
-            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include  -I$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia
+        endif
+        ifdef TARGET_OVERRIDE # cuda toolkit targets override
+            NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
         endif
     endif
-endif
-
-ifeq ($(TARGET_OS),qnx)
-    CCFLAGS += -DWIN_INTERFACE_CUSTOM
-    LDFLAGS += -lsocket
 endif
 
 # Install directory of different arch
@@ -206,6 +225,8 @@ ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
@@ -245,11 +266,38 @@ LIBRARIES :=
 
 ################################################################################
 
+#Detect if installed version of GCC supports required C++11
+ifeq ($(TARGET_OS),linux)
+    empty :=
+    space := $(empty) $(empty)
+    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
+#Create version number without "."
+    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
+# Make sure the version number has at least 3 decimals
+    GCCVERSION += 00
+# Remove spaces from the version number
+    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
+# Crop the version number to 3 decimals.
+    GCCVERSION := $(shell expr `echo $(GCCVERSION)` | cut -b1-3)
+#$(warning $(GCCVERSION))
+
+    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 470)
+
+    ifeq ($(IS_MIN_VERSION), 1)
+        $(info >>> GCC Version is greater or equal to 4.7.0 <<<)
+    else
+        $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
+        SAMPLE_ENABLED := 0
+    endif
+endif
+
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 30 35 37 50 52 60 61 70 72 75
+SMS ?= 35 37 50 52 60 61 70 72 75 80
 else
-SMS ?= 30 35 37 50 52 60 61 70 75
+SMS ?= 35 37 50 52 60 61 70 75 80
 endif
 
 ifeq ($(SMS),)
@@ -268,6 +316,8 @@ GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
 endif
 endif
 
+ALL_CCFLAGS += --std=c++11
+
 ifeq ($(SAMPLE_ENABLED),0)
 EXEC ?= @echo "[@]"
 endif
diff --git a/Samples/warpAggregatedAtomicsCG/NsightEclipse.xml b/Samples/warpAggregatedAtomicsCG/NsightEclipse.xml
index 67bf4eed..a1e6f202 100644
--- a/Samples/warpAggregatedAtomicsCG/NsightEclipse.xml
+++ b/Samples/warpAggregatedAtomicsCG/NsightEclipse.xml
@@ -2,7 +2,10 @@
 <!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
 <entry>
   <name>warpAggregatedAtomicsCG</name>
-  <description><![CDATA[This sample demonstrates how using Cooperative Groups (CG) to perform warp aggregated atomics, a useful technique to improve performance when many threads atomically add to a single counter.]]></description>
+  <cflags>
+    <flag>--std=c++11</flag>
+  </cflags>
+  <description><![CDATA[This sample demonstrates how using Cooperative Groups (CG) to perform warp aggregated atomics to single and multiple counters, a useful technique to improve performance when many threads atomically add to a single or multiple counters.]]></description>
   <includepaths>
     <path>./</path>
     <path>../</path>
@@ -16,6 +19,7 @@
     <keyword>GPGPU</keyword>
     <keyword>Cooperative Groups</keyword>
     <keyword>Atomic</keyword>
+    <keyword>CPP11</keyword>
   </keywords>
   <libraries>
   </libraries>
@@ -26,16 +30,6 @@
   <scopes>
     <scope>1:CUDA Advanced Topics</scope>
   </scopes>
-  <sm-arch>sm30</sm-arch>
-  <sm-arch>sm35</sm-arch>
-  <sm-arch>sm37</sm-arch>
-  <sm-arch>sm50</sm-arch>
-  <sm-arch>sm52</sm-arch>
-  <sm-arch>sm60</sm-arch>
-  <sm-arch>sm61</sm-arch>
-  <sm-arch>sm70</sm-arch>
-  <sm-arch>sm72</sm-arch>
-  <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
diff --git a/Samples/warpAggregatedAtomicsCG/README.md b/Samples/warpAggregatedAtomicsCG/README.md
index d2da39ce..5030b8f7 100644
--- a/Samples/warpAggregatedAtomicsCG/README.md
+++ b/Samples/warpAggregatedAtomicsCG/README.md
@@ -2,7 +2,7 @@
 
 ## Description
 
-This sample demonstrates how using Cooperative Groups (CG) to perform warp aggregated atomics, a useful technique to improve performance when many threads atomically add to a single counter.
+This sample demonstrates how using Cooperative Groups (CG) to perform warp aggregated atomics to single and multiple counters, a useful technique to improve performance when many threads atomically add to a single or multiple counters.
 
 ## Key Concepts
 
@@ -10,11 +10,9 @@ Cooperative Groups, Atomic Intrinsics
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
-
 ## Supported OSes
 
-Linux, Windows, MacOSX
+Linux, Windows
 
 ## Supported CPU Architecture
 
@@ -24,7 +22,7 @@ x86_64, ppc64le, armv7l, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
@@ -63,29 +61,5 @@ The samples makefiles can take advantage of certain options:
     $ make HOST_COMPILER=g++
 ```
 
-### Mac
-The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
-```
-$ cd <sample_dir>
-$ make
-```
-
-The samples makefiles can take advantage of certain options:
-
-*  **dbg=1** - build with debug symbols
-    ```
-    $ make dbg=1
-    ```
-
-*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
-    ```
-    $ make SMS="A B ..."
-    ```
-
-*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
-    ```
-    $ make HOST_COMPILER=clang
-    ```
-
 ## References (for more details)
 
diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG.cu b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG.cu
index eedac116..429c755b 100644
--- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG.cu
+++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG.cu
@@ -43,21 +43,17 @@ namespace cg = cooperative_groups;
 __device__ int atomicAggInc(int *counter) {
   cg::coalesced_group active = cg::coalesced_threads();
 
-  int mask = active.ballot(1);
-  // select the leader
-  int leader = __ffs(mask) - 1;
-
   // leader does the update
   int res = 0;
-  if (active.thread_rank() == leader) {
-    res = atomicAdd(counter, __popc(mask));
+  if (active.thread_rank() == 0) {
+    res = atomicAdd(counter, active.size());
   }
 
   // broadcast result
-  res = active.shfl(res, leader);
+  res = active.shfl(res, 0);
 
   // each thread computes its own value
-  return res + __popc(mask & ((1 << active.thread_rank()) - 1));
+  return res + active.thread_rank();
 }
 
 __global__ void filter_arr(int *dst, int *nres, const int *src, int n) {
@@ -68,18 +64,108 @@ __global__ void filter_arr(int *dst, int *nres, const int *src, int n) {
   }
 }
 
+// warp-aggregated atomic multi bucket increment
+#if __CUDA_ARCH__ >= 700
+__device__ int atomicAggIncMulti(const int bucket, int *counter)
+{
+  cg::coalesced_group active = cg::coalesced_threads();
+  // group all threads with same bucket value.
+  auto labeledGroup = cg::labeled_partition(active, bucket);
+
+  int res = 0;
+  if (labeledGroup.thread_rank() == 0)
+  {
+    res = atomicAdd(&counter[bucket], labeledGroup.size());
+  }
+
+  // broadcast result
+  res = labeledGroup.shfl(res, 0);
+
+  // each thread computes its own value
+  return res + labeledGroup.thread_rank();
+}
+#endif
+
+// Places individual value indices into its corresponding buckets.
+__global__ void mapToBuckets(const int *srcArr, int *indicesBuckets, int *bucketCounters, const int srcSize, const int numOfBuckets)
+{
+#if __CUDA_ARCH__ >= 700
+  cg::grid_group grid = cg::this_grid();
+
+  for (int i=grid.thread_rank(); i < srcSize; i += grid.size())
+  {
+    const int bucket = srcArr[i];
+    if (bucket < numOfBuckets)
+    {
+      indicesBuckets[atomicAggIncMulti(bucket, bucketCounters)] = i;
+    }
+  }
+#endif
+}
+
+int mapIndicesToBuckets(int *h_srcArr, int *d_srcArr, int numOfBuckets)
+{
+  int *d_indicesBuckets, *d_bucketCounters;
+  int *cpuBucketCounters = new int[numOfBuckets];
+  int *h_bucketCounters = new int[numOfBuckets];
+
+  memset(cpuBucketCounters, 0, sizeof(int)*numOfBuckets);
+  // Initialize each bucket counters.
+  for (int i = 0; i < numOfBuckets; i++)
+  {
+    h_bucketCounters[i] = i*NUM_ELEMS;
+  }
+
+  checkCudaErrors(cudaMalloc(&d_indicesBuckets, sizeof(int) * NUM_ELEMS * numOfBuckets));
+  checkCudaErrors(cudaMalloc(&d_bucketCounters, sizeof(int) * numOfBuckets));
+
+  checkCudaErrors(cudaMemcpy(d_bucketCounters, h_bucketCounters, sizeof(int)*numOfBuckets, cudaMemcpyHostToDevice));
+
+  dim3 dimBlock(NUM_THREADS_PER_BLOCK, 1, 1);
+  dim3 dimGrid((NUM_ELEMS / NUM_THREADS_PER_BLOCK), 1, 1);
+
+  mapToBuckets<<<dimGrid, dimBlock>>>(d_srcArr, d_indicesBuckets, d_bucketCounters, NUM_ELEMS, numOfBuckets);
+
+  checkCudaErrors(cudaMemcpy(h_bucketCounters, d_bucketCounters, sizeof(int)*numOfBuckets, cudaMemcpyDeviceToHost));
+
+  for (int i=0; i < NUM_ELEMS; i++)
+  {
+    cpuBucketCounters[h_srcArr[i]]++;
+  }
+
+  bool allMatch = true;
+  int finalElems = 0;
+  for (int i=0; i < numOfBuckets; i++)
+  {
+    finalElems += (h_bucketCounters[i] - i*NUM_ELEMS);
+    if (cpuBucketCounters[i] != (h_bucketCounters[i] - i*NUM_ELEMS))
+    {
+      allMatch = false;
+      break;
+    }
+  }
+
+  if (!allMatch && finalElems != NUM_ELEMS)
+  {
+      return EXIT_FAILURE;
+  }
+  return EXIT_SUCCESS;
+}
+
 int main(int argc, char **argv) {
   int *data_to_filter, *filtered_data, nres = 0;
   int *d_data_to_filter, *d_filtered_data, *d_nres;
 
+  int numOfBuckets = 5;
+
   data_to_filter = reinterpret_cast<int *>(malloc(sizeof(int) * NUM_ELEMS));
 
   // Generate input data.
   for (int i = 0; i < NUM_ELEMS; i++) {
-    data_to_filter[i] = rand() % 20;
+    data_to_filter[i] = rand() % numOfBuckets;
   }
 
-  findCudaDevice(argc, (const char **)argv);
+  int devId = findCudaDevice(argc, (const char **)argv);
 
   checkCudaErrors(cudaMalloc(&d_data_to_filter, sizeof(int) * NUM_ELEMS));
   checkCudaErrors(cudaMalloc(&d_filtered_data, sizeof(int) * NUM_ELEMS));
@@ -114,8 +200,18 @@ int main(int argc, char **argv) {
     }
   }
 
+  int major = 0;
+  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devId));
+
+  int mapIndicesToBucketsStatus = EXIT_SUCCESS;
+  // atomicAggIncMulti require a GPU of Volta (SM7X) architecture or higher,
+  // so that it can take advantage of the new MATCH capability of Volta hardware
+  if (major >= 7) {
+    mapIndicesToBucketsStatus = mapIndicesToBuckets(data_to_filter, d_data_to_filter, numOfBuckets);
+  }
+
   printf("\nWarp Aggregated Atomics %s \n",
-         host_flt_count == nres ? "PASSED" : "FAILED");
+         (host_flt_count == nres) && (mapIndicesToBucketsStatus == EXIT_SUCCESS) ? "PASSED" : "FAILED");
 
   checkCudaErrors(cudaFree(d_data_to_filter));
   checkCudaErrors(cudaFree(d_filtered_data));
diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2015.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2015.vcxproj
index 6b869272..3454681f 100644
--- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2015.vcxproj
+++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2015.vcxproj
@@ -62,7 +62,7 @@
       <OutputFile>$(OutDir)/warpAggregatedAtomicsCG.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj
index 982202a0..d88c61b7 100644
--- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj
+++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/warpAggregatedAtomicsCG.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2019.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2019.vcxproj
index d5f11bd1..5f865fd5 100644
--- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2019.vcxproj
+++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2019.vcxproj
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/warpAggregatedAtomicsCG.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>