From 568b39bd5bea5924446d49d07a4c026a1ec945a5 Mon Sep 17 00:00:00 2001
From: Rutwik Choughule <rutwik.choughule@gmail.com>
Date: Fri, 16 Apr 2021 11:54:26 +0530
Subject: [PATCH] add and update samples with CUDA 11.3 support

---
 README.md                                     |   67 +-
 Samples/EGLStream_CUDA_Interop/Makefile       |    6 +
 Samples/EGLStream_CUDA_Interop/README.md      |    2 +-
 Samples/MersenneTwisterGP11213/Makefile       |   29 +-
 .../MersenneTwister.cpp                       |  188 +-
 .../MersenneTwisterGP11213_vs2017.vcxproj     |    4 +-
 .../MersenneTwisterGP11213_vs2019.vcxproj     |    4 +-
 Samples/MersenneTwisterGP11213/README.md      |    2 +-
 .../NV12toBGRandResize_vs2017.vcxproj         |    4 +-
 .../NV12toBGRandResize_vs2019.vcxproj         |    4 +-
 Samples/NV12toBGRandResize/README.md          |    2 +-
 Samples/UnifiedMemoryPerf/README.md           |    2 +-
 .../UnifiedMemoryPerf_vs2017.vcxproj          |    4 +-
 .../UnifiedMemoryPerf_vs2019.vcxproj          |    4 +-
 Samples/bandwidthTest/README.md               |    2 +-
 .../bandwidthTest_vs2017.vcxproj              |    4 +-
 .../bandwidthTest_vs2019.vcxproj              |    4 +-
 .../Makefile                                  |    6 +
 .../README.md                                 |    2 +-
 ...chedLabelMarkersAndLabelCompressionNPP.cpp |   93 +-
 ...rkersAndLabelCompressionNPP_vs2017.vcxproj |    4 +-
 ...rkersAndLabelCompressionNPP_vs2019.vcxproj |    4 +-
 Samples/bf16TensorCoreGemm/README.md          |    2 +-
 .../bf16TensorCoreGemm_vs2017.vcxproj         |    4 +-
 .../bf16TensorCoreGemm_vs2019.vcxproj         |    4 +-
 Samples/binaryPartitionCG/README.md           |    2 +-
 .../binaryPartitionCG/binaryPartitionCG.cu    |  172 +-
 .../binaryPartitionCG_vs2017.vcxproj          |    4 +-
 .../binaryPartitionCG_vs2019.vcxproj          |    4 +-
 Samples/boxFilterNPP/README.md                |    2 +-
 .../boxFilterNPP/boxFilterNPP_vs2017.vcxproj  |    4 +-
 .../boxFilterNPP/boxFilterNPP_vs2019.vcxproj  |    4 +-
 Samples/cannyEdgeDetectorNPP/README.md        |    2 +-
 .../cannyEdgeDetectorNPP_vs2017.vcxproj       |    4 +-
 .../cannyEdgeDetectorNPP_vs2019.vcxproj       |    4 +-
 Samples/concurrentKernels/README.md           |    2 +-
 .../concurrentKernels_vs2017.vcxproj          |    4 +-
 .../concurrentKernels_vs2019.vcxproj          |    4 +-
 Samples/conjugateGradientCudaGraphs/Makefile  |    6 +
 Samples/conjugateGradientCudaGraphs/README.md |    2 +-
 .../conjugateGradientCudaGraphs.cu            |   78 +-
 ...conjugateGradientCudaGraphs_vs2017.vcxproj |    4 +-
 ...conjugateGradientCudaGraphs_vs2019.vcxproj |    4 +-
 .../conjugateGradientMultiBlockCG/README.md   |    2 +-
 ...njugateGradientMultiBlockCG_vs2017.vcxproj |    4 +-
 ...njugateGradientMultiBlockCG_vs2019.vcxproj |    4 +-
 .../conjugateGradientMultiDeviceCG/README.md  |    2 +-
 .../conjugateGradientMultiDeviceCG.cu         |  138 +-
 ...jugateGradientMultiDeviceCG_vs2017.vcxproj |    4 +-
 ...jugateGradientMultiDeviceCG_vs2019.vcxproj |    4 +-
 Samples/cuSolverDn_LinearSolver/Makefile      |    6 +
 Samples/cuSolverDn_LinearSolver/README.md     |    2 +-
 .../cuSolverDn_LinearSolver_vs2017.vcxproj    |    4 +-
 .../cuSolverDn_LinearSolver_vs2019.vcxproj    |    4 +-
 Samples/cuSolverSp_LinearSolver/Makefile      |    6 +
 Samples/cuSolverSp_LinearSolver/README.md     |    2 +-
 .../cuSolverSp_LinearSolver.cpp               |    6 +-
 .../cuSolverSp_LinearSolver_vs2017.vcxproj    |    4 +-
 .../cuSolverSp_LinearSolver_vs2019.vcxproj    |    4 +-
 Samples/cudaCompressibleMemory/README.md      |    2 +-
 .../cudaCompressibleMemory_vs2017.vcxproj     |    4 +-
 .../cudaCompressibleMemory_vs2019.vcxproj     |    4 +-
 Samples/cudaNvSci/Makefile                    |    6 +
 Samples/cudaNvSci/README.md                   |    2 +-
 Samples/cudaNvSciNvMedia/README.md            |    2 +-
 Samples/cudaOpenMP/README.md                  |    2 +-
 Samples/cudaOpenMP/cudaOpenMP_vs2017.vcxproj  |    4 +-
 Samples/cudaOpenMP/cudaOpenMP_vs2019.vcxproj  |    4 +-
 Samples/cudaTensorCoreGemm/README.md          |    2 +-
 .../cudaTensorCoreGemm_vs2017.vcxproj         |    4 +-
 .../cudaTensorCoreGemm_vs2019.vcxproj         |    4 +-
 Samples/deviceQuery/README.md                 |    2 +-
 Samples/deviceQuery/deviceQuery.cpp           |   21 +-
 .../deviceQuery/deviceQuery_vs2017.vcxproj    |    4 +-
 .../deviceQuery/deviceQuery_vs2019.vcxproj    |    4 +-
 Samples/dmmaTensorCoreGemm/README.md          |    2 +-
 .../dmmaTensorCoreGemm_vs2017.vcxproj         |    4 +-
 .../dmmaTensorCoreGemm_vs2019.vcxproj         |    4 +-
 Samples/globalToShmemAsyncCopy/README.md      |    2 +-
 .../globalToShmemAsyncCopy.cu                 | 1681 ++++-----
 .../globalToShmemAsyncCopy_vs2017.vcxproj     |    4 +-
 .../globalToShmemAsyncCopy_vs2019.vcxproj     |    4 +-
 Samples/immaTensorCoreGemm/README.md          |    2 +-
 .../immaTensorCoreGemm_vs2017.vcxproj         |    4 +-
 .../immaTensorCoreGemm_vs2019.vcxproj         |    4 +-
 Samples/jacobiCudaGraphs/README.md            |    2 +-
 .../jacobiCudaGraphs_vs2017.vcxproj           |    4 +-
 .../jacobiCudaGraphs_vs2019.vcxproj           |    4 +-
 Samples/jacobiCudaGraphs/main.cpp             |    9 +-
 Samples/matrixMul/README.md                   |    2 +-
 Samples/matrixMul/matrixMul_vs2017.vcxproj    |    4 +-
 Samples/matrixMul/matrixMul_vs2019.vcxproj    |    4 +-
 Samples/matrixMulDrv/README.md                |    2 +-
 .../matrixMulDrv/matrixMulDrv_vs2017.vcxproj  |    4 +-
 .../matrixMulDrv/matrixMulDrv_vs2019.vcxproj  |    4 +-
 Samples/memMapIPCDrv/Makefile                 |   19 +-
 Samples/memMapIPCDrv/README.md                |    2 +-
 .../memMapIPCDrv/memMapIPCDrv_vs2017.vcxproj  |   10 +-
 .../memMapIPCDrv/memMapIPCDrv_vs2019.vcxproj  |   10 +-
 Samples/memMapIPCDrv/memMapIpc.cpp            |   87 +-
 Samples/nvJPEG/Makefile                       |    6 +
 Samples/nvJPEG/README.md                      |    2 +-
 Samples/nvJPEG/nvJPEG_vs2017.vcxproj          |    4 +-
 Samples/nvJPEG/nvJPEG_vs2019.vcxproj          |    4 +-
 Samples/nvJPEG_encoder/Makefile               |    6 +
 Samples/nvJPEG_encoder/README.md              |    2 +-
 .../nvJPEG_encoder_vs2017.vcxproj             |    4 +-
 .../nvJPEG_encoder_vs2019.vcxproj             |    4 +-
 Samples/p2pBandwidthLatencyTest/README.md     |    2 +-
 .../p2pBandwidthLatencyTest_vs2017.vcxproj    |    4 +-
 .../p2pBandwidthLatencyTest_vs2019.vcxproj    |    4 +-
 Samples/reduction/README.md                   |    2 +-
 Samples/reduction/reduction_vs2017.vcxproj    |    4 +-
 Samples/reduction/reduction_vs2019.vcxproj    |    4 +-
 Samples/shfl_scan/README.md                   |    2 +-
 Samples/shfl_scan/shfl_scan_vs2017.vcxproj    |    4 +-
 Samples/shfl_scan/shfl_scan_vs2019.vcxproj    |    4 +-
 Samples/simpleAWBarrier/README.md             |    2 +-
 Samples/simpleAWBarrier/simpleAWBarrier.cu    |  356 +-
 .../simpleAWBarrier_vs2017.vcxproj            |    4 +-
 .../simpleAWBarrier_vs2019.vcxproj            |    4 +-
 Samples/simpleAttributes/README.md            |    2 +-
 Samples/simpleAttributes/simpleAttributes.cu  |  263 +-
 .../simpleAttributes_vs2017.vcxproj           |    4 +-
 .../simpleAttributes_vs2019.vcxproj           |    4 +-
 Samples/simpleCUBLAS/Makefile                 |   29 +-
 Samples/simpleCUBLAS/README.md                |    2 +-
 .../simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj  |    4 +-
 .../simpleCUBLAS/simpleCUBLAS_vs2019.vcxproj  |    4 +-
 Samples/simpleCUBLASXT/Makefile               |    6 +
 Samples/simpleCUBLASXT/README.md              |    2 +-
 .../simpleCUBLASXT_vs2017.vcxproj             |    4 +-
 .../simpleCUBLASXT_vs2019.vcxproj             |    4 +-
 Samples/simpleCUBLAS_LU/Makefile              |  357 ++
 Samples/simpleCUBLAS_LU/NsightEclipse.xml     |   68 +
 Samples/simpleCUBLAS_LU/README.md             |   71 +
 Samples/simpleCUBLAS_LU/simpleCUBLAS_LU.cpp   |  417 +++
 .../simpleCUBLAS_LU_vs2017.sln                |   20 +
 .../simpleCUBLAS_LU_vs2017.vcxproj            |  113 +
 .../simpleCUBLAS_LU_vs2019.sln                |   20 +
 .../simpleCUBLAS_LU_vs2019.vcxproj            |  109 +
 Samples/simpleCUFFT/Makefile                  |    6 +
 Samples/simpleCUFFT/README.md                 |    2 +-
 .../simpleCUFFT/simpleCUFFT_vs2017.vcxproj    |    4 +-
 .../simpleCUFFT/simpleCUFFT_vs2019.vcxproj    |    4 +-
 Samples/simpleCudaGraphs/README.md            |    2 +-
 Samples/simpleCudaGraphs/simpleCudaGraphs.cu  |    3 +-
 .../simpleCudaGraphs_vs2017.vcxproj           |    4 +-
 .../simpleCudaGraphs_vs2019.vcxproj           |    4 +-
 Samples/simpleD3D11/README.md                 |    2 +-
 .../simpleD3D11/simpleD3D11_vs2017.vcxproj    |    4 +-
 .../simpleD3D11/simpleD3D11_vs2019.vcxproj    |    4 +-
 Samples/simpleD3D12/README.md                 |    2 +-
 Samples/simpleD3D12/simpleD3D12.cpp           |   15 +-
 .../simpleD3D12/simpleD3D12_vs2017.vcxproj    |    4 +-
 .../simpleD3D12/simpleD3D12_vs2019.vcxproj    |    4 +-
 Samples/simpleDrvRuntime/README.md            |    2 +-
 Samples/simpleDrvRuntime/simpleDrvRuntime.cpp |   12 +-
 .../simpleDrvRuntime_vs2017.vcxproj           |    4 +-
 .../simpleDrvRuntime_vs2019.vcxproj           |    4 +-
 Samples/simpleGL/README.md                    |    2 +-
 Samples/simpleGL/simpleGL_vs2017.vcxproj      |    4 +-
 Samples/simpleGL/simpleGL_vs2019.vcxproj      |    4 +-
 Samples/simpleIPC/README.md                   |    2 +-
 Samples/simpleIPC/simpleIPC_vs2017.vcxproj    |    4 +-
 Samples/simpleIPC/simpleIPC_vs2019.vcxproj    |    4 +-
 Samples/simpleVoteIntrinsics/README.md        |    2 +-
 .../simpleVoteIntrinsics_vs2017.vcxproj       |    4 +-
 .../simpleVoteIntrinsics_vs2019.vcxproj       |    4 +-
 Samples/simpleVulkan/README.md                |    2 +-
 Samples/simpleVulkan/SineWaveSimulation.cu    |  170 +-
 Samples/simpleVulkan/SineWaveSimulation.h     |   34 +-
 Samples/simpleVulkan/VulkanBaseApp.cpp        | 3201 +++++++++--------
 Samples/simpleVulkan/VulkanBaseApp.h          |  202 +-
 Samples/simpleVulkan/main.cpp                 |  749 ++--
 .../simpleVulkan/simpleVulkan_vs2017.vcxproj  |    4 +-
 .../simpleVulkan/simpleVulkan_vs2019.vcxproj  |    4 +-
 Samples/simpleVulkanMMAP/README.md            |    2 +-
 .../simpleVulkanMMAP_vs2017.vcxproj           |    4 +-
 .../simpleVulkanMMAP_vs2019.vcxproj           |    4 +-
 Samples/simpleZeroCopy/README.md              |    2 +-
 .../simpleZeroCopy_vs2017.vcxproj             |    4 +-
 .../simpleZeroCopy_vs2019.vcxproj             |    4 +-
 Samples/streamOrderedAllocation/README.md     |    2 +-
 .../streamOrderedAllocation_vs2017.vcxproj    |    4 +-
 .../streamOrderedAllocation_vs2019.vcxproj    |    4 +-
 Samples/streamOrderedAllocationIPC/Makefile   |  423 +++
 .../NsightEclipse.xml                         |   65 +
 Samples/streamOrderedAllocationIPC/README.md  |   60 +
 .../streamOrderedAllocationIPC.cu             |  440 +++
 Samples/streamOrderedAllocationP2P/README.md  |    2 +-
 .../streamOrderedAllocationP2P.cu             |   35 +-
 .../streamOrderedAllocationP2P_vs2017.vcxproj |    4 +-
 .../streamOrderedAllocationP2P_vs2019.vcxproj |    4 +-
 Samples/systemWideAtomics/README.md           |    2 +-
 Samples/tf32TensorCoreGemm/README.md          |    2 +-
 .../tf32TensorCoreGemm_vs2017.vcxproj         |    4 +-
 .../tf32TensorCoreGemm_vs2019.vcxproj         |    4 +-
 Samples/vectorAddMMAP/README.md               |    2 +-
 .../vectorAddMMAP_vs2017.vcxproj              |    4 +-
 .../vectorAddMMAP_vs2019.vcxproj              |    4 +-
 Samples/vectorAdd_nvrtc/README.md             |    2 +-
 .../vectorAdd_nvrtc_vs2017.vcxproj            |    4 +-
 .../vectorAdd_nvrtc_vs2019.vcxproj            |    4 +-
 Samples/vulkanImageCUDA/README.md             |    2 +-
 .../vulkanImageCUDA_vs2017.vcxproj            |    4 +-
 .../vulkanImageCUDA_vs2019.vcxproj            |    4 +-
 Samples/warpAggregatedAtomicsCG/README.md     |    2 +-
 .../warpAggregatedAtomicsCG_vs2017.vcxproj    |    4 +-
 .../warpAggregatedAtomicsCG_vs2019.vcxproj    |    4 +-
 Samples/watershedSegmentationNPP/Makefile     |    6 +
 Samples/watershedSegmentationNPP/README.md    |    2 +-
 .../watershedSegmentationNPP_vs2017.vcxproj   |    4 +-
 .../watershedSegmentationNPP_vs2019.vcxproj   |    4 +-
 214 files changed, 6590 insertions(+), 3856 deletions(-)
 create mode 100644 Samples/simpleCUBLAS_LU/Makefile
 create mode 100644 Samples/simpleCUBLAS_LU/NsightEclipse.xml
 create mode 100644 Samples/simpleCUBLAS_LU/README.md
 create mode 100644 Samples/simpleCUBLAS_LU/simpleCUBLAS_LU.cpp
 create mode 100644 Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.sln
 create mode 100644 Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.vcxproj
 create mode 100644 Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.sln
 create mode 100644 Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.vcxproj
 create mode 100644 Samples/streamOrderedAllocationIPC/Makefile
 create mode 100644 Samples/streamOrderedAllocationIPC/NsightEclipse.xml
 create mode 100644 Samples/streamOrderedAllocationIPC/README.md
 create mode 100644 Samples/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu

diff --git a/README.md b/README.md
index 0db26681..a4fafa7f 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,17 @@
 # CUDA Samples
 
-Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads).
+Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads).
 
 ## Release Notes
 
 This section describes the release notes for the CUDA Samples on GitHub only.
 
+### CUDA 11.3
+*  Added `streamOrderedAllocationIPC`. Demonstrates Inter Process Communication using one process per GPU for computation.
+*  Added `simpleCUBLAS_LU`. Demonstrates batched matrix LU decomposition using cuBLAS API `cublas<t>getrfBatched()`
+*  Updated `simpleVulkan`. Demonstrates use of timeline semaphore.
+*  Updated multiple samples to use pinned memory using `cudaMallocHost()`.
+
 ### CUDA 11.2
 *  Added `streamOrderedAllocation`. Demonstrates stream ordered memory allocation on a GPU using cudaMallocAsync and cudaMemPool family of APIs.
 *  Added `streamOrderedAllocationP2P`. Demonstrates peer-to-peer access of stream ordered memory allocated using cudaMallocAsync and cudaMemPool family of APIs.
@@ -103,7 +109,7 @@ This is the first release of CUDA Samples on GitHub:
 
 ### Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), and the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html).
 
 ### Getting the CUDA Samples
@@ -160,38 +166,39 @@ The samples makefiles can take advantage of certain options:
 ### Samples by OS
 
 #### Linux
-**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** |
+**[bandwidthTest](./Samples/bandwidthTest)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** |
 ---|---|---|---|
-**[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[EGLStream_CUDA_Interop](./Samples/EGLStream_CUDA_Interop)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleIPC](./Samples/simpleIPC)** |
-**[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[shfl_scan](./Samples/shfl_scan)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** |
-**[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[cudaNvSciNvMedia](./Samples/cudaNvSciNvMedia)** | **[nvJPEG](./Samples/nvJPEG)** |
-**[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** |
-**[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleAttributes](./Samples/simpleAttributes)** |
-**[cudaNvSci](./Samples/cudaNvSci)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** |
-**[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** |
-**[matrixMulDrv](./Samples/matrixMulDrv)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** |
-**[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** |
-**[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[bandwidthTest](./Samples/bandwidthTest)** | **[concurrentKernels](./Samples/concurrentKernels)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** |
-**[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[simpleGL](./Samples/simpleGL)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** |
-**[simpleVulkan](./Samples/simpleVulkan)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** |
-**[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[cudaOpenMP](./Samples/cudaOpenMP)** | **[matrixMul](./Samples/matrixMul)** | **[systemWideAtomics](./Samples/systemWideAtomics)** |
+**[boxFilterNPP](./Samples/boxFilterNPP)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[concurrentKernels](./Samples/concurrentKernels)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** |
+**[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[cudaNvSci](./Samples/cudaNvSci)** |
+**[cudaNvSciNvMedia](./Samples/cudaNvSciNvMedia)** | **[cudaOpenMP](./Samples/cudaOpenMP)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** |
+**[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[deviceQuery](./Samples/deviceQuery)** | **[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[EGLStream_CUDA_Interop](./Samples/EGLStream_CUDA_Interop)** |
+**[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | **[matrixMul](./Samples/matrixMul)** |
+**[matrixMulDrv](./Samples/matrixMulDrv)** | **[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** |
+**[nvJPEG](./Samples/nvJPEG)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[reduction](./Samples/reduction)** |
+**[shfl_scan](./Samples/shfl_scan)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** |
+**[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleCUBLAS_LU](./Samples/simpleCUBLAS_LU)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[simpleCUFFT](./Samples/simpleCUFFT)** |
+**[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[simpleGL](./Samples/simpleGL)** | **[simpleIPC](./Samples/simpleIPC)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** |
+**[simpleVulkan](./Samples/simpleVulkan)** | **[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** |
+**[streamOrderedAllocationIPC](./Samples/streamOrderedAllocationIPC)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | **[systemWideAtomics](./Samples/systemWideAtomics)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** |
+**[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** |
+**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** |
 
 #### Windows
-**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** |
+**[bandwidthTest](./Samples/bandwidthTest)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** |
 ---|---|---|---|
-**[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleIPC](./Samples/simpleIPC)** | **[memMapIPCDrv](./Samples/memMapIPCDrv)** |
-**[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[shfl_scan](./Samples/shfl_scan)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** |
-**[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[nvJPEG](./Samples/nvJPEG)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[simpleD3D12](./Samples/simpleD3D12)** |
-**[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** | **[deviceQuery](./Samples/deviceQuery)** |
-**[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** |
-**[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** |
-**[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** |
-**[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** |
-**[simpleD3D11](./Samples/simpleD3D11)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** |
-**[bandwidthTest](./Samples/bandwidthTest)** | **[concurrentKernels](./Samples/concurrentKernels)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** |
-**[simpleGL](./Samples/simpleGL)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | **[simpleVulkan](./Samples/simpleVulkan)** |
-**[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** |
-**[cudaOpenMP](./Samples/cudaOpenMP)** | **[matrixMul](./Samples/matrixMul)** |
+**[boxFilterNPP](./Samples/boxFilterNPP)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[concurrentKernels](./Samples/concurrentKernels)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** |
+**[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[cudaOpenMP](./Samples/cudaOpenMP)** |
+**[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[deviceQuery](./Samples/deviceQuery)** |
+**[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** |
+**[matrixMul](./Samples/matrixMul)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** |
+**[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[nvJPEG](./Samples/nvJPEG)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** |
+**[reduction](./Samples/reduction)** | **[shfl_scan](./Samples/shfl_scan)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** |
+**[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleCUBLAS_LU](./Samples/simpleCUBLAS_LU)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** |
+**[simpleCUFFT](./Samples/simpleCUFFT)** | **[simpleD3D11](./Samples/simpleD3D11)** | **[simpleD3D12](./Samples/simpleD3D12)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** |
+**[simpleGL](./Samples/simpleGL)** | **[simpleIPC](./Samples/simpleIPC)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleVulkan](./Samples/simpleVulkan)** |
+**[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** |
+**[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** |
+**[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** |
 
 ## Dependencies
 
diff --git a/Samples/EGLStream_CUDA_Interop/Makefile b/Samples/EGLStream_CUDA_Interop/Makefile
index 010ce65c..1e901d99 100644
--- a/Samples/EGLStream_CUDA_Interop/Makefile
+++ b/Samples/EGLStream_CUDA_Interop/Makefile
@@ -285,6 +285,12 @@ ifeq ($(TARGET_OS),android)
   SAMPLE_ENABLED := 0
 endif
 
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - EGLStream_CUDA_Interop is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
diff --git a/Samples/EGLStream_CUDA_Interop/README.md b/Samples/EGLStream_CUDA_Interop/README.md
index b7420b73..2a0f654d 100644
--- a/Samples/EGLStream_CUDA_Interop/README.md
+++ b/Samples/EGLStream_CUDA_Interop/README.md
@@ -30,7 +30,7 @@ cuDeviceGet, cuDeviceGetAttribute, cuDeviceComputeCapability, cuDeviceGetCount,
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/MersenneTwisterGP11213/Makefile b/Samples/MersenneTwisterGP11213/Makefile
index e40b5b99..fb3aa590 100644
--- a/Samples/MersenneTwisterGP11213/Makefile
+++ b/Samples/MersenneTwisterGP11213/Makefile
@@ -263,6 +263,14 @@ ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - MersenneTwisterGP11213 is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
@@ -297,6 +305,10 @@ ALL_CCFLAGS += --threads 0
 
 LIBRARIES += -lcurand_static -lculibos
 
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
 ################################################################################
 
 # Target rules
@@ -304,16 +316,23 @@ all: build
 
 build: MersenneTwisterGP11213
 
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
 MersenneTwister.o:MersenneTwister.cpp
-	$(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 
 MersenneTwisterGP11213: MersenneTwister.o
-	$(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
-	mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
-	cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 
 run: build
-	./MersenneTwisterGP11213
+	$(EXEC) ./MersenneTwisterGP11213
 
 clean:
 	rm -f MersenneTwisterGP11213 MersenneTwister.o
diff --git a/Samples/MersenneTwisterGP11213/MersenneTwister.cpp b/Samples/MersenneTwisterGP11213/MersenneTwister.cpp
index bb2916bc..3462512c 100644
--- a/Samples/MersenneTwisterGP11213/MersenneTwister.cpp
+++ b/Samples/MersenneTwisterGP11213/MersenneTwister.cpp
@@ -47,138 +47,134 @@
 
 float compareResults(int rand_n, float *h_RandGPU, float *h_RandCPU);
 
-const int    DEFAULT_RAND_N = 2400000;
+const int DEFAULT_RAND_N = 2400000;
 const unsigned int DEFAULT_SEED = 777;
 
 ///////////////////////////////////////////////////////////////////////////////
 // Main program
 ///////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv)
-{
-    // Start logs
-    printf("%s Starting...\n\n", argv[0]);
+int main(int argc, char **argv) {
+  // Start logs
+  printf("%s Starting...\n\n", argv[0]);
 
-    // initialize the GPU, either identified by --device
-    // or by picking the device with highest flop rate.
-    int devID = findCudaDevice(argc, (const char **)argv);
+  // initialize the GPU, either identified by --device
+  // or by picking the device with highest flop rate.
+  int devID = findCudaDevice(argc, (const char **)argv);
 
-    // parsing the number of random numbers to generate
-    int rand_n = DEFAULT_RAND_N;
+  // parsing the number of random numbers to generate
+  int rand_n = DEFAULT_RAND_N;
 
-    if (checkCmdLineFlag(argc, (const char **) argv, "count"))
-    {
-        rand_n = getCmdLineArgumentInt(argc, (const char **) argv, "count");
-    }
+  if (checkCmdLineFlag(argc, (const char **)argv, "count")) {
+    rand_n = getCmdLineArgumentInt(argc, (const char **)argv, "count");
+  }
 
-    printf("Allocating data for %i samples...\n", rand_n);
+  printf("Allocating data for %i samples...\n", rand_n);
 
-    // parsing the seed
-    int seed = DEFAULT_SEED;
+  // parsing the seed
+  int seed = DEFAULT_SEED;
 
-    if (checkCmdLineFlag(argc, (const char **) argv, "seed"))
-    {
-        seed = getCmdLineArgumentInt(argc, (const char **) argv, "seed");
-    }
+  if (checkCmdLineFlag(argc, (const char **)argv, "seed")) {
+    seed = getCmdLineArgumentInt(argc, (const char **)argv, "seed");
+  }
 
-    printf("Seeding with %i ...\n", seed);
+  printf("Seeding with %i ...\n", seed);
 
-    cudaStream_t stream;
-    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+  cudaStream_t stream;
+  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
-    float *d_Rand;
-    checkCudaErrors(cudaMalloc((void **)&d_Rand, rand_n * sizeof(float)));
+  float *d_Rand;
+  checkCudaErrors(cudaMalloc((void **)&d_Rand, rand_n * sizeof(float)));
 
-    curandGenerator_t prngGPU;
-    checkCudaErrors(curandCreateGenerator(&prngGPU, CURAND_RNG_PSEUDO_MTGP32));
-    checkCudaErrors(curandSetStream(prngGPU, stream));
-    checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngGPU, seed));
+  curandGenerator_t prngGPU;
+  checkCudaErrors(curandCreateGenerator(&prngGPU, CURAND_RNG_PSEUDO_MTGP32));
+  checkCudaErrors(curandSetStream(prngGPU, stream));
+  checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngGPU, seed));
 
-    curandGenerator_t prngCPU;
-    checkCudaErrors(curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32));
-    checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngCPU, seed));
+  curandGenerator_t prngCPU;
+  checkCudaErrors(
+      curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32));
+  checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngCPU, seed));
 
-    //
-    // Example 1: Compare random numbers generated on GPU and CPU
-    float *h_RandGPU  = (float *)malloc(rand_n * sizeof(float));
+  //
+  // Example 1: Compare random numbers generated on GPU and CPU
+  float *h_RandGPU;
+  checkCudaErrors(cudaMallocHost(&h_RandGPU, rand_n * sizeof(float)));
 
-    printf("Generating random numbers on GPU...\n\n");
-    checkCudaErrors(curandGenerateUniform(prngGPU, (float *) d_Rand, rand_n));
+  printf("Generating random numbers on GPU...\n\n");
+  checkCudaErrors(curandGenerateUniform(prngGPU, (float *)d_Rand, rand_n));
 
-    printf("\nReading back the results...\n");
-    checkCudaErrors(cudaMemcpyAsync(h_RandGPU, d_Rand, rand_n * sizeof(float), cudaMemcpyDeviceToHost, stream));
+  printf("\nReading back the results...\n");
+  checkCudaErrors(cudaMemcpyAsync(h_RandGPU, d_Rand, rand_n * sizeof(float),
+                                  cudaMemcpyDeviceToHost, stream));
 
+  float *h_RandCPU = (float *)malloc(rand_n * sizeof(float));
 
-    float *h_RandCPU  = (float *)malloc(rand_n * sizeof(float));
+  printf("Generating random numbers on CPU...\n\n");
+  checkCudaErrors(curandGenerateUniform(prngCPU, (float *)h_RandCPU, rand_n));
 
-    printf("Generating random numbers on CPU...\n\n");
-    checkCudaErrors(curandGenerateUniform(prngCPU, (float *) h_RandCPU, rand_n));
+  checkCudaErrors(cudaStreamSynchronize(stream));
+  printf("Comparing CPU/GPU random numbers...\n\n");
+  float L1norm = compareResults(rand_n, h_RandGPU, h_RandCPU);
 
-    checkCudaErrors(cudaStreamSynchronize(stream));
-    printf("Comparing CPU/GPU random numbers...\n\n");
-    float L1norm = compareResults(rand_n, h_RandGPU, h_RandCPU);
+  //
+  // Example 2: Timing of random number generation on GPU
+  const int numIterations = 10;
+  int i;
+  StopWatchInterface *hTimer;
 
-    //
-    // Example 2: Timing of random number generation on GPU
-    const int numIterations = 10;
-    int i;
-    StopWatchInterface *hTimer;
+  sdkCreateTimer(&hTimer);
+  sdkResetTimer(&hTimer);
+  sdkStartTimer(&hTimer);
 
-    sdkCreateTimer(&hTimer);
-    sdkResetTimer(&hTimer);
-    sdkStartTimer(&hTimer);
+  for (i = 0; i < numIterations; i++) {
+    checkCudaErrors(curandGenerateUniform(prngGPU, (float *)d_Rand, rand_n));
+  }
 
-    for (i = 0; i < numIterations; i++)
-    {
-        checkCudaErrors(curandGenerateUniform(prngGPU, (float *) d_Rand, rand_n));
-    }
+  checkCudaErrors(cudaStreamSynchronize(stream));
+  sdkStopTimer(&hTimer);
 
-    checkCudaErrors(cudaStreamSynchronize(stream));
-    sdkStopTimer(&hTimer);
+  double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer) / (double)numIterations;
 
-    double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer)/(double)numIterations;
+  printf(
+      "MersenneTwisterGP11213, Throughput = %.4f GNumbers/s, Time = %.5f s, "
+      "Size = %u Numbers\n",
+      1.0e-9 * rand_n / gpuTime, gpuTime, rand_n);
 
-    printf("MersenneTwisterGP11213, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers\n",
-           1.0e-9 * rand_n / gpuTime, gpuTime, rand_n);
+  printf("Shutting down...\n");
 
-    printf("Shutting down...\n");
+  checkCudaErrors(curandDestroyGenerator(prngGPU));
+  checkCudaErrors(curandDestroyGenerator(prngCPU));
+  checkCudaErrors(cudaStreamDestroy(stream));
+  checkCudaErrors(cudaFree(d_Rand));
+  sdkDeleteTimer(&hTimer);
+  checkCudaErrors(cudaFreeHost(h_RandGPU));
+  free(h_RandCPU);
 
-    checkCudaErrors(curandDestroyGenerator(prngGPU));
-    checkCudaErrors(curandDestroyGenerator(prngCPU));
-    checkCudaErrors(cudaStreamDestroy(stream));
-    checkCudaErrors(cudaFree(d_Rand));
-    sdkDeleteTimer(&hTimer);
-    free(h_RandGPU);
-    free(h_RandCPU);
-
-    exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE);
+  exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE);
 }
 
+float compareResults(int rand_n, float *h_RandGPU, float *h_RandCPU) {
+  int i;
+  float rCPU, rGPU, delta;
+  float max_delta = 0.;
+  float sum_delta = 0.;
+  float sum_ref = 0.;
 
-float compareResults(int rand_n, float *h_RandGPU, float *h_RandCPU)
-{
-    int i;
-    float rCPU, rGPU, delta;
-    float max_delta = 0.;
-    float sum_delta = 0.;
-    float sum_ref   = 0.;
+  for (i = 0; i < rand_n; i++) {
+    rCPU = h_RandCPU[i];
+    rGPU = h_RandGPU[i];
+    delta = fabs(rCPU - rGPU);
+    sum_delta += delta;
+    sum_ref += fabs(rCPU);
 
-    for (i = 0; i < rand_n; i++)
-    {
-        rCPU = h_RandCPU[i];
-        rGPU = h_RandGPU[i];
-        delta = fabs(rCPU - rGPU);
-        sum_delta += delta;
-        sum_ref   += fabs(rCPU);
-
-        if (delta >= max_delta)
-        {
-            max_delta = delta;
-        }
+    if (delta >= max_delta) {
+      max_delta = delta;
     }
+  }
 
-    float L1norm = (float)(sum_delta / sum_ref);
-    printf("Max absolute error: %E\n", max_delta);
-    printf("L1 norm: %E\n\n", L1norm);
+  float L1norm = (float)(sum_delta / sum_ref);
+  printf("Max absolute error: %E\n", max_delta);
+  printf("L1 norm: %E\n\n", L1norm);
 
-    return L1norm;
+  return L1norm;
 }
diff --git a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2017.vcxproj b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2017.vcxproj
index 73c7cc50..e39c60a4 100644
--- a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2017.vcxproj
+++ b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2019.vcxproj b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2019.vcxproj
index 01fcaae8..8648205f 100644
--- a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2019.vcxproj
+++ b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/MersenneTwisterGP11213/README.md b/Samples/MersenneTwisterGP11213/README.md
index c3a82706..eb8bd797 100644
--- a/Samples/MersenneTwisterGP11213/README.md
+++ b/Samples/MersenneTwisterGP11213/README.md
@@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.vcxproj b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.vcxproj
index cc3bd1a6..3bbad98a 100644
--- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.vcxproj
+++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -113,6 +113,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.vcxproj b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.vcxproj
index 61b93fa1..a5149390 100644
--- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.vcxproj
+++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -109,6 +109,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/NV12toBGRandResize/README.md b/Samples/NV12toBGRandResize/README.md
index 70b2f5ea..8070aaf0 100644
--- a/Samples/NV12toBGRandResize/README.md
+++ b/Samples/NV12toBGRandResize/README.md
@@ -27,7 +27,7 @@ cudaMemcpy2D, cudaMallocManaged
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/UnifiedMemoryPerf/README.md b/Samples/UnifiedMemoryPerf/README.md
index c3dd1857..43cecf11 100644
--- a/Samples/UnifiedMemoryPerf/README.md
+++ b/Samples/UnifiedMemoryPerf/README.md
@@ -28,7 +28,7 @@ cudaMallocManaged, cudaStreamAttachMemAsync, cudaMemcpyAsync, cudaMallocHost, cu
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj
index a8c46f60..b767c25f 100644
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -111,6 +111,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.vcxproj b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.vcxproj
index 6b1b3383..cfcb126c 100644
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -107,6 +107,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/bandwidthTest/README.md b/Samples/bandwidthTest/README.md
index 04db60d7..8f70b9c0 100644
--- a/Samples/bandwidthTest/README.md
+++ b/Samples/bandwidthTest/README.md
@@ -27,7 +27,7 @@ cudaSetDevice, cudaHostAlloc, cudaFree, cudaMallocHost, cudaFreeHost, cudaMemcpy
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj b/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj
index 68998f73..c6979275 100644
--- a/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj
+++ b/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/bandwidthTest/bandwidthTest_vs2019.vcxproj b/Samples/bandwidthTest/bandwidthTest_vs2019.vcxproj
index 2c9afc01..40850f7e 100644
--- a/Samples/bandwidthTest/bandwidthTest_vs2019.vcxproj
+++ b/Samples/bandwidthTest/bandwidthTest_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/batchedLabelMarkersAndLabelCompressionNPP/Makefile b/Samples/batchedLabelMarkersAndLabelCompressionNPP/Makefile
index 00ee41fa..fccab0a1 100644
--- a/Samples/batchedLabelMarkersAndLabelCompressionNPP/Makefile
+++ b/Samples/batchedLabelMarkersAndLabelCompressionNPP/Makefile
@@ -271,6 +271,12 @@ ifeq ($(TARGET_OS),darwin)
   SAMPLE_ENABLED := 0
 endif
 
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - batchedLabelMarkersAndLabelCompressionNPP is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
diff --git a/Samples/batchedLabelMarkersAndLabelCompressionNPP/README.md b/Samples/batchedLabelMarkersAndLabelCompressionNPP/README.md
index 430e7aa8..16270de7 100644
--- a/Samples/batchedLabelMarkersAndLabelCompressionNPP/README.md
+++ b/Samples/batchedLabelMarkersAndLabelCompressionNPP/README.md
@@ -28,7 +28,7 @@ x86_64, ppc64le, armv7l
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP.cpp b/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP.cpp
index 6079a289..50a6b400 100644
--- a/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP.cpp
+++ b/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP.cpp
@@ -36,11 +36,13 @@
 #include <string.h>
 #include <fstream>
 
+#include <cuda_runtime.h>
 #include <helper_cuda.h>
+#include <helper_string.h>
 #include <npp.h>
 
 // Note:  If you want to view these images we HIGHLY recommend using imagej
-// which is free on the internet and works on most platforms
+//        which is free on the internet and works on most platforms
 //        because it is one of the few image viewing apps that can display 32
 //        bit integer image data.  While it normalizes the data to floating
 //        point values for viewing it still provides a good representation of
@@ -102,11 +104,12 @@ void tearDown()  // Clean up and tear down
   if (pUFBatchSrcDstImageListDev != 0) cudaFree(pUFBatchSrcDstImageListDev);
   if (pUFBatchSrcImageListDev != 0) cudaFree(pUFBatchSrcImageListDev);
   if (pUFBatchPerImageCompressedCountListHost != 0)
-    free(pUFBatchPerImageCompressedCountListHost);
+    cudaFreeHost(pUFBatchPerImageCompressedCountListHost);
   if (pUFBatchSrcDstScratchBufferListHost != 0)
-    free(pUFBatchSrcDstScratchBufferListHost);
-  if (pUFBatchSrcDstImageListHost != 0) free(pUFBatchSrcDstImageListHost);
-  if (pUFBatchSrcImageListHost != 0) free(pUFBatchSrcImageListHost);
+    cudaFreeHost(pUFBatchSrcDstScratchBufferListHost);
+  if (pUFBatchSrcDstImageListHost != 0)
+    cudaFreeHost(pUFBatchSrcDstImageListHost);
+  if (pUFBatchSrcImageListHost != 0) cudaFreeHost(pUFBatchSrcImageListHost);
 
   for (int j = 0; j < NUMBER_OF_IMAGES; j++) {
     if (pUFCompressedLabelsScratchBufferDev[j] != 0)
@@ -115,8 +118,8 @@ void tearDown()  // Clean up and tear down
       cudaFree(pUFGenerateLabelsScratchBufferDev[j]);
     if (pUFLabelDev[j] != 0) cudaFree(pUFLabelDev[j]);
     if (pInputImageDev[j] != 0) cudaFree(pInputImageDev[j]);
-    if (pUFLabelHost[j] != 0) free(pUFLabelHost[j]);
-    if (pInputImageHost[j] != 0) free(pInputImageHost[j]);
+    if (pUFLabelHost[j] != 0) cudaFreeHost(pUFLabelHost[j]);
+    if (pInputImageHost[j] != 0) cudaFreeHost(pInputImageHost[j]);
   }
 }
 
@@ -177,7 +180,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) {
       exit(EXIT_WAIVED);
     }
 
-    bmpFile = fopen(InputFile, "rb");
+    FOPEN(bmpFile, InputFile, "rb");
   } else if (nImage == 1) {
     if (nWidth != 512 || nHeight != 512) return -1;
     const char *fileName = "CT_skull_512x512_8u.raw";
@@ -187,7 +190,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) {
       exit(EXIT_WAIVED);
     }
 
-    bmpFile = fopen(InputFile, "rb");
+    FOPEN(bmpFile, InputFile, "rb");
   } else if (nImage == 2) {
     if (nWidth != 509 || nHeight != 335) return -1;
     const char *fileName = "PCB_METAL_509x335_8u.raw";
@@ -197,7 +200,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) {
       exit(EXIT_WAIVED);
     }
 
-    bmpFile = fopen(InputFile, "rb");
+    FOPEN(bmpFile, InputFile, "rb");
   } else if (nImage == 3) {
     if (nWidth != 1024 || nHeight != 683) return -1;
     const char *fileName = "PCB2_1024x683_8u.raw";
@@ -207,7 +210,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) {
       exit(EXIT_WAIVED);
     }
 
-    bmpFile = fopen(InputFile, "rb");
+    FOPEN(bmpFile, InputFile, "rb");
   } else if (nImage == 4) {
     if (nWidth != 1280 || nHeight != 720) return -1;
     const char *fileName = "PCB_1280x720_8u.raw";
@@ -217,7 +220,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) {
       exit(EXIT_WAIVED);
     }
 
-    bmpFile = fopen(InputFile, "rb");
+    FOPEN(bmpFile, InputFile, "rb");
   } else {
     printf("Input file load failed.\n");
     return -1;
@@ -347,9 +350,11 @@ int main(int argc, char **argv) {
         oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height);
     if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
 
-    pInputImageHost[nImage] = reinterpret_cast<Npp8u *>(malloc(
+    checkCudaErrors(cudaMallocHost(
+        &(pInputImageHost[nImage]),
         oSizeROI[nImage].width * sizeof(Npp8u) * oSizeROI[nImage].height));
-    pUFLabelHost[nImage] = reinterpret_cast<Npp32u *>(malloc(
+    checkCudaErrors(cudaMallocHost(
+        &(pUFLabelHost[nImage]),
         oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height));
 
     // Use UF functions throughout this sample.
@@ -409,15 +414,15 @@ int main(int argc, char **argv) {
       }
 
       if (nImage == 0)
-        bmpFile = fopen(LabelMarkersOutputFile0.c_str(), "wb");
+        FOPEN(bmpFile, LabelMarkersOutputFile0.c_str(), "wb");
       else if (nImage == 1)
-        bmpFile = fopen(LabelMarkersOutputFile1.c_str(), "wb");
+        FOPEN(bmpFile, LabelMarkersOutputFile1.c_str(), "wb");
       else if (nImage == 2)
-        bmpFile = fopen(LabelMarkersOutputFile2.c_str(), "wb");
+        FOPEN(bmpFile, LabelMarkersOutputFile2.c_str(), "wb");
       else if (nImage == 3)
-        bmpFile = fopen(LabelMarkersOutputFile3.c_str(), "wb");
+        FOPEN(bmpFile, LabelMarkersOutputFile3.c_str(), "wb");
       else if (nImage == 4)
-        bmpFile = fopen(LabelMarkersOutputFile4.c_str(), "wb");
+        FOPEN(bmpFile, LabelMarkersOutputFile4.c_str(), "wb");
 
       if (bmpFile == NULL) return -1;
       size_t nSize = 0;
@@ -478,15 +483,15 @@ int main(int argc, char **argv) {
       }
 
       if (nImage == 0)
-        bmpFile = fopen(CompressedMarkerLabelsOutputFile0.c_str(), "wb");
+        FOPEN(bmpFile, CompressedMarkerLabelsOutputFile0.c_str(), "wb");
       else if (nImage == 1)
-        bmpFile = fopen(CompressedMarkerLabelsOutputFile1.c_str(), "wb");
+        FOPEN(bmpFile, CompressedMarkerLabelsOutputFile1.c_str(), "wb");
       else if (nImage == 2)
-        bmpFile = fopen(CompressedMarkerLabelsOutputFile2.c_str(), "wb");
+        FOPEN(bmpFile, CompressedMarkerLabelsOutputFile2.c_str(), "wb");
       else if (nImage == 3)
-        bmpFile = fopen(CompressedMarkerLabelsOutputFile3.c_str(), "wb");
+        FOPEN(bmpFile, CompressedMarkerLabelsOutputFile3.c_str(), "wb");
       else if (nImage == 4)
-        bmpFile = fopen(CompressedMarkerLabelsOutputFile4.c_str(), "wb");
+        FOPEN(bmpFile, CompressedMarkerLabelsOutputFile4.c_str(), "wb");
 
       if (bmpFile == NULL) return -1;
       nSize = 0;
@@ -554,10 +559,11 @@ int main(int argc, char **argv) {
       cudaMalloc((void **)&pUFBatchSrcDstImageListDev, nBatchImageListBytes);
   if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
 
-  pUFBatchSrcImageListHost =
-      reinterpret_cast<NppiImageDescriptor *>(malloc(nBatchImageListBytes));
-  pUFBatchSrcDstImageListHost =
-      reinterpret_cast<NppiImageDescriptor *>(malloc(nBatchImageListBytes));
+  checkCudaErrors(
+      cudaMallocHost((void **)&pUFBatchSrcImageListHost, nBatchImageListBytes));
+
+  checkCudaErrors(cudaMallocHost((void **)&pUFBatchSrcDstImageListHost,
+                                 nBatchImageListBytes));
 
   NppiSize oMaxROISize = {0, 0};
 
@@ -620,15 +626,15 @@ int main(int argc, char **argv) {
   // Save output to files
   for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
     if (nImage == 0)
-      bmpFile = fopen(LabelMarkersBatchOutputFile0.c_str(), "wb");
+      FOPEN(bmpFile, LabelMarkersBatchOutputFile0.c_str(), "wb");
     else if (nImage == 1)
-      bmpFile = fopen(LabelMarkersBatchOutputFile1.c_str(), "wb");
+      FOPEN(bmpFile, LabelMarkersBatchOutputFile1.c_str(), "wb");
     else if (nImage == 2)
-      bmpFile = fopen(LabelMarkersBatchOutputFile2.c_str(), "wb");
+      FOPEN(bmpFile, LabelMarkersBatchOutputFile2.c_str(), "wb");
     else if (nImage == 3)
-      bmpFile = fopen(LabelMarkersBatchOutputFile3.c_str(), "wb");
+      FOPEN(bmpFile, LabelMarkersBatchOutputFile3.c_str(), "wb");
     else if (nImage == 4)
-      bmpFile = fopen(LabelMarkersBatchOutputFile4.c_str(), "wb");
+      FOPEN(bmpFile, LabelMarkersBatchOutputFile4.c_str(), "wb");
 
     if (bmpFile == NULL) return -1;
     size_t nSize = 0;
@@ -652,12 +658,13 @@ int main(int argc, char **argv) {
 
   // Allocate host side scratch buffer point and size list and initialize with
   // device scratch buffer pointers
-  pUFBatchSrcDstScratchBufferListHost =
-      reinterpret_cast<NppiBufferDescriptor *>(
-          malloc(NUMBER_OF_IMAGES * sizeof(NppiBufferDescriptor)));
+  checkCudaErrors(
+      cudaMallocHost((void **)&pUFBatchSrcDstScratchBufferListHost,
+                     NUMBER_OF_IMAGES * sizeof(NppiBufferDescriptor)));
 
-  pUFBatchPerImageCompressedCountListHost =
-      reinterpret_cast<Npp32u *>(malloc(NUMBER_OF_IMAGES * sizeof(Npp32u)));
+  checkCudaErrors(
+      cudaMallocHost((void **)&pUFBatchPerImageCompressedCountListHost,
+                     +NUMBER_OF_IMAGES * sizeof(Npp32u)));
 
   // Start buffer pointer at beginning of full per image buffer list sized
   // pUFCompressedLabelsScratchBufferDev[0]
@@ -728,15 +735,15 @@ int main(int argc, char **argv) {
   // Save compressed label images into files
   for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
     if (nImage == 0)
-      bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile0.c_str(), "wb");
+      FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile0.c_str(), "wb");
     else if (nImage == 1)
-      bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile1.c_str(), "wb");
+      FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile1.c_str(), "wb");
     else if (nImage == 2)
-      bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile2.c_str(), "wb");
+      FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile2.c_str(), "wb");
     else if (nImage == 3)
-      bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile3.c_str(), "wb");
+      FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile3.c_str(), "wb");
     else if (nImage == 4)
-      bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile4.c_str(), "wb");
+      FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile4.c_str(), "wb");
 
     if (bmpFile == NULL) return -1;
     size_t nSize = 0;
diff --git a/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2017.vcxproj b/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2017.vcxproj
index c931c860..0f34cd13 100644
--- a/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2017.vcxproj
+++ b/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2019.vcxproj b/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2019.vcxproj
index 8c37e349..22205f6b 100644
--- a/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2019.vcxproj
+++ b/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/bf16TensorCoreGemm/README.md b/Samples/bf16TensorCoreGemm/README.md
index 10775e50..5a51bb4e 100644
--- a/Samples/bf16TensorCoreGemm/README.md
+++ b/Samples/bf16TensorCoreGemm/README.md
@@ -27,7 +27,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate,
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2017.vcxproj b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2017.vcxproj
index 384240ed..156376ad 100644
--- a/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2017.vcxproj
+++ b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2019.vcxproj b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2019.vcxproj
index a8f8eded..1146105a 100644
--- a/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2019.vcxproj
+++ b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/binaryPartitionCG/README.md b/Samples/binaryPartitionCG/README.md
index c24500fb..98c3418d 100644
--- a/Samples/binaryPartitionCG/README.md
+++ b/Samples/binaryPartitionCG/README.md
@@ -24,7 +24,7 @@ x86_64, ppc64le, armv7l
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/binaryPartitionCG/binaryPartitionCG.cu b/Samples/binaryPartitionCG/binaryPartitionCG.cu
index 53021c44..341fb4f1 100644
--- a/Samples/binaryPartitionCG/binaryPartitionCG.cu
+++ b/Samples/binaryPartitionCG/binaryPartitionCG.cu
@@ -31,14 +31,16 @@
  * 1.) Each thread loads a value from random array.
  * 2.) then checks if it is odd or even.
  * 3.) create binary partition group based on the above predicate
- * 4.) we count the number of odd/even in the group based on size of the binary groups
+ * 4.) we count the number of odd/even in the group based on size of the binary
+       groups
  * 5.) write it global counter of odd.
- * 6.) sum the values loaded by individual threads(using reduce) and write it to global 
- *     even & odd elements sum.
+ * 6.) sum the values loaded by individual threads(using reduce) and write it to
+       global even & odd elements sum.
  *
- * **NOTE** : binary_partition results in splitting warp into divergent thread groups
-              this is not good from performance perspective, but in cases where warp 
-              divergence is inevitable one can use binary_partition group.
+ * **NOTE** :
+ *    binary_partition results in splitting warp into divergent thread groups
+ *    this is not good from performance perspective, but in cases where warp
+ *    divergence is inevitable one can use binary_partition group.
 */
 
 #include <stdio.h>
@@ -48,108 +50,110 @@
 
 namespace cg = cooperative_groups;
 
-void initOddEvenArr(int *inputArr, unsigned int size)
-{
-    for (int i=0; i < size; i++)
-    {
-        inputArr[i] = rand() % 50;
-    }
+void initOddEvenArr(int *inputArr, unsigned int size) {
+  for (int i = 0; i < size; i++) {
+    inputArr[i] = rand() % 50;
+  }
 }
 
-
 /**
  * CUDA kernel device code
- * 
+ *
  * Creates cooperative groups and performs odd/even counting & summation.
  */
-__global__ void oddEvenCountAndSumCG(int *inputArr, int *numOfOdds, int *sumOfOddAndEvens, unsigned int size)
-{
-    cg::thread_block cta = cg::this_thread_block();
-    cg::grid_group grid = cg::this_grid();
-    cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
+__global__ void oddEvenCountAndSumCG(int *inputArr, int *numOfOdds,
+                                     int *sumOfOddAndEvens, unsigned int size) {
+  cg::thread_block cta = cg::this_thread_block();
+  cg::grid_group grid = cg::this_grid();
+  cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
 
-    for (int i = grid.thread_rank(); i < size; i += grid.size())
+  for (int i = grid.thread_rank(); i < size; i += grid.size()) {
+    int elem = inputArr[i];
+    auto subTile = cg::binary_partition(tile32, elem & 1);
+    if (elem & 1)  // Odd numbers group
     {
-        int elem = inputArr[i];
-        auto subTile = cg::binary_partition(tile32, elem & 1);
-        if (elem & 1) // Odd numbers group
-        {
-            int oddGroupSum = cg::reduce(subTile, elem, cg::plus<int>());
+      int oddGroupSum = cg::reduce(subTile, elem, cg::plus<int>());
 
-            if (subTile.thread_rank() == 0)
-            {
-                // Add number of odds present in this group of Odds.
-                atomicAdd(numOfOdds, subTile.size());
+      if (subTile.thread_rank() == 0) {
+        // Add number of odds present in this group of Odds.
+        atomicAdd(numOfOdds, subTile.size());
 
-                // Add local reduction of odds present in this group of Odds.
-                atomicAdd(&sumOfOddAndEvens[0], oddGroupSum);
+        // Add local reduction of odds present in this group of Odds.
+        atomicAdd(&sumOfOddAndEvens[0], oddGroupSum);
+      }
+    } else  // Even numbers group
+    {
+      int evenGroupSum = cg::reduce(subTile, elem, cg::plus<int>());
 
-            }
-        }
-        else // Even numbers group
-        {
-            int evenGroupSum = cg::reduce(subTile, elem, cg::plus<int>());
-
-            if (subTile.thread_rank() == 0)
-            {
-                // Add local reduction of even present in this group of evens.
-                atomicAdd(&sumOfOddAndEvens[1], evenGroupSum);
-            }
-        }
-        // reconverge warp so for next loop iteration we ensure convergence of 
-        // above diverged threads to perform coalesced loads of inputArr.
-        cg::sync(tile32);
+      if (subTile.thread_rank() == 0) {
+        // Add local reduction of even present in this group of evens.
+        atomicAdd(&sumOfOddAndEvens[1], evenGroupSum);
+      }
     }
+    // reconverge warp so for next loop iteration we ensure convergence of
+    // above diverged threads to perform coalesced loads of inputArr.
+    cg::sync(tile32);
+  }
 }
 
-
 /**
  * Host main routine
  */
-int main(int argc, const char **argv)
-{
-    int deviceId = findCudaDevice(argc, argv);
-    int *h_inputArr, *d_inputArr;
-    int *h_numOfOdds, *d_numOfOdds;
-    int *h_sumOfOddEvenElems, *d_sumOfOddEvenElems;
-    unsigned int arrSize = 1024 * 100;
+int main(int argc, const char **argv) {
+  int deviceId = findCudaDevice(argc, argv);
+  int *h_inputArr, *d_inputArr;
+  int *h_numOfOdds, *d_numOfOdds;
+  int *h_sumOfOddEvenElems, *d_sumOfOddEvenElems;
+  unsigned int arrSize = 1024 * 100;
 
-    h_inputArr = new int[arrSize];
-    h_numOfOdds = new int[1];
-    h_sumOfOddEvenElems = new int[2];
-    initOddEvenArr(h_inputArr, arrSize);
-   
-    cudaStream_t stream;
-    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-    checkCudaErrors(cudaMalloc(&d_inputArr, sizeof(int)*arrSize));
-    checkCudaErrors(cudaMalloc(&d_numOfOdds, sizeof(int)));
-    checkCudaErrors(cudaMalloc(&d_sumOfOddEvenElems, sizeof(int)*2));
+  checkCudaErrors(cudaMallocHost(&h_inputArr, sizeof(int) * arrSize));
+  checkCudaErrors(cudaMallocHost(&h_numOfOdds, sizeof(int)));
+  checkCudaErrors(cudaMallocHost(&h_sumOfOddEvenElems, sizeof(int) * 2));
+  initOddEvenArr(h_inputArr, arrSize);
 
-    checkCudaErrors(cudaMemcpyAsync(d_inputArr, h_inputArr, sizeof(int)*arrSize, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemsetAsync(d_numOfOdds, 0, sizeof(int), stream));
-    checkCudaErrors(cudaMemsetAsync(d_sumOfOddEvenElems, 0, 2*sizeof(int), stream));
+  cudaStream_t stream;
+  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+  checkCudaErrors(cudaMalloc(&d_inputArr, sizeof(int) * arrSize));
+  checkCudaErrors(cudaMalloc(&d_numOfOdds, sizeof(int)));
+  checkCudaErrors(cudaMalloc(&d_sumOfOddEvenElems, sizeof(int) * 2));
 
-    //Launch the kernel
-    int threadsPerBlock=1024;
-    int blocksPerGrid = arrSize / threadsPerBlock;
+  checkCudaErrors(cudaMemcpyAsync(d_inputArr, h_inputArr, sizeof(int) * arrSize,
+                                  cudaMemcpyHostToDevice, stream));
+  checkCudaErrors(cudaMemsetAsync(d_numOfOdds, 0, sizeof(int), stream));
+  checkCudaErrors(
+      cudaMemsetAsync(d_sumOfOddEvenElems, 0, 2 * sizeof(int), stream));
 
-    printf("\nLaunching %d blocks with %d threads...\n\n",blocksPerGrid, threadsPerBlock);
+  // Launch the kernel
+  int threadsPerBlock = 0;
+  int blocksPerGrid = 0;
+  checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
+      &blocksPerGrid, &threadsPerBlock, oddEvenCountAndSumCG, 0, 0));
 
-    oddEvenCountAndSumCG<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_inputArr, d_numOfOdds, d_sumOfOddEvenElems, arrSize);
+  printf("\nLaunching %d blocks with %d threads...\n\n", blocksPerGrid,
+         threadsPerBlock);
 
-    checkCudaErrors(cudaMemcpyAsync(h_numOfOdds, d_numOfOdds, sizeof(int), cudaMemcpyDeviceToHost, stream));
-    checkCudaErrors(cudaMemcpyAsync(h_sumOfOddEvenElems, d_sumOfOddEvenElems, 2*sizeof(int), cudaMemcpyDeviceToHost, stream));
-    
-    printf("Array size = %d Num of Odds = %d Sum of Odds = %d Sum of Evens %d\n", arrSize, h_numOfOdds[0], h_sumOfOddEvenElems[0], h_sumOfOddEvenElems[1]);
-    printf("\n...Done.\n\n");
+  oddEvenCountAndSumCG<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+      d_inputArr, d_numOfOdds, d_sumOfOddEvenElems, arrSize);
 
-    delete[] h_inputArr;
-    delete[] h_numOfOdds;
-    delete[] h_sumOfOddEvenElems;
+  checkCudaErrors(cudaMemcpyAsync(h_numOfOdds, d_numOfOdds, sizeof(int),
+                                  cudaMemcpyDeviceToHost, stream));
+  checkCudaErrors(cudaMemcpyAsync(h_sumOfOddEvenElems, d_sumOfOddEvenElems,
+                                  2 * sizeof(int), cudaMemcpyDeviceToHost,
+                                  stream));
+  checkCudaErrors(cudaStreamSynchronize(stream));
 
-    checkCudaErrors(cudaFree(d_inputArr));
-    checkCudaErrors(cudaFree(d_numOfOdds));
-    checkCudaErrors(cudaFree(d_sumOfOddEvenElems));
+  printf("Array size = %d Num of Odds = %d Sum of Odds = %d Sum of Evens %d\n",
+         arrSize, h_numOfOdds[0], h_sumOfOddEvenElems[0],
+         h_sumOfOddEvenElems[1]);
+  printf("\n...Done.\n\n");
 
-    return EXIT_SUCCESS;
+  checkCudaErrors(cudaFreeHost(h_inputArr));
+  checkCudaErrors(cudaFreeHost(h_numOfOdds));
+  checkCudaErrors(cudaFreeHost(h_sumOfOddEvenElems));
+
+  checkCudaErrors(cudaFree(d_inputArr));
+  checkCudaErrors(cudaFree(d_numOfOdds));
+  checkCudaErrors(cudaFree(d_sumOfOddEvenElems));
+
+  return EXIT_SUCCESS;
 }
diff --git a/Samples/binaryPartitionCG/binaryPartitionCG_vs2017.vcxproj b/Samples/binaryPartitionCG/binaryPartitionCG_vs2017.vcxproj
index d2c55039..2399c9ec 100644
--- a/Samples/binaryPartitionCG/binaryPartitionCG_vs2017.vcxproj
+++ b/Samples/binaryPartitionCG/binaryPartitionCG_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/binaryPartitionCG/binaryPartitionCG_vs2019.vcxproj b/Samples/binaryPartitionCG/binaryPartitionCG_vs2019.vcxproj
index 27c33226..fe7bb11f 100644
--- a/Samples/binaryPartitionCG/binaryPartitionCG_vs2019.vcxproj
+++ b/Samples/binaryPartitionCG/binaryPartitionCG_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/boxFilterNPP/README.md b/Samples/boxFilterNPP/README.md
index c14fe05e..54f26d6a 100644
--- a/Samples/boxFilterNPP/README.md
+++ b/Samples/boxFilterNPP/README.md
@@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/boxFilterNPP/boxFilterNPP_vs2017.vcxproj b/Samples/boxFilterNPP/boxFilterNPP_vs2017.vcxproj
index b233a75e..580c3df5 100644
--- a/Samples/boxFilterNPP/boxFilterNPP_vs2017.vcxproj
+++ b/Samples/boxFilterNPP/boxFilterNPP_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -118,6 +118,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/boxFilterNPP/boxFilterNPP_vs2019.vcxproj b/Samples/boxFilterNPP/boxFilterNPP_vs2019.vcxproj
index 435986b7..91f4db2d 100644
--- a/Samples/boxFilterNPP/boxFilterNPP_vs2019.vcxproj
+++ b/Samples/boxFilterNPP/boxFilterNPP_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -114,6 +114,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/cannyEdgeDetectorNPP/README.md b/Samples/cannyEdgeDetectorNPP/README.md
index 2d67e1f0..0c969c8e 100644
--- a/Samples/cannyEdgeDetectorNPP/README.md
+++ b/Samples/cannyEdgeDetectorNPP/README.md
@@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2017.vcxproj b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2017.vcxproj
index 78c395f6..f0140b6a 100644
--- a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2017.vcxproj
+++ b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -118,6 +118,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2019.vcxproj b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2019.vcxproj
index 318815b6..f919b081 100644
--- a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2019.vcxproj
+++ b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -114,6 +114,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/concurrentKernels/README.md b/Samples/concurrentKernels/README.md
index f2933e27..b3a52d91 100644
--- a/Samples/concurrentKernels/README.md
+++ b/Samples/concurrentKernels/README.md
@@ -24,7 +24,7 @@ x86_64, ppc64le, armv7l
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/concurrentKernels/concurrentKernels_vs2017.vcxproj b/Samples/concurrentKernels/concurrentKernels_vs2017.vcxproj
index a0e1b67a..f8036198 100644
--- a/Samples/concurrentKernels/concurrentKernels_vs2017.vcxproj
+++ b/Samples/concurrentKernels/concurrentKernels_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/concurrentKernels/concurrentKernels_vs2019.vcxproj b/Samples/concurrentKernels/concurrentKernels_vs2019.vcxproj
index 6c992bc0..f6224739 100644
--- a/Samples/concurrentKernels/concurrentKernels_vs2019.vcxproj
+++ b/Samples/concurrentKernels/concurrentKernels_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/conjugateGradientCudaGraphs/Makefile b/Samples/conjugateGradientCudaGraphs/Makefile
index 13d1e4ee..6609440a 100644
--- a/Samples/conjugateGradientCudaGraphs/Makefile
+++ b/Samples/conjugateGradientCudaGraphs/Makefile
@@ -265,6 +265,12 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 
 SAMPLE_ENABLED := 1
 
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - conjugateGradientCudaGraphs is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
diff --git a/Samples/conjugateGradientCudaGraphs/README.md b/Samples/conjugateGradientCudaGraphs/README.md
index f9a787c9..1e723476 100644
--- a/Samples/conjugateGradientCudaGraphs/README.md
+++ b/Samples/conjugateGradientCudaGraphs/README.md
@@ -30,7 +30,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch,
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu
index b1528438..7a4a5c8a 100644
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu
@@ -25,7 +25,6 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-
 /*
  * This sample implements a conjugate gradient solver on GPU
  * using CUBLAS and CUSPARSE with CUDA Graphs
@@ -46,7 +45,6 @@
 #include <helper_cuda.h>  // helper function CUDA error checking and initialization
 #include <helper_functions.h>  // helper for shared functions common to CUDA Samples
 
-
 const char *sSDKname = "conjugateGradientCudaGraphs";
 
 #ifndef WITH_GRAPH
@@ -145,12 +143,12 @@ int main(int argc, char **argv) {
   /* Generate a random tridiagonal symmetric matrix in CSR format */
   N = 1048576;
   nz = (N - 2) * 3 + 4;
-  I = (int *)malloc(sizeof(int) * (N + 1));
-  J = (int *)malloc(sizeof(int) * nz);
-  val = (float *)malloc(sizeof(float) * nz);
+  checkCudaErrors(cudaMallocHost(&I, sizeof(int) * (N + 1)));
+  checkCudaErrors(cudaMallocHost(&J, sizeof(int) * nz));
+  checkCudaErrors(cudaMallocHost(&val, sizeof(float) * nz));
   genTridiag(I, J, val, N, nz);
 
-  x = (float *)malloc(sizeof(float) * N);
+  checkCudaErrors(cudaMallocHost(&x, sizeof(float) * N));
   rhs = (float *)malloc(sizeof(float) * N);
 
   for (int i = 0; i < N; i++) {
@@ -192,9 +190,9 @@ int main(int argc, char **argv) {
 
   /* Wrap raw data into cuSPARSE generic API objects */
   cusparseSpMatDescr_t matA = NULL;
-  checkCudaErrors(cusparseCreateCsr(
-      &matA, N, N, nz, d_row, d_col, d_val, CUSPARSE_INDEX_32I,
-      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+  checkCudaErrors(cusparseCreateCsr(&matA, N, N, nz, d_row, d_col, d_val,
+                                    CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+                                    CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
   cusparseDnVecDescr_t vecx = NULL;
   checkCudaErrors(cusparseCreateDnVec(&vecx, N, d_x, CUDA_R_32F));
   cusparseDnVecDescr_t vecp = NULL;
@@ -206,7 +204,7 @@ int main(int argc, char **argv) {
   size_t bufferSize = 0;
   checkCudaErrors(cusparseSpMV_bufferSize(
       cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecx,
-      &beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize));
+      &beta, vecAx, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize));
   void *buffer = NULL;
   checkCudaErrors(cudaMalloc(&buffer, bufferSize));
 
@@ -234,9 +232,9 @@ int main(int argc, char **argv) {
   beta = 0.0;
 
   checkCudaErrors(cusparseSetStream(cusparseHandle, stream1));
-  checkCudaErrors(cusparseSpMV(
-    cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecx,
-    &beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer));
+  checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                               &alpha, matA, vecx, &beta, vecAx, CUDA_R_32F,
+                               CUSPARSE_SPMV_ALG_DEFAULT, buffer));
 
   checkCudaErrors(cublasSetStream(cublasHandle, stream1));
   checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpham1, d_Ax, 1, d_r, 1));
@@ -248,9 +246,9 @@ int main(int argc, char **argv) {
   k = 1;
   // First Iteration when k=1 starts
   checkCudaErrors(cublasScopy(cublasHandle, N, d_r, 1, d_p, 1));
-  checkCudaErrors(cusparseSpMV(
-    cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp,
-    &beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer));
+  checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                               &alpha, matA, vecp, &beta, vecAx, CUDA_R_32F,
+                               CUSPARSE_SPMV_ALG_DEFAULT, buffer));
 
   checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot));
 
@@ -290,9 +288,9 @@ int main(int argc, char **argv) {
 
   checkCudaErrors(
       cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST));
-  checkCudaErrors(cusparseSpMV(
-    cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp,
-    &beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer));
+  checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                               &alpha, matA, vecp, &beta, vecAx, CUDA_R_32F,
+                               CUSPARSE_SPMV_ALG_DEFAULT, buffer));
 
   checkCudaErrors(cudaMemsetAsync(d_dot, 0, sizeof(float), stream1));
   checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot));
@@ -335,8 +333,8 @@ int main(int argc, char **argv) {
     checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1));
 
     checkCudaErrors(cusparseSpMV(
-      cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp,
-      &beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer));
+        cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp,
+        &beta, vecAx, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, buffer));
 
     cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE);
     checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot));
@@ -395,23 +393,31 @@ int main(int argc, char **argv) {
   cusparseDestroy(cusparseHandle);
   cublasDestroy(cublasHandle);
 
-  if (matA       ) { checkCudaErrors(cusparseDestroySpMat(matA)); }
-  if (vecx       ) { checkCudaErrors(cusparseDestroyDnVec(vecx)); }
-  if (vecAx      ) { checkCudaErrors(cusparseDestroyDnVec(vecAx)); }
-  if (vecp       ) { checkCudaErrors(cusparseDestroyDnVec(vecp)); }
+  if (matA) {
+    checkCudaErrors(cusparseDestroySpMat(matA));
+  }
+  if (vecx) {
+    checkCudaErrors(cusparseDestroyDnVec(vecx));
+  }
+  if (vecAx) {
+    checkCudaErrors(cusparseDestroyDnVec(vecAx));
+  }
+  if (vecp) {
+    checkCudaErrors(cusparseDestroyDnVec(vecp));
+  }
 
-  free(I);
-  free(J);
-  free(val);
-  free(x);
+  checkCudaErrors(cudaFreeHost(I));
+  checkCudaErrors(cudaFreeHost(J));
+  checkCudaErrors(cudaFreeHost(val));
+  checkCudaErrors(cudaFreeHost(x));
   free(rhs);
-  cudaFree(d_col);
-  cudaFree(d_row);
-  cudaFree(d_val);
-  cudaFree(d_x);
-  cudaFree(d_r);
-  cudaFree(d_p);
-  cudaFree(d_Ax);
+  checkCudaErrors(cudaFree(d_col));
+  checkCudaErrors(cudaFree(d_row));
+  checkCudaErrors(cudaFree(d_val));
+  checkCudaErrors(cudaFree(d_x));
+  checkCudaErrors(cudaFree(d_r));
+  checkCudaErrors(cudaFree(d_p));
+  checkCudaErrors(cudaFree(d_Ax));
 
   printf("Test Summary:  Error amount = %f\n", err);
   exit((k <= max_iter) ? 0 : 1);
diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj
index ed755a2c..a662b455 100644
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.vcxproj b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.vcxproj
index d14e10fe..5fe964d8 100644
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/conjugateGradientMultiBlockCG/README.md b/Samples/conjugateGradientMultiBlockCG/README.md
index 0728e9af..217fabf3 100644
--- a/Samples/conjugateGradientMultiBlockCG/README.md
+++ b/Samples/conjugateGradientMultiBlockCG/README.md
@@ -27,7 +27,7 @@ x86_64, ppc64le, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj
index 1d447230..9692e5fe 100644
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.vcxproj b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.vcxproj
index 67082acf..9952e93f 100644
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/conjugateGradientMultiDeviceCG/README.md b/Samples/conjugateGradientMultiDeviceCG/README.md
index 4c6b7ac8..099a61bd 100644
--- a/Samples/conjugateGradientMultiDeviceCG/README.md
+++ b/Samples/conjugateGradientMultiDeviceCG/README.md
@@ -30,7 +30,7 @@ cudaMemAdvise, cudaMemPrefetchAsync, cudaLaunchCooperativeKernelMultiDevice, cud
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu
index f7fad12d..d09c9da8 100644
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu
@@ -223,8 +223,10 @@ __device__ void gpuDotProduct(float *vecA, float *vecB, int size,
   cg::sync(cta);
 
   if (tile32.meta_group_rank() == 0) {
-     temp_sum = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0;
-     temp_sum = cg::reduce(tile32, temp_sum, cg::plus<double>());
+    temp_sum = tile32.thread_rank() < tile32.meta_group_size()
+                   ? tmp[tile32.thread_rank()]
+                   : 0.0;
+    temp_sum = cg::reduce(tile32, temp_sum, cg::plus<double>());
 
     if (tile32.thread_rank() == 0) {
       atomicAdd(&grid_dot_result, temp_sum);
@@ -239,8 +241,9 @@ __device__ void gpuCopyVector(float *srcA, float *destB, int size,
   }
 }
 
-__device__ void gpuScaleVectorAndSaxpy(float *x, float *y, float a, float scale, int size,
-                         const cg::multi_grid_group &multi_grid) {
+__device__ void gpuScaleVectorAndSaxpy(float *x, float *y, float a, float scale,
+                                       int size,
+                                       const cg::multi_grid_group &multi_grid) {
   for (int i = multi_grid.thread_rank(); i < size; i += multi_grid.size()) {
     y[i] = a * x[i] + scale * y[i];
   }
@@ -360,10 +363,11 @@ std::multimap<std::pair<int, int>, int> getIdenticalGPUs() {
     // Filter unsupported devices
     if (deviceProp.cooperativeMultiDeviceLaunch &&
         deviceProp.concurrentManagedAccess) {
-      identicalGpus.emplace(std::make_pair(deviceProp.major, deviceProp.minor), i);
+      identicalGpus.emplace(std::make_pair(deviceProp.major, deviceProp.minor),
+                            i);
     }
     printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", i,
-          deviceProp.name, deviceProp.major, deviceProp.minor);
+           deviceProp.name, deviceProp.major, deviceProp.minor);
   }
 
   return identicalGpus;
@@ -387,15 +391,17 @@ int main(int argc, char **argv) {
 
   auto bestFit = std::make_pair(it, it);
   // use std::distance to find the largest number of GPUs amongst architectures
-  auto distance = [](decltype(bestFit) p){return std::distance(p.first, p.second);};
+  auto distance = [](decltype(bestFit) p) {
+    return std::distance(p.first, p.second);
+  };
 
   // Read each unique key/pair element in order
   for (; it != end; it = gpusByArch.upper_bound(it->first)) {
     // first and second are iterators bounded within the architecture group
     auto testFit = gpusByArch.equal_range(it->first);
-    // Always use devices with highest architecture version or whichever has the most devices available
-    if (distance(bestFit) <= distance(testFit))
-        bestFit = testFit;
+    // Always use devices with highest architecture version or whichever has the
+    // most devices available
+    if (distance(bestFit) <= distance(testFit)) bestFit = testFit;
   }
 
   if (distance(bestFit) < kNumGpusRequired) {
@@ -408,33 +414,35 @@ int main(int argc, char **argv) {
 
   std::set<int> bestFitDeviceIds;
 
-  // check & select peer-to-peer access capable GPU devices as enabling p2p access between participating
+  // check & select peer-to-peer access capable GPU devices as enabling p2p
+  // access between participating
   // GPUs gives better performance for multi_grid sync.
   for (auto itr = bestFit.first; itr != bestFit.second; itr++) {
     int deviceId = itr->second;
     checkCudaErrors(cudaSetDevice(deviceId));
 
-    std::for_each(itr, bestFit.second, [&deviceId, &bestFitDeviceIds](decltype(*itr) mapPair) {
-      if (deviceId != mapPair.second)
-      {
+    std::for_each(itr, bestFit.second, [&deviceId, &bestFitDeviceIds,
+                                        &kNumGpusRequired](
+                                           decltype(*itr) mapPair) {
+      if (deviceId != mapPair.second) {
         int access = 0;
-        checkCudaErrors(cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second));
-        printf("Device=%d %s Access Peer Device=%d\n", deviceId, access ? "CAN" : "CANNOT", mapPair.second);
+        checkCudaErrors(
+            cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second));
+        printf("Device=%d %s Access Peer Device=%d\n", deviceId,
+               access ? "CAN" : "CANNOT", mapPair.second);
         if (access && bestFitDeviceIds.size() < kNumGpusRequired) {
           bestFitDeviceIds.emplace(deviceId);
           bestFitDeviceIds.emplace(mapPair.second);
-        }
-        else {
+        } else {
           printf("Ignoring device %i (max devices exceeded)\n", mapPair.second);
         }
       }
     });
 
-    if (bestFitDeviceIds.size() >= kNumGpusRequired)
-    {
+    if (bestFitDeviceIds.size() >= kNumGpusRequired) {
       printf("Selected p2p capable devices - ");
-      for (auto devicesItr = bestFitDeviceIds.begin(); devicesItr != bestFitDeviceIds.end(); devicesItr++)
-      {
+      for (auto devicesItr = bestFitDeviceIds.begin();
+           devicesItr != bestFitDeviceIds.end(); devicesItr++) {
         printf("deviceId = %d  ", *devicesItr);
       }
       printf("\n");
@@ -442,33 +450,34 @@ int main(int argc, char **argv) {
     }
   }
 
-  // if bestFitDeviceIds.size() == 0 it means the GPUs in system are not p2p capable,
+  // if bestFitDeviceIds.size() == 0 it means the GPUs in system are not p2p
+  // capable,
   // hence we add it without p2p capability check.
-  if (!bestFitDeviceIds.size())
-  {
-    printf("Devices involved are not p2p capable.. selecting %zu of them\n", kNumGpusRequired);
-    std::for_each(bestFit.first, bestFit.second, [&bestFitDeviceIds](decltype(*bestFit.first) mapPair) {
-      if (bestFitDeviceIds.size() < kNumGpusRequired) {
-        bestFitDeviceIds.emplace(mapPair.second);
-      }
-      else {
-        printf("Ignoring device %i (max devices exceeded)\n", mapPair.second);
-      }
-      // Insert the sequence into the deviceIds set
-    });
-  }
-  else
-  {
-    // perform cudaDeviceEnablePeerAccess in both directions for all participating devices
-    // of a cudaLaunchCooperativeKernelMultiDevice call this gives better performance for multi_grid sync.
-    for (auto p1_itr = bestFitDeviceIds.begin(); p1_itr != bestFitDeviceIds.end(); p1_itr++)
-    {
+  if (!bestFitDeviceIds.size()) {
+    printf("Devices involved are not p2p capable.. selecting %zu of them\n",
+           kNumGpusRequired);
+    std::for_each(bestFit.first, bestFit.second,
+                  [&bestFitDeviceIds,
+                   &kNumGpusRequired](decltype(*bestFit.first) mapPair) {
+                    if (bestFitDeviceIds.size() < kNumGpusRequired) {
+                      bestFitDeviceIds.emplace(mapPair.second);
+                    } else {
+                      printf("Ignoring device %i (max devices exceeded)\n",
+                             mapPair.second);
+                    }
+                    // Insert the sequence into the deviceIds set
+                  });
+  } else {
+    // perform cudaDeviceEnablePeerAccess in both directions for all
+    // participating devices of a cudaLaunchCooperativeKernelMultiDevice call
+    // this gives better performance for multi_grid sync.
+    for (auto p1_itr = bestFitDeviceIds.begin();
+         p1_itr != bestFitDeviceIds.end(); p1_itr++) {
       checkCudaErrors(cudaSetDevice(*p1_itr));
-      for (auto p2_itr = bestFitDeviceIds.begin(); p2_itr != bestFitDeviceIds.end(); p2_itr++)
-      {
-        if (*p1_itr != *p2_itr)
-        {
-          checkCudaErrors(cudaDeviceEnablePeerAccess(*p2_itr, 0 ));
+      for (auto p2_itr = bestFitDeviceIds.begin();
+           p2_itr != bestFitDeviceIds.end(); p2_itr++) {
+        if (*p1_itr != *p2_itr) {
+          checkCudaErrors(cudaDeviceEnablePeerAccess(*p2_itr, 0));
           checkCudaErrors(cudaSetDevice(*p1_itr));
         }
       }
@@ -518,7 +527,7 @@ int main(int argc, char **argv) {
   std::cout << "\nRunning on GPUs = " << kNumGpusRequired << std::endl;
   cudaStream_t nStreams[kNumGpusRequired];
 
-  int sMemSize = sizeof(double) * ((THREADS_PER_BLOCK/32) + 1);
+  int sMemSize = sizeof(double) * ((THREADS_PER_BLOCK / 32) + 1);
   int numBlocksPerSm = INT_MAX;
   int numThreads = THREADS_PER_BLOCK;
   int numSms = INT_MAX;
@@ -530,17 +539,16 @@ int main(int argc, char **argv) {
     checkCudaErrors(cudaSetDevice(*deviceId));
     checkCudaErrors(cudaGetDeviceProperties(&deviceProp, *deviceId));
 
-    int numBlocksPerSm_current=0;
+    int numBlocksPerSm_current = 0;
     checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &numBlocksPerSm_current, multiGpuConjugateGradient, numThreads, sMemSize));
+        &numBlocksPerSm_current, multiGpuConjugateGradient, numThreads,
+        sMemSize));
 
-    if (numBlocksPerSm > numBlocksPerSm_current)
-    {
-        numBlocksPerSm = numBlocksPerSm_current;
+    if (numBlocksPerSm > numBlocksPerSm_current) {
+      numBlocksPerSm = numBlocksPerSm_current;
     }
-    if (numSms > deviceProp.multiProcessorCount)
-    {
-        numSms = deviceProp.multiProcessorCount;
+    if (numSms > deviceProp.multiProcessorCount) {
+      numSms = deviceProp.multiProcessorCount;
     }
     deviceId++;
   }
@@ -554,7 +562,7 @@ int main(int argc, char **argv) {
 
   int device_count = 0;
   int totalThreadsPerGPU = numSms * numBlocksPerSm * THREADS_PER_BLOCK;
-  deviceId =  bestFitDeviceIds.begin();;
+  deviceId = bestFitDeviceIds.begin();
   while (deviceId != bestFitDeviceIds.end()) {
     checkCudaErrors(cudaSetDevice(*deviceId));
     checkCudaErrors(cudaStreamCreate(&nStreams[device_count]));
@@ -621,14 +629,15 @@ int main(int argc, char **argv) {
 
   printf("Total threads per GPU = %d numBlocksPerSm  = %d\n",
          numSms * numBlocksPerSm * THREADS_PER_BLOCK, numBlocksPerSm);
-  dim3 dimGrid(numSms * numBlocksPerSm, 1, 1), dimBlock(THREADS_PER_BLOCK, 1, 1);
+  dim3 dimGrid(numSms * numBlocksPerSm, 1, 1),
+      dimBlock(THREADS_PER_BLOCK, 1, 1);
   void *kernelArgs[] = {
       (void *)&I,  (void *)&J, (void *)&val, (void *)&x,
       (void *)&Ax, (void *)&p, (void *)&r,   (void *)&dot_result,
       (void *)&nz, (void *)&N, (void *)&tol,
   };
-  cudaLaunchParams *launchParamsList = (cudaLaunchParams *)malloc(
-      sizeof(cudaLaunchParams) * kNumGpusRequired);
+  cudaLaunchParams *launchParamsList =
+      (cudaLaunchParams *)malloc(sizeof(cudaLaunchParams) * kNumGpusRequired);
   for (int i = 0; i < kNumGpusRequired; i++) {
     launchParamsList[i].func = (void *)multiGpuConjugateGradient;
     launchParamsList[i].gridDim = dimGrid;
@@ -645,12 +654,11 @@ int main(int argc, char **argv) {
       cudaCooperativeLaunchMultiDeviceNoPreSync |
           cudaCooperativeLaunchMultiDeviceNoPostSync));
 
+  checkCudaErrors(cudaMemPrefetchAsync(x, sizeof(float) * N, cudaCpuDeviceId));
   checkCudaErrors(
-      cudaMemPrefetchAsync(x, sizeof(float) * N, cudaCpuDeviceId));
-  checkCudaErrors(
-        cudaMemPrefetchAsync(dot_result, sizeof(double), cudaCpuDeviceId));
+      cudaMemPrefetchAsync(dot_result, sizeof(double), cudaCpuDeviceId));
 
-  deviceId =  bestFitDeviceIds.begin();;
+  deviceId = bestFitDeviceIds.begin();
   device_count = 0;
   while (deviceId != bestFitDeviceIds.end()) {
     checkCudaErrors(cudaSetDevice(*deviceId));
@@ -658,7 +666,7 @@ int main(int argc, char **argv) {
     deviceId++;
   }
 
-  r1 = *dot_result;
+  r1 = (float)*dot_result;
 
   printf("GPU Final, residual = %e \n  ", sqrt(r1));
 
diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj
index f6ab8299..281b9f54 100644
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -109,6 +109,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.vcxproj b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.vcxproj
index bd54470f..da03363e 100644
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.vcxproj
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -105,6 +105,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/cuSolverDn_LinearSolver/Makefile b/Samples/cuSolverDn_LinearSolver/Makefile
index f8b34a31..61e55f47 100644
--- a/Samples/cuSolverDn_LinearSolver/Makefile
+++ b/Samples/cuSolverDn_LinearSolver/Makefile
@@ -271,6 +271,12 @@ ifeq ($(TARGET_ARCH),armv7l)
   SAMPLE_ENABLED := 0
 endif
 
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - cuSolverDn_LinearSolver is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ifeq ($(TARGET_OS),linux)
 ALL_CCFLAGS += -Xcompiler \"-Wl,--no-as-needed\"
 endif
diff --git a/Samples/cuSolverDn_LinearSolver/README.md b/Samples/cuSolverDn_LinearSolver/README.md
index 194c9e42..185f577b 100644
--- a/Samples/cuSolverDn_LinearSolver/README.md
+++ b/Samples/cuSolverDn_LinearSolver/README.md
@@ -27,7 +27,7 @@ x86_64, ppc64le, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2017.vcxproj b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2017.vcxproj
index 910879f6..8d77015c 100644
--- a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2017.vcxproj
+++ b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -110,6 +110,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2019.vcxproj b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2019.vcxproj
index 194bb165..d3f1e05e 100644
--- a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2019.vcxproj
+++ b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -106,6 +106,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/cuSolverSp_LinearSolver/Makefile b/Samples/cuSolverSp_LinearSolver/Makefile
index 59a043d9..cc002581 100644
--- a/Samples/cuSolverSp_LinearSolver/Makefile
+++ b/Samples/cuSolverSp_LinearSolver/Makefile
@@ -265,6 +265,12 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 
 SAMPLE_ENABLED := 1
 
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - cuSolverSp_LinearSolver is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ifeq ($(TARGET_OS),linux)
 ALL_CCFLAGS += -Xcompiler \"-Wl,--no-as-needed\"
 endif
diff --git a/Samples/cuSolverSp_LinearSolver/README.md b/Samples/cuSolverSp_LinearSolver/README.md
index 45e2e442..35c105ee 100644
--- a/Samples/cuSolverSp_LinearSolver/README.md
+++ b/Samples/cuSolverSp_LinearSolver/README.md
@@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp
index fabb33fb..331c733b 100644
--- a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp
+++ b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp
@@ -495,13 +495,13 @@ int main(int argc, char *argv[]) {
   size_t bufferSize = 0;
   checkCudaErrors(cusparseSpMV_bufferSize(
       cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx,
-      &one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize));
+      &one, vecAx, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize));
   void *buffer = NULL;
   checkCudaErrors(cudaMalloc(&buffer, bufferSize));
 
   checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                &minus_one, matA, vecx, &one, vecAx, CUDA_R_64F,
-                               CUSPARSE_MV_ALG_DEFAULT, &buffer));
+                               CUSPARSE_SPMV_ALG_DEFAULT, buffer));
 
   checkCudaErrors(cudaMemcpyAsync(h_r, d_r, sizeof(double) * rowsA,
                                   cudaMemcpyDeviceToHost, stream));
@@ -559,7 +559,7 @@ int main(int argc, char *argv[]) {
 
   checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                &minus_one, matA, vecx, &one, vecAx, CUDA_R_64F,
-                               CUSPARSE_MV_ALG_DEFAULT, &buffer));
+                               CUSPARSE_SPMV_ALG_DEFAULT, buffer));
 
   checkCudaErrors(cudaMemcpyAsync(h_x, d_x, sizeof(double) * colsA,
                                   cudaMemcpyDeviceToHost, stream));
diff --git a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2017.vcxproj b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2017.vcxproj
index 57cdcc14..1bdf5779 100644
--- a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2017.vcxproj
+++ b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -110,6 +110,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2019.vcxproj b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2019.vcxproj
index 8f85e583..665f795e 100644
--- a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2019.vcxproj
+++ b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -106,6 +106,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/cudaCompressibleMemory/README.md b/Samples/cudaCompressibleMemory/README.md
index aaa62565..6492f737 100644
--- a/Samples/cudaCompressibleMemory/README.md
+++ b/Samples/cudaCompressibleMemory/README.md
@@ -30,7 +30,7 @@ cudaMalloc, cudaFree
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2017.vcxproj b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2017.vcxproj
index 58de5026..4f450a2c 100644
--- a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2017.vcxproj
+++ b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -109,6 +109,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2019.vcxproj b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2019.vcxproj
index 19b01a21..a932dd47 100644
--- a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2019.vcxproj
+++ b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -105,6 +105,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/cudaNvSci/Makefile b/Samples/cudaNvSci/Makefile
index 1ef041a8..d7db232f 100644
--- a/Samples/cudaNvSci/Makefile
+++ b/Samples/cudaNvSci/Makefile
@@ -279,6 +279,12 @@ ifeq ($(TARGET_ARCH),armv7l)
   SAMPLE_ENABLED := 0
 endif
 
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - cudaNvSci is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
diff --git a/Samples/cudaNvSci/README.md b/Samples/cudaNvSci/README.md
index 8a4f56bf..58d95d19 100644
--- a/Samples/cudaNvSci/README.md
+++ b/Samples/cudaNvSci/README.md
@@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedBuffer, cudaExternalMemoryG
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/cudaNvSciNvMedia/README.md b/Samples/cudaNvSciNvMedia/README.md
index 2ff41323..a8e1a41c 100644
--- a/Samples/cudaNvSciNvMedia/README.md
+++ b/Samples/cudaNvSciNvMedia/README.md
@@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedBuffer, cudaExternalMemoryG
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/cudaOpenMP/README.md b/Samples/cudaOpenMP/README.md
index fb9c1a2b..c2f88493 100644
--- a/Samples/cudaOpenMP/README.md
+++ b/Samples/cudaOpenMP/README.md
@@ -30,7 +30,7 @@ cudaMalloc, cudaFree, cudaMemcpy
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/cudaOpenMP/cudaOpenMP_vs2017.vcxproj b/Samples/cudaOpenMP/cudaOpenMP_vs2017.vcxproj
index 28041d97..d3a04a75 100644
--- a/Samples/cudaOpenMP/cudaOpenMP_vs2017.vcxproj
+++ b/Samples/cudaOpenMP/cudaOpenMP_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -109,6 +109,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/cudaOpenMP/cudaOpenMP_vs2019.vcxproj b/Samples/cudaOpenMP/cudaOpenMP_vs2019.vcxproj
index 51dfcf9b..59018165 100644
--- a/Samples/cudaOpenMP/cudaOpenMP_vs2019.vcxproj
+++ b/Samples/cudaOpenMP/cudaOpenMP_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -105,6 +105,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/cudaTensorCoreGemm/README.md b/Samples/cudaTensorCoreGemm/README.md
index 4f5b2152..502941ae 100644
--- a/Samples/cudaTensorCoreGemm/README.md
+++ b/Samples/cudaTensorCoreGemm/README.md
@@ -31,7 +31,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate,
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj
index 748796aa..622ffe8f 100644
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2019.vcxproj b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2019.vcxproj
index cbb96dc4..fb649d4c 100644
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2019.vcxproj
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/deviceQuery/README.md b/Samples/deviceQuery/README.md
index bbbb7d18..76fb08d5 100644
--- a/Samples/deviceQuery/README.md
+++ b/Samples/deviceQuery/README.md
@@ -27,7 +27,7 @@ cudaSetDevice, cudaGetDeviceCount, cudaGetDeviceProperties, cudaDriverGetVersion
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/deviceQuery/deviceQuery.cpp b/Samples/deviceQuery/deviceQuery.cpp
index c002cc5c..4c6c3369 100644
--- a/Samples/deviceQuery/deviceQuery.cpp
+++ b/Samples/deviceQuery/deviceQuery.cpp
@@ -112,10 +112,10 @@ int main(int argc, char **argv) {
     char msg[256];
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
     sprintf_s(msg, sizeof(msg),
-             "  Total amount of global memory:                 %.0f MBytes "
-             "(%llu bytes)\n",
-             static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
-             (unsigned long long)deviceProp.totalGlobalMem);
+              "  Total amount of global memory:                 %.0f MBytes "
+              "(%llu bytes)\n",
+              static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
+              (unsigned long long)deviceProp.totalGlobalMem);
 #else
     snprintf(msg, sizeof(msg),
              "  Total amount of global memory:                 %.0f MBytes "
@@ -125,7 +125,7 @@ int main(int argc, char **argv) {
 #endif
     printf("%s", msg);
 
-    printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
+    printf("  (%03d) Multiprocessors, (%03d) CUDA Cores/MP:    %d CUDA Cores\n",
            deviceProp.multiProcessorCount,
            _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
            _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
@@ -250,8 +250,7 @@ int main(int argc, char **argv) {
         "device)",
         "Exclusive Process (many threads in one process is able to use "
         "::cudaSetDevice() with this device)",
-        "Unknown",
-        NULL};
+        "Unknown", NULL};
     printf("  Compute Mode:\n");
     printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
   }
@@ -272,7 +271,7 @@ int main(int argc, char **argv) {
           // must be enabled to support this
           && prop[i].tccDriver
 #endif
-      ) {
+          ) {
         // This is an array of P2P capable GPUs
         gpuid[gpu_p2p_count++] = i;
       }
@@ -307,7 +306,8 @@ int main(int argc, char **argv) {
   // driver version
   sProfileString += ", CUDA Driver Version = ";
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-  sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
+  sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000,
+            (driverVersion % 100) / 10);
 #else
   snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000,
            (driverVersion % 100) / 10);
@@ -317,7 +317,8 @@ int main(int argc, char **argv) {
   // Runtime version
   sProfileString += ", CUDA Runtime Version = ";
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-  sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
+  sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000,
+            (runtimeVersion % 100) / 10);
 #else
   snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000,
            (runtimeVersion % 100) / 10);
diff --git a/Samples/deviceQuery/deviceQuery_vs2017.vcxproj b/Samples/deviceQuery/deviceQuery_vs2017.vcxproj
index 9fc4c3e9..5bd56297 100644
--- a/Samples/deviceQuery/deviceQuery_vs2017.vcxproj
+++ b/Samples/deviceQuery/deviceQuery_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/deviceQuery/deviceQuery_vs2019.vcxproj b/Samples/deviceQuery/deviceQuery_vs2019.vcxproj
index b8fd169a..f8532544 100644
--- a/Samples/deviceQuery/deviceQuery_vs2019.vcxproj
+++ b/Samples/deviceQuery/deviceQuery_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/dmmaTensorCoreGemm/README.md b/Samples/dmmaTensorCoreGemm/README.md
index 9730739a..aa6e6f16 100644
--- a/Samples/dmmaTensorCoreGemm/README.md
+++ b/Samples/dmmaTensorCoreGemm/README.md
@@ -27,7 +27,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate,
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2017.vcxproj b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2017.vcxproj
index 7d8414e5..5ea929c1 100644
--- a/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2017.vcxproj
+++ b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2019.vcxproj b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2019.vcxproj
index 4fe4bfc6..b415db92 100644
--- a/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2019.vcxproj
+++ b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/globalToShmemAsyncCopy/README.md b/Samples/globalToShmemAsyncCopy/README.md
index bd13a04d..233d5b50 100644
--- a/Samples/globalToShmemAsyncCopy/README.md
+++ b/Samples/globalToShmemAsyncCopy/README.md
@@ -30,7 +30,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu
index ad0229a7..c1b70b46 100644
--- a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu
+++ b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu
@@ -28,12 +28,14 @@
 /**
  * Matrix multiplication: C = A * B.
  *
- * This sample demonstrates implements matrix multiplication which makes use of shared memory
- * to ensure data reuse, the matrix multiplication is done using tiling approach.
- * With compute capability 8.0 or higher the CUDA kernels involved uses asynchronously copy data
- * from global to shared memory; a.k.a., async-copy.
- * This sample has been written for clarity of exposition to illustrate various CUDA programming
- * principles, not with the goal of providing the most performant generic kernel for matrix multiplication.
+ * This sample demonstrates implements matrix multiplication which makes use of
+ * shared memory to ensure data reuse, the matrix multiplication is done using
+ * tiling approach.
+ * With compute capability 8.0 or higher the CUDA kernels involved uses
+ * asynchronously copy data from global to shared memory; a.k.a., async-copy.
+ * This sample has been written for clarity of exposition to illustrate various
+ * CUDA programming principles, not with the goal of providing the most
+ * performant generic kernel for matrix multiplication.
  */
 
 // System includes
@@ -47,7 +49,7 @@
 #if __CUDA_ARCH__ >= 700
 #include <cuda/barrier>
 #endif
-#include  <cooperative_groups.h>
+#include <cooperative_groups.h>
 
 namespace cg = cooperative_groups;
 
@@ -55,966 +57,1015 @@ namespace cg = cooperative_groups;
 #include <helper_functions.h>
 #include <helper_cuda.h>
 
-enum kernels
-{
-    AsyncCopyMultiStageLargeChunk  = 0,
-    AsyncCopyLargeChunk            = 1,
-    AsyncCopyLargeChunkAWBarrier   = 2,
-    AsyncCopyMultiStageSharedState = 3,
-    AsyncCopyMultiStage            = 4,
-    AsyncCopySingleStage           = 5,
-    Naive                          = 6,
-    NaiveLargeChunk                = 7
+enum kernels {
+  AsyncCopyMultiStageLargeChunk = 0,
+  AsyncCopyLargeChunk = 1,
+  AsyncCopyLargeChunkAWBarrier = 2,
+  AsyncCopyMultiStageSharedState = 3,
+  AsyncCopyMultiStage = 4,
+  AsyncCopySingleStage = 5,
+  Naive = 6,
+  NaiveLargeChunk = 7
 };
 
-const char* kernelNames[] = {"AsyncCopyMultiStageLargeChunk", "AsyncCopyLargeChunk", 
-                            "AsyncCopyLargeChunkAWBarrier", "AsyncCopyMultiStageSharedState",
-                            "AsyncCopyMultiStage", "AsyncCopySingleStage", "Naive", "NaiveLargeChunk"};
+const char *kernelNames[] = {"AsyncCopyMultiStageLargeChunk",
+                             "AsyncCopyLargeChunk",
+                             "AsyncCopyLargeChunkAWBarrier",
+                             "AsyncCopyMultiStageSharedState",
+                             "AsyncCopyMultiStage",
+                             "AsyncCopySingleStage",
+                             "Naive",
+                             "NaiveLargeChunk"};
 
 constexpr int blockSize = 16;
 
 // Multi Stage memcpy_async pipeline with large chunk copy
-template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyMultiStageLargeChunk(float* __restrict__ C, 
-                                                        const float* __restrict__ A,
-                                                        const float* __restrict__ B, int wA,
-                                                        int wB) {
-    // Requires BLOCK_SIZE % 4 == 0 
+template <int BLOCK_SIZE>
+__global__ void MatrixMulAsyncCopyMultiStageLargeChunk(
+    float *__restrict__ C, const float *__restrict__ A,
+    const float *__restrict__ B, int wA, int wB) {
+  // Requires BLOCK_SIZE % 4 == 0
 
-    // Multi-stage pipeline version
-    constexpr size_t maxPipelineStages = 4;
+  // Multi-stage pipeline version
+  constexpr size_t maxPipelineStages = 4;
 
-    // Declaration of the shared memory array As used to
-    // store the sub-matrix of A for each stage
-    __shared__ alignas(alignof(float4)) float As[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE];
+  // Declaration of the shared memory array As used to
+  // store the sub-matrix of A for each stage
+  __shared__ alignas(
+      alignof(float4)) float As[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE];
 
-    // Declaration of the shared memory array Bs used to
-    // store the sub-matrix of B for each stage
-    __shared__ alignas(alignof(float4)) float Bs[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE];
+  // Declaration of the shared memory array Bs used to
+  // store the sub-matrix of B for each stage
+  __shared__ alignas(
+      alignof(float4)) float Bs[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE];
 
-    float Csub = 0.0;
+  float Csub = 0.0;
 
-    // Index of the first sub-matrix of A processed by the block
-    const int aBegin = wA * (BLOCK_SIZE) * blockIdx.y;
+  // Index of the first sub-matrix of A processed by the block
+  const int aBegin = wA * (BLOCK_SIZE)*blockIdx.y;
 
-    // Index of the last sub-matrix of A processed by the block
-    const int aEnd   = aBegin + wA - 1;
+  // Index of the last sub-matrix of A processed by the block
+  const int aEnd = aBegin + wA - 1;
 
-    // Step size used to iterate through the sub-matrices of A
-    int aStep  = BLOCK_SIZE;
+  // Step size used to iterate through the sub-matrices of A
+  int aStep = BLOCK_SIZE;
 
-    // Index of the first sub-matrix of B processed by the block
-    const int bBegin = BLOCK_SIZE * blockIdx.x;
+  // Index of the first sub-matrix of B processed by the block
+  const int bBegin = BLOCK_SIZE * blockIdx.x;
 
-    // Step size used to iterate through the sub-matrices of B
-    int bStep  = BLOCK_SIZE * wB;
+  // Step size used to iterate through the sub-matrices of B
+  int bStep = BLOCK_SIZE * wB;
 
-    const int t4x = threadIdx.x * 4;
-    const auto shape4 = cuda::aligned_size_t<alignof(float4)>(sizeof(float4));
+  const int t4x = threadIdx.x * 4;
+  const auto shape4 = cuda::aligned_size_t<alignof(float4)>(sizeof(float4));
 
-    cuda::pipeline<cuda::thread_scope_thread> pipe = cuda::make_pipeline();
-
-    // Loop over all the sub-matrices of A and B
-    // required to compute the block sub-matrix
-    for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, iStage = 0; a <= aEnd; a += aStep, b += bStep, ++i ) {
-        // Load the matrices from device memory to shared memory; each thread loads
-        // one element of each matrix
-        for ( ; aStage <= a + aStep * maxPipelineStages ; aStage += aStep, bStage += bStep, ++iStage )
-        {
-            pipe.producer_acquire();
-            if ( aStage <= aEnd && t4x < BLOCK_SIZE )
-            {
-                // Rotating buffer
-                const int j = iStage % maxPipelineStages;
-                cuda::memcpy_async(&As[j][threadIdx.y][t4x], &A[aStage + wA * threadIdx.y + t4x], shape4, pipe);
-                cuda::memcpy_async(&Bs[j][threadIdx.y][t4x], &B[aStage + wA * threadIdx.y + t4x], shape4, pipe);
-            }
-            pipe.producer_commit();
-        }
-
-        pipe.consumer_wait();
-        // Synchronize to make sure the matrices are loaded
-        __syncthreads();
+  cuda::pipeline<cuda::thread_scope_thread> pipe = cuda::make_pipeline();
 
+  // Loop over all the sub-matrices of A and B
+  // required to compute the block sub-matrix
+  for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin,
+           iStage = 0;
+       a <= aEnd; a += aStep, b += bStep, ++i) {
+    // Load the matrices from device memory to shared memory; each thread loads
+    // one element of each matrix
+    for (; aStage <= a + aStep * maxPipelineStages;
+         aStage += aStep, bStage += bStep, ++iStage) {
+      pipe.producer_acquire();
+      if (aStage <= aEnd && t4x < BLOCK_SIZE) {
         // Rotating buffer
-        const int j = i % maxPipelineStages;
-
-        // Multiply the two matrices together;
-        // each thread computes one element
-        // of the block sub-matrix
-        #pragma unroll
-        for (int k = 0; k < BLOCK_SIZE; ++k) {
-            Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x];
-        }
-        pipe.consumer_release();
-
-        // Don't have to synchronize because maxPipelineStages is greater than one
-        // therefore next iteration is loading to a different buffer.
+        const int j = iStage % maxPipelineStages;
+        cuda::memcpy_async(&As[j][threadIdx.y][t4x],
+                           &A[aStage + wA * threadIdx.y + t4x], shape4, pipe);
+        cuda::memcpy_async(&Bs[j][threadIdx.y][t4x],
+                           &B[aStage + wA * threadIdx.y + t4x], shape4, pipe);
+      }
+      pipe.producer_commit();
     }
 
-    // Write the block sub-matrix to device memory;
-    // each thread writes four element
-    int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
-    C[c + wB * threadIdx.y + threadIdx.x] = Csub;
-}
-
-
-// Single Stage memcpy_async pipeline with Large copy chunk (float4)
-template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyLargeChunk(float* __restrict__ C, 
-                                                        const float* __restrict__ A,
-                                                        const float* __restrict__ B, int wA,
-                                                        int wB) {
-    // Requires BLOCK_SIZE % 4 == 0 
-
-    // Declaration of the shared memory array As used to
-    // store the sub-matrix of A
-    __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE];
-
-    // Declaration of the shared memory array Bs used to
-    // store the sub-matrix of B
-    __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE];
-
-    // Index of the first sub-matrix of A processed by the block
-    int aBegin = wA * BLOCK_SIZE * blockIdx.y;
-
-    // Index of the last sub-matrix of A processed by the block
-    int aEnd   = aBegin + wA - 1;
-
-    // Step size used to iterate through the sub-matrices of A
-    int aStep  = BLOCK_SIZE;
-
-    // Index of the first sub-matrix of B processed by the block
-    int bBegin = BLOCK_SIZE * blockIdx.x;
-
-    // Step size used to iterate through the sub-matrices of B
-    int bStep  = BLOCK_SIZE * wB;
-
-    // Single-stage pipeline version
-    float Csub = 0.0;
-
-    const int t4x = threadIdx.x * 4;
-    const auto shape4 = cuda::aligned_size_t<alignof(float4)>(sizeof(float4));
-    cuda::pipeline<cuda::thread_scope_thread> pipe = cuda::make_pipeline();
-
-    // Loop over all the sub-matrices of A and B
-    // required to compute the block sub-matrix
-    for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
-        // Load the matrices from device memory to shared memory; 
-        // a subset of threads loads a contiguous chunk of elements.
-
-        // Previously, per-thread:
-        // As[ty][tx] = A[a + wA * ty + tx];
-        // Bs[ty][tx] = B[b + wB * ty + tx];
-
-        // Now, one fourth of the threads load four elements of each matrix
-        if ( t4x < BLOCK_SIZE ) {
-
-            pipe.producer_acquire();
-
-            cuda::memcpy_async(&As[threadIdx.y][t4x], &A[a + wA * threadIdx.y + t4x], shape4, pipe);
-            cuda::memcpy_async(&Bs[threadIdx.y][t4x], &B[a + wA * threadIdx.y + t4x], shape4, pipe);
-
-            pipe.producer_commit();
-            pipe.consumer_wait();
-        }
-
-        // Synchronize to make sure the matrices are loaded
-        __syncthreads();
-
-        // Multiply the two matrices together;
-        // each thread computes one element
-        // of the block sub-matrix
-#pragma unroll
-        for (int k = 0; k < BLOCK_SIZE; ++k) {
-            Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x];
-        }
-
-        pipe.consumer_release();
-
-        // Synchronize to make sure that the preceding
-        // computation is done before overwriting the
-        // shared memory sub-matrix buffers As and Bs in the next iteration.
-        __syncthreads();
-    }
-
-    // Write the block sub-matrix to device memory;
-    // each thread writes four element
-    int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
-    C[c + wB * threadIdx.y + threadIdx.x] = Csub;
-}
-
-// Single Stage memcpy_async pipeline with Large copy chunk (float4) using arrive-wait barrier
-template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyLargeChunkAWBarrier(float* __restrict__ C, 
-                                                        const float* __restrict__ A,
-                                                        const float* __restrict__ B, int wA,
-                                                        int wB) {
-#if __CUDA_ARCH__ >= 700
-#pragma diag_suppress static_var_with_dynamic_init
-    // Requires BLOCK_SIZE % 4 == 0 
-
-    __shared__ cuda::barrier<cuda::thread_scope_block> bar;
-
-    // Declaration of the shared memory array As used to
-    // store the sub-matrix of A
-    __shared__  alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE];
-
-    // Declaration of the shared memory array Bs used to
-    // store the sub-matrix of B
-    __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE];
-
-    if (threadIdx.x == 0) {
-        init(&bar, blockDim.x*blockDim.y);
-    }
+    pipe.consumer_wait();
+    // Synchronize to make sure the matrices are loaded
     __syncthreads();
 
-    // Index of the first sub-matrix of A processed by the block
-    int aBegin = wA * BLOCK_SIZE * blockIdx.y;
+    // Rotating buffer
+    const int j = i % maxPipelineStages;
 
-    // Index of the last sub-matrix of A processed by the block
-    int aEnd   = aBegin + wA - 1;
-
-    // Step size used to iterate through the sub-matrices of A
-    int aStep  = BLOCK_SIZE;
-
-    // Index of the first sub-matrix of B processed by the block
-    int bBegin = BLOCK_SIZE * blockIdx.x;
-
-    // Step size used to iterate through the sub-matrices of B
-    int bStep  = BLOCK_SIZE * wB;
-
-    float Csub = 0.0;
-
-    const int t4x = threadIdx.x * 4;
-
-    // Loop over all the sub-matrices of A and B
-    // required to compute the block sub-matrix
-    for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
-        // Load the matrices from device memory to shared memory; 
-        // a subset of threads loads a contiguous chunk of elements.
-
-        // Now, one fourth of the threads load four elements of each matrix
-        if ( t4x < BLOCK_SIZE ) {
-            float4 * const A4s = reinterpret_cast<float4*>(& As[threadIdx.y][t4x]);
-            float4 * const B4s = reinterpret_cast<float4*>(& Bs[threadIdx.y][t4x]);
-            const float4 * const A4  = reinterpret_cast<const float4*>(& A[a + wA * threadIdx.y + t4x]);
-            const float4 * const B4  = reinterpret_cast<const float4*>(& B[a + wA * threadIdx.y + t4x]);
-
-            cuda::memcpy_async(A4s, A4, sizeof(float4), bar);
-            cuda::memcpy_async(B4s, B4, sizeof(float4), bar);
-         }
-
-        // Synchronize to make sure the matrices are loaded
-        bar.arrive_and_wait();
-
-        // Multiply the two matrices together;
-        // each thread computes one element
-        // of the block sub-matrix
+// Multiply the two matrices together;
+// each thread computes one element
+// of the block sub-matrix
 #pragma unroll
-        for (int k = 0; k < BLOCK_SIZE; ++k) {
-            Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x];
-        }
+    for (int k = 0; k < BLOCK_SIZE; ++k) {
+      Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x];
+    }
+    pipe.consumer_release();
 
-        // Synchronize to make sure that the preceding
-        // computation is done before overwriting the
-        // shared memory sub-matrix buffers As and Bs in the next iteration.
-        bar.arrive_and_wait();
+    // Don't have to synchronize because maxPipelineStages is greater than one
+    // therefore next iteration is loading to a different buffer.
+  }
+
+  // Write the block sub-matrix to device memory;
+  // each thread writes four element
+  int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
+  C[c + wB * threadIdx.y + threadIdx.x] = Csub;
+}
+
+// Single Stage memcpy_async pipeline with Large copy chunk (float4)
+template <int BLOCK_SIZE>
+__global__ void MatrixMulAsyncCopyLargeChunk(float *__restrict__ C,
+                                             const float *__restrict__ A,
+                                             const float *__restrict__ B,
+                                             int wA, int wB) {
+  // Requires BLOCK_SIZE % 4 == 0
+
+  // Declaration of the shared memory array As used to
+  // store the sub-matrix of A
+  __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE];
+
+  // Declaration of the shared memory array Bs used to
+  // store the sub-matrix of B
+  __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE];
+
+  // Index of the first sub-matrix of A processed by the block
+  int aBegin = wA * BLOCK_SIZE * blockIdx.y;
+
+  // Index of the last sub-matrix of A processed by the block
+  int aEnd = aBegin + wA - 1;
+
+  // Step size used to iterate through the sub-matrices of A
+  int aStep = BLOCK_SIZE;
+
+  // Index of the first sub-matrix of B processed by the block
+  int bBegin = BLOCK_SIZE * blockIdx.x;
+
+  // Step size used to iterate through the sub-matrices of B
+  int bStep = BLOCK_SIZE * wB;
+
+  // Single-stage pipeline version
+  float Csub = 0.0;
+
+  const int t4x = threadIdx.x * 4;
+  const auto shape4 = cuda::aligned_size_t<alignof(float4)>(sizeof(float4));
+  cuda::pipeline<cuda::thread_scope_thread> pipe = cuda::make_pipeline();
+
+  // Loop over all the sub-matrices of A and B
+  // required to compute the block sub-matrix
+  for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
+    // Load the matrices from device memory to shared memory;
+    // a subset of threads loads a contiguous chunk of elements.
+
+    // Previously, per-thread:
+    // As[ty][tx] = A[a + wA * ty + tx];
+    // Bs[ty][tx] = B[b + wB * ty + tx];
+
+    // Now, one fourth of the threads load four elements of each matrix
+    if (t4x < BLOCK_SIZE) {
+      pipe.producer_acquire();
+
+      cuda::memcpy_async(&As[threadIdx.y][t4x], &A[a + wA * threadIdx.y + t4x],
+                         shape4, pipe);
+      cuda::memcpy_async(&Bs[threadIdx.y][t4x], &B[a + wA * threadIdx.y + t4x],
+                         shape4, pipe);
+
+      pipe.producer_commit();
+      pipe.consumer_wait();
     }
 
-    // Write the block sub-matrix to device memory;
-    // each thread writes four element
-    int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
-    C[c + wB * threadIdx.y + threadIdx.x] = Csub;
+    // Synchronize to make sure the matrices are loaded
+    __syncthreads();
+
+// Multiply the two matrices together;
+// each thread computes one element
+// of the block sub-matrix
+#pragma unroll
+    for (int k = 0; k < BLOCK_SIZE; ++k) {
+      Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x];
+    }
+
+    pipe.consumer_release();
+
+    // Synchronize to make sure that the preceding
+    // computation is done before overwriting the
+    // shared memory sub-matrix buffers As and Bs in the next iteration.
+    __syncthreads();
+  }
+
+  // Write the block sub-matrix to device memory;
+  // each thread writes four element
+  int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
+  C[c + wB * threadIdx.y + threadIdx.x] = Csub;
+}
+
+// Single Stage memcpy_async pipeline with Large copy chunk (float4) using
+// arrive-wait barrier
+template <int BLOCK_SIZE>
+__global__ void MatrixMulAsyncCopyLargeChunkAWBarrier(
+    float *__restrict__ C, const float *__restrict__ A,
+    const float *__restrict__ B, int wA, int wB) {
+#if __CUDA_ARCH__ >= 700
+#pragma diag_suppress static_var_with_dynamic_init
+  // Requires BLOCK_SIZE % 4 == 0
+
+  __shared__ cuda::barrier<cuda::thread_scope_block> bar;
+
+  // Declaration of the shared memory array As used to
+  // store the sub-matrix of A
+  __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE];
+
+  // Declaration of the shared memory array Bs used to
+  // store the sub-matrix of B
+  __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE];
+
+  if (threadIdx.x == 0) {
+    init(&bar, blockDim.x * blockDim.y);
+  }
+  __syncthreads();
+
+  // Index of the first sub-matrix of A processed by the block
+  int aBegin = wA * BLOCK_SIZE * blockIdx.y;
+
+  // Index of the last sub-matrix of A processed by the block
+  int aEnd = aBegin + wA - 1;
+
+  // Step size used to iterate through the sub-matrices of A
+  int aStep = BLOCK_SIZE;
+
+  // Index of the first sub-matrix of B processed by the block
+  int bBegin = BLOCK_SIZE * blockIdx.x;
+
+  // Step size used to iterate through the sub-matrices of B
+  int bStep = BLOCK_SIZE * wB;
+
+  float Csub = 0.0;
+
+  const int t4x = threadIdx.x * 4;
+
+  // Loop over all the sub-matrices of A and B
+  // required to compute the block sub-matrix
+  for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
+    // Load the matrices from device memory to shared memory;
+    // a subset of threads loads a contiguous chunk of elements.
+
+    // Now, one fourth of the threads load four elements of each matrix
+    if (t4x < BLOCK_SIZE) {
+      float4 *const A4s = reinterpret_cast<float4 *>(&As[threadIdx.y][t4x]);
+      float4 *const B4s = reinterpret_cast<float4 *>(&Bs[threadIdx.y][t4x]);
+      const float4 *const A4 =
+          reinterpret_cast<const float4 *>(&A[a + wA * threadIdx.y + t4x]);
+      const float4 *const B4 =
+          reinterpret_cast<const float4 *>(&B[a + wA * threadIdx.y + t4x]);
+
+      cuda::memcpy_async(A4s, A4, sizeof(float4), bar);
+      cuda::memcpy_async(B4s, B4, sizeof(float4), bar);
+    }
+
+    // Synchronize to make sure the matrices are loaded
+    bar.arrive_and_wait();
+
+// Multiply the two matrices together;
+// each thread computes one element
+// of the block sub-matrix
+#pragma unroll
+    for (int k = 0; k < BLOCK_SIZE; ++k) {
+      Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x];
+    }
+
+    // Synchronize to make sure that the preceding
+    // computation is done before overwriting the
+    // shared memory sub-matrix buffers As and Bs in the next iteration.
+    bar.arrive_and_wait();
+  }
+
+  // Write the block sub-matrix to device memory;
+  // each thread writes four element
+  int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
+  C[c + wB * threadIdx.y + threadIdx.x] = Csub;
 #endif
 }
 
 // Single Stage memcpy_async pipeline with float copy
-template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopySingleStage(float *C, const float *A,
-                                                        const float *B, int wA,
-                                                        int wB) {
+template <int BLOCK_SIZE>
+__global__ void MatrixMulAsyncCopySingleStage(float *C, const float *A,
+                                              const float *B, int wA, int wB) {
+  // Declaration of the shared memory array As used to
+  // store the sub-matrix of A
+  __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
 
-    // Declaration of the shared memory array As used to
-    // store the sub-matrix of A
-    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
+  // Declaration of the shared memory array Bs used to
+  // store the sub-matrix of B
+  __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
 
-    // Declaration of the shared memory array Bs used to
-    // store the sub-matrix of B
-    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+  // Index of the first sub-matrix of A processed by the block
+  int aBegin = wA * BLOCK_SIZE * blockIdx.y;
 
-    // Index of the first sub-matrix of A processed by the block
-    int aBegin = wA * BLOCK_SIZE * blockIdx.y;
+  // Index of the last sub-matrix of A processed by the block
+  int aEnd = aBegin + wA - 1;
 
-    // Index of the last sub-matrix of A processed by the block
-    int aEnd   = aBegin + wA - 1;
+  // Step size used to iterate through the sub-matrices of A
+  int aStep = BLOCK_SIZE;
 
-    // Step size used to iterate through the sub-matrices of A
-    int aStep  = BLOCK_SIZE;
+  // Index of the first sub-matrix of B processed by the block
+  int bBegin = BLOCK_SIZE * blockIdx.x;
 
-    // Index of the first sub-matrix of B processed by the block
-    int bBegin = BLOCK_SIZE * blockIdx.x;
+  // Step size used to iterate through the sub-matrices of B
+  int bStep = BLOCK_SIZE * wB;
 
-    // Step size used to iterate through the sub-matrices of B
-    int bStep  = BLOCK_SIZE * wB;
+  // Single-stage pipeline version
+  float Csub = 0.0;
 
-    // Single-stage pipeline version
-    float Csub = 0.0;
+  cuda::pipeline<cuda::thread_scope_thread> pipe = cuda::make_pipeline();
+  const auto shape1 = cuda::aligned_size_t<alignof(float)>(sizeof(float));
 
-    cuda::pipeline<cuda::thread_scope_thread> pipe = cuda::make_pipeline();
-    const auto shape1 = cuda::aligned_size_t<alignof(float)>(sizeof(float));
+  // Loop over all the sub-matrices of A and B
+  // required to compute the block sub-matrix
+  for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
+    // Load the matrices from device memory to shared memory; each thread loads
+    // one element of each matrix
+    {
+      pipe.producer_acquire();
 
+      cuda::memcpy_async(&As[threadIdx.y][threadIdx.x],
+                         &A[a + wA * threadIdx.y + threadIdx.x], shape1, pipe);
+      cuda::memcpy_async(&Bs[threadIdx.y][threadIdx.x],
+                         &B[b + wB * threadIdx.y + threadIdx.x], shape1, pipe);
 
-    // Loop over all the sub-matrices of A and B
-    // required to compute the block sub-matrix
-    for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
-        // Load the matrices from device memory to shared memory; each thread loads
-        // one element of each matrix
-        {
-            pipe.producer_acquire();
-
-            cuda::memcpy_async(&As[threadIdx.y][threadIdx.x], &A[a + wA * threadIdx.y + threadIdx.x], shape1, pipe);
-            cuda::memcpy_async(&Bs[threadIdx.y][threadIdx.x], &B[b + wB * threadIdx.y + threadIdx.x], shape1, pipe);
-
-            pipe.producer_commit();
-        }
-
-        pipe.consumer_wait();
-        // Synchronize to make sure the matrices are loaded
-        __syncthreads();
-
-        // Multiply the two matrices together;
-        // each thread computes one element
-        // of the block sub-matrix
-#pragma unroll
-        for (int k = 0; k < BLOCK_SIZE; ++k) {
-            Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x];
-        }
-
-        // Synchronize to make sure that the preceding
-        // computation is done before overwriting the
-        // shared memory sub-matrix buffers As and Bs in the next iteration.
-        __syncthreads();
+      pipe.producer_commit();
     }
 
-    // Write the block sub-matrix to device memory;
-    // each thread writes four element
-    int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
-    C[c + wB * threadIdx.y + threadIdx.x] = Csub;
+    pipe.consumer_wait();
+    // Synchronize to make sure the matrices are loaded
+    __syncthreads();
+
+// Multiply the two matrices together;
+// each thread computes one element
+// of the block sub-matrix
+#pragma unroll
+    for (int k = 0; k < BLOCK_SIZE; ++k) {
+      Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x];
+    }
+
+    // Synchronize to make sure that the preceding
+    // computation is done before overwriting the
+    // shared memory sub-matrix buffers As and Bs in the next iteration.
+    __syncthreads();
+  }
+
+  // Write the block sub-matrix to device memory;
+  // each thread writes four element
+  int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
+  C[c + wB * threadIdx.y + threadIdx.x] = Csub;
 }
 
-// Multi Stage memcpy_async thread_scope_thread pipeline with single-element async-copy
-template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyMultiStage(float* __restrict__ C, 
-                                                        const float* __restrict__ A,
-                                                        const float* __restrict__ B, int wA,
-                                                        int wB) {
-    // Multi-stage pipeline version
-    constexpr size_t maxPipelineStages = 4;
+// Multi Stage memcpy_async thread_scope_thread pipeline with single-element
+// async-copy
+template <int BLOCK_SIZE>
+__global__ void MatrixMulAsyncCopyMultiStage(float *__restrict__ C,
+                                             const float *__restrict__ A,
+                                             const float *__restrict__ B,
+                                             int wA, int wB) {
+  // Multi-stage pipeline version
+  constexpr size_t maxPipelineStages = 4;
 
-    // Declaration of the shared memory array As used to
-    // store the sub-matrix of A for each stage
-    __shared__ float As[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE];
+  // Declaration of the shared memory array As used to
+  // store the sub-matrix of A for each stage
+  __shared__ float As[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE];
 
-    // Declaration of the shared memory array Bs used to
-    // store the sub-matrix of B for each stage
-    __shared__ float Bs[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE];
+  // Declaration of the shared memory array Bs used to
+  // store the sub-matrix of B for each stage
+  __shared__ float Bs[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE];
 
-    float Csub = 0.0;
+  float Csub = 0.0;
 
-    // Index of the first sub-matrix of A processed by the block
-    const int aBegin = wA * BLOCK_SIZE * blockIdx.y;
+  // Index of the first sub-matrix of A processed by the block
+  const int aBegin = wA * BLOCK_SIZE * blockIdx.y;
 
-    // Index of the last sub-matrix of A processed by the block
-    const int aEnd   = aBegin + wA - 1;
+  // Index of the last sub-matrix of A processed by the block
+  const int aEnd = aBegin + wA - 1;
 
-    // Step size used to iterate through the sub-matrices of A
-    int aStep  = BLOCK_SIZE;
+  // Step size used to iterate through the sub-matrices of A
+  int aStep = BLOCK_SIZE;
 
-    // Index of the first sub-matrix of B processed by the block
-    const int bBegin = BLOCK_SIZE * blockIdx.x;
+  // Index of the first sub-matrix of B processed by the block
+  const int bBegin = BLOCK_SIZE * blockIdx.x;
 
-    // Step size used to iterate through the sub-matrices of B
-    int bStep  = BLOCK_SIZE * wB;
+  // Step size used to iterate through the sub-matrices of B
+  int bStep = BLOCK_SIZE * wB;
 
-    cuda::pipeline<cuda::thread_scope_thread> pipe = cuda::make_pipeline();
-    const auto shape1 = cuda::aligned_size_t<alignof(float)>(sizeof(float));
+  cuda::pipeline<cuda::thread_scope_thread> pipe = cuda::make_pipeline();
+  const auto shape1 = cuda::aligned_size_t<alignof(float)>(sizeof(float));
 
-    // Loop over all the sub-matrices of A and B
-    // required to compute the block sub-matrix
-    for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, iStage = 0; a <= aEnd; a += aStep, b += bStep, ++i ) {
-        // Load the matrices from device memory to shared memory; each thread loads
-        // one element of each matrix
+  // Loop over all the sub-matrices of A and B
+  // required to compute the block sub-matrix
+  for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin,
+           iStage = 0;
+       a <= aEnd; a += aStep, b += bStep, ++i) {
+    // Load the matrices from device memory to shared memory; each thread loads
+    // one element of each matrix
 
-        for ( ; aStage <= a + aStep * maxPipelineStages ; aStage += aStep, bStage += bStep, ++iStage )
-        {
-            if ( aStage <= aEnd )
-            {
-                // Rotating buffer
-                const int j = iStage % maxPipelineStages;
+    for (; aStage <= a + aStep * maxPipelineStages;
+         aStage += aStep, bStage += bStep, ++iStage) {
+      if (aStage <= aEnd) {
+        // Rotating buffer
+        const int j = iStage % maxPipelineStages;
 
-                pipe.producer_acquire();
+        pipe.producer_acquire();
 
-                cuda::memcpy_async(&As[j][threadIdx.y][threadIdx.x], &A[aStage + wA * threadIdx.y + threadIdx.x], shape1, pipe);
-                cuda::memcpy_async(&Bs[j][threadIdx.y][threadIdx.x], &B[bStage + wB * threadIdx.y + threadIdx.x], shape1, pipe);
+        cuda::memcpy_async(&As[j][threadIdx.y][threadIdx.x],
+                           &A[aStage + wA * threadIdx.y + threadIdx.x], shape1,
+                           pipe);
+        cuda::memcpy_async(&Bs[j][threadIdx.y][threadIdx.x],
+                           &B[bStage + wB * threadIdx.y + threadIdx.x], shape1,
+                           pipe);
 
-                pipe.producer_commit();
-            }
-        }
-        pipe.consumer_wait();
+        pipe.producer_commit();
+      }
+    }
+    pipe.consumer_wait();
 
-        // Synchronize to make sure the matrices are loaded
-        __syncthreads();
+    // Synchronize to make sure the matrices are loaded
+    __syncthreads();
 
-        const int j = i % maxPipelineStages;
+    const int j = i % maxPipelineStages;
 
-        // Multiply the two matrices together;
-        // each thread computes one element
-        // of the block sub-matrix
-        for (int k = 0; k < BLOCK_SIZE; ++k) {
-            Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x];
-        }
-
-        pipe.consumer_release();
-        // Don't have to synchronize because maxPipelineStages is greater than one
-        // therefore next iteration is loading to a different buffer.
+    // Multiply the two matrices together;
+    // each thread computes one element
+    // of the block sub-matrix
+    for (int k = 0; k < BLOCK_SIZE; ++k) {
+      Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x];
     }
 
-    // Write the block sub-matrix to device memory;
-    // each thread writes four element
-    int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
-    C[c + wB * threadIdx.y + threadIdx.x] = Csub;
+    pipe.consumer_release();
+    // Don't have to synchronize because maxPipelineStages is greater than one
+    // therefore next iteration is loading to a different buffer.
+  }
+
+  // Write the block sub-matrix to device memory;
+  // each thread writes four element
+  int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
+  C[c + wB * threadIdx.y + threadIdx.x] = Csub;
 }
 
 // Multi Stage shared state memcpy_async pipeline thread_scope_block
 // with parititioned producer & consumer, here we've 1 warp as producer
 // group which issues memcpy_async operations and rest all warps are part of
-// consumer group which perform gemm computation on the loaded matrices by producer.
-template <int BLOCK_SIZE_X> __global__ void MatrixMulAsyncCopyMultiStageSharedState(float* __restrict__ C, 
-                                                        const float* __restrict__ A,
-                                                        const float* __restrict__ B, int wA,
-                                                        int wB) {
-    // Multi-stage pipeline version
-    constexpr size_t maxPipelineStages = 4;
+// consumer group which perform gemm computation on the loaded matrices by
+// producer.
+template <int BLOCK_SIZE_X>
+__global__ void MatrixMulAsyncCopyMultiStageSharedState(
+    float *__restrict__ C, const float *__restrict__ A,
+    const float *__restrict__ B, int wA, int wB) {
+  // Multi-stage pipeline version
+  constexpr size_t maxPipelineStages = 4;
 
-    // Declaration of the shared memory array As used to
-    // store the sub-matrix of A for each stage
-    __shared__ float As[maxPipelineStages][BLOCK_SIZE_X][BLOCK_SIZE_X];
+  // Declaration of the shared memory array As used to
+  // store the sub-matrix of A for each stage
+  __shared__ float As[maxPipelineStages][BLOCK_SIZE_X][BLOCK_SIZE_X];
 
-    // Declaration of the shared memory array Bs used to
-    // store the sub-matrix of B for each stage
-    __shared__ float Bs[maxPipelineStages][BLOCK_SIZE_X][BLOCK_SIZE_X];
+  // Declaration of the shared memory array Bs used to
+  // store the sub-matrix of B for each stage
+  __shared__ float Bs[maxPipelineStages][BLOCK_SIZE_X][BLOCK_SIZE_X];
 
-    float Csub = 0.0;
+  float Csub = 0.0;
 
-    // Index of the first sub-matrix of A processed by the block
-    const int aBegin = wA * BLOCK_SIZE_X * blockIdx.y;
+  // Index of the first sub-matrix of A processed by the block
+  const int aBegin = wA * BLOCK_SIZE_X * blockIdx.y;
 
-    // Index of the last sub-matrix of A processed by the block
-    const int aEnd = aBegin + wA - 1;
+  // Index of the last sub-matrix of A processed by the block
+  const int aEnd = aBegin + wA - 1;
 
-    // Step size used to iterate through the sub-matrices of A
-    constexpr int aStep  = BLOCK_SIZE_X;
+  // Step size used to iterate through the sub-matrices of A
+  constexpr int aStep = BLOCK_SIZE_X;
 
-    // Index of the first sub-matrix of B processed by the block
-    const int bBegin = BLOCK_SIZE_X * blockIdx.x;
+  // Index of the first sub-matrix of B processed by the block
+  const int bBegin = BLOCK_SIZE_X * blockIdx.x;
 
-    // Step size used to iterate through the sub-matrices of B
-    int bStep  = BLOCK_SIZE_X * wB;
+  // Step size used to iterate through the sub-matrices of B
+  int bStep = BLOCK_SIZE_X * wB;
 
-    auto cta = cg::this_thread_block();
+  auto cta = cg::this_thread_block();
 
-    const auto shape1 = cuda::aligned_size_t<alignof(float)>(sizeof(float));
-    __shared__ cuda::pipeline_shared_state<cuda::thread_scope_block, maxPipelineStages> shared_state;
-    constexpr int consumer_row_count =  BLOCK_SIZE_X;
+  const auto shape1 = cuda::aligned_size_t<alignof(float)>(sizeof(float));
+  __shared__ cuda::pipeline_shared_state<cuda::thread_scope_block,
+                                         maxPipelineStages> shared_state;
+  constexpr int consumer_row_count = BLOCK_SIZE_X;
 
-    const auto thread_role = (cta.thread_index().y < consumer_row_count)
-                                ? cuda::pipeline_role::consumer
-                                : cuda::pipeline_role::producer;
-    auto pipe = cuda::make_pipeline(cta, &shared_state, thread_role);
+  const auto thread_role = (cta.thread_index().y < consumer_row_count)
+                               ? cuda::pipeline_role::consumer
+                               : cuda::pipeline_role::producer;
+  auto pipe = cuda::make_pipeline(cta, &shared_state, thread_role);
 
-    // Loop over all the sub-matrices of A and B
-    // required to compute the block sub-matrix
-    for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, iStage = 0;
-                                                a <= aEnd; a += aStep, b += bStep, ++i) {
-        if (threadIdx.y >= consumer_row_count) {
-            // this is a whole producer warp because threadIdx.y >= 16 where 16 == consumer_row_count,
-            // which loads the matrices from device memory to shared memory; 
-            for (; aStage <= a + aStep * maxPipelineStages; aStage += aStep, bStage += bStep, ++iStage) {
-                if (aStage <= aEnd) {
-                    // Rotating buffer
-                    const int j = iStage % maxPipelineStages;
-                    const int strideRows = (blockDim.y - consumer_row_count);
-                    pipe.producer_acquire();
-                    for (int rowId = threadIdx.y - consumer_row_count; rowId < BLOCK_SIZE_X; rowId += strideRows) {
-                        cuda::memcpy_async(&As[j][rowId][threadIdx.x], 
-                                            &A[aStage + wA * rowId + threadIdx.x], shape1, pipe);
-                        cuda::memcpy_async(&Bs[j][rowId][threadIdx.x],
-                                            &B[bStage + wB * rowId + threadIdx.x], shape1, pipe);
-                    }
-                    pipe.producer_commit();
-                }
-            }
-        }
-        else {
-            // this is a whole set of consumer group because threadIdx.y < consumer_row_count where consumer_row_count == 16,
-            // which computes gemm operation on matrices loaded in shared memory by producer warp. 
-            const int j = i % maxPipelineStages;
-            // Synchronize consumer group to make sure the matrices are loaded by producer group.
-            pipe.consumer_wait();
-            // Multiply the two matrices together;
-            // each thread computes one element
-            // of the block sub-matrix
-            #pragma unroll
-            for (int k = 0; k < BLOCK_SIZE_X; ++k) {
-                Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x];
-            }
-            pipe.consumer_release();
+  // Loop over all the sub-matrices of A and B
+  // required to compute the block sub-matrix
+  for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin,
+           iStage = 0;
+       a <= aEnd; a += aStep, b += bStep, ++i) {
+    if (threadIdx.y >= consumer_row_count) {
+      // this is a whole producer warp because threadIdx.y >= 16 where 16 ==
+      // consumer_row_count,
+      // which loads the matrices from device memory to shared memory;
+      for (; aStage <= a + aStep * maxPipelineStages;
+           aStage += aStep, bStage += bStep, ++iStage) {
+        if (aStage <= aEnd) {
+          // Rotating buffer
+          const int j = iStage % maxPipelineStages;
+          const int strideRows = (blockDim.y - consumer_row_count);
+          pipe.producer_acquire();
+          for (int rowId = threadIdx.y - consumer_row_count;
+               rowId < BLOCK_SIZE_X; rowId += strideRows) {
+            cuda::memcpy_async(&As[j][rowId][threadIdx.x],
+                               &A[aStage + wA * rowId + threadIdx.x], shape1,
+                               pipe);
+            cuda::memcpy_async(&Bs[j][rowId][threadIdx.x],
+                               &B[bStage + wB * rowId + threadIdx.x], shape1,
+                               pipe);
+          }
+          pipe.producer_commit();
         }
+      }
+    } else {
+      // this is a whole set of consumer group because threadIdx.y <
+      // consumer_row_count where consumer_row_count == 16,
+      // which computes gemm operation on matrices loaded in shared memory by
+      // producer warp.
+      const int j = i % maxPipelineStages;
+      // Synchronize consumer group to make sure the matrices are loaded by
+      // producer group.
+      pipe.consumer_wait();
+// Multiply the two matrices together;
+// each thread computes one element
+// of the block sub-matrix
+#pragma unroll
+      for (int k = 0; k < BLOCK_SIZE_X; ++k) {
+        Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x];
+      }
+      pipe.consumer_release();
     }
+  }
 
-    // Write the block sub-matrix to device memory;
-    // each thread writes four element
-    if (threadIdx.y < consumer_row_count)
-    {
-        const int c = wB * BLOCK_SIZE_X * blockIdx.y + BLOCK_SIZE_X * blockIdx.x;
-        C[c + wB * threadIdx.y + threadIdx.x] = Csub;
-    }
+  // Write the block sub-matrix to device memory;
+  // each thread writes four element
+  if (threadIdx.y < consumer_row_count) {
+    const int c = wB * BLOCK_SIZE_X * blockIdx.y + BLOCK_SIZE_X * blockIdx.x;
+    C[c + wB * threadIdx.y + threadIdx.x] = Csub;
+  }
 }
 
 /**
  * Matrix multiplication (CUDA Kernel) on the device: C = A * B
  * wA is A's width and wB is B's width
  */
-template <int BLOCK_SIZE> __global__ void MatrixMulNaive(float *C, float *A,
-                                                        float *B, int wA,
-                                                        int wB) {
-    // Declaration of the shared memory array As used to
-    // store the sub-matrix of A
-    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
+template <int BLOCK_SIZE>
+__global__ void MatrixMulNaive(float *C, float *A, float *B, int wA, int wB) {
+  // Declaration of the shared memory array As used to
+  // store the sub-matrix of A
+  __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
 
-    // Declaration of the shared memory array Bs used to
-    // store the sub-matrix of B
-    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+  // Declaration of the shared memory array Bs used to
+  // store the sub-matrix of B
+  __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
 
-    // Index of the first sub-matrix of A processed by the block
-    int aBegin = wA * BLOCK_SIZE * blockIdx.y;
+  // Index of the first sub-matrix of A processed by the block
+  int aBegin = wA * BLOCK_SIZE * blockIdx.y;
 
-    // Index of the last sub-matrix of A processed by the block
-    int aEnd   = aBegin + wA - 1;
+  // Index of the last sub-matrix of A processed by the block
+  int aEnd = aBegin + wA - 1;
 
-    // Step size used to iterate through the sub-matrices of A
-    int aStep  = BLOCK_SIZE;
+  // Step size used to iterate through the sub-matrices of A
+  int aStep = BLOCK_SIZE;
 
-    // Index of the first sub-matrix of B processed by the block
-    int bBegin = BLOCK_SIZE * blockIdx.x;
+  // Index of the first sub-matrix of B processed by the block
+  int bBegin = BLOCK_SIZE * blockIdx.x;
 
-    // Step size used to iterate through the sub-matrices of B
-    int bStep  = BLOCK_SIZE * wB;
+  // Step size used to iterate through the sub-matrices of B
+  int bStep = BLOCK_SIZE * wB;
 
-    // Csub is used to store the element of the block sub-matrix
-    // that is computed by the thread
-    float Csub = 0;
+  // Csub is used to store the element of the block sub-matrix
+  // that is computed by the thread
+  float Csub = 0;
 
-    // Loop over all the sub-matrices of A and B
-    // required to compute the block sub-matrix
-    for (int a = aBegin, b = bBegin;
-            a <= aEnd;
-            a += aStep, b += bStep) {
+  // Loop over all the sub-matrices of A and B
+  // required to compute the block sub-matrix
+  for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
+    // Load the matrices from device memory
+    // to shared memory; each thread loads
+    // one element of each matrix
+    As[threadIdx.y][threadIdx.x] = A[a + wA * threadIdx.y + threadIdx.x];
+    Bs[threadIdx.y][threadIdx.x] = B[b + wB * threadIdx.y + threadIdx.x];
 
-        // Load the matrices from device memory
-        // to shared memory; each thread loads
-        // one element of each matrix
-        As[threadIdx.y][threadIdx.x] = A[a + wA * threadIdx.y + threadIdx.x];
-        Bs[threadIdx.y][threadIdx.x] = B[b + wB * threadIdx.y + threadIdx.x];
+    // Synchronize to make sure the matrices are loaded
+    __syncthreads();
 
-        // Synchronize to make sure the matrices are loaded
-        __syncthreads();
-
-        // Multiply the two matrices together;
-        // each thread computes one element
-        // of the block sub-matrix
+// Multiply the two matrices together;
+// each thread computes one element
+// of the block sub-matrix
 #pragma unroll
-        for (int k = 0; k < BLOCK_SIZE; ++k) {
-            Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x];
-        }
-
-        // Synchronize to make sure that the preceding
-        // computation is done before loading two new
-        // sub-matrices of A and B in the next iteration
-        __syncthreads();
+    for (int k = 0; k < BLOCK_SIZE; ++k) {
+      Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x];
     }
 
-    // Write the block sub-matrix to device memory;
-    // each thread writes one element
-    int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
-    C[c + wB * threadIdx.y + threadIdx.x] = Csub;
+    // Synchronize to make sure that the preceding
+    // computation is done before loading two new
+    // sub-matrices of A and B in the next iteration
+    __syncthreads();
+  }
+
+  // Write the block sub-matrix to device memory;
+  // each thread writes one element
+  int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
+  C[c + wB * threadIdx.y + threadIdx.x] = Csub;
 }
 
-template <int BLOCK_SIZE> __global__ void MatrixMulNaiveLargeChunk(float *C, float *A,
-                                                        float *B, int wA,
-                                                        int wB) {
-    // Declaration of the shared memory array As used to
-    // store the sub-matrix of A
-    __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE];
+template <int BLOCK_SIZE>
+__global__ void MatrixMulNaiveLargeChunk(float *C, float *A, float *B, int wA,
+                                         int wB) {
+  // Declaration of the shared memory array As used to
+  // store the sub-matrix of A
+  __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE];
 
-    // Declaration of the shared memory array Bs used to
-    // store the sub-matrix of B
-    __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE];
+  // Declaration of the shared memory array Bs used to
+  // store the sub-matrix of B
+  __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE];
 
-    int t4x = threadIdx.x * 4 ;
+  int t4x = threadIdx.x * 4;
 
-    // Index of the first sub-matrix of A processed by the block
-    int aBegin = wA * BLOCK_SIZE * blockIdx.y;
+  // Index of the first sub-matrix of A processed by the block
+  int aBegin = wA * BLOCK_SIZE * blockIdx.y;
 
-    // Index of the last sub-matrix of A processed by the block
-    int aEnd   = aBegin + wA - 1;
+  // Index of the last sub-matrix of A processed by the block
+  int aEnd = aBegin + wA - 1;
 
-    // Step size used to iterate through the sub-matrices of A
-    int aStep  = BLOCK_SIZE;
+  // Step size used to iterate through the sub-matrices of A
+  int aStep = BLOCK_SIZE;
 
-    // Index of the first sub-matrix of B processed by the block
-    int bBegin = BLOCK_SIZE * blockIdx.x;
+  // Index of the first sub-matrix of B processed by the block
+  int bBegin = BLOCK_SIZE * blockIdx.x;
 
-    // Step size used to iterate through the sub-matrices of B
-    int bStep  = BLOCK_SIZE * wB;
+  // Step size used to iterate through the sub-matrices of B
+  int bStep = BLOCK_SIZE * wB;
 
-    // Csub is used to store the element of the block sub-matrix
-    // that is computed by the thread
-    float Csub = 0;
+  // Csub is used to store the element of the block sub-matrix
+  // that is computed by the thread
+  float Csub = 0;
 
-    // Loop over all the sub-matrices of A and B
-    // required to compute the block sub-matrix
-    for (int a = aBegin, b = bBegin;
-            a <= aEnd;
-            a += aStep, b += bStep) {
+  // Loop over all the sub-matrices of A and B
+  // required to compute the block sub-matrix
+  for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
+    // Load the matrices from device memory
+    // to shared memory;
 
-        // Load the matrices from device memory
-        // to shared memory; 
-
-        // One fourth of the threads load four elements of each matrix
-        if ( t4x < BLOCK_SIZE ) {
-            float4 * const A4s = reinterpret_cast<float4*>(& As[threadIdx.y][t4x]);
-            float4 * const B4s = reinterpret_cast<float4*>(& Bs[threadIdx.y][t4x]);
-            const float4 * const A4 = reinterpret_cast<float4*>(& A[a + wA * threadIdx.y + t4x]);
-            const float4 * const B4 = reinterpret_cast<float4*>(& B[a + wA * threadIdx.y + t4x]);
-            *A4s = *A4 ;
-            *B4s = *B4 ;
-        }
-
-        // Synchronize to make sure the matrices are loaded
-        __syncthreads();
-
-        // Multiply the two matrices together;
-        // each thread computes one element
-        // of the block sub-matrix
-#pragma unroll
-        for (int k = 0; k < BLOCK_SIZE; ++k) {
-            Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x];
-        }
-
-        // Synchronize to make sure that the preceding
-        // computation is done before loading two new
-        // sub-matrices of A and B in the next iteration
-        __syncthreads();
+    // One fourth of the threads load four elements of each matrix
+    if (t4x < BLOCK_SIZE) {
+      float4 *const A4s = reinterpret_cast<float4 *>(&As[threadIdx.y][t4x]);
+      float4 *const B4s = reinterpret_cast<float4 *>(&Bs[threadIdx.y][t4x]);
+      const float4 *const A4 =
+          reinterpret_cast<float4 *>(&A[a + wA * threadIdx.y + t4x]);
+      const float4 *const B4 =
+          reinterpret_cast<float4 *>(&B[a + wA * threadIdx.y + t4x]);
+      *A4s = *A4;
+      *B4s = *B4;
     }
 
-    // Write the block sub-matrix to device memory;
-    // each thread writes one element
-    int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
-    C[c + wB * threadIdx.y + threadIdx.x] = Csub;
-}
+    // Synchronize to make sure the matrices are loaded
+    __syncthreads();
 
+// Multiply the two matrices together;
+// each thread computes one element
+// of the block sub-matrix
+#pragma unroll
+    for (int k = 0; k < BLOCK_SIZE; ++k) {
+      Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x];
+    }
+
+    // Synchronize to make sure that the preceding
+    // computation is done before loading two new
+    // sub-matrices of A and B in the next iteration
+    __syncthreads();
+  }
+
+  // Write the block sub-matrix to device memory;
+  // each thread writes one element
+  int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
+  C[c + wB * threadIdx.y + threadIdx.x] = Csub;
+}
 
 void ConstantInit(float *data, int size, float val) {
-    for (int i = 0; i < size; ++i) {
-        data[i] = val;
-    }
+  for (int i = 0; i < size; ++i) {
+    data[i] = val;
+  }
 }
 
 /**
  * Run matrix multiplication using CUDA
  */
-int MatrixMultiply(int argc, char **argv,
-                   const dim3 &dimsA,
-                   const dim3 &dimsB,
+int MatrixMultiply(int argc, char **argv, const dim3 &dimsA, const dim3 &dimsB,
                    kernels kernel_number) {
-    // Allocate host memory for matrices A and B
-    unsigned int size_A = dimsA.x * dimsA.y;
-    unsigned int mem_size_A = sizeof(float) * size_A;
-    float* h_A;
-    checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
-    unsigned int size_B = dimsB.x * dimsB.y;
-    unsigned int mem_size_B = sizeof(float) * size_B;
-    float* h_B;
-    checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
-    cudaStream_t stream;
+  // Allocate host memory for matrices A and B
+  unsigned int size_A = dimsA.x * dimsA.y;
+  unsigned int mem_size_A = sizeof(float) * size_A;
+  float *h_A;
+  checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
+  unsigned int size_B = dimsB.x * dimsB.y;
+  unsigned int mem_size_B = sizeof(float) * size_B;
+  float *h_B;
+  checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
+  cudaStream_t stream;
 
-    // Initialize host memory
-    const float valB = 2.10f;
-    ConstantInit(h_A, size_A, 1.0f);
-    ConstantInit(h_B, size_B, valB);
+  // Initialize host memory
+  const float valB = 2.10f;
+  ConstantInit(h_A, size_A, 1.0f);
+  ConstantInit(h_B, size_B, valB);
 
-    // Allocate device memory
-    float *d_A, *d_B, *d_C;
+  // Allocate device memory
+  float *d_A, *d_B, *d_C;
 
-    // Allocate host matrix C
-    dim3 dimsC(dimsB.x, dimsA.y, 1);
-    unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
-    float* h_C;
-    checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
+  // Allocate host matrix C
+  dim3 dimsC(dimsB.x, dimsA.y, 1);
+  unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
+  float *h_C;
+  checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
 
-    if (h_C == NULL) {
-        fprintf(stderr, "Failed to allocate host matrix C!\n");
-        exit(EXIT_FAILURE);
+  if (h_C == NULL) {
+    fprintf(stderr, "Failed to allocate host matrix C!\n");
+    exit(EXIT_FAILURE);
+  }
+
+  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
+  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
+  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
+  // Allocate CUDA events that we'll use for timing
+  cudaEvent_t start, stop;
+  checkCudaErrors(cudaEventCreate(&start));
+  checkCudaErrors(cudaEventCreate(&stop));
+
+  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+  // copy host memory to device
+  checkCudaErrors(
+      cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
+  checkCudaErrors(
+      cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
+  checkCudaErrors(cudaMemsetAsync(d_C, 0, mem_size_C, stream));
+
+  // Setup execution parameters
+  dim3 threads(blockSize, blockSize);
+  dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
+
+  // Here the block size is 16x18, where first 16 rows are consumer thread group
+  // and last 2 rows (1 warp) is producer thread group
+  dim3 threadsSharedStateKernel(blockSize, blockSize + 2, 1);
+  dim3 gridSharedStateKernel(dimsB.x / threadsSharedStateKernel.x,
+                             dimsA.y / threadsSharedStateKernel.x);
+
+  printf("Running kernel = %d - %s\n", kernel_number,
+         kernelNames[kernel_number]);
+  // Create and start timer
+  printf("Computing result using CUDA Kernel...\n");
+
+  // Performs warmup operation using matrixMul CUDA kernel
+  switch (kernel_number) {
+    case AsyncCopyMultiStageLargeChunk:
+    default:
+      MatrixMulAsyncCopyMultiStageLargeChunk<
+          blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x,
+                                                   dimsB.x);
+      break;
+    case AsyncCopyLargeChunk:
+      MatrixMulAsyncCopyLargeChunk<blockSize><<<grid, threads, 0, stream>>>(
+          d_C, d_A, d_B, dimsA.x, dimsB.x);
+      break;
+    case AsyncCopyLargeChunkAWBarrier:
+      MatrixMulAsyncCopyLargeChunkAWBarrier<
+          blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x,
+                                                   dimsB.x);
+      break;
+    case AsyncCopyMultiStageSharedState:
+      MatrixMulAsyncCopyMultiStageSharedState<blockSize><<<
+          gridSharedStateKernel, threadsSharedStateKernel, 0, stream>>>(
+          d_C, d_A, d_B, dimsA.x, dimsB.x);
+      break;
+    case AsyncCopyMultiStage:
+      MatrixMulAsyncCopyMultiStage<blockSize><<<grid, threads, 0, stream>>>(
+          d_C, d_A, d_B, dimsA.x, dimsB.x);
+      break;
+    case AsyncCopySingleStage:
+      MatrixMulAsyncCopySingleStage<blockSize><<<grid, threads, 0, stream>>>(
+          d_C, d_A, d_B, dimsA.x, dimsB.x);
+      break;
+    case Naive:
+      MatrixMulNaive<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B,
+                                                              dimsA.x, dimsB.x);
+      break;
+    case NaiveLargeChunk:
+      MatrixMulNaiveLargeChunk<blockSize><<<grid, threads, 0, stream>>>(
+          d_C, d_A, d_B, dimsA.x, dimsB.x);
+      break;
+  }
+
+  printf("done\n");
+  checkCudaErrors(cudaStreamSynchronize(stream));
+
+  // Execute the kernel
+  int nIter = 100;
+
+  // Record the start event
+  checkCudaErrors(cudaEventRecord(start, stream));
+
+  for (int j = 0; j < nIter; j++) {
+    switch (kernel_number) {
+      case AsyncCopyMultiStageLargeChunk:
+      default:
+        MatrixMulAsyncCopyMultiStageLargeChunk<
+            blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x,
+                                                     dimsB.x);
+        break;
+      case AsyncCopyLargeChunk:
+        MatrixMulAsyncCopyLargeChunk<blockSize><<<grid, threads, 0, stream>>>(
+            d_C, d_A, d_B, dimsA.x, dimsB.x);
+        break;
+      case AsyncCopyLargeChunkAWBarrier:
+        MatrixMulAsyncCopyLargeChunkAWBarrier<
+            blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x,
+                                                     dimsB.x);
+        break;
+      case AsyncCopyMultiStageSharedState:
+        MatrixMulAsyncCopyMultiStageSharedState<blockSize><<<
+            gridSharedStateKernel, threadsSharedStateKernel, 0, stream>>>(
+            d_C, d_A, d_B, dimsA.x, dimsB.x);
+        break;
+      case AsyncCopyMultiStage:
+        MatrixMulAsyncCopyMultiStage<blockSize><<<grid, threads, 0, stream>>>(
+            d_C, d_A, d_B, dimsA.x, dimsB.x);
+        break;
+      case AsyncCopySingleStage:
+        MatrixMulAsyncCopySingleStage<blockSize><<<grid, threads, 0, stream>>>(
+            d_C, d_A, d_B, dimsA.x, dimsB.x);
+        break;
+      case Naive:
+        MatrixMulNaive<blockSize><<<grid, threads, 0, stream>>>(
+            d_C, d_A, d_B, dimsA.x, dimsB.x);
+        break;
+      case NaiveLargeChunk:
+        MatrixMulNaiveLargeChunk<blockSize><<<grid, threads, 0, stream>>>(
+            d_C, d_A, d_B, dimsA.x, dimsB.x);
+        break;
     }
+  }
 
-    checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
-    checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
-    checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
-    // Allocate CUDA events that we'll use for timing
-    cudaEvent_t start, stop;
-    checkCudaErrors(cudaEventCreate(&start));
-    checkCudaErrors(cudaEventCreate(&stop));
+  // Record the stop event
+  checkCudaErrors(cudaEventRecord(stop, stream));
 
-    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+  // Wait for the stop event to complete
+  checkCudaErrors(cudaEventSynchronize(stop));
 
-    // copy host memory to device
-    checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemsetAsync(d_C, 0, mem_size_C, stream));
+  float msecTotal = 0.0f;
+  checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
 
-    // Setup execution parameters
-    dim3 threads(blockSize, blockSize);
-    dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
+  // Compute and print the performance
+  float msecPerMatrixMul = msecTotal / nIter;
+  double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
+                             static_cast<double>(dimsA.y) *
+                             static_cast<double>(dimsB.x);
+  double gigaFlops =
+      (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
+  printf(
+      "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
+      " WorkgroupSize= %u threads/block\n",
+      gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);
 
-    // Here the block size is 16x18, where first 16 rows are consumer thread group
-    // and last 2 rows (1 warp) is producer thread group
-    dim3 threadsSharedStateKernel(blockSize, blockSize + 2, 1);
-    dim3 gridSharedStateKernel(dimsB.x / threadsSharedStateKernel.x, dimsA.y / threadsSharedStateKernel.x);
+  // Copy result from device to host
+  checkCudaErrors(
+      cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
+  checkCudaErrors(cudaStreamSynchronize(stream));
 
-    printf("Running kernel = %d - %s\n", kernel_number, kernelNames[kernel_number]);
-    // Create and start timer
-    printf("Computing result using CUDA Kernel...\n");
+  printf("Checking computed result for correctness: ");
+  bool correct = true;
 
-    // Performs warmup operation using matrixMul CUDA kernel
-    switch (kernel_number)
-    {
-        case AsyncCopyMultiStageLargeChunk :
-        default:
-            MatrixMulAsyncCopyMultiStageLargeChunk<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-            break;
-        case AsyncCopyLargeChunk :
-            MatrixMulAsyncCopyLargeChunk<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-            break;
-        case AsyncCopyLargeChunkAWBarrier :
-            MatrixMulAsyncCopyLargeChunkAWBarrier<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-            break;
-        case AsyncCopyMultiStageSharedState :
-            MatrixMulAsyncCopyMultiStageSharedState<blockSize><<<gridSharedStateKernel, threadsSharedStateKernel, 0, stream>>>
-                                                                                        (d_C, d_A, d_B, dimsA.x, dimsB.x);
-            break;
-        case AsyncCopyMultiStage :
-            MatrixMulAsyncCopyMultiStage<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-            break;
-        case AsyncCopySingleStage :
-            MatrixMulAsyncCopySingleStage<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-            break;
-        case Naive :
-            MatrixMulNaive<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-            break;
-        case NaiveLargeChunk:
-            MatrixMulNaiveLargeChunk<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-            break;
+  // test relative error by the formula
+  // |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
+  double eps = 1.e-6;  // machine zero
+
+  for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
+    double abs_err = fabs(h_C[i] - (dimsA.x * valB));
+    double dot_length = dimsA.x;
+    double abs_val = fabs(h_C[i]);
+    double rel_err = abs_err / abs_val / dot_length;
+
+    if (rel_err > eps) {
+      printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
+             h_C[i], dimsA.x * valB, eps);
+      correct = false;
     }
+  }
 
-    printf("done\n");
-    checkCudaErrors(cudaStreamSynchronize(stream));
+  printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
 
+  // Clean up memory
+  checkCudaErrors(cudaFreeHost(h_A));
+  checkCudaErrors(cudaFreeHost(h_B));
+  checkCudaErrors(cudaFreeHost(h_C));
+  checkCudaErrors(cudaFree(d_A));
+  checkCudaErrors(cudaFree(d_B));
+  checkCudaErrors(cudaFree(d_C));
+  checkCudaErrors(cudaEventDestroy(start));
+  checkCudaErrors(cudaEventDestroy(stop));
+  printf(
+      "\nNOTE: The CUDA Samples are not meant for performance "
+      "measurements. Results may vary when GPU Boost is enabled.\n");
 
-    // Execute the kernel
-    int nIter = 100;
-
-    // Record the start event
-    checkCudaErrors(cudaEventRecord(start, stream));
-
-    for (int j = 0; j < nIter; j++) {
-        switch (kernel_number)
-        {
-            case AsyncCopyMultiStageLargeChunk :
-            default:
-                MatrixMulAsyncCopyMultiStageLargeChunk<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-                break;
-            case AsyncCopyLargeChunk :
-                MatrixMulAsyncCopyLargeChunk<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-                break;
-            case AsyncCopyLargeChunkAWBarrier :
-                MatrixMulAsyncCopyLargeChunkAWBarrier<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-                break;
-            case AsyncCopyMultiStageSharedState :
-                MatrixMulAsyncCopyMultiStageSharedState<blockSize><<<gridSharedStateKernel, threadsSharedStateKernel, 0, stream>>>
-                                                                                                (d_C, d_A, d_B, dimsA.x, dimsB.x);
-                break;
-            case AsyncCopyMultiStage :
-                MatrixMulAsyncCopyMultiStage<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-                break;
-            case AsyncCopySingleStage :
-                MatrixMulAsyncCopySingleStage<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-                break;
-            case Naive :
-                MatrixMulNaive<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-                break;
-            case NaiveLargeChunk:
-                MatrixMulNaiveLargeChunk<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-                break;
-        }
-    }
-
-    // Record the stop event
-    checkCudaErrors(cudaEventRecord(stop, stream));
-
-    // Wait for the stop event to complete
-    checkCudaErrors(cudaEventSynchronize(stop));
-
-    float msecTotal = 0.0f;
-    checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
-
-    // Compute and print the performance
-    float msecPerMatrixMul = msecTotal / nIter;
-    double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
-                               static_cast<double>(dimsA.y) *
-                               static_cast<double>(dimsB.x);
-    double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) /
-                       (msecPerMatrixMul / 1000.0f);
-    printf(
-        "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops," \
-        " WorkgroupSize= %u threads/block\n",
-        gigaFlops,
-        msecPerMatrixMul,
-        flopsPerMatrixMul,
-        threads.x * threads.y);
-
-    // Copy result from device to host
-    checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
-    checkCudaErrors(cudaStreamSynchronize(stream));
-
-    printf("Checking computed result for correctness: ");
-    bool correct = true;
-
-    // test relative error by the formula
-    // |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
-    double eps = 1.e-6;  // machine zero
-
-    for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
-        double abs_err = fabs(h_C[i] - (dimsA.x * valB));
-        double dot_length = dimsA.x;
-        double abs_val = fabs(h_C[i]);
-        double rel_err = abs_err / abs_val / dot_length;
-
-        if (rel_err > eps) {
-            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n",
-                   i, h_C[i], dimsA.x * valB, eps);
-            correct = false;
-        }
-    }
-
-    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
-
-    // Clean up memory
-    checkCudaErrors(cudaFreeHost(h_A));
-    checkCudaErrors(cudaFreeHost(h_B));
-    checkCudaErrors(cudaFreeHost(h_C));
-    checkCudaErrors(cudaFree(d_A));
-    checkCudaErrors(cudaFree(d_B));
-    checkCudaErrors(cudaFree(d_C));
-    checkCudaErrors(cudaEventDestroy(start));
-    checkCudaErrors(cudaEventDestroy(stop));
-    printf("\nNOTE: The CUDA Samples are not meant for performance "\
-           "measurements. Results may vary when GPU Boost is enabled.\n");
-
-    if (correct) {
-        return EXIT_SUCCESS;
-    } else {
-        return EXIT_FAILURE;
-    }
+  if (correct) {
+    return EXIT_SUCCESS;
+  } else {
+    return EXIT_FAILURE;
+  }
 }
 
-
 int main(int argc, char **argv) {
-    printf("[globalToShmemAsyncCopy] - Starting...\n");
+  printf("[globalToShmemAsyncCopy] - Starting...\n");
 
-    if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
-            checkCmdLineFlag(argc, (const char **)argv, "?")) {
-        printf("Usage -device=n (n >= 0 for deviceID)\n");
-        printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
-        printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
-        printf("      -kernel=kernel_number (0 - AsyncCopyMultiStageLargeChunk; 1 - AsyncCopyLargeChunk)\n");
-        printf("                            (2 - AsyncCopyLargeChunkAWBarrier; 3 - AsyncCopyMultiStageSharedState)\n");
-        printf("                            (4 - AsyncCopyMultiStage; 5 - AsyncCopySingleStage; 6 - Naive without memcpy_async)\n");
-        printf("                            (7 - NaiveLargeChunk without memcpy_async)\n");
-        printf("  Note: Outer matrix dimensions of A & B matrices must be equal.\n");
+  if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
+      checkCmdLineFlag(argc, (const char **)argv, "?")) {
+    printf("Usage -device=n (n >= 0 for deviceID)\n");
+    printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
+    printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
+    printf(
+        "      -kernel=kernel_number (0 - AsyncCopyMultiStageLargeChunk; 1 - "
+        "AsyncCopyLargeChunk)\n");
+    printf(
+        "                            (2 - AsyncCopyLargeChunkAWBarrier; 3 - "
+        "AsyncCopyMultiStageSharedState)\n");
+    printf(
+        "                            (4 - AsyncCopyMultiStage; 5 - "
+        "AsyncCopySingleStage; 6 - Naive without memcpy_async)\n");
+    printf(
+        "                            (7 - NaiveLargeChunk without "
+        "memcpy_async)\n");
+    printf(
+        "  Note: Outer matrix dimensions of A & B matrices must be equal.\n");
 
-        exit(EXIT_SUCCESS);
+    exit(EXIT_SUCCESS);
+  }
+
+  // This will pick the best possible CUDA capable device, otherwise
+  // override the device ID based on input provided at the command line
+  int dev = findCudaDevice(argc, (const char **)argv);
+
+  int matrixBlock = 32;
+  dim3 dimsA(10 * 4 * matrixBlock, 10 * 4 * matrixBlock, 1);
+  dim3 dimsB(10 * 4 * matrixBlock, 10 * 4 * matrixBlock, 1);
+
+  // width of Matrix A
+  if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
+    dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
+  }
+
+  // height of Matrix A
+  if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
+    dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
+  }
+
+  // width of Matrix B
+  if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
+    dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
+  }
+
+  // height of Matrix B
+  if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
+    dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
+  }
+
+  if (dimsA.x != dimsB.y) {
+    printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
+           dimsA.x, dimsB.y);
+    exit(EXIT_FAILURE);
+  }
+
+  kernels selected_kernel = AsyncCopyMultiStageLargeChunk;
+
+  // kernel to run - default (AsyncCopyMultiStageLargeChunk == 0)
+  if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) {
+    int kernel_number =
+        getCmdLineArgumentInt(argc, (const char **)argv, "kernel");
+    if (kernel_number < 8) {
+      selected_kernel = (kernels)kernel_number;
+    } else {
+      printf(
+          "Error: kernel number should be between 0 to 6, you have entered "
+          "%d\n",
+          kernel_number);
+      exit(EXIT_FAILURE);
     }
+  }
 
-    // This will pick the best possible CUDA capable device, otherwise
-    // override the device ID based on input provided at the command line
-    int dev = findCudaDevice(argc, (const char **)argv);
+  int major = 0;
+  checkCudaErrors(
+      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
+  if (major < 7) {
+    printf("globalToShmemAsyncCopy requires SM 7.0 or higher.  Exiting...\n");
+    exit(EXIT_WAIVED);
+  }
 
-    int matrixBlock = 32;
-    dim3 dimsA(10 * 4 * matrixBlock, 10 * 4 * matrixBlock, 1);
-    dim3 dimsB(10 * 4 * matrixBlock, 10 * 4 * matrixBlock, 1);
+  printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x,
+         dimsB.y);
 
-    // width of Matrix A
-    if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
-        dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
-    }
+  int matrix_result = MatrixMultiply(argc, argv, dimsA, dimsB, selected_kernel);
 
-    // height of Matrix A
-    if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
-        dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
-    }
-
-    // width of Matrix B
-    if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
-        dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
-    }
-
-    // height of Matrix B
-    if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
-        dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
-    }
-
-    if (dimsA.x != dimsB.y) {
-        printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
-               dimsA.x, dimsB.y);
-        exit(EXIT_FAILURE);
-    }
-
-    kernels selected_kernel = AsyncCopyMultiStageLargeChunk;
-
-    // kernel to run - default (AsyncCopyMultiStageLargeChunk == 0)
-    if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) {
-        int kernel_number = getCmdLineArgumentInt(argc, (const char **)argv, "kernel");
-        if (kernel_number < 8)
-        {
-            selected_kernel = (kernels)kernel_number;
-        }
-        else
-        {
-            printf("Error: kernel number should be between 0 to 6, you have entered %d\n", kernel_number);
-            exit(EXIT_FAILURE);
-        }
-    }
-
-    int major = 0;
-    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
-    if (major < 7)
-    {
-        printf("globalToShmemAsyncCopy requires SM 7.0 or higher.  Exiting...\n");
-        exit(EXIT_WAIVED);
-    }
-
-    printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y,
-                                               dimsB.x, dimsB.y);
-
-    int matrix_result = MatrixMultiply(argc, argv, dimsA, dimsB, selected_kernel);
-
-    exit(matrix_result);
+  exit(matrix_result);
 }
-
diff --git a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2017.vcxproj b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2017.vcxproj
index 1fd5cdd3..added1d2 100644
--- a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2017.vcxproj
+++ b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2019.vcxproj b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2019.vcxproj
index c8c02a93..bf65f63a 100644
--- a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2019.vcxproj
+++ b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/immaTensorCoreGemm/README.md b/Samples/immaTensorCoreGemm/README.md
index 21262589..3c07bb95 100644
--- a/Samples/immaTensorCoreGemm/README.md
+++ b/Samples/immaTensorCoreGemm/README.md
@@ -27,7 +27,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate,
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj
index 5055f031..d6942bc2 100644
--- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2019.vcxproj b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2019.vcxproj
index 6139830e..6ecb5d5f 100644
--- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2019.vcxproj
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/jacobiCudaGraphs/README.md b/Samples/jacobiCudaGraphs/README.md
index 78a2e97e..c6223ff2 100644
--- a/Samples/jacobiCudaGraphs/README.md
+++ b/Samples/jacobiCudaGraphs/README.md
@@ -25,7 +25,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch,
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2017.vcxproj b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2017.vcxproj
index a811971b..c899fc38 100644
--- a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2017.vcxproj
+++ b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -109,6 +109,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2019.vcxproj b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2019.vcxproj
index c02f5c88..c6158ebd 100644
--- a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2019.vcxproj
+++ b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -105,6 +105,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/jacobiCudaGraphs/main.cpp b/Samples/jacobiCudaGraphs/main.cpp
index 1257e32a..5cb7db72 100644
--- a/Samples/jacobiCudaGraphs/main.cpp
+++ b/Samples/jacobiCudaGraphs/main.cpp
@@ -100,8 +100,10 @@ int main(int argc, char **argv) {
 
   double *b = NULL;
   float *A = NULL;
-  b = (double *)calloc(N_ROWS, sizeof(double));
-  A = (float *)calloc(N_ROWS * N_ROWS, sizeof(float));
+  checkCudaErrors(cudaMallocHost(&b, N_ROWS * sizeof(double)));
+  memset(b, 0, N_ROWS * sizeof(double));
+  checkCudaErrors(cudaMallocHost(&A, N_ROWS * N_ROWS * sizeof(float)));
+  memset(A, 0, N_ROWS * N_ROWS * sizeof(float));
 
   createLinearSystem(A, b);
   double *x = NULL;
@@ -170,6 +172,9 @@ int main(int argc, char **argv) {
   checkCudaErrors(cudaFree(d_x));
   checkCudaErrors(cudaFree(d_x_new));
 
+  checkCudaErrors(cudaFreeHost(A));
+  checkCudaErrors(cudaFreeHost(b));
+
   printf("&&&& jacobiCudaGraphs %s\n",
          (fabs(sum - sumGPU) < conv_threshold) ? "PASSED" : "FAILED");
 
diff --git a/Samples/matrixMul/README.md b/Samples/matrixMul/README.md
index 85b1f138..5d9dba69 100644
--- a/Samples/matrixMul/README.md
+++ b/Samples/matrixMul/README.md
@@ -27,7 +27,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/matrixMul/matrixMul_vs2017.vcxproj b/Samples/matrixMul/matrixMul_vs2017.vcxproj
index ca222d53..c362684f 100644
--- a/Samples/matrixMul/matrixMul_vs2017.vcxproj
+++ b/Samples/matrixMul/matrixMul_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/matrixMul/matrixMul_vs2019.vcxproj b/Samples/matrixMul/matrixMul_vs2019.vcxproj
index b61bb8d7..084d32b0 100644
--- a/Samples/matrixMul/matrixMul_vs2019.vcxproj
+++ b/Samples/matrixMul/matrixMul_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/matrixMulDrv/README.md b/Samples/matrixMulDrv/README.md
index f4672258..248b61c5 100644
--- a/Samples/matrixMulDrv/README.md
+++ b/Samples/matrixMulDrv/README.md
@@ -27,7 +27,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cu
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj
index 76200467..6360c07c 100644
--- a/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj
+++ b/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -112,6 +112,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2019.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2019.vcxproj
index fdff280d..69a91d3c 100644
--- a/Samples/matrixMulDrv/matrixMulDrv_vs2019.vcxproj
+++ b/Samples/matrixMulDrv/matrixMulDrv_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/memMapIPCDrv/Makefile b/Samples/memMapIPCDrv/Makefile
index 8a0c1c3c..fe711fd3 100644
--- a/Samples/memMapIPCDrv/Makefile
+++ b/Samples/memMapIPCDrv/Makefile
@@ -302,14 +302,10 @@ LIBRARIES :=
 
 ################################################################################
 
-FATBIN_FILE := memMapIpc_kernel${TARGET_SIZE}.fatbin
+PTX_FILE := memMapIpc_kernel${TARGET_SIZE}.ptx
 
 # Gencode arguments
-ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
-SMS ?= 35 37 50 52 60 61 70 72 75 80 86
-else
-SMS ?= 35 37 50 52 60 61 70 75 80 86
-endif
+SMS ?=
 
 ifeq ($(GENCODE_FLAGS),)
 # Generate SASS code for each SM architecture listed in $(SMS)
@@ -395,7 +391,7 @@ endif
 # Target rules
 all: build
 
-build: memMapIPCDrv $(FATBIN_FILE)
+build: memMapIPCDrv $(PTX_FILE)
 
 check.deps:
 ifeq ($(SAMPLE_ENABLED),0)
@@ -404,8 +400,8 @@ else
 	@echo "Sample is ready - all dependencies have been met"
 endif
 
-$(FATBIN_FILE): memMapIpc_kernel.cu
-	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -fatbin $<
+$(PTX_FILE): memMapIpc_kernel.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -ptx $<
 	$(EXEC) mkdir -p data
 	$(EXEC) cp -f $@ ./data
 	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
@@ -426,9 +422,8 @@ run: build
 	$(EXEC) ./memMapIPCDrv
 
 clean:
-	rm -f memMapIPCDrv helper_multiprocess.o memMapIpc.o  data/$(FATBIN_FILE) $(FATBIN_FILE)
+	rm -f memMapIPCDrv helper_multiprocess.o memMapIpc.o  data/$(PTX_FILE) $(PTX_FILE)
 	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/memMapIPCDrv
-
-	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/$(FATBIN_FILE)
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/$(PTX_FILE)
 
 clobber: clean
diff --git a/Samples/memMapIPCDrv/README.md b/Samples/memMapIPCDrv/README.md
index 04c7f0c4..1e343fd1 100644
--- a/Samples/memMapIPCDrv/README.md
+++ b/Samples/memMapIPCDrv/README.md
@@ -30,7 +30,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuLaunchKernel, cuMemcpyD
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/memMapIPCDrv/memMapIPCDrv_vs2017.vcxproj b/Samples/memMapIPCDrv/memMapIPCDrv_vs2017.vcxproj
index c3107e07..86d80be6 100644
--- a/Samples/memMapIPCDrv/memMapIPCDrv_vs2017.vcxproj
+++ b/Samples/memMapIPCDrv/memMapIPCDrv_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -67,7 +67,7 @@
       <OutputFile>$(OutDir)/memMapIPCDrv.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_35,compute_35;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
@@ -105,14 +105,14 @@
   <ItemGroup>
     <ClCompile Include="memMapIpc.cpp" />
     <CudaCompile Include="memMapIpc_kernel.cu">
-      <CompileOut Condition="'$(Platform)'=='x64'">data/%(Filename)64.fatbin</CompileOut>
-      <NvccCompilation>fatbin</NvccCompilation>
+      <CompileOut Condition="'$(Platform)'=='x64'">data/%(Filename)64.ptx</CompileOut>
+      <NvccCompilation>ptx</NvccCompilation>
     </CudaCompile>
     <ClCompile Include="../../Common/helper_multiprocess.cpp" />
     <ClInclude Include="../../Common/helper_multiprocess.h" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/memMapIPCDrv/memMapIPCDrv_vs2019.vcxproj b/Samples/memMapIPCDrv/memMapIPCDrv_vs2019.vcxproj
index 78a96686..3c928e83 100644
--- a/Samples/memMapIPCDrv/memMapIPCDrv_vs2019.vcxproj
+++ b/Samples/memMapIPCDrv/memMapIPCDrv_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -63,7 +63,7 @@
       <OutputFile>$(OutDir)/memMapIPCDrv.exe</OutputFile>
     </Link>
     <CudaCompile>
-      <CodeGeneration>compute_35,compute_35;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
+      <CodeGeneration>compute_35,compute_35;</CodeGeneration>
       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
@@ -101,14 +101,14 @@
   <ItemGroup>
     <ClCompile Include="memMapIpc.cpp" />
     <CudaCompile Include="memMapIpc_kernel.cu">
-      <CompileOut Condition="'$(Platform)'=='x64'">data/%(Filename)64.fatbin</CompileOut>
-      <NvccCompilation>fatbin</NvccCompilation>
+      <CompileOut Condition="'$(Platform)'=='x64'">data/%(Filename)64.ptx</CompileOut>
+      <NvccCompilation>ptx</NvccCompilation>
     </CudaCompile>
     <ClCompile Include="../../Common/helper_multiprocess.cpp" />
     <ClInclude Include="../../Common/helper_multiprocess.h" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/memMapIPCDrv/memMapIpc.cpp b/Samples/memMapIPCDrv/memMapIpc.cpp
index ae4dee08..729cd231 100644
--- a/Samples/memMapIPCDrv/memMapIpc.cpp
+++ b/Samples/memMapIPCDrv/memMapIpc.cpp
@@ -64,9 +64,13 @@ typedef struct shmStruct_st {
   int sense;
 } shmStruct;
 
-// define input fatbin file
-#ifndef FATBIN_FILE
-#define FATBIN_FILE "memMapIpc_kernel64.fatbin"
+bool findModulePath(const char *, string &, char **, string &);
+
+// define input ptx file for different platforms
+#if defined(_WIN64) || defined(__LP64__)
+#define PTX_FILE "memMapIpc_kernel64.ptx"
+#else
+#define PTX_FILE "memMapIpc_kernel32.ptx"
 #endif
 
 // `ipcHandleTypeFlag` specifies the platform specific handle type this sample
@@ -255,23 +259,44 @@ static void memMapUnmapAndFreeMemory(CUdeviceptr dptr, size_t size) {
 
 static void memMapGetDeviceFunction(char **argv) {
   // first search for the module path before we load the results
-  string module_path;
-  std::ostringstream fatbin;
-
-  if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
-    exit(EXIT_FAILURE);
+  string module_path, ptx_source;
+  if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) {
+    if (!findModulePath("memMapIpc_kernel.cubin", module_path, argv,
+                        ptx_source)) {
+      printf(
+          "> findModulePath could not find <simpleMemMapIpc> ptx or cubin\n");
+      exit(EXIT_FAILURE);
+    }
   } else {
     printf("> initCUDA loading module: <%s>\n", module_path.c_str());
   }
 
-  if (!fatbin.str().size()) {
-    printf("fatbin file empty. exiting..\n");
-    exit(EXIT_FAILURE);
+  // Create module from binary file (PTX or CUBIN)
+  if (module_path.rfind("ptx") != string::npos) {
+    // in this branch we use compilation with parameters
+    const unsigned int jitNumOptions = 3;
+    CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
+    void **jitOptVals = new void *[jitNumOptions];
+    // set up size of compilation log buffer
+    jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+    int jitLogBufferSize = 1024;
+    jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
+    // set up pointer to the compilation log buffer
+    jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
+    char *jitLogBuffer = new char[jitLogBufferSize];
+    jitOptVals[1] = jitLogBuffer;
+    // set up pointer to set the Maximum # of registers for a particular kernel
+    jitOptions[2] = CU_JIT_MAX_REGISTERS;
+    int jitRegCount = 32;
+    jitOptVals[2] = (void *)(size_t)jitRegCount;
+    checkCudaErrors(cuModuleLoadDataEx(&cuModule, ptx_source.c_str(),
+                                       jitNumOptions, jitOptions,
+                                       (void **)jitOptVals));
+    printf("> PTX JIT log:\n%s\n", jitLogBuffer);
+  } else {
+    checkCudaErrors(cuModuleLoad(&cuModule, module_path.c_str()));
   }
 
-  // Create module from binary file (FATBIN)
-  checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
-
   // Get function handle from module
   checkCudaErrors(
       cuModuleGetFunction(&_memMapIpc_kernel, cuModule, "memMapIpc_kernel"));
@@ -585,3 +610,37 @@ int main(int argc, char **argv) {
   return EXIT_SUCCESS;
 #endif
 }
+
+bool inline findModulePath(const char *module_file, string &module_path,
+                           char **argv, string &ptx_source) {
+  char *actual_path = sdkFindFilePath(module_file, argv[0]);
+
+  if (actual_path) {
+    module_path = actual_path;
+  } else {
+    printf("> findModulePath file not found: <%s> \n", module_file);
+    return false;
+  }
+
+  if (module_path.empty()) {
+    printf("> findModulePath could not find file: <%s> \n", module_file);
+    return false;
+  } else {
+    printf("> findModulePath found file at <%s>\n", module_path.c_str());
+
+    if (module_path.rfind(".ptx") != string::npos) {
+      FILE *fp = fopen(module_path.c_str(), "rb");
+      fseek(fp, 0, SEEK_END);
+      int file_size = ftell(fp);
+      char *buf = new char[file_size + 1];
+      fseek(fp, 0, SEEK_SET);
+      fread(buf, sizeof(char), file_size, fp);
+      fclose(fp);
+      buf[file_size] = '\0';
+      ptx_source = buf;
+      delete[] buf;
+    }
+
+    return true;
+  }
+}
\ No newline at end of file
diff --git a/Samples/nvJPEG/Makefile b/Samples/nvJPEG/Makefile
index d8c228df..f3515c78 100644
--- a/Samples/nvJPEG/Makefile
+++ b/Samples/nvJPEG/Makefile
@@ -277,6 +277,12 @@ ifeq ($(TARGET_ARCH),armv7l)
   SAMPLE_ENABLED := 0
 endif
 
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - nvJPEG is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
diff --git a/Samples/nvJPEG/README.md b/Samples/nvJPEG/README.md
index 9a86bf7e..53c1b60d 100644
--- a/Samples/nvJPEG/README.md
+++ b/Samples/nvJPEG/README.md
@@ -25,7 +25,7 @@ x86_64, ppc64le, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/nvJPEG/nvJPEG_vs2017.vcxproj b/Samples/nvJPEG/nvJPEG_vs2017.vcxproj
index 94b76d62..7d16e568 100644
--- a/Samples/nvJPEG/nvJPEG_vs2017.vcxproj
+++ b/Samples/nvJPEG/nvJPEG_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/nvJPEG/nvJPEG_vs2019.vcxproj b/Samples/nvJPEG/nvJPEG_vs2019.vcxproj
index 0bf2340d..378b9198 100644
--- a/Samples/nvJPEG/nvJPEG_vs2019.vcxproj
+++ b/Samples/nvJPEG/nvJPEG_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/nvJPEG_encoder/Makefile b/Samples/nvJPEG_encoder/Makefile
index 05228d1d..da0b82b7 100644
--- a/Samples/nvJPEG_encoder/Makefile
+++ b/Samples/nvJPEG_encoder/Makefile
@@ -277,6 +277,12 @@ ifeq ($(TARGET_ARCH),armv7l)
   SAMPLE_ENABLED := 0
 endif
 
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - nvJPEG_encoder is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
diff --git a/Samples/nvJPEG_encoder/README.md b/Samples/nvJPEG_encoder/README.md
index ced6a27f..40f092b3 100644
--- a/Samples/nvJPEG_encoder/README.md
+++ b/Samples/nvJPEG_encoder/README.md
@@ -25,7 +25,7 @@ x86_64, ppc64le, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2017.vcxproj b/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2017.vcxproj
index 9b102304..765f1f35 100644
--- a/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2017.vcxproj
+++ b/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2019.vcxproj b/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2019.vcxproj
index 314cb390..76fcec11 100644
--- a/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2019.vcxproj
+++ b/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/p2pBandwidthLatencyTest/README.md b/Samples/p2pBandwidthLatencyTest/README.md
index 4adbc8ed..ab1f4685 100644
--- a/Samples/p2pBandwidthLatencyTest/README.md
+++ b/Samples/p2pBandwidthLatencyTest/README.md
@@ -27,7 +27,7 @@ cudaDeviceCanAccessPeer, cudaDeviceEnablePeerAccess, cudaDeviceDisablePeerAccess
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj
index d8832b17..28d5f5cd 100644
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2019.vcxproj b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2019.vcxproj
index 7d04df50..8a2d5450 100644
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2019.vcxproj
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/reduction/README.md b/Samples/reduction/README.md
index 158abb00..172d748a 100644
--- a/Samples/reduction/README.md
+++ b/Samples/reduction/README.md
@@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/reduction/reduction_vs2017.vcxproj b/Samples/reduction/reduction_vs2017.vcxproj
index c1cf9fb3..7e14bc82 100644
--- a/Samples/reduction/reduction_vs2017.vcxproj
+++ b/Samples/reduction/reduction_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -109,6 +109,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/reduction/reduction_vs2019.vcxproj b/Samples/reduction/reduction_vs2019.vcxproj
index 0fd929e9..74fb1d6a 100644
--- a/Samples/reduction/reduction_vs2019.vcxproj
+++ b/Samples/reduction/reduction_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -105,6 +105,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/shfl_scan/README.md b/Samples/shfl_scan/README.md
index efa00f56..87b6872b 100644
--- a/Samples/shfl_scan/README.md
+++ b/Samples/shfl_scan/README.md
@@ -25,7 +25,7 @@ x86_64, ppc64le, armv7l, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/shfl_scan/shfl_scan_vs2017.vcxproj b/Samples/shfl_scan/shfl_scan_vs2017.vcxproj
index 6f91ea60..beaad3f8 100644
--- a/Samples/shfl_scan/shfl_scan_vs2017.vcxproj
+++ b/Samples/shfl_scan/shfl_scan_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -109,6 +109,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/shfl_scan/shfl_scan_vs2019.vcxproj b/Samples/shfl_scan/shfl_scan_vs2019.vcxproj
index 5504852e..8757714e 100644
--- a/Samples/shfl_scan/shfl_scan_vs2019.vcxproj
+++ b/Samples/shfl_scan/shfl_scan_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -105,6 +105,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleAWBarrier/README.md b/Samples/simpleAWBarrier/README.md
index ca95266a..c4003183 100644
--- a/Samples/simpleAWBarrier/README.md
+++ b/Samples/simpleAWBarrier/README.md
@@ -30,7 +30,7 @@ cudaMalloc, cudaFree, cudaMemcpyAsync
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/simpleAWBarrier/simpleAWBarrier.cu b/Samples/simpleAWBarrier/simpleAWBarrier.cu
index 8beb371a..b36af811 100644
--- a/Samples/simpleAWBarrier/simpleAWBarrier.cu
+++ b/Samples/simpleAWBarrier/simpleAWBarrier.cu
@@ -25,7 +25,6 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-
 // Includes, system
 #include <stdio.h>
 
@@ -35,229 +34,222 @@
 #include <cooperative_groups.h>
 
 // Utilities and timing functions
-#include <helper_functions.h>    // includes cuda.h and cuda_runtime_api.h
+#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
 
 // CUDA helper functions
-#include <helper_cuda.h>         // helper functions for CUDA error check
+#include <helper_cuda.h>  // helper functions for CUDA error check
 
 namespace cg = cooperative_groups;
 
-
 #if __CUDA_ARCH__ >= 700
-template <bool writeSquareRoot> __device__ void reduceBlockData(cuda::barrier<cuda::thread_scope_block> &barrier,
-                            cg::thread_block_tile<32> &tile32, double &threadSum, double *result)
-{
-    extern __shared__ double tmp[];
+template <bool writeSquareRoot>
+__device__ void reduceBlockData(
+    cuda::barrier<cuda::thread_scope_block> &barrier,
+    cg::thread_block_tile<32> &tile32, double &threadSum, double *result) {
+  extern __shared__ double tmp[];
 
-    #pragma unroll
-    for (int offset = tile32.size()/2; offset > 0; offset /= 2)
-    {
-         threadSum += tile32.shfl_down(threadSum, offset);
-    }
-    if (tile32.thread_rank() == 0)
-    {
-        tmp[tile32.meta_group_rank()] = threadSum;
+#pragma unroll
+  for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
+    threadSum += tile32.shfl_down(threadSum, offset);
+  }
+  if (tile32.thread_rank() == 0) {
+    tmp[tile32.meta_group_rank()] = threadSum;
+  }
+
+  auto token = barrier.arrive();
+
+  barrier.wait(std::move(token));
+
+  // The warp 0 will perform last round of reduction
+  if (tile32.meta_group_rank() == 0) {
+    double beta = tile32.thread_rank() < tile32.meta_group_size()
+                      ? tmp[tile32.thread_rank()]
+                      : 0.0;
+
+#pragma unroll
+    for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
+      beta += tile32.shfl_down(beta, offset);
     }
 
-    auto token = barrier.arrive();
-
-    barrier.wait(std::move(token));
-
-    // The warp 0 will perform last round of reduction
-    if (tile32.meta_group_rank() == 0) {
-
-        double beta  = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0;
-
-        #pragma unroll
-        for (int offset = tile32.size()/2; offset > 0; offset /= 2)
-        {
-             beta += tile32.shfl_down(beta, offset);
-        }
-
-        if (tile32.thread_rank() == 0)
-        {
-            if (writeSquareRoot)
-                *result = sqrt(beta);
-            else
-                *result = beta;
-        }
+    if (tile32.thread_rank() == 0) {
+      if (writeSquareRoot)
+        *result = sqrt(beta);
+      else
+        *result = beta;
     }
+  }
 }
 #endif
 
-__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, double *partialResults, int size)
-{
+__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB,
+                                             double *partialResults, int size) {
 #if __CUDA_ARCH__ >= 700
 #pragma diag_suppress static_var_with_dynamic_init
-    cg::thread_block cta = cg::this_thread_block();
-    cg::grid_group grid = cg::this_grid();;
-    cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
+  cg::thread_block cta = cg::this_thread_block();
+  cg::grid_group grid = cg::this_grid();
+  ;
+  cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
 
-    __shared__ cuda::barrier<cuda::thread_scope_block> barrier;
+  __shared__ cuda::barrier<cuda::thread_scope_block> barrier;
 
-    if (threadIdx.x == 0) {
-        init(&barrier, blockDim.x);
+  if (threadIdx.x == 0) {
+    init(&barrier, blockDim.x);
+  }
+
+  cg::sync(cta);
+
+  double threadSum = 0.0;
+  for (int i = grid.thread_rank(); i < size; i += grid.size()) {
+    threadSum += (double)(vecA[i] * vecB[i]);
+  }
+
+  // Each thread block performs reduction of partial dotProducts and writes to
+  // global mem.
+  reduceBlockData<false>(barrier, tile32, threadSum,
+                         &partialResults[blockIdx.x]);
+
+  cg::sync(grid);
+
+  // One block performs the final summation of partial dot products
+  // of all the thread blocks and writes the sqrt of final dot product.
+  if (blockIdx.x == 0) {
+    threadSum = 0.0;
+    for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) {
+      threadSum += partialResults[i];
     }
+    reduceBlockData<true>(barrier, tile32, threadSum, &partialResults[0]);
+  }
 
-    cg::sync(cta);
+  cg::sync(grid);
 
-    double threadSum = 0.0;
-    for (int i = grid.thread_rank(); i < size; i += grid.size())
-    {
-        threadSum += (double) (vecA[i] * vecB[i]);
-    }
+  const double finalValue = partialResults[0];
 
-    // Each thread block performs reduction of partial dotProducts and writes to 
-    // global mem.
-    reduceBlockData<false>(barrier, tile32, threadSum, &partialResults[blockIdx.x]);
-
-    cg::sync(grid);
-
-    // One block performs the final summation of partial dot products
-    // of all the thread blocks and writes the sqrt of final dot product.
-    if (blockIdx.x == 0)
-    {
-        threadSum = 0.0;
-        for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size())
-        {
-            threadSum += partialResults[i];
-        }
-        reduceBlockData<true>(barrier, tile32, threadSum, &partialResults[0]);
-    }
-
-    cg::sync(grid);
-
-    const double finalValue = partialResults[0];
-
-    // Perform normalization of vecA & vecB.
-    for (int i = grid.thread_rank(); i < size; i += grid.size())
-    {
-        vecA[i] = (float)vecA[i] / finalValue;
-        vecB[i] = (float)vecB[i] / finalValue;
-    }
+  // Perform normalization of vecA & vecB.
+  for (int i = grid.thread_rank(); i < size; i += grid.size()) {
+    vecA[i] = (float)vecA[i] / finalValue;
+    vecB[i] = (float)vecB[i] / finalValue;
+  }
 #endif
 }
 
-
 int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv)
-{
-    printf("%s starting...\n", argv[0]);
+int main(int argc, char **argv) {
+  printf("%s starting...\n", argv[0]);
 
-    // This will pick the best possible CUDA capable device
-    int dev = findCudaDevice(argc, (const char **)argv);
+  // This will pick the best possible CUDA capable device
+  int dev = findCudaDevice(argc, (const char **)argv);
 
-    int major = 0;
-    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
+  int major = 0;
+  checkCudaErrors(
+      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
 
-    // Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
-    if (major < 7) {
-        printf("simpleAWBarrier requires SM 7.0 or higher.  Exiting...\n");
-        exit(EXIT_WAIVED);
-    }
+  // Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
+  if (major < 7) {
+    printf("simpleAWBarrier requires SM 7.0 or higher.  Exiting...\n");
+    exit(EXIT_WAIVED);
+  }
 
-    int supportsCooperativeLaunch = 0;
-    checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch, cudaDevAttrCooperativeLaunch, dev));
+  int supportsCooperativeLaunch = 0;
+  checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch,
+                                         cudaDevAttrCooperativeLaunch, dev));
 
-    if (!supportsCooperativeLaunch)
-    {
-        printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, Waiving the run\n", dev);
-        exit(EXIT_WAIVED);
-    }
+  if (!supportsCooperativeLaunch) {
+    printf(
+        "\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
+        "Waiving the run\n",
+        dev);
+    exit(EXIT_WAIVED);
+  }
 
-    int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev);
+  int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev);
 
-    printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!");
-    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
+  printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!");
+  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
 
-int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId)
-{
-    float *vecA, *d_vecA;
-    float *vecB, *d_vecB;
-    double *d_partialResults;
-    int size = 10000000;
+int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
+  float *vecA, *d_vecA;
+  float *vecB, *d_vecB;
+  double *d_partialResults;
+  int size = 10000000;
 
-    vecA = new float[size];
-    vecB = new float[size];
+  checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size));
+  checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size));
 
-    checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float)*size));
-    checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float)*size));
+  checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size));
+  checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size));
 
-    float baseVal = 2.0;
-    for (int i = 0; i < size; i++)
-    {
-        vecA[i] = vecB[i] = baseVal;
+  float baseVal = 2.0;
+  for (int i = 0; i < size; i++) {
+    vecA[i] = vecB[i] = baseVal;
+  }
+
+  cudaStream_t stream;
+  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+  checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size,
+                                  cudaMemcpyHostToDevice, stream));
+  checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size,
+                                  cudaMemcpyHostToDevice, stream));
+
+  // Kernel configuration, where a one-dimensional
+  // grid and one-dimensional blocks are configured.
+  int minGridSize = 0, blockSize = 0;
+  checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
+      &minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
+
+  int smemSize = ((blockSize / 32) + 1) * sizeof(double);
+
+  int numBlocksPerSm = 0;
+  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
+
+  int multiProcessorCount = 0;
+  checkCudaErrors(cudaDeviceGetAttribute(
+      &multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
+
+  minGridSize = multiProcessorCount * numBlocksPerSm;
+  checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
+
+  printf(
+      "Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
+      "blockSize = %d\n",
+      minGridSize, blockSize);
+
+  dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
+
+  void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB,
+                        (void *)&d_partialResults, (void *)&size};
+
+  checkCudaErrors(
+      cudaLaunchCooperativeKernel((void *)normVecByDotProductAWBarrier, dimGrid,
+                                  dimBlock, kernelArgs, smemSize, stream));
+
+  checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size,
+                                  cudaMemcpyDeviceToHost, stream));
+  checkCudaErrors(cudaStreamSynchronize(stream));
+
+  float expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
+  unsigned int matches = 0;
+  for (int i = 0; i < size; i++) {
+    if ((vecA[i] - expectedResult) > 0.00001) {
+      printf("mismatch at i = %d\n", i);
+      break;
+    } else {
+      matches++;
     }
+  }
 
-    cudaStream_t stream;
-    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+  printf("Result = %s\n", matches == size ? "PASSED" : "FAILED");
+  checkCudaErrors(cudaFree(d_vecA));
+  checkCudaErrors(cudaFree(d_vecB));
+  checkCudaErrors(cudaFree(d_partialResults));
 
-    checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float)*size, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float)*size, cudaMemcpyHostToDevice, stream));
-
-    // Kernel configuration, where a one-dimensional
-    // grid and one-dimensional blocks are configured.
-    int minGridSize = 0, blockSize = 0;
-    checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
-                            &minGridSize,
-                            &blockSize,
-                            (void*)normVecByDotProductAWBarrier,
-                            0,
-                            size));
-
-    int smemSize =  ((blockSize/32)+1) * sizeof(double);
-
-    int numBlocksPerSm = 0;
-    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
-
-    int multiProcessorCount = 0;
-    checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
-
-    minGridSize = multiProcessorCount * numBlocksPerSm;
-    checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize*sizeof(double)));
-
-    printf("Launching normVecByDotProductAWBarrier kernel with numBlocks = %d blockSize = %d\n", minGridSize, blockSize);
-
-    dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
-
-    void *kernelArgs[] = {
-        (void*)&d_vecA,
-        (void*)&d_vecB,
-        (void*)&d_partialResults,
-        (void*)&size
-    };
-
-    checkCudaErrors(cudaLaunchCooperativeKernel((void *)normVecByDotProductAWBarrier, dimGrid, dimBlock, kernelArgs, smemSize, stream));
-
-    checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float)*size, cudaMemcpyDeviceToHost, stream));
-    checkCudaErrors(cudaStreamSynchronize(stream));
-
-    float expectedResult =  (baseVal / sqrt(size*baseVal*baseVal));
-    unsigned int matches = 0;
-    for (int i=0; i < size; i++)
-    {
-        if ((vecA[i] - expectedResult) > 0.00001)
-        {
-            printf("mismatch at i = %d\n", i);
-            break;
-        }
-        else
-        {
-            matches++;
-        }
-    }
-
-    printf("Result = %s\n", matches == size ? "PASSED" : "FAILED");
-    checkCudaErrors(cudaFree(d_vecA));
-    checkCudaErrors(cudaFree(d_vecB));
-    checkCudaErrors(cudaFree(d_partialResults));
-
-    delete[] vecA;
-    delete[] vecB;
-    return matches == size;
+  checkCudaErrors(cudaFreeHost(vecA));
+  checkCudaErrors(cudaFreeHost(vecB));
+  return matches == size;
 }
diff --git a/Samples/simpleAWBarrier/simpleAWBarrier_vs2017.vcxproj b/Samples/simpleAWBarrier/simpleAWBarrier_vs2017.vcxproj
index b83f8823..e03ef6e2 100644
--- a/Samples/simpleAWBarrier/simpleAWBarrier_vs2017.vcxproj
+++ b/Samples/simpleAWBarrier/simpleAWBarrier_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleAWBarrier/simpleAWBarrier_vs2019.vcxproj b/Samples/simpleAWBarrier/simpleAWBarrier_vs2019.vcxproj
index 62e6fccd..b4be9610 100644
--- a/Samples/simpleAWBarrier/simpleAWBarrier_vs2019.vcxproj
+++ b/Samples/simpleAWBarrier/simpleAWBarrier_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleAttributes/README.md b/Samples/simpleAttributes/README.md
index f5947c5f..5d643c86 100644
--- a/Samples/simpleAttributes/README.md
+++ b/Samples/simpleAttributes/README.md
@@ -27,7 +27,7 @@ cudaCtxResetPersistingL2Cache, cudaDeviceSetLimit, cudaFree, cudaGetDeviceProper
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/simpleAttributes/simpleAttributes.cu b/Samples/simpleAttributes/simpleAttributes.cu
index 46f00aae..d71b4bb1 100644
--- a/Samples/simpleAttributes/simpleAttributes.cu
+++ b/Samples/simpleAttributes/simpleAttributes.cu
@@ -36,22 +36,22 @@
 
 // includes, project
 #include <helper_cuda.h>
-#include <helper_functions.h> // helper functions for SDK examples
+#include <helper_functions.h>  // helper functions for SDK examples
 
 ////////////////////////////////////////////////////////////////////////////////
 // declaration, forward
 void runTest(int argc, char **argv);
 
-cudaAccessPolicyWindow
-initAccessPolicyWindow(void) {
-   cudaAccessPolicyWindow accessPolicyWindow = { 0 };
-   accessPolicyWindow.base_ptr = (void *)0;
-   accessPolicyWindow.num_bytes = 0;
-   accessPolicyWindow.hitRatio = 0.f;
-   accessPolicyWindow.hitProp = cudaAccessPropertyNormal;
-   accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
-   return accessPolicyWindow;
+cudaAccessPolicyWindow initAccessPolicyWindow(void) {
+  cudaAccessPolicyWindow accessPolicyWindow = {0};
+  accessPolicyWindow.base_ptr = (void *)0;
+  accessPolicyWindow.num_bytes = 0;
+  accessPolicyWindow.hitRatio = 0.f;
+  accessPolicyWindow.hitProp = cudaAccessPropertyNormal;
+  accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
+  return accessPolicyWindow;
 }
+
 ////////////////////////////////////////////////////////////////////////////////
 //! Simple test kernel for device functionality
 //! @param data  input data in global memory
@@ -60,146 +60,155 @@ initAccessPolicyWindow(void) {
 //! @param bigDataSize  input bigData size
 //! @param hitcount how many data access are done within block
 ////////////////////////////////////////////////////////////////////////////////
-static __global__ void
-kernCacheSegmentTest(int* data, int dataSize, int *trash, int bigDataSize, int hitCount)
-{
-    __shared__ unsigned int hit;
-    int row = blockIdx.y * blockDim.y + threadIdx.y;
-    int col = blockIdx.x * blockDim.x + threadIdx.x;
-    int tID = row * blockDim.y + col;
-    uint32_t psRand = tID;
+static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash,
+                                            int bigDataSize, int hitCount) {
+  __shared__ unsigned int hit;
+  int row = blockIdx.y * blockDim.y + threadIdx.y;
+  int col = blockIdx.x * blockDim.x + threadIdx.x;
+  int tID = row * blockDim.y + col;
+  uint32_t psRand = tID;
 
-    atomicExch(&hit, 0);
-    __syncthreads();
-    while (hit < hitCount) {
-         psRand ^= psRand << 13;
-         psRand ^= psRand >> 17;
-         psRand ^= psRand << 5;
+  atomicExch(&hit, 0);
+  __syncthreads();
+  while (hit < hitCount) {
+    psRand ^= psRand << 13;
+    psRand ^= psRand >> 17;
+    psRand ^= psRand << 5;
 
-         int idx = tID - psRand;
-         if (idx < 0) {
-             idx = -idx;
-         }
-
-         if((tID % 2) == 0) {
-             data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
-         } else {
-             trash[psRand % bigDataSize] = trash[psRand % bigDataSize] + trash[idx % bigDataSize];
-         }
-
-        atomicAdd(&hit, 1);
+    int idx = tID - psRand;
+    if (idx < 0) {
+      idx = -idx;
     }
+
+    if ((tID % 2) == 0) {
+      data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
+    } else {
+      trash[psRand % bigDataSize] =
+          trash[psRand % bigDataSize] + trash[idx % bigDataSize];
+    }
+
+    atomicAdd(&hit, 1);
+  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int
-main(int argc, char **argv)
-{
-    runTest(argc, argv);
-}
+int main(int argc, char **argv) { runTest(argc, argv); }
 
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void
-runTest(int argc, char **argv)
-{
-    bool bTestResult = true;
-    cudaAccessPolicyWindow accessPolicyWindow;
-    cudaDeviceProp deviceProp;
-    cudaStreamAttrValue streamAttrValue;
-    cudaStream_t stream;
-    cudaStreamAttrID streamAttrID;
-    dim3 threads(32, 32);
-    int *dataDevicePointer;
-    int *dataHostPointer;
-    int dataSize;
-    int *bigDataDevicePointer;
-    int *bigDataHostPointer;
-    int bigDataSize;
-    StopWatchInterface *timer = 0;
+void runTest(int argc, char **argv) {
+  bool bTestResult = true;
+  cudaAccessPolicyWindow accessPolicyWindow;
+  cudaDeviceProp deviceProp;
+  cudaStreamAttrValue streamAttrValue;
+  cudaStream_t stream;
+  cudaStreamAttrID streamAttrID;
+  dim3 threads(32, 32);
+  int *dataDevicePointer;
+  int *dataHostPointer;
+  int dataSize;
+  int *bigDataDevicePointer;
+  int *bigDataHostPointer;
+  int bigDataSize;
+  StopWatchInterface *timer = 0;
 
-    printf("%s Starting...\n\n", argv[0]);
+  printf("%s Starting...\n\n", argv[0]);
 
-    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
-    int devID = findCudaDevice(argc, (const char **)argv);
-    sdkCreateTimer(&timer);
-    sdkStartTimer(&timer);
-    //Get device properties
-    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
-    dim3 blocks(deviceProp.maxGridSize[1], 1);
+  // use command-line specified CUDA device, otherwise use device with highest
+  // Gflops/s
+  int devID = findCudaDevice(argc, (const char **)argv);
+  sdkCreateTimer(&timer);
+  sdkStartTimer(&timer);
+  // Get device properties
+  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
+  dim3 blocks(deviceProp.maxGridSize[1], 1);
 
-    //Make sure device the l2 optimization
-    if (deviceProp.persistingL2CacheMaxSize == 0) {
-        printf("Waiving execution as device %d does not support persisting L2 Caching\n", devID);
-       exit(EXIT_WAIVED);
+  // Make sure device the l2 optimization
+  if (deviceProp.persistingL2CacheMaxSize == 0) {
+    printf(
+        "Waiving execution as device %d does not support persisting L2 "
+        "Caching\n",
+        devID);
+    exit(EXIT_WAIVED);
+  }
+
+  // Create stream to assiocate with window
+  checkCudaErrors(cudaStreamCreate(&stream));
+
+  // Set the amount of l2 cache that will be persisting to maximum the device
+  // can support
+  checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize,
+                                     deviceProp.persistingL2CacheMaxSize));
+
+  // Stream attribute to set
+  streamAttrID = cudaStreamAttributeAccessPolicyWindow;
+
+  // Default window
+  streamAttrValue.accessPolicyWindow = initAccessPolicyWindow();
+  accessPolicyWindow = initAccessPolicyWindow();
+
+  // Allocate size of both buffers
+  bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int);
+  dataSize = (deviceProp.l2CacheSize / 4) / sizeof(int);
+
+  // Allocate data
+  checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
+  checkCudaErrors(
+      cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
+
+  for (int i = 0; i < bigDataSize; ++i) {
+    if (i < dataSize) {
+      dataHostPointer[i] = i;
     }
 
-    //Create stream to assiocate with window
-    checkCudaErrors(cudaStreamCreate(&stream));
+    bigDataHostPointer[bigDataSize - i - 1] = i;
+  }
 
-    //Set the amount of l2 cache that will be persisting to maximum the device can support
-    checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, deviceProp.persistingL2CacheMaxSize));
+  checkCudaErrors(
+      cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
+  checkCudaErrors(
+      cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
+  checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer,
+                                  dataSize * sizeof(int),
+                                  cudaMemcpyHostToDevice, stream));
+  checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer,
+                                  bigDataSize * sizeof(int),
+                                  cudaMemcpyHostToDevice, stream));
 
-    //Stream attribute to set
-    streamAttrID = cudaStreamAttributeAccessPolicyWindow;
+  // Make a window for the buffer of interest
+  accessPolicyWindow.base_ptr = (void *)dataDevicePointer;
+  accessPolicyWindow.num_bytes = dataSize * sizeof(int);
+  accessPolicyWindow.hitRatio = 1.f;
+  accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
+  accessPolicyWindow.missProp = cudaAccessPropertyNormal;
+  streamAttrValue.accessPolicyWindow = accessPolicyWindow;
 
-    //Default window
-    streamAttrValue.accessPolicyWindow      = initAccessPolicyWindow();
-    accessPolicyWindow                      = initAccessPolicyWindow();
+  // Assign window to stream
+  checkCudaErrors(
+      cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
 
-    //Allocate size of both buffers
-    bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int);
-    dataSize = (deviceProp.l2CacheSize / 4)  / sizeof(int);
+  // Demote any previous persisting lines
+  checkCudaErrors(cudaCtxResetPersistingL2Cache());
 
-    //Allocate data
-    dataHostPointer = (int *)malloc(dataSize * sizeof(int));
-    bigDataHostPointer = (int *)malloc(bigDataSize * sizeof(int));
+  checkCudaErrors(cudaStreamSynchronize(stream));
+  kernCacheSegmentTest<<<blocks, threads, 0, stream>>>(
+      dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF);
 
-    for ( int i = 0; i < bigDataSize; ++i) {
-        if (i < dataSize) {
-            dataHostPointer[i] = i;
-        }
+  checkCudaErrors(cudaStreamSynchronize(stream));
+  // check if kernel execution generated and error
+  getLastCudaError("Kernel execution failed");
 
-        bigDataHostPointer[bigDataSize - i - 1] = i;
-    }
+  // Free memory
+  checkCudaErrors(cudaFreeHost(dataHostPointer));
+  checkCudaErrors(cudaFreeHost(bigDataHostPointer));
+  checkCudaErrors(cudaFree(dataDevicePointer));
+  checkCudaErrors(cudaFree(bigDataDevicePointer));
 
-    checkCudaErrors(cudaMalloc((void**) &dataDevicePointer, dataSize * sizeof(int)));
-    checkCudaErrors(cudaMalloc((void**) &bigDataDevicePointer, bigDataSize * sizeof(int)));
-    checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer, dataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer, bigDataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
+  sdkStopTimer(&timer);
+  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+  sdkDeleteTimer(&timer);
 
-    //Make a window for the buffer of interest
-    accessPolicyWindow.base_ptr = (void *)dataDevicePointer;
-    accessPolicyWindow.num_bytes = dataSize * sizeof(int);
-    accessPolicyWindow.hitRatio = 1.f;
-    accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
-    accessPolicyWindow.missProp = cudaAccessPropertyNormal;
-    streamAttrValue.accessPolicyWindow = accessPolicyWindow;
-
-    //Assign window to stream
-    checkCudaErrors(cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
-
-    //Demote any previous persisting lines
-    checkCudaErrors(cudaCtxResetPersistingL2Cache());
-
-    checkCudaErrors(cudaStreamSynchronize(stream));
-    kernCacheSegmentTest<<<blocks, threads, 0, stream>>>(dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF);
-
-    checkCudaErrors(cudaStreamSynchronize(stream));
-    // check if kernel execution generated and error
-    getLastCudaError("Kernel execution failed");
-
-    //Free memory
-    free(dataHostPointer);
-    free(bigDataHostPointer);
-    checkCudaErrors(cudaFree(dataDevicePointer));
-    checkCudaErrors(cudaFree(bigDataDevicePointer));
-
-    sdkStopTimer(&timer);
-    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
-    sdkDeleteTimer(&timer);
-
-    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
+  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
diff --git a/Samples/simpleAttributes/simpleAttributes_vs2017.vcxproj b/Samples/simpleAttributes/simpleAttributes_vs2017.vcxproj
index 692be656..a31cf815 100644
--- a/Samples/simpleAttributes/simpleAttributes_vs2017.vcxproj
+++ b/Samples/simpleAttributes/simpleAttributes_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleAttributes/simpleAttributes_vs2019.vcxproj b/Samples/simpleAttributes/simpleAttributes_vs2019.vcxproj
index 47fdd889..b74d221e 100644
--- a/Samples/simpleAttributes/simpleAttributes_vs2019.vcxproj
+++ b/Samples/simpleAttributes/simpleAttributes_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCUBLAS/Makefile b/Samples/simpleCUBLAS/Makefile
index bbdaed39..516da194 100644
--- a/Samples/simpleCUBLAS/Makefile
+++ b/Samples/simpleCUBLAS/Makefile
@@ -263,6 +263,14 @@ ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - simpleCUBLAS is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
@@ -297,6 +305,10 @@ ALL_CCFLAGS += --threads 0
 
 LIBRARIES += -lcublas
 
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
 ################################################################################
 
 # Target rules
@@ -304,16 +316,23 @@ all: build
 
 build: simpleCUBLAS
 
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
 simpleCUBLAS.o:simpleCUBLAS.cpp
-	$(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 
 simpleCUBLAS: simpleCUBLAS.o
-	$(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
-	mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
-	cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 
 run: build
-	./simpleCUBLAS
+	$(EXEC) ./simpleCUBLAS
 
 clean:
 	rm -f simpleCUBLAS simpleCUBLAS.o
diff --git a/Samples/simpleCUBLAS/README.md b/Samples/simpleCUBLAS/README.md
index d4735374..67dd4ce5 100644
--- a/Samples/simpleCUBLAS/README.md
+++ b/Samples/simpleCUBLAS/README.md
@@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj
index 161fe55f..3a68d707 100644
--- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj
+++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2019.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2019.vcxproj
index b43537ff..6370f200 100644
--- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2019.vcxproj
+++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCUBLASXT/Makefile b/Samples/simpleCUBLASXT/Makefile
index 24e5af89..b5759857 100644
--- a/Samples/simpleCUBLASXT/Makefile
+++ b/Samples/simpleCUBLASXT/Makefile
@@ -271,6 +271,12 @@ ifeq ($(TARGET_ARCH),armv7l)
   SAMPLE_ENABLED := 0
 endif
 
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - simpleCUBLASXT is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
diff --git a/Samples/simpleCUBLASXT/README.md b/Samples/simpleCUBLASXT/README.md
index 00b4ce9b..fd3decae 100644
--- a/Samples/simpleCUBLASXT/README.md
+++ b/Samples/simpleCUBLASXT/README.md
@@ -27,7 +27,7 @@ x86_64, ppc64le, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj
index faaaba0e..3805ce26 100644
--- a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj
+++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2019.vcxproj b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2019.vcxproj
index 7b0324fb..b0472a39 100644
--- a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2019.vcxproj
+++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCUBLAS_LU/Makefile b/Samples/simpleCUBLAS_LU/Makefile
new file mode 100644
index 00000000..2c49cc17
--- /dev/null
+++ b/Samples/simpleCUBLAS_LU/Makefile
@@ -0,0 +1,357 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        NVCCFLAGS += --qpp-config 5.4.0,gcc_ntoaarch64le
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
+        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
+        ifdef TARGET_OVERRIDE
+            LDFLAGS += -lslog2
+        endif
+
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -L$(TARGET_FS)/usr/lib
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
+            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
+            CCFLAGS += -I$(TARGET_FS)/../include
+        endif
+    endif
+endif
+
+ifdef TARGET_OVERRIDE # cuda toolkit targets override
+    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on Mac OSX
+ifeq ($(TARGET_OS),darwin)
+  $(info >>> WARNING - simpleCUBLAS_LU is not supported on Mac OSX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on ARMv7
+ifeq ($(TARGET_ARCH),armv7l)
+  $(info >>> WARNING - simpleCUBLAS_LU is not supported on ARMv7 - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - simpleCUBLAS_LU is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 35 37 50 52 60 61 70 72 75 80 86
+else
+SMS ?= 35 37 50 52 60 61 70 75 80 86
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ALL_CCFLAGS += --threads 0
+
+LIBRARIES += -lcublas
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: simpleCUBLAS_LU
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+simpleCUBLAS_LU.o:simpleCUBLAS_LU.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+simpleCUBLAS_LU: simpleCUBLAS_LU.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./simpleCUBLAS_LU
+
+clean:
+	rm -f simpleCUBLAS_LU simpleCUBLAS_LU.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/simpleCUBLAS_LU
+
+clobber: clean
diff --git a/Samples/simpleCUBLAS_LU/NsightEclipse.xml b/Samples/simpleCUBLAS_LU/NsightEclipse.xml
new file mode 100644
index 00000000..dea35d1a
--- /dev/null
+++ b/Samples/simpleCUBLAS_LU/NsightEclipse.xml
@@ -0,0 +1,68 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>simpleCUBLAS_LU</name>
+  <description><![CDATA[CUDA sample demonstrating cuBLAS API cublasDgetrfBatched() for lower-upper (LU) decomposition of a matrix.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">CUBLAS Library</concept>
+    <concept level="basic">LU decomposition</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>CUDA</keyword>
+    <keyword>CUBLAS</keyword>
+    <keyword>Linear Algebra</keyword>
+    <keyword>LU decomposition</keyword>
+  </keywords>
+  <libraries>
+    <library>cublas</library>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>simpleCUBLAS_LU.cpp</primary_file>
+  <required_dependencies>
+    <dependency>CUBLAS</dependency>
+  </required_dependencies>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+    <scope>3:Linear Algebra</scope>
+  </scopes>
+  <sm-arch>sm35</sm-arch>
+  <sm-arch>sm37</sm-arch>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
+  <sm-arch>sm86</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>aarch64</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>Simple CUBLAS LU</title>
+  <type>exe</type>
+</entry>
diff --git a/Samples/simpleCUBLAS_LU/README.md b/Samples/simpleCUBLAS_LU/README.md
new file mode 100644
index 00000000..9ef4764b
--- /dev/null
+++ b/Samples/simpleCUBLAS_LU/README.md
@@ -0,0 +1,71 @@
+# simpleCUBLAS_LU - Simple CUBLAS LU
+
+## Description
+
+CUDA sample demonstrating cuBLAS API cublasDgetrfBatched() for lower-upper (LU) decomposition of a matrix.
+
+## Key Concepts
+
+CUBLAS Library, LU decomposition
+
+## Supported SM Architectures
+
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, aarch64
+
+## CUDA APIs involved
+
+## Dependencies needed to build/run
+[CUBLAS](../../README.md#cublas)
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
diff --git a/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU.cpp b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU.cpp
new file mode 100644
index 00000000..7b7e4d7a
--- /dev/null
+++ b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU.cpp
@@ -0,0 +1,417 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This example demonstrates how to use the cuBLAS library API
+ * for lower-upper (LU) decomposition of a matrix. LU decomposition
+ * factors a matrix as the product of upper triangular matrix and
+ * lower trianglular matrix.
+ *
+ * https://en.wikipedia.org/wiki/LU_decomposition
+ *
+ * This sample uses 10000 matrices of size 4x4 and performs
+ * LU decomposition of them using batched decomposition API
+ * of cuBLAS library. To test the correctness of upper and lower
+ * matrices generated, they are multiplied and compared with the
+ * original input matrix.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+// cuda libraries and helpers
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+
+// configurable parameters
+// dimension of matrix
+#define N 4
+#define BATCH_SIZE 10000
+
+// use double precision data type
+#define DOUBLE_PRECISION /* comment this to use single precision */
+#ifdef DOUBLE_PRECISION
+#define DATA_TYPE double
+#define MAX_ERROR 1e-15
+#else
+#define DATA_TYPE float
+#define MAX_ERROR 1e-6
+#endif /* DOUBLE_PRCISION */
+
+// use pivot vector while decomposing
+#define PIVOT /* comment this to disable pivot use */
+
+// helper functions
+
+// wrapper around cublas<t>getrfBatched()
+cublasStatus_t cublasXgetrfBatched(cublasHandle_t handle, int n,
+                                   DATA_TYPE* const A[], int lda, int* P,
+                                   int* info, int batchSize) {
+#ifdef DOUBLE_PRECISION
+  return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize);
+#else
+  return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize);
+#endif
+}
+
+// wrapper around malloc
+// clears the allocated memory to 0
+// terminates the program if malloc fails
+void* xmalloc(size_t size) {
+  void* ptr = malloc(size);
+  if (ptr == NULL) {
+    printf("> ERROR: malloc for size %zu failed..\n", size);
+    exit(EXIT_FAILURE);
+  }
+  memset(ptr, 0, size);
+  return ptr;
+}
+
+// initalize identity matrix
+void initIdentityMatrix(DATA_TYPE* mat) {
+  // clear the matrix
+  memset(mat, 0, N * N * sizeof(DATA_TYPE));
+
+  // set all diagonals to 1
+  for (int i = 0; i < N; i++) {
+    mat[(i * N) + i] = 1.0;
+  }
+}
+
+// initialize matrix with all elements as 0
+void initZeroMatrix(DATA_TYPE* mat) {
+  memset(mat, 0, N * N * sizeof(DATA_TYPE));
+}
+
+// fill random value in column-major matrix
+void initRandomMatrix(DATA_TYPE* mat) {
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      mat[(j * N) + i] =
+          (DATA_TYPE)1.0 + ((DATA_TYPE)rand() / (DATA_TYPE)RAND_MAX);
+    }
+  }
+
+  // diagonal dominant matrix to insure it is invertible matrix
+  for (int i = 0; i < N; i++) {
+    mat[(i * N) + i] += (DATA_TYPE)N;
+  }
+}
+
+// print column-major matrix
+void printMatrix(DATA_TYPE* mat) {
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      printf("%20.16f ", mat[(j * N) + i]);
+    }
+    printf("\n");
+  }
+  printf("\n");
+}
+
+// matrix mulitplication
+void matrixMultiply(DATA_TYPE* res, DATA_TYPE* mat1, DATA_TYPE* mat2) {
+  initZeroMatrix(res);
+
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      for (int k = 0; k < N; k++) {
+        res[(j * N) + i] += mat1[(k * N) + i] * mat2[(j * N) + k];
+      }
+    }
+  }
+}
+
+// check matrix equality
+bool checkRelativeError(DATA_TYPE* mat1, DATA_TYPE* mat2, DATA_TYPE maxError) {
+  DATA_TYPE err = (DATA_TYPE)0.0;
+  DATA_TYPE refNorm = (DATA_TYPE)0.0;
+  DATA_TYPE relError = (DATA_TYPE)0.0;
+  DATA_TYPE relMaxError = (DATA_TYPE)0.0;
+
+  for (int i = 0; i < N * N; i++) {
+    refNorm = abs(mat1[i]);
+    err = abs(mat1[i] - mat2[i]);
+
+    if (refNorm != 0.0 && err > 0.0) {
+      relError = err / refNorm;
+      relMaxError = MAX(relMaxError, relError);
+    }
+
+    if (relMaxError > maxError) return false;
+  }
+  return true;
+}
+
+// decode lower and upper matrix from single matrix
+// returned by getrfBatched()
+void getLUdecoded(DATA_TYPE* mat, DATA_TYPE* L, DATA_TYPE* U) {
+  // init L as identity matrix
+  initIdentityMatrix(L);
+
+  // copy lower triangular values from mat to L (skip diagonal)
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < i; j++) {
+      L[(j * N) + i] = mat[(j * N) + i];
+    }
+  }
+
+  // init U as all zero
+  initZeroMatrix(U);
+
+  // copy upper triangular values from mat to U
+  for (int i = 0; i < N; i++) {
+    for (int j = i; j < N; j++) {
+      U[(j * N) + i] = mat[(j * N) + i];
+    }
+  }
+}
+
+// generate permutation matrix from pivot vector
+void getPmatFromPivot(DATA_TYPE* Pmat, int* P) {
+  int pivot[N];
+
+  // pivot vector in base-1
+  // convert it to base-0
+  for (int i = 0; i < N; i++) {
+    P[i]--;
+  }
+
+  // generate permutation vector from pivot
+  // initialize pivot with identity sequence
+  for (int k = 0; k < N; k++) {
+    pivot[k] = k;
+  }
+
+  // swap the indices according to pivot vector
+  for (int k = 0; k < N; k++) {
+    int q = P[k];
+
+    // swap pivot(k) and pivot(q)
+    int s = pivot[k];
+    int t = pivot[q];
+    pivot[k] = t;
+    pivot[q] = s;
+  }
+
+  // generate permutation matrix from pivot vector
+  initZeroMatrix(Pmat);
+  for (int i = 0; i < N; i++) {
+    int j = pivot[i];
+    Pmat[(j * N) + i] = (DATA_TYPE)1.0;
+  }
+}
+
+int main(int argc, char** argv) {
+  // cuBLAS variables
+  cublasStatus_t status;
+  cublasHandle_t handle;
+
+  // host variables
+  size_t matSize = N * N * sizeof(DATA_TYPE);
+
+  DATA_TYPE* h_AarrayInput;
+  DATA_TYPE* h_AarrayOutput;
+  DATA_TYPE* h_ptr_array[BATCH_SIZE];
+
+  int* h_pivotArray;
+  int* h_infoArray;
+
+  // device variables
+  DATA_TYPE* d_Aarray;
+  DATA_TYPE** d_ptr_array;
+
+  int* d_pivotArray;
+  int* d_infoArray;
+
+  int err_count = 0;
+
+  // seed the rand() function with time
+  srand(12345);
+
+  // find cuda device
+  printf("> initializing..\n");
+  int dev = findCudaDevice(argc, (const char**)argv);
+  if (dev == -1) {
+    return (EXIT_FAILURE);
+  }
+
+  // initialize cuBLAS
+  status = cublasCreate(&handle);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    printf("> ERROR: cuBLAS initialization failed..\n");
+    return (EXIT_FAILURE);
+  }
+
+#ifdef DOUBLE_PRECISION
+  printf("> using DOUBLE precision..\n");
+#else
+  printf("> using SINGLE precision..\n");
+#endif
+
+#ifdef PIVOT
+  printf("> pivot ENABLED..\n");
+#else
+  printf("> pivot DISABLED..\n");
+#endif
+
+  // allocate memory for host variables
+  h_AarrayInput = (DATA_TYPE*)xmalloc(BATCH_SIZE * matSize);
+  h_AarrayOutput = (DATA_TYPE*)xmalloc(BATCH_SIZE * matSize);
+
+  h_pivotArray = (int*)xmalloc(N * BATCH_SIZE * sizeof(int));
+  h_infoArray = (int*)xmalloc(BATCH_SIZE * sizeof(int));
+
+  // allocate memory for device variables
+  checkCudaErrors(cudaMalloc((void**)&d_Aarray, BATCH_SIZE * matSize));
+  checkCudaErrors(
+      cudaMalloc((void**)&d_pivotArray, N * BATCH_SIZE * sizeof(int)));
+  checkCudaErrors(cudaMalloc((void**)&d_infoArray, BATCH_SIZE * sizeof(int)));
+  checkCudaErrors(
+      cudaMalloc((void**)&d_ptr_array, BATCH_SIZE * sizeof(DATA_TYPE*)));
+
+  // fill matrix with random data
+  printf("> generating random matrices..\n");
+  for (int i = 0; i < BATCH_SIZE; i++) {
+    initRandomMatrix(h_AarrayInput + (i * N * N));
+  }
+
+  // copy data to device from host
+  printf("> copying data from host memory to GPU memory..\n");
+  checkCudaErrors(cudaMemcpy(d_Aarray, h_AarrayInput, BATCH_SIZE * matSize,
+                             cudaMemcpyHostToDevice));
+
+  // create pointer array for matrices
+  for (int i = 0; i < BATCH_SIZE; i++) h_ptr_array[i] = d_Aarray + (i * N * N);
+
+  // copy pointer array to device memory
+  checkCudaErrors(cudaMemcpy(d_ptr_array, h_ptr_array,
+                             BATCH_SIZE * sizeof(DATA_TYPE*),
+                             cudaMemcpyHostToDevice));
+
+  // perform LU decomposition
+  printf("> performing LU decomposition..\n");
+#ifdef PIVOT
+  status = cublasXgetrfBatched(handle, N, d_ptr_array, N, d_pivotArray,
+                               d_infoArray, BATCH_SIZE);
+#else
+  status = cublasXgetrfBatched(handle, N, d_ptr_array, N, NULL, d_infoArray,
+                               BATCH_SIZE);
+#endif /* PIVOT */
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    printf("> ERROR: cublasDgetrfBatched() failed with error %s..\n",
+           _cudaGetErrorEnum(status));
+    return (EXIT_FAILURE);
+  }
+
+  // copy data to host from device
+  printf("> copying data from GPU memory to host memory..\n");
+  checkCudaErrors(cudaMemcpy(h_AarrayOutput, d_Aarray, BATCH_SIZE * matSize,
+                             cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaMemcpy(h_infoArray, d_infoArray, BATCH_SIZE * sizeof(int),
+                             cudaMemcpyDeviceToHost));
+#ifdef PIVOT
+  checkCudaErrors(cudaMemcpy(h_pivotArray, d_pivotArray,
+                             N * BATCH_SIZE * sizeof(int),
+                             cudaMemcpyDeviceToHost));
+#endif /* PIVOT */
+
+  // verify the result
+  printf("> verifying the result..\n");
+  for (int i = 0; i < BATCH_SIZE; i++) {
+    if (h_infoArray[i] == 0) {
+      DATA_TYPE* A = h_AarrayInput + (i * N * N);
+      DATA_TYPE* LU = h_AarrayOutput + (i * N * N);
+      DATA_TYPE L[N * N];
+      DATA_TYPE U[N * N];
+      getLUdecoded(LU, L, U);
+
+      // test P * A = L * U
+      int* P = h_pivotArray + (i * N);
+      DATA_TYPE Pmat[N * N];
+#ifdef PIVOT
+      getPmatFromPivot(Pmat, P);
+#else
+      initIdentityMatrix(Pmat);
+#endif /* PIVOT */
+
+      // perform matrix multiplication
+      DATA_TYPE PxA[N * N];
+      DATA_TYPE LxU[N * N];
+      matrixMultiply(PxA, Pmat, A);
+      matrixMultiply(LxU, L, U);
+
+      // check for equality of matrices
+      if (!checkRelativeError(PxA, LxU, (DATA_TYPE)MAX_ERROR)) {
+        printf("> ERROR: accuracy check failed for matrix number %05d..\n",
+               i + 1);
+        err_count++;
+      }
+
+    } else if (h_infoArray[i] > 0) {
+      printf(
+          "> execution for matrix %05d is successful, but U is singular and "
+          "U(%d,%d) = 0..\n",
+          i + 1, h_infoArray[i] - 1, h_infoArray[i] - 1);
+    } else  // (h_infoArray[i] < 0)
+    {
+      printf("> ERROR: matrix %05d have an illegal value at index %d = %lf..\n",
+             i + 1, -h_infoArray[i],
+             *(h_AarrayInput + (i * N * N) + (-h_infoArray[i])));
+    }
+  }
+
+  // free device variables
+  checkCudaErrors(cudaFree(d_ptr_array));
+  checkCudaErrors(cudaFree(d_infoArray));
+  checkCudaErrors(cudaFree(d_pivotArray));
+  checkCudaErrors(cudaFree(d_Aarray));
+
+  // free host variables
+  if (h_infoArray) free(h_infoArray);
+  if (h_pivotArray) free(h_pivotArray);
+  if (h_AarrayOutput) free(h_AarrayOutput);
+  if (h_AarrayInput) free(h_AarrayInput);
+
+  // destroy cuBLAS handle
+  status = cublasDestroy(handle);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    printf("> ERROR: cuBLAS uninitialization failed..\n");
+    return (EXIT_FAILURE);
+  }
+
+  if (err_count > 0) {
+    printf("> TEST FAILED for %d matrices, with precision: %g\n", err_count,
+           MAX_ERROR);
+    return (EXIT_FAILURE);
+  }
+
+  printf("> TEST SUCCESSFUL, with precision: %g\n", MAX_ERROR);
+  return (EXIT_SUCCESS);
+}
diff --git a/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.sln b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.sln
new file mode 100644
index 00000000..96272bc1
--- /dev/null
+++ b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUBLAS_LU", "simpleCUBLAS_LU_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.vcxproj b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.vcxproj
new file mode 100644
index 00000000..7599aeda
--- /dev/null
+++ b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.vcxproj
@@ -0,0 +1,113 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>simpleCUBLAS_LU_vs2017</RootNamespace>
+    <ProjectName>simpleCUBLAS_LU</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/simpleCUBLAS_LU.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+      <AdditionalOptions>--threads 0</AdditionalOptions>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="simpleCUBLAS_LU.cpp" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.sln b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.sln
new file mode 100644
index 00000000..950f5c83
--- /dev/null
+++ b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUBLAS_LU", "simpleCUBLAS_LU_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.vcxproj b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.vcxproj
new file mode 100644
index 00000000..154dac0a
--- /dev/null
+++ b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.vcxproj
@@ -0,0 +1,109 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>simpleCUBLAS_LU_vs2019</RootNamespace>
+    <ProjectName>simpleCUBLAS_LU</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/simpleCUBLAS_LU.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+      <AdditionalOptions>--threads 0</AdditionalOptions>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="simpleCUBLAS_LU.cpp" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/simpleCUFFT/Makefile b/Samples/simpleCUFFT/Makefile
index 9e9475ee..c716cd0c 100644
--- a/Samples/simpleCUFFT/Makefile
+++ b/Samples/simpleCUFFT/Makefile
@@ -265,6 +265,12 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 
 SAMPLE_ENABLED := 1
 
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - simpleCUFFT is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
diff --git a/Samples/simpleCUFFT/README.md b/Samples/simpleCUFFT/README.md
index ecf41085..67227805 100644
--- a/Samples/simpleCUFFT/README.md
+++ b/Samples/simpleCUFFT/README.md
@@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj
index 0914f439..914b65a7 100644
--- a/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj
+++ b/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2019.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2019.vcxproj
index 075276a2..339d7959 100644
--- a/Samples/simpleCUFFT/simpleCUFFT_vs2019.vcxproj
+++ b/Samples/simpleCUFFT/simpleCUFFT_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCudaGraphs/README.md b/Samples/simpleCudaGraphs/README.md
index c6982a12..9e044f33 100644
--- a/Samples/simpleCudaGraphs/README.md
+++ b/Samples/simpleCudaGraphs/README.md
@@ -25,7 +25,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaLaunchHostFunc, cudaGraphCreat
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs.cu b/Samples/simpleCudaGraphs/simpleCudaGraphs.cu
index b62d04e2..82b6c160 100644
--- a/Samples/simpleCudaGraphs/simpleCudaGraphs.cu
+++ b/Samples/simpleCudaGraphs/simpleCudaGraphs.cu
@@ -393,7 +393,7 @@ int main(int argc, char **argv) {
   float *inputVec_d = NULL, *inputVec_h = NULL;
   double *outputVec_d = NULL, *result_d;
 
-  inputVec_h = (float *)malloc(sizeof(float) * size);
+  checkCudaErrors(cudaMallocHost(&inputVec_h, sizeof(float) * size));
   checkCudaErrors(cudaMalloc(&inputVec_d, sizeof(float) * size));
   checkCudaErrors(cudaMalloc(&outputVec_d, sizeof(double) * maxBlocks));
   checkCudaErrors(cudaMalloc(&result_d, sizeof(double)));
@@ -408,5 +408,6 @@ int main(int argc, char **argv) {
   checkCudaErrors(cudaFree(inputVec_d));
   checkCudaErrors(cudaFree(outputVec_d));
   checkCudaErrors(cudaFree(result_d));
+  checkCudaErrors(cudaFreeHost(inputVec_h));
   return EXIT_SUCCESS;
 }
diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj
index 6cc0c2f1..5a48206e 100644
--- a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj
+++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2019.vcxproj b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2019.vcxproj
index 3f977e78..e7aeecd2 100644
--- a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2019.vcxproj
+++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleD3D11/README.md b/Samples/simpleD3D11/README.md
index 1262eeaa..eb8d6428 100644
--- a/Samples/simpleD3D11/README.md
+++ b/Samples/simpleD3D11/README.md
@@ -30,7 +30,7 @@ cudaD3D11GetDevice, cudaImportExternalSemaphore, cudaImportExternalMemory, cudaE
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/simpleD3D11/simpleD3D11_vs2017.vcxproj b/Samples/simpleD3D11/simpleD3D11_vs2017.vcxproj
index acd1e849..651c1a4c 100644
--- a/Samples/simpleD3D11/simpleD3D11_vs2017.vcxproj
+++ b/Samples/simpleD3D11/simpleD3D11_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -112,6 +112,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleD3D11/simpleD3D11_vs2019.vcxproj b/Samples/simpleD3D11/simpleD3D11_vs2019.vcxproj
index fb464f50..c3bcdbe4 100644
--- a/Samples/simpleD3D11/simpleD3D11_vs2019.vcxproj
+++ b/Samples/simpleD3D11/simpleD3D11_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleD3D12/README.md b/Samples/simpleD3D12/README.md
index 0622cfdc..4c60ed5b 100644
--- a/Samples/simpleD3D12/README.md
+++ b/Samples/simpleD3D12/README.md
@@ -30,7 +30,7 @@ cudaWaitExternalSemaphoresAsync, cudaSignalExternalSemaphoresAsync, cudaImportEx
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/simpleD3D12/simpleD3D12.cpp b/Samples/simpleD3D12/simpleD3D12.cpp
index 5eb64764..fe072d30 100755
--- a/Samples/simpleD3D12/simpleD3D12.cpp
+++ b/Samples/simpleD3D12/simpleD3D12.cpp
@@ -266,17 +266,8 @@ void DX12CudaInterop::LoadAssets() {
     parameter.InitAsDescriptorTable(1, &range, D3D12_SHADER_VISIBILITY_VERTEX);
 
     D3D12_ROOT_SIGNATURE_FLAGS rootSignatureFlags =
-        D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT |  // Only
-                                                                        // the
-                                                                        // input
-                                                                        // assembler
-                                                                        // stage
-                                                                        // needs
-                                                                        // access
-                                                                        // to
-                                                                        // the
-                                                                        // constant
-                                                                        // buffer.
+        // Only the input assembler stage needs access to the constant buffer.
+        D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT |
         D3D12_ROOT_SIGNATURE_FLAG_DENY_DOMAIN_SHADER_ROOT_ACCESS |
         D3D12_ROOT_SIGNATURE_FLAG_DENY_GEOMETRY_SHADER_ROOT_ACCESS |
         D3D12_ROOT_SIGNATURE_FLAG_DENY_HULL_SHADER_ROOT_ACCESS |
@@ -390,6 +381,7 @@ void DX12CudaInterop::LoadAssets() {
 
     checkCudaErrors(
         cudaImportExternalMemory(&m_externalMemory, &externalMemoryHandleDesc));
+    CloseHandle(sharedHandle);
 
     cudaExternalMemoryBufferDesc externalMemoryBufferDesc;
     memset(&externalMemoryBufferDesc, 0, sizeof(externalMemoryBufferDesc));
@@ -468,6 +460,7 @@ void DX12CudaInterop::OnDestroy() {
   WaitForGpu();
   checkCudaErrors(cudaDestroyExternalSemaphore(m_externalSemaphore));
   checkCudaErrors(cudaDestroyExternalMemory(m_externalMemory));
+  checkCudaErrors(cudaFree(m_cudaDevVertptr));
   CloseHandle(m_fenceEvent);
 }
 
diff --git a/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj b/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj
index 7932d3de..e0bee149 100644
--- a/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj
+++ b/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -120,6 +120,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleD3D12/simpleD3D12_vs2019.vcxproj b/Samples/simpleD3D12/simpleD3D12_vs2019.vcxproj
index 72fe6656..ec724d05 100644
--- a/Samples/simpleD3D12/simpleD3D12_vs2019.vcxproj
+++ b/Samples/simpleD3D12/simpleD3D12_vs2019.vcxproj
@@ -39,7 +39,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -121,6 +121,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleDrvRuntime/README.md b/Samples/simpleDrvRuntime/README.md
index bce62a82..8c09f93b 100644
--- a/Samples/simpleDrvRuntime/README.md
+++ b/Samples/simpleDrvRuntime/README.md
@@ -30,7 +30,7 @@ cudaMemcpy, cudaMalloc, cudaStreamCreateWithFlags
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/simpleDrvRuntime/simpleDrvRuntime.cpp b/Samples/simpleDrvRuntime/simpleDrvRuntime.cpp
index f2926be1..cd2c0c3b 100644
--- a/Samples/simpleDrvRuntime/simpleDrvRuntime.cpp
+++ b/Samples/simpleDrvRuntime/simpleDrvRuntime.cpp
@@ -117,9 +117,9 @@ int main(int argc, char **argv) {
       cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
 
   // Allocate input vectors h_A and h_B in host memory
-  h_A = (float *)malloc(size);
-  h_B = (float *)malloc(size);
-  h_C = (float *)malloc(size);
+  checkCudaErrors(cudaMallocHost(&h_A, size));
+  checkCudaErrors(cudaMallocHost(&h_B, size));
+  checkCudaErrors(cudaMallocHost(&h_C, size));
 
   // Initialize input vectors
   RandomInit(h_A, N);
@@ -179,15 +179,15 @@ int CleanupNoFailure(CUcontext &cuContext) {
 
   // Free host memory
   if (h_A) {
-    free(h_A);
+    checkCudaErrors(cudaFreeHost(h_A));
   }
 
   if (h_B) {
-    free(h_B);
+    checkCudaErrors(cudaFreeHost(h_B));
   }
 
   if (h_C) {
-    free(h_C);
+    checkCudaErrors(cudaFreeHost(h_C));
   }
 
   checkCudaDrvErrors(cuCtxDestroy(cuContext));
diff --git a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2017.vcxproj b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2017.vcxproj
index 862b0dc0..94e29419 100644
--- a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2017.vcxproj
+++ b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -112,6 +112,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2019.vcxproj b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2019.vcxproj
index 879c0699..42556e0c 100644
--- a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2019.vcxproj
+++ b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleGL/README.md b/Samples/simpleGL/README.md
index 35a42076..5176ee1b 100644
--- a/Samples/simpleGL/README.md
+++ b/Samples/simpleGL/README.md
@@ -30,7 +30,7 @@ cudaGraphicsMapResources, cudaGraphicsUnmapResources, cudaGraphicsResourceGetMap
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/simpleGL/simpleGL_vs2017.vcxproj b/Samples/simpleGL/simpleGL_vs2017.vcxproj
index 87d2f599..d096e815 100644
--- a/Samples/simpleGL/simpleGL_vs2017.vcxproj
+++ b/Samples/simpleGL/simpleGL_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -118,6 +118,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleGL/simpleGL_vs2019.vcxproj b/Samples/simpleGL/simpleGL_vs2019.vcxproj
index d5e1f2e3..527b22d0 100644
--- a/Samples/simpleGL/simpleGL_vs2019.vcxproj
+++ b/Samples/simpleGL/simpleGL_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -114,6 +114,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleIPC/README.md b/Samples/simpleIPC/README.md
index 445db7c9..3fcb740a 100644
--- a/Samples/simpleIPC/README.md
+++ b/Samples/simpleIPC/README.md
@@ -30,7 +30,7 @@ cudaIpcGetEventHandle, cudaIpcOpenMemHandle, cudaIpcCloseMemHandle, cudaMemcpyAs
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/simpleIPC/simpleIPC_vs2017.vcxproj b/Samples/simpleIPC/simpleIPC_vs2017.vcxproj
index 47270332..640802b1 100644
--- a/Samples/simpleIPC/simpleIPC_vs2017.vcxproj
+++ b/Samples/simpleIPC/simpleIPC_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -109,6 +109,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleIPC/simpleIPC_vs2019.vcxproj b/Samples/simpleIPC/simpleIPC_vs2019.vcxproj
index 271da1f3..8c03b709 100644
--- a/Samples/simpleIPC/simpleIPC_vs2019.vcxproj
+++ b/Samples/simpleIPC/simpleIPC_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -105,6 +105,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleVoteIntrinsics/README.md b/Samples/simpleVoteIntrinsics/README.md
index cbf4ec3f..314de841 100644
--- a/Samples/simpleVoteIntrinsics/README.md
+++ b/Samples/simpleVoteIntrinsics/README.md
@@ -27,7 +27,7 @@ cudaMalloc, cudaFree, cudaMemcpy, cudaFreeHost
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj
index 0170b401..62b48298 100644
--- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj
+++ b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2019.vcxproj b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2019.vcxproj
index eaf28122..630c35e5 100644
--- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2019.vcxproj
+++ b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleVulkan/README.md b/Samples/simpleVulkan/README.md
index de1bbe69..4cbb0122 100644
--- a/Samples/simpleVulkan/README.md
+++ b/Samples/simpleVulkan/README.md
@@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedBuffer, cudaImportExternalS
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/simpleVulkan/SineWaveSimulation.cu b/Samples/simpleVulkan/SineWaveSimulation.cu
index 68e63d7a..f0289733 100644
--- a/Samples/simpleVulkan/SineWaveSimulation.cu
+++ b/Samples/simpleVulkan/SineWaveSimulation.cu
@@ -29,110 +29,106 @@
 #include <algorithm>
 #include <helper_cuda.h>
 
-__global__ void sinewave(float *heightMap, unsigned int width, unsigned int height, float time)
-{
-    const float freq = 4.0f;
-    const size_t stride = gridDim.x * blockDim.x;
+__global__ void sinewave(float *heightMap, unsigned int width,
+                         unsigned int height, float time) {
+  const float freq = 4.0f;
+  const size_t stride = gridDim.x * blockDim.x;
 
-    // Iterate through the entire array in a way that is
-    // independent of the grid configuration
-    for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < width * height; tid += stride) {
-        // Calculate the x, y coordinates
-        const size_t y = tid / width;
-        const size_t x = tid - y * width;
-        // Normalize x, y to [0,1]
-        const float u = ((2.0f * x) / width)  - 1.0f;
-        const float v = ((2.0f * y) / height) - 1.0f;
-        // Calculate the new height value
-        const float w = 0.5f * sinf(u * freq + time) * cosf(v * freq + time);
-        // Store this new height value
-        heightMap[tid] = w;
-    }
+  // Iterate through the entire array in a way that is
+  // independent of the grid configuration
+  for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < width * height;
+       tid += stride) {
+    // Calculate the x, y coordinates
+    const size_t y = tid / width;
+    const size_t x = tid - y * width;
+    // Normalize x, y to [0,1]
+    const float u = ((2.0f * x) / width) - 1.0f;
+    const float v = ((2.0f * y) / height) - 1.0f;
+    // Calculate the new height value
+    const float w = 0.5f * sinf(u * freq + time) * cosf(v * freq + time);
+    // Store this new height value
+    heightMap[tid] = w;
+  }
 }
 
-SineWaveSimulation::SineWaveSimulation(size_t width, size_t height) 
-                                        : m_heightMap(nullptr), m_width(width), m_height(height)
-{
+SineWaveSimulation::SineWaveSimulation(size_t width, size_t height)
+    : m_heightMap(nullptr), m_width(width), m_height(height) {}
+
+void SineWaveSimulation::initCudaLaunchConfig(int device) {
+  cudaDeviceProp prop = {};
+  checkCudaErrors(cudaSetDevice(device));
+  checkCudaErrors(cudaGetDeviceProperties(&prop, device));
+
+  // We don't need large block sizes, since there's not much inter-thread
+  // communication
+  m_threads = prop.warpSize;
+
+  // Use the occupancy calculator and fill the gpu as best as we can
+  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &m_blocks, sinewave, prop.warpSize, 0));
+  m_blocks *= prop.multiProcessorCount;
+
+  // Go ahead and the clamp the blocks to the minimum needed for this
+  // height/width
+  m_blocks = std::min(m_blocks,
+                      (int)((m_width * m_height + m_threads - 1) / m_threads));
 }
 
-void SineWaveSimulation::initCudaLaunchConfig(int device)
-{
-    cudaDeviceProp prop = {};
-    checkCudaErrors(cudaSetDevice(device));
-    checkCudaErrors(cudaGetDeviceProperties(&prop, device));
+int SineWaveSimulation::initCuda(uint8_t *vkDeviceUUID, size_t UUID_SIZE) {
+  int current_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
 
-    // We don't need large block sizes, since there's not much inter-thread communication
-    m_threads = prop.warpSize;
+  cudaDeviceProp deviceProp;
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
 
-    // Use the occupancy calculator and fill the gpu as best as we can
-    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_blocks, sinewave, prop.warpSize, 0));
-    m_blocks *= prop.multiProcessorCount;
+  if (device_count == 0) {
+    fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
 
-    // Go ahead and the clamp the blocks to the minimum needed for this height/width
-    m_blocks = std::min(m_blocks, (int)((m_width * m_height + m_threads - 1) / m_threads));
-}
+  // Find the GPU which is selected by Vulkan
+  while (current_device < device_count) {
+    cudaGetDeviceProperties(&deviceProp, current_device);
 
-int SineWaveSimulation::initCuda(uint8_t  *vkDeviceUUID, size_t UUID_SIZE)
-{
-    int current_device = 0;
-    int device_count = 0;
-    int devices_prohibited = 0;
+    if ((deviceProp.computeMode != cudaComputeModeProhibited)) {
+      // Compare the cuda device UUID with vulkan UUID
+      int ret = memcmp((void *)&deviceProp.uuid, vkDeviceUUID, UUID_SIZE);
+      if (ret == 0) {
+        checkCudaErrors(cudaSetDevice(current_device));
+        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device));
+        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+               current_device, deviceProp.name, deviceProp.major,
+               deviceProp.minor);
 
-    cudaDeviceProp deviceProp;
-    checkCudaErrors(cudaGetDeviceCount(&device_count));
+        return current_device;
+      }
 
-    if (device_count == 0) {
-        fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
-        exit(EXIT_FAILURE);
+    } else {
+      devices_prohibited++;
     }
 
-    // Find the GPU which is selected by Vulkan
-    while (current_device < device_count) {
-        cudaGetDeviceProperties(&deviceProp, current_device);
+    current_device++;
+  }
 
-        if ((deviceProp.computeMode != cudaComputeModeProhibited)) {
-            // Compare the cuda device UUID with vulkan UUID
-            int ret = memcmp((void*)&deviceProp.uuid, vkDeviceUUID, UUID_SIZE);
-            if (ret == 0)
-            {
-                checkCudaErrors(cudaSetDevice(current_device));
-                checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device));
-                printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
-                 current_device, deviceProp.name, deviceProp.major,
-                 deviceProp.minor);
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "CUDA error:"
+            " No Vulkan-CUDA Interop capable GPU found.\n");
+    exit(EXIT_FAILURE);
+  }
 
-                return current_device;
-            }
-
-        } else {
-          devices_prohibited++;
-        }
-
-        current_device++;
-    }
-
-    if (devices_prohibited == device_count) {
-        fprintf(stderr,
-                "CUDA error:"
-                " No Vulkan-CUDA Interop capable GPU found.\n");
-        exit(EXIT_FAILURE);
-    }
-
-    return -1;
+  return -1;
 }
 
-SineWaveSimulation::~SineWaveSimulation()
-{
-    m_heightMap = NULL;
+SineWaveSimulation::~SineWaveSimulation() { m_heightMap = NULL; }
+
+void SineWaveSimulation::initSimulation(float *heights) {
+  m_heightMap = heights;
 }
 
-void SineWaveSimulation::initSimulation(float *heights)
-{
-    m_heightMap = heights;
-}
-
-void SineWaveSimulation::stepSimulation(float time, cudaStream_t stream)
-{
-    sinewave <<< m_blocks, m_threads, 0, stream >>> (m_heightMap, m_width, m_height, time);
-    getLastCudaError("Failed to launch CUDA simulation");
+void SineWaveSimulation::stepSimulation(float time, cudaStream_t stream) {
+  sinewave<<<m_blocks, m_threads, 0, stream>>>(m_heightMap, m_width, m_height,
+                                               time);
+  getLastCudaError("Failed to launch CUDA simulation");
 }
diff --git a/Samples/simpleVulkan/SineWaveSimulation.h b/Samples/simpleVulkan/SineWaveSimulation.h
index dc889b4b..aadc4828 100644
--- a/Samples/simpleVulkan/SineWaveSimulation.h
+++ b/Samples/simpleVulkan/SineWaveSimulation.h
@@ -34,25 +34,21 @@
 #include <stdint.h>
 #include "linmath.h"
 
-class SineWaveSimulation
-{
-    float *m_heightMap;
-    size_t m_width, m_height;
-    int m_blocks, m_threads;
-public:
-    SineWaveSimulation(size_t width, size_t height);
-    ~SineWaveSimulation();
-    void initSimulation(float *heightMap);
-    void stepSimulation(float time, cudaStream_t stream = 0);
-    void initCudaLaunchConfig(int device);
-    int initCuda(uint8_t  *vkDeviceUUID, size_t UUID_SIZE);
+class SineWaveSimulation {
+  float *m_heightMap;
+  size_t m_width, m_height;
+  int m_blocks, m_threads;
 
-    size_t getWidth() const {
-        return m_width;
-    }
-    size_t getHeight() const {
-        return m_height;
-    }
+ public:
+  SineWaveSimulation(size_t width, size_t height);
+  ~SineWaveSimulation();
+  void initSimulation(float *heightMap);
+  void stepSimulation(float time, cudaStream_t stream = 0);
+  void initCudaLaunchConfig(int device);
+  int initCuda(uint8_t *vkDeviceUUID, size_t UUID_SIZE);
+
+  size_t getWidth() const { return m_width; }
+  size_t getHeight() const { return m_height; }
 };
 
-#endif // __SINESIM_H__
+#endif  // __SINESIM_H__
diff --git a/Samples/simpleVulkan/VulkanBaseApp.cpp b/Samples/simpleVulkan/VulkanBaseApp.cpp
index 05dece53..fbc4049c 100644
--- a/Samples/simpleVulkan/VulkanBaseApp.cpp
+++ b/Samples/simpleVulkan/VulkanBaseApp.cpp
@@ -55,1665 +55,1886 @@
 #define countof(x) (sizeof(x) / sizeof(*(x)))
 #endif
 
-static const char *validationLayers[] = { "VK_LAYER_KHRONOS_validation" };
+static const char *validationLayers[] = {"VK_LAYER_KHRONOS_validation"};
 static const size_t MAX_FRAMES_IN_FLIGHT = 5;
 
-void VulkanBaseApp::resizeCallback(GLFWwindow *window, int width, int height)
-{
-    VulkanBaseApp *app = reinterpret_cast<VulkanBaseApp *>(glfwGetWindowUserPointer(window));
-    app->m_framebufferResized = true;
+void VulkanBaseApp::resizeCallback(GLFWwindow *window, int width, int height) {
+  VulkanBaseApp *app =
+      reinterpret_cast<VulkanBaseApp *>(glfwGetWindowUserPointer(window));
+  app->m_framebufferResized = true;
 }
 
-static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, VkDebugUtilsMessageTypeFlagsEXT messageType, const VkDebugUtilsMessengerCallbackDataEXT *pCallbackData, void *pUserData)
-{
-    std::cerr << "validation layer: " << pCallbackData->pMessage << std::endl;
+static VKAPI_ATTR VkBool32 VKAPI_CALL
+debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity,
+              VkDebugUtilsMessageTypeFlagsEXT messageType,
+              const VkDebugUtilsMessengerCallbackDataEXT *pCallbackData,
+              void *pUserData) {
+  std::cerr << "validation layer: " << pCallbackData->pMessage << std::endl;
 
-    return VK_FALSE;
+  return VK_FALSE;
 }
 
-VulkanBaseApp::VulkanBaseApp(const std::string& appName, bool enableValidation) :
-    m_appName(appName),
-    m_enableValidation(enableValidation),
-    m_instance(VK_NULL_HANDLE),
-    m_window(nullptr),
-    m_debugMessenger(VK_NULL_HANDLE),
-    m_surface(VK_NULL_HANDLE),
-    m_physicalDevice(VK_NULL_HANDLE),
-    m_device(VK_NULL_HANDLE),
-    m_graphicsQueue(VK_NULL_HANDLE),
-    m_presentQueue(VK_NULL_HANDLE),
-    m_swapChain(VK_NULL_HANDLE),
-    m_vkDeviceUUID(),
-    m_swapChainImages(),
-    m_swapChainFormat(),
-    m_swapChainExtent(),
-    m_swapChainImageViews(),
-    m_shaderFiles(),
-    m_renderPass(),
-    m_pipelineLayout(VK_NULL_HANDLE),
-    m_graphicsPipeline(VK_NULL_HANDLE),
-    m_swapChainFramebuffers(),
-    m_commandPool(VK_NULL_HANDLE),
-    m_commandBuffers(),
-    m_imageAvailableSemaphores(),
-    m_renderFinishedSemaphores(),
-    m_inFlightFences(),
-    m_uniformBuffers(),
-    m_uniformMemory(),
-    m_descriptorSetLayout(VK_NULL_HANDLE),
-    m_descriptorPool(VK_NULL_HANDLE),
-    m_descriptorSets(),
-    m_depthImage(VK_NULL_HANDLE),
-    m_depthImageMemory(VK_NULL_HANDLE),
-    m_depthImageView(VK_NULL_HANDLE),
-    m_currentFrame(0),
-    m_framebufferResized(false)
-{
-}
+VulkanBaseApp::VulkanBaseApp(const std::string &appName, bool enableValidation)
+    : m_appName(appName),
+      m_enableValidation(enableValidation),
+      m_instance(VK_NULL_HANDLE),
+      m_window(nullptr),
+      m_debugMessenger(VK_NULL_HANDLE),
+      m_surface(VK_NULL_HANDLE),
+      m_physicalDevice(VK_NULL_HANDLE),
+      m_device(VK_NULL_HANDLE),
+      m_graphicsQueue(VK_NULL_HANDLE),
+      m_presentQueue(VK_NULL_HANDLE),
+      m_swapChain(VK_NULL_HANDLE),
+      m_vkDeviceUUID(),
+      m_swapChainImages(),
+      m_swapChainFormat(),
+      m_swapChainExtent(),
+      m_swapChainImageViews(),
+      m_shaderFiles(),
+      m_renderPass(),
+      m_pipelineLayout(VK_NULL_HANDLE),
+      m_graphicsPipeline(VK_NULL_HANDLE),
+      m_swapChainFramebuffers(),
+      m_commandPool(VK_NULL_HANDLE),
+      m_commandBuffers(),
+      m_imageAvailableSemaphores(),
+      m_renderFinishedSemaphores(),
+      m_inFlightFences(),
+      m_uniformBuffers(),
+      m_uniformMemory(),
+      m_descriptorSetLayout(VK_NULL_HANDLE),
+      m_descriptorPool(VK_NULL_HANDLE),
+      m_descriptorSets(),
+      m_depthImage(VK_NULL_HANDLE),
+      m_depthImageMemory(VK_NULL_HANDLE),
+      m_depthImageView(VK_NULL_HANDLE),
+      m_currentFrame(0),
+      m_framebufferResized(false) {}
 
-VkExternalSemaphoreHandleTypeFlagBits VulkanBaseApp::getDefaultSemaphoreHandleType()
-{
+VkExternalSemaphoreHandleTypeFlagBits
+VulkanBaseApp::getDefaultSemaphoreHandleType() {
 #ifdef _WIN64
-    return IsWindows8OrGreater() ?
-           VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT :
-           VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT;
+  return IsWindows8OrGreater()
+             ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT
+             : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT;
 #else
-    return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT;
-#endif
+  return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT;
+#endif /* _WIN64 */
 }
 
-VkExternalMemoryHandleTypeFlagBits VulkanBaseApp::getDefaultMemHandleType()
-{
+VkExternalMemoryHandleTypeFlagBits VulkanBaseApp::getDefaultMemHandleType() {
 #ifdef _WIN64
-    return IsWindows8Point1OrGreater() ?
-           VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT :
-           VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT;
+  return IsWindows8Point1OrGreater()
+             ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT
+             : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT;
 #else
-    return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
-#endif
+  return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+#endif /* _WIN64 */
 }
 
-VulkanBaseApp::~VulkanBaseApp()
-{
-    cleanupSwapChain();
+VulkanBaseApp::~VulkanBaseApp() {
+  cleanupSwapChain();
 
-    if (m_descriptorSetLayout != VK_NULL_HANDLE) {
-        vkDestroyDescriptorSetLayout(m_device, m_descriptorSetLayout, nullptr);
-    }
+  if (m_descriptorSetLayout != VK_NULL_HANDLE) {
+    vkDestroyDescriptorSetLayout(m_device, m_descriptorSetLayout, nullptr);
+  }
 
-    for (size_t i = 0; i < m_renderFinishedSemaphores.size(); i++) {
-        vkDestroySemaphore(m_device, m_renderFinishedSemaphores[i], nullptr);
-        vkDestroySemaphore(m_device, m_imageAvailableSemaphores[i], nullptr);
-        vkDestroyFence(m_device, m_inFlightFences[i], nullptr);
-    }
-    if (m_commandPool != VK_NULL_HANDLE) {
-        vkDestroyCommandPool(m_device, m_commandPool, nullptr);
-    }
+#ifdef _VK_TIMELINE_SEMAPHORE
+  if (m_vkPresentationSemaphore != VK_NULL_HANDLE) {
+    vkDestroySemaphore(m_device, m_vkPresentationSemaphore, nullptr);
+  }
+#endif /* _VK_TIMELINE_SEMAPHORE */
 
-    if (m_device != VK_NULL_HANDLE) {
-        vkDestroyDevice(m_device, nullptr);
-    }
+  for (size_t i = 0; i < m_renderFinishedSemaphores.size(); i++) {
+    vkDestroySemaphore(m_device, m_renderFinishedSemaphores[i], nullptr);
+    vkDestroySemaphore(m_device, m_imageAvailableSemaphores[i], nullptr);
+    vkDestroyFence(m_device, m_inFlightFences[i], nullptr);
+  }
+  if (m_commandPool != VK_NULL_HANDLE) {
+    vkDestroyCommandPool(m_device, m_commandPool, nullptr);
+  }
 
-    if (m_enableValidation) {
-        PFN_vkDestroyDebugUtilsMessengerEXT func = (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr(m_instance, "vkDestroyDebugUtilsMessengerEXT");
-        if (func != nullptr) {
-            func(m_instance, m_debugMessenger, nullptr);
-        }
-    }
+  if (m_device != VK_NULL_HANDLE) {
+    vkDestroyDevice(m_device, nullptr);
+  }
 
-    if (m_surface != VK_NULL_HANDLE) {
-        vkDestroySurfaceKHR(m_instance, m_surface, nullptr);
+  if (m_enableValidation) {
+    PFN_vkDestroyDebugUtilsMessengerEXT func =
+        (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr(
+            m_instance, "vkDestroyDebugUtilsMessengerEXT");
+    if (func != nullptr) {
+      func(m_instance, m_debugMessenger, nullptr);
     }
+  }
 
-    if (m_instance != VK_NULL_HANDLE) {
-        vkDestroyInstance(m_instance, nullptr);
-    }
+  if (m_surface != VK_NULL_HANDLE) {
+    vkDestroySurfaceKHR(m_instance, m_surface, nullptr);
+  }
 
-    if (m_window) {
-        glfwDestroyWindow(m_window);
-    }
+  if (m_instance != VK_NULL_HANDLE) {
+    vkDestroyInstance(m_instance, nullptr);
+  }
 
-    glfwTerminate();
+  if (m_window) {
+    glfwDestroyWindow(m_window);
+  }
+
+  glfwTerminate();
 }
 
-void VulkanBaseApp::init()
-{
-    initWindow();
-    initVulkan();
+void VulkanBaseApp::init() {
+  initWindow();
+  initVulkan();
 }
 
-VkCommandBuffer VulkanBaseApp::beginSingleTimeCommands()
-{
-    VkCommandBufferAllocateInfo allocInfo = {};
-    allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
-    allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
-    allocInfo.commandPool = m_commandPool;
-    allocInfo.commandBufferCount = 1;
+VkCommandBuffer VulkanBaseApp::beginSingleTimeCommands() {
+  VkCommandBufferAllocateInfo allocInfo = {};
+  allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+  allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+  allocInfo.commandPool = m_commandPool;
+  allocInfo.commandBufferCount = 1;
 
-    VkCommandBuffer commandBuffer;
-    vkAllocateCommandBuffers(m_device, &allocInfo, &commandBuffer);
+  VkCommandBuffer commandBuffer;
+  vkAllocateCommandBuffers(m_device, &allocInfo, &commandBuffer);
 
-    VkCommandBufferBeginInfo beginInfo = {};
-    beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
-    beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+  VkCommandBufferBeginInfo beginInfo = {};
+  beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+  beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
 
-    vkBeginCommandBuffer(commandBuffer, &beginInfo);
+  vkBeginCommandBuffer(commandBuffer, &beginInfo);
 
-    return commandBuffer;
+  return commandBuffer;
 }
 
-void VulkanBaseApp::endSingleTimeCommands(VkCommandBuffer commandBuffer)
-{
-    vkEndCommandBuffer(commandBuffer);
+void VulkanBaseApp::endSingleTimeCommands(VkCommandBuffer commandBuffer) {
+  vkEndCommandBuffer(commandBuffer);
 
-    VkSubmitInfo submitInfo = {};
-    submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
-    submitInfo.commandBufferCount = 1;
-    submitInfo.pCommandBuffers = &commandBuffer;
+  VkSubmitInfo submitInfo = {};
+  submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+  submitInfo.commandBufferCount = 1;
+  submitInfo.pCommandBuffers = &commandBuffer;
 
-    vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE);
-    vkQueueWaitIdle(m_graphicsQueue);
+  vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE);
+  vkQueueWaitIdle(m_graphicsQueue);
 
-    vkFreeCommandBuffers(m_device, m_commandPool, 1, &commandBuffer);
+  vkFreeCommandBuffers(m_device, m_commandPool, 1, &commandBuffer);
 }
 
-void VulkanBaseApp::initWindow()
-{
-    glfwInit();
+void VulkanBaseApp::initWindow() {
+  glfwInit();
 
-    glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API);
-    glfwWindowHint(GLFW_RESIZABLE, GLFW_FALSE);
+  glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API);
+  glfwWindowHint(GLFW_RESIZABLE, GLFW_FALSE);
 
-    m_window = glfwCreateWindow(1280, 800, m_appName.c_str(), nullptr, nullptr);
-    glfwSetWindowUserPointer(m_window, this);
-    glfwSetFramebufferSizeCallback(m_window, resizeCallback);
+  m_window = glfwCreateWindow(1280, 800, m_appName.c_str(), nullptr, nullptr);
+  glfwSetWindowUserPointer(m_window, this);
+  glfwSetFramebufferSizeCallback(m_window, resizeCallback);
 }
 
-
-std::vector<const char *> VulkanBaseApp::getRequiredExtensions() const
-{
-    return std::vector<const char *>();
+std::vector<const char *> VulkanBaseApp::getRequiredExtensions() const {
+  return std::vector<const char *>();
 }
 
-std::vector<const char *> VulkanBaseApp::getRequiredDeviceExtensions() const
-{
-    return std::vector<const char *>();
+std::vector<const char *> VulkanBaseApp::getRequiredDeviceExtensions() const {
+  return std::vector<const char *>();
 }
 
-void VulkanBaseApp::initVulkan()
-{
-    createInstance();
-    createSurface();
-    createDevice();
-    createSwapChain();
-    createImageViews();
-    createRenderPass();
-    createDescriptorSetLayout();
-    createGraphicsPipeline();
-    createCommandPool();
-    createDepthResources();
-    createFramebuffers();
-    initVulkanApp();
-    createUniformBuffers();
-    createDescriptorPool();
-    createDescriptorSets();
-    createCommandBuffers();
-    createSyncObjects();
+void VulkanBaseApp::initVulkan() {
+  createInstance();
+  createSurface();
+  createDevice();
+  createSwapChain();
+  createImageViews();
+  createRenderPass();
+  createDescriptorSetLayout();
+  createGraphicsPipeline();
+  createCommandPool();
+  createDepthResources();
+  createFramebuffers();
+  initVulkanApp();
+  createUniformBuffers();
+  createDescriptorPool();
+  createDescriptorSets();
+  createCommandBuffers();
+  createSyncObjects();
 }
 
 #ifdef _WIN64
-class WindowsSecurityAttributes
-{
-protected:
-    SECURITY_ATTRIBUTES m_winSecurityAttributes;
-    PSECURITY_DESCRIPTOR m_winPSecurityDescriptor;
+class WindowsSecurityAttributes {
+ protected:
+  SECURITY_ATTRIBUTES m_winSecurityAttributes;
+  PSECURITY_DESCRIPTOR m_winPSecurityDescriptor;
 
-public:
-    WindowsSecurityAttributes();
-    SECURITY_ATTRIBUTES *operator&();
-    ~WindowsSecurityAttributes();
+ public:
+  WindowsSecurityAttributes();
+  SECURITY_ATTRIBUTES *operator&();
+  ~WindowsSecurityAttributes();
 };
 
-WindowsSecurityAttributes::WindowsSecurityAttributes()
-{
-    m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc(1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void **));
-    if (!m_winPSecurityDescriptor) {
-        throw std::runtime_error("Failed to allocate memory for security descriptor");
-    }
+WindowsSecurityAttributes::WindowsSecurityAttributes() {
+  m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc(
+      1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void **));
+  if (!m_winPSecurityDescriptor) {
+    throw std::runtime_error(
+        "Failed to allocate memory for security descriptor");
+  }
 
-    PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH);
-    PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *));
+  PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor +
+                         SECURITY_DESCRIPTOR_MIN_LENGTH);
+  PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *));
 
-    InitializeSecurityDescriptor(m_winPSecurityDescriptor, SECURITY_DESCRIPTOR_REVISION);
+  InitializeSecurityDescriptor(m_winPSecurityDescriptor,
+                               SECURITY_DESCRIPTOR_REVISION);
 
-    SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = SECURITY_WORLD_SID_AUTHORITY;
-    AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0, 0, 0, 0, 0, 0, ppSID);
+  SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority =
+      SECURITY_WORLD_SID_AUTHORITY;
+  AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0,
+                           0, 0, 0, 0, 0, ppSID);
 
-    EXPLICIT_ACCESS explicitAccess;
-    ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS));
-    explicitAccess.grfAccessPermissions = STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL;
-    explicitAccess.grfAccessMode = SET_ACCESS;
-    explicitAccess.grfInheritance = INHERIT_ONLY;
-    explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID;
-    explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP;
-    explicitAccess.Trustee.ptstrName = (LPTSTR) * ppSID;
+  EXPLICIT_ACCESS explicitAccess;
+  ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS));
+  explicitAccess.grfAccessPermissions =
+      STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL;
+  explicitAccess.grfAccessMode = SET_ACCESS;
+  explicitAccess.grfInheritance = INHERIT_ONLY;
+  explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID;
+  explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP;
+  explicitAccess.Trustee.ptstrName = (LPTSTR)*ppSID;
 
-    SetEntriesInAcl(1, &explicitAccess, NULL, ppACL);
+  SetEntriesInAcl(1, &explicitAccess, NULL, ppACL);
 
-    SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE);
+  SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE);
 
-    m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes);
-    m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor;
-    m_winSecurityAttributes.bInheritHandle = TRUE;
+  m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes);
+  m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor;
+  m_winSecurityAttributes.bInheritHandle = TRUE;
 }
 
-SECURITY_ATTRIBUTES *
-WindowsSecurityAttributes::operator&()
-{
-    return &m_winSecurityAttributes;
+SECURITY_ATTRIBUTES *WindowsSecurityAttributes::operator&() {
+  return &m_winSecurityAttributes;
 }
 
-WindowsSecurityAttributes::~WindowsSecurityAttributes()
-{
-    PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH);
-    PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *));
+WindowsSecurityAttributes::~WindowsSecurityAttributes() {
+  PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor +
+                         SECURITY_DESCRIPTOR_MIN_LENGTH);
+  PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *));
 
-    if (*ppSID) {
-        FreeSid(*ppSID);
-    }
-    if (*ppACL) {
-        LocalFree(*ppACL);
-    }
-    free(m_winPSecurityDescriptor);
+  if (*ppSID) {
+    FreeSid(*ppSID);
+  }
+  if (*ppACL) {
+    LocalFree(*ppACL);
+  }
+  free(m_winPSecurityDescriptor);
 }
 #endif /* _WIN64 */
 
-
-static VkFormat findSupportedFormat(VkPhysicalDevice physicalDevice, const std::vector<VkFormat>& candidates, VkImageTiling tiling, VkFormatFeatureFlags features)
-{
-    for (VkFormat format : candidates) {
-        VkFormatProperties props;
-        vkGetPhysicalDeviceFormatProperties(physicalDevice, format, &props);
-        if (tiling == VK_IMAGE_TILING_LINEAR && (props.linearTilingFeatures & features) == features) {
-            return format;
-        }
-        else if (tiling == VK_IMAGE_TILING_OPTIMAL && (props.optimalTilingFeatures & features) == features) {
-            return format;
-        }
+static VkFormat findSupportedFormat(VkPhysicalDevice physicalDevice,
+                                    const std::vector<VkFormat> &candidates,
+                                    VkImageTiling tiling,
+                                    VkFormatFeatureFlags features) {
+  for (VkFormat format : candidates) {
+    VkFormatProperties props;
+    vkGetPhysicalDeviceFormatProperties(physicalDevice, format, &props);
+    if (tiling == VK_IMAGE_TILING_LINEAR &&
+        (props.linearTilingFeatures & features) == features) {
+      return format;
+    } else if (tiling == VK_IMAGE_TILING_OPTIMAL &&
+               (props.optimalTilingFeatures & features) == features) {
+      return format;
     }
-    throw std::runtime_error("Failed to find supported format!");
+  }
+  throw std::runtime_error("Failed to find supported format!");
 }
 
-static uint32_t findMemoryType(VkPhysicalDevice physicalDevice, uint32_t typeFilter, VkMemoryPropertyFlags properties)
-{
-    VkPhysicalDeviceMemoryProperties memProperties;
-    vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties);
-    for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) {
-        if (typeFilter & (1 << i) && (memProperties.memoryTypes[i].propertyFlags & properties) == properties) {
-            return i;
-        }
+static uint32_t findMemoryType(VkPhysicalDevice physicalDevice,
+                               uint32_t typeFilter,
+                               VkMemoryPropertyFlags properties) {
+  VkPhysicalDeviceMemoryProperties memProperties;
+  vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties);
+  for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) {
+    if (typeFilter & (1 << i) &&
+        (memProperties.memoryTypes[i].propertyFlags & properties) ==
+            properties) {
+      return i;
     }
-    return ~0;
+  }
+  return ~0;
 }
 
-static bool supportsValidationLayers()
-{
-    std::vector<VkLayerProperties> availableLayers;
-    uint32_t layerCount;
+static bool supportsValidationLayers() {
+  std::vector<VkLayerProperties> availableLayers;
+  uint32_t layerCount;
 
-    vkEnumerateInstanceLayerProperties(&layerCount, nullptr);
-    availableLayers.resize(layerCount);
-    vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data());
+  vkEnumerateInstanceLayerProperties(&layerCount, nullptr);
+  availableLayers.resize(layerCount);
+  vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data());
 
-    for (const char * layerName : validationLayers) {
-        bool layerFound = false;
+  for (const char *layerName : validationLayers) {
+    bool layerFound = false;
 
-        for (const auto & layerProperties : availableLayers) {
-            if (strcmp(layerName, layerProperties.layerName) == 0) {
-                layerFound = true;
-                break;
-            }
-        }
-
-        if (!layerFound) {
-            return false;
-        }
+    for (const auto &layerProperties : availableLayers) {
+      if (strcmp(layerName, layerProperties.layerName) == 0) {
+        layerFound = true;
+        break;
+      }
     }
 
-    return true;
+    if (!layerFound) {
+      return false;
+    }
+  }
+
+  return true;
 }
 
-void VulkanBaseApp::createInstance()
-{
-    if (m_enableValidation && !supportsValidationLayers()) {
-        throw std::runtime_error("Validation requested, but not supported!");
-    }
+void VulkanBaseApp::createInstance() {
+  if (m_enableValidation && !supportsValidationLayers()) {
+    throw std::runtime_error("Validation requested, but not supported!");
+  }
 
-    VkApplicationInfo appInfo = {};
-    appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
-    appInfo.pApplicationName = m_appName.c_str();
-    appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
-    appInfo.pEngineName = "No Engine";
-    appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
-    appInfo.apiVersion = VK_API_VERSION_1_0;
+  VkApplicationInfo appInfo = {};
+  appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
+  appInfo.pApplicationName = m_appName.c_str();
+  appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
+  appInfo.pEngineName = "No Engine";
+  appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
+  appInfo.apiVersion = VK_API_VERSION_1_2;
 
-    VkInstanceCreateInfo createInfo = {};
-    createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
-    createInfo.pApplicationInfo = &appInfo;
+  VkInstanceCreateInfo createInfo = {};
+  createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
+  createInfo.pApplicationInfo = &appInfo;
 
-    std::vector<const char *> exts = getRequiredExtensions();
+  std::vector<const char *> exts = getRequiredExtensions();
 
-    {
-        uint32_t glfwExtensionCount = 0;
-        const char **glfwExtensions;
+  {
+    uint32_t glfwExtensionCount = 0;
+    const char **glfwExtensions;
 
-        glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount);
+    glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount);
 
-        exts.insert(exts.begin(), glfwExtensions, glfwExtensions + glfwExtensionCount);
-
-        if (m_enableValidation) {
-            exts.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME);
-        }
-    }
-
-    createInfo.enabledExtensionCount = static_cast<uint32_t>(exts.size());
-    createInfo.ppEnabledExtensionNames = exts.data();
-    VkDebugUtilsMessengerCreateInfoEXT debugCreateInfo = {};
-    if (m_enableValidation) {
-        createInfo.enabledLayerCount = static_cast<uint32_t>(countof(validationLayers));
-        createInfo.ppEnabledLayerNames = validationLayers;
-
-        debugCreateInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT;
-        debugCreateInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT;
-        debugCreateInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT;
-        debugCreateInfo.pfnUserCallback = debugCallback;
-
-        createInfo.pNext = &debugCreateInfo;
-    }
-    else {
-        createInfo.enabledLayerCount = 0;
-        createInfo.pNext = nullptr;
-    }
-
-    if (vkCreateInstance(&createInfo, nullptr, &m_instance) != VK_SUCCESS) {
-        throw std::runtime_error("Failed to create Vulkan instance!");
-    }
+    exts.insert(exts.begin(), glfwExtensions,
+                glfwExtensions + glfwExtensionCount);
 
     if (m_enableValidation) {
-        PFN_vkCreateDebugUtilsMessengerEXT func = (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(m_instance, "vkCreateDebugUtilsMessengerEXT");
-        if (func == nullptr || func(m_instance, &debugCreateInfo, nullptr, &m_debugMessenger) != VK_SUCCESS) {
-            throw std::runtime_error("Failed to set up debug messenger!");
-        }
+      exts.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME);
     }
+  }
+
+  createInfo.enabledExtensionCount = static_cast<uint32_t>(exts.size());
+  createInfo.ppEnabledExtensionNames = exts.data();
+  VkDebugUtilsMessengerCreateInfoEXT debugCreateInfo = {};
+  if (m_enableValidation) {
+    createInfo.enabledLayerCount =
+        static_cast<uint32_t>(countof(validationLayers));
+    createInfo.ppEnabledLayerNames = validationLayers;
+
+    debugCreateInfo.sType =
+        VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT;
+    debugCreateInfo.messageSeverity =
+        VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT |
+        VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT |
+        VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT;
+    debugCreateInfo.messageType =
+        VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT |
+        VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT |
+        VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT;
+    debugCreateInfo.pfnUserCallback = debugCallback;
+
+    createInfo.pNext = &debugCreateInfo;
+  } else {
+    createInfo.enabledLayerCount = 0;
+    createInfo.pNext = nullptr;
+  }
+
+  if (vkCreateInstance(&createInfo, nullptr, &m_instance) != VK_SUCCESS) {
+    throw std::runtime_error("Failed to create Vulkan instance!");
+  }
+
+  if (m_enableValidation) {
+    PFN_vkCreateDebugUtilsMessengerEXT func =
+        (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(
+            m_instance, "vkCreateDebugUtilsMessengerEXT");
+    if (func == nullptr ||
+        func(m_instance, &debugCreateInfo, nullptr, &m_debugMessenger) !=
+            VK_SUCCESS) {
+      throw std::runtime_error("Failed to set up debug messenger!");
+    }
+  }
 }
 
-void VulkanBaseApp::createSurface()
-{
-    if (glfwCreateWindowSurface(m_instance, m_window, nullptr, &m_surface) != VK_SUCCESS) {
-        throw std::runtime_error("failed to create window surface!");
-    }
+void VulkanBaseApp::createSurface() {
+  if (glfwCreateWindowSurface(m_instance, m_window, nullptr, &m_surface) !=
+      VK_SUCCESS) {
+    throw std::runtime_error("failed to create window surface!");
+  }
 }
 
-static bool findGraphicsQueueIndicies(VkPhysicalDevice device, VkSurfaceKHR surface, uint32_t& graphicsFamily, uint32_t& presentFamily)
-{
-    uint32_t queueFamilyCount = 0;
+static bool findGraphicsQueueIndicies(VkPhysicalDevice device,
+                                      VkSurfaceKHR surface,
+                                      uint32_t &graphicsFamily,
+                                      uint32_t &presentFamily) {
+  uint32_t queueFamilyCount = 0;
 
-    vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, nullptr);
+  vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, nullptr);
 
-    std::vector<VkQueueFamilyProperties> queueFamilies(queueFamilyCount);
-    vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, queueFamilies.data());
+  std::vector<VkQueueFamilyProperties> queueFamilies(queueFamilyCount);
+  vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount,
+                                           queueFamilies.data());
 
-    graphicsFamily = presentFamily = ~0;
+  graphicsFamily = presentFamily = ~0;
 
-    for (uint32_t i = 0; i < queueFamilyCount; i++) {
-
-        if (queueFamilies[i].queueCount > 0) {
-            if (graphicsFamily == ~0 && queueFamilies[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
-                graphicsFamily = i;
-            }
-            uint32_t presentSupport = 0;
-            vkGetPhysicalDeviceSurfaceSupportKHR(device, i, surface, &presentSupport);
-            if (presentFamily == ~0 && presentSupport) {
-                presentFamily = i;
-            }
-            if (presentFamily != ~0 && graphicsFamily != ~0) {
-                break;
-            }
-        }
+  for (uint32_t i = 0; i < queueFamilyCount; i++) {
+    if (queueFamilies[i].queueCount > 0) {
+      if (graphicsFamily == ~0 &&
+          queueFamilies[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
+        graphicsFamily = i;
+      }
+      uint32_t presentSupport = 0;
+      vkGetPhysicalDeviceSurfaceSupportKHR(device, i, surface, &presentSupport);
+      if (presentFamily == ~0 && presentSupport) {
+        presentFamily = i;
+      }
+      if (presentFamily != ~0 && graphicsFamily != ~0) {
+        break;
+      }
     }
+  }
 
-    return graphicsFamily != ~0 && presentFamily != ~0;
+  return graphicsFamily != ~0 && presentFamily != ~0;
 }
 
-static bool hasAllExtensions(VkPhysicalDevice device, const std::vector<const char *>& deviceExtensions)
-{
-    uint32_t extensionCount;
-    vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, nullptr);
-    std::vector<VkExtensionProperties> availableExtensions(extensionCount);
-    vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, availableExtensions.data());
+static bool hasAllExtensions(
+    VkPhysicalDevice device,
+    const std::vector<const char *> &deviceExtensions) {
+  uint32_t extensionCount;
+  vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount,
+                                       nullptr);
+  std::vector<VkExtensionProperties> availableExtensions(extensionCount);
+  vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount,
+                                       availableExtensions.data());
 
-    std::set<std::string> requiredExtensions(deviceExtensions.begin(), deviceExtensions.end());
+  std::set<std::string> requiredExtensions(deviceExtensions.begin(),
+                                           deviceExtensions.end());
 
-    for (const auto & extension : availableExtensions) {
-        requiredExtensions.erase(extension.extensionName);
-    }
+  for (const auto &extension : availableExtensions) {
+    requiredExtensions.erase(extension.extensionName);
+  }
 
-    return requiredExtensions.empty();
+  return requiredExtensions.empty();
 }
 
-static void getSwapChainProperties(VkPhysicalDevice device, VkSurfaceKHR surface, VkSurfaceCapabilitiesKHR& capabilities, std::vector<VkSurfaceFormatKHR>& formats, std::vector<VkPresentModeKHR>& presentModes)
-{
-    vkGetPhysicalDeviceSurfaceCapabilitiesKHR(device, surface, &capabilities);
-    uint32_t formatCount;
-    vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, nullptr);
-    if (formatCount != 0) {
-        formats.resize(formatCount);
-        vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, formats.data());
-    }
-    uint32_t presentModeCount;
-    vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, nullptr);
-    if (presentModeCount != 0) {
-        presentModes.resize(presentModeCount);
-        vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, presentModes.data());
-    }
+static void getSwapChainProperties(
+    VkPhysicalDevice device, VkSurfaceKHR surface,
+    VkSurfaceCapabilitiesKHR &capabilities,
+    std::vector<VkSurfaceFormatKHR> &formats,
+    std::vector<VkPresentModeKHR> &presentModes) {
+  vkGetPhysicalDeviceSurfaceCapabilitiesKHR(device, surface, &capabilities);
+  uint32_t formatCount;
+  vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, nullptr);
+  if (formatCount != 0) {
+    formats.resize(formatCount);
+    vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount,
+                                         formats.data());
+  }
+  uint32_t presentModeCount;
+  vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount,
+                                            nullptr);
+  if (presentModeCount != 0) {
+    presentModes.resize(presentModeCount);
+    vkGetPhysicalDeviceSurfacePresentModesKHR(
+        device, surface, &presentModeCount, presentModes.data());
+  }
 }
 
-bool VulkanBaseApp::isSuitableDevice(VkPhysicalDevice dev) const
-{
-    uint32_t graphicsQueueIndex, presentQueueIndex;
-    std::vector<const char *> deviceExtensions = getRequiredDeviceExtensions();
-    VkSurfaceCapabilitiesKHR caps;
+bool VulkanBaseApp::isSuitableDevice(VkPhysicalDevice dev) const {
+  uint32_t graphicsQueueIndex, presentQueueIndex;
+  std::vector<const char *> deviceExtensions = getRequiredDeviceExtensions();
+  VkSurfaceCapabilitiesKHR caps;
+  std::vector<VkSurfaceFormatKHR> formats;
+  std::vector<VkPresentModeKHR> presentModes;
+  deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
+  getSwapChainProperties(dev, m_surface, caps, formats, presentModes);
+  return hasAllExtensions(dev, deviceExtensions) && !formats.empty() &&
+         !presentModes.empty() &&
+         findGraphicsQueueIndicies(dev, m_surface, graphicsQueueIndex,
+                                   presentQueueIndex);
+}
+
+void VulkanBaseApp::createDevice() {
+  {
+    uint32_t deviceCount = 0;
+    vkEnumeratePhysicalDevices(m_instance, &deviceCount, nullptr);
+    if (deviceCount == 0) {
+      throw std::runtime_error("Failed to find Vulkan capable GPUs!");
+    }
+    std::vector<VkPhysicalDevice> phyDevs(deviceCount);
+    vkEnumeratePhysicalDevices(m_instance, &deviceCount, phyDevs.data());
+    std::vector<VkPhysicalDevice>::iterator it =
+        std::find_if(phyDevs.begin(), phyDevs.end(),
+                     std::bind(&VulkanBaseApp::isSuitableDevice, this,
+                               std::placeholders::_1));
+    if (it == phyDevs.end()) {
+      throw std::runtime_error("No suitable device found!");
+    }
+    m_physicalDevice = *it;
+  }
+
+  uint32_t graphicsQueueIndex, presentQueueIndex;
+  findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsQueueIndex,
+                            presentQueueIndex);
+
+  std::vector<VkDeviceQueueCreateInfo> queueCreateInfos;
+  std::set<uint32_t> uniqueFamilyIndices = {graphicsQueueIndex,
+                                            presentQueueIndex};
+
+  float queuePriority = 1.0f;
+
+  for (uint32_t queueFamily : uniqueFamilyIndices) {
+    VkDeviceQueueCreateInfo queueCreateInfo = {};
+    queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
+    queueCreateInfo.queueFamilyIndex = queueFamily;
+    queueCreateInfo.queueCount = 1;
+    queueCreateInfo.pQueuePriorities = &queuePriority;
+    queueCreateInfos.push_back(queueCreateInfo);
+  }
+
+  VkPhysicalDeviceFeatures deviceFeatures = {};
+  deviceFeatures.fillModeNonSolid = true;
+
+  VkDeviceCreateInfo createInfo = {};
+  createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
+
+  createInfo.pQueueCreateInfos = queueCreateInfos.data();
+  createInfo.queueCreateInfoCount =
+      static_cast<uint32_t>(queueCreateInfos.size());
+
+  createInfo.pEnabledFeatures = &deviceFeatures;
+
+  std::vector<const char *> deviceExtensions = getRequiredDeviceExtensions();
+  deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
+
+  createInfo.enabledExtensionCount =
+      static_cast<uint32_t>(deviceExtensions.size());
+  createInfo.ppEnabledExtensionNames = deviceExtensions.data();
+
+  if (m_enableValidation) {
+    createInfo.enabledLayerCount =
+        static_cast<uint32_t>(countof(validationLayers));
+    createInfo.ppEnabledLayerNames = validationLayers;
+  } else {
+    createInfo.enabledLayerCount = 0;
+  }
+
+  if (vkCreateDevice(m_physicalDevice, &createInfo, nullptr, &m_device) !=
+      VK_SUCCESS) {
+    throw std::runtime_error("failed to create logical device!");
+  }
+
+  vkGetDeviceQueue(m_device, graphicsQueueIndex, 0, &m_graphicsQueue);
+  vkGetDeviceQueue(m_device, presentQueueIndex, 0, &m_presentQueue);
+
+  VkPhysicalDeviceIDProperties vkPhysicalDeviceIDProperties = {};
+  vkPhysicalDeviceIDProperties.sType =
+      VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES;
+  vkPhysicalDeviceIDProperties.pNext = NULL;
+
+  VkPhysicalDeviceProperties2 vkPhysicalDeviceProperties2 = {};
+  vkPhysicalDeviceProperties2.sType =
+      VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
+  vkPhysicalDeviceProperties2.pNext = &vkPhysicalDeviceIDProperties;
+
+  PFN_vkGetPhysicalDeviceProperties2 fpGetPhysicalDeviceProperties2;
+  fpGetPhysicalDeviceProperties2 =
+      (PFN_vkGetPhysicalDeviceProperties2)vkGetInstanceProcAddr(
+          m_instance, "vkGetPhysicalDeviceProperties2");
+  if (fpGetPhysicalDeviceProperties2 == NULL) {
+    throw std::runtime_error(
+        "Vulkan: Proc address for \"vkGetPhysicalDeviceProperties2KHR\" not "
+        "found.\n");
+  }
+
+  fpGetPhysicalDeviceProperties2(m_physicalDevice,
+                                 &vkPhysicalDeviceProperties2);
+
+  memcpy(m_vkDeviceUUID, vkPhysicalDeviceIDProperties.deviceUUID, VK_UUID_SIZE);
+}
+
+static VkSurfaceFormatKHR chooseSwapSurfaceFormat(
+    const std::vector<VkSurfaceFormatKHR> &availableFormats) {
+  if (availableFormats.size() == 1 &&
+      availableFormats[0].format == VK_FORMAT_UNDEFINED) {
+    return {VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR};
+  }
+
+  for (const auto &availableFormat : availableFormats) {
+    if (availableFormat.format == VK_FORMAT_B8G8R8A8_UNORM &&
+        availableFormat.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) {
+      return availableFormat;
+    }
+  }
+
+  return availableFormats[0];
+}
+
+static VkPresentModeKHR chooseSwapPresentMode(
+    const std::vector<VkPresentModeKHR> &availablePresentModes) {
+  VkPresentModeKHR bestMode = VK_PRESENT_MODE_FIFO_KHR;
+
+  for (const auto &availablePresentMode : availablePresentModes) {
+    if (availablePresentMode == VK_PRESENT_MODE_MAILBOX_KHR) {
+      return availablePresentMode;
+    } else if (availablePresentMode == VK_PRESENT_MODE_IMMEDIATE_KHR) {
+      bestMode = availablePresentMode;
+    }
+  }
+
+  return bestMode;
+}
+
+static VkExtent2D chooseSwapExtent(
+    GLFWwindow *window, const VkSurfaceCapabilitiesKHR &capabilities) {
+  if (capabilities.currentExtent.width !=
+      std::numeric_limits<uint32_t>::max()) {
+    return capabilities.currentExtent;
+  } else {
+    int width, height;
+    glfwGetFramebufferSize(window, &width, &height);
+    VkExtent2D actualExtent = {static_cast<uint32_t>(width),
+                               static_cast<uint32_t>(height)};
+
+    actualExtent.width = std::max(
+        capabilities.minImageExtent.width,
+        std::min(capabilities.maxImageExtent.width, actualExtent.width));
+    actualExtent.height = std::max(
+        capabilities.minImageExtent.height,
+        std::min(capabilities.maxImageExtent.height, actualExtent.height));
+
+    return actualExtent;
+  }
+}
+
+void VulkanBaseApp::createSwapChain() {
+  VkSurfaceCapabilitiesKHR capabilities;
+  VkSurfaceFormatKHR format;
+  VkPresentModeKHR presentMode;
+  VkExtent2D extent;
+  uint32_t imageCount;
+
+  {
     std::vector<VkSurfaceFormatKHR> formats;
     std::vector<VkPresentModeKHR> presentModes;
-    deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
-    getSwapChainProperties(dev, m_surface, caps, formats, presentModes);
-    return hasAllExtensions(dev, deviceExtensions)
-           && !formats.empty() && !presentModes.empty()
-           && findGraphicsQueueIndicies(dev, m_surface, graphicsQueueIndex, presentQueueIndex);
+
+    getSwapChainProperties(m_physicalDevice, m_surface, capabilities, formats,
+                           presentModes);
+    format = chooseSwapSurfaceFormat(formats);
+    presentMode = chooseSwapPresentMode(presentModes);
+    extent = chooseSwapExtent(m_window, capabilities);
+    imageCount = capabilities.minImageCount + 1;
+    if (capabilities.maxImageCount > 0 &&
+        imageCount > capabilities.maxImageCount) {
+      imageCount = capabilities.maxImageCount;
+    }
+  }
+
+  VkSwapchainCreateInfoKHR createInfo = {};
+  createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR;
+  createInfo.surface = m_surface;
+
+  createInfo.minImageCount = imageCount;
+  createInfo.imageFormat = format.format;
+  createInfo.imageColorSpace = format.colorSpace;
+  createInfo.imageExtent = extent;
+  createInfo.imageArrayLayers = 1;
+  createInfo.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+
+  uint32_t queueFamilyIndices[2];
+  findGraphicsQueueIndicies(m_physicalDevice, m_surface, queueFamilyIndices[0],
+                            queueFamilyIndices[1]);
+
+  if (queueFamilyIndices[0] != queueFamilyIndices[1]) {
+    createInfo.imageSharingMode = VK_SHARING_MODE_CONCURRENT;
+    createInfo.queueFamilyIndexCount = countof(queueFamilyIndices);
+    createInfo.pQueueFamilyIndices = queueFamilyIndices;
+  } else {
+    createInfo.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE;
+  }
+
+  createInfo.preTransform = capabilities.currentTransform;
+  createInfo.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR;
+  createInfo.presentMode = presentMode;
+  createInfo.clipped = VK_TRUE;
+
+  createInfo.oldSwapchain = VK_NULL_HANDLE;
+
+  if (vkCreateSwapchainKHR(m_device, &createInfo, nullptr, &m_swapChain) !=
+      VK_SUCCESS) {
+    throw std::runtime_error("failed to create swap chain!");
+  }
+
+  vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount, nullptr);
+  m_swapChainImages.resize(imageCount);
+  vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount,
+                          m_swapChainImages.data());
+
+  m_swapChainFormat = format.format;
+  m_swapChainExtent = extent;
 }
 
-void VulkanBaseApp::createDevice()
-{
-    {
-        uint32_t deviceCount = 0;
-        vkEnumeratePhysicalDevices(m_instance, &deviceCount, nullptr);
-        if (deviceCount == 0) {
-            throw std::runtime_error("Failed to find Vulkan capable GPUs!");
-        }
-        std::vector<VkPhysicalDevice> phyDevs(deviceCount);
-        vkEnumeratePhysicalDevices(m_instance, &deviceCount, phyDevs.data());
-        std::vector<VkPhysicalDevice>::iterator it = std::find_if(phyDevs.begin(), phyDevs.end(),
-                                                                  std::bind(&VulkanBaseApp::isSuitableDevice, this, std::placeholders::_1));
-        if (it == phyDevs.end()) {
-            throw std::runtime_error("No suitable device found!");
-        }
-        m_physicalDevice = *it;
-    }
+static VkImageView createImageView(VkDevice dev, VkImage image, VkFormat format,
+                                   VkImageAspectFlags aspectFlags) {
+  VkImageView imageView;
+  VkImageViewCreateInfo createInfo = {};
+  createInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
+  createInfo.image = image;
+  createInfo.viewType = VK_IMAGE_VIEW_TYPE_2D;
+  createInfo.format = format;
+  createInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
+  createInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
+  createInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
+  createInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
+  createInfo.subresourceRange.aspectMask = aspectFlags;
+  createInfo.subresourceRange.baseMipLevel = 0;
+  createInfo.subresourceRange.levelCount = 1;
+  createInfo.subresourceRange.baseArrayLayer = 0;
+  createInfo.subresourceRange.layerCount = 1;
+  if (vkCreateImageView(dev, &createInfo, nullptr, &imageView) != VK_SUCCESS) {
+    throw std::runtime_error("Failed to create image views!");
+  }
 
-    uint32_t graphicsQueueIndex, presentQueueIndex;
-    findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsQueueIndex, presentQueueIndex);
-
-    std::vector<VkDeviceQueueCreateInfo> queueCreateInfos;
-    std::set<uint32_t> uniqueFamilyIndices = { graphicsQueueIndex, presentQueueIndex };
-
-    float queuePriority = 1.0f;
-
-    for (uint32_t queueFamily : uniqueFamilyIndices) {
-        VkDeviceQueueCreateInfo queueCreateInfo = {};
-        queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
-        queueCreateInfo.queueFamilyIndex = graphicsQueueIndex;
-        queueCreateInfo.queueCount = 1;
-        queueCreateInfo.pQueuePriorities = &queuePriority;
-        queueCreateInfos.push_back(queueCreateInfo);
-    }
-
-    VkPhysicalDeviceFeatures deviceFeatures = {};
-    deviceFeatures.fillModeNonSolid = true;
-
-    VkDeviceCreateInfo createInfo = {};
-    createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
-
-    createInfo.pQueueCreateInfos = queueCreateInfos.data();
-    createInfo.queueCreateInfoCount = static_cast<uint32_t>(queueCreateInfos.size());
-
-    createInfo.pEnabledFeatures = &deviceFeatures;
-
-    std::vector<const char *> deviceExtensions = getRequiredDeviceExtensions();
-    deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
-
-    createInfo.enabledExtensionCount = static_cast<uint32_t>(deviceExtensions.size());
-    createInfo.ppEnabledExtensionNames = deviceExtensions.data();
-
-    if (m_enableValidation) {
-        createInfo.enabledLayerCount = static_cast<uint32_t>(countof(validationLayers));
-        createInfo.ppEnabledLayerNames = validationLayers;
-    }
-    else {
-        createInfo.enabledLayerCount = 0;
-    }
-
-    if (vkCreateDevice(m_physicalDevice, &createInfo, nullptr, &m_device) != VK_SUCCESS) {
-        throw std::runtime_error("failed to create logical device!");
-    }
-
-    vkGetDeviceQueue(m_device, graphicsQueueIndex, 0, &m_graphicsQueue);
-    vkGetDeviceQueue(m_device, presentQueueIndex, 0, &m_presentQueue);
-
-    VkPhysicalDeviceIDProperties vkPhysicalDeviceIDProperties = {};
-    vkPhysicalDeviceIDProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES;
-    vkPhysicalDeviceIDProperties.pNext = NULL;
-
-    VkPhysicalDeviceProperties2 vkPhysicalDeviceProperties2 = {};
-    vkPhysicalDeviceProperties2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
-    vkPhysicalDeviceProperties2.pNext = &vkPhysicalDeviceIDProperties;
-
-    PFN_vkGetPhysicalDeviceProperties2 fpGetPhysicalDeviceProperties2;
-    fpGetPhysicalDeviceProperties2 = (PFN_vkGetPhysicalDeviceProperties2)vkGetInstanceProcAddr(m_instance, "vkGetPhysicalDeviceProperties2");
-    if (fpGetPhysicalDeviceProperties2 == NULL) {
-        throw std::runtime_error("Vulkan: Proc address for \"vkGetPhysicalDeviceProperties2KHR\" not found.\n");
-    }
-
-    fpGetPhysicalDeviceProperties2(m_physicalDevice, &vkPhysicalDeviceProperties2);
-
-    memcpy(m_vkDeviceUUID, vkPhysicalDeviceIDProperties.deviceUUID,  VK_UUID_SIZE);
+  return imageView;
 }
 
-static VkSurfaceFormatKHR chooseSwapSurfaceFormat(const std::vector<VkSurfaceFormatKHR>& availableFormats)
-{
-    if (availableFormats.size() == 1 && availableFormats[0].format == VK_FORMAT_UNDEFINED) {
-        return { VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR };
-    }
+static void createImage(VkPhysicalDevice physicalDevice, VkDevice device,
+                        uint32_t width, uint32_t height, VkFormat format,
+                        VkImageTiling tiling, VkImageUsageFlags usage,
+                        VkMemoryPropertyFlags properties, VkImage &image,
+                        VkDeviceMemory &imageMemory) {
+  VkImageCreateInfo imageInfo = {};
+  imageInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
+  imageInfo.imageType = VK_IMAGE_TYPE_2D;
+  imageInfo.extent.width = width;
+  imageInfo.extent.height = height;
+  imageInfo.extent.depth = 1;
+  imageInfo.mipLevels = 1;
+  imageInfo.arrayLayers = 1;
+  imageInfo.format = format;
+  imageInfo.tiling = tiling;
+  imageInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+  imageInfo.usage = usage;
+  imageInfo.samples = VK_SAMPLE_COUNT_1_BIT;
+  imageInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
 
-    for (const auto & availableFormat : availableFormats) {
-        if (availableFormat.format == VK_FORMAT_B8G8R8A8_UNORM && availableFormat.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) {
-            return availableFormat;
-        }
-    }
+  if (vkCreateImage(device, &imageInfo, nullptr, &image) != VK_SUCCESS) {
+    throw std::runtime_error("failed to create image!");
+  }
 
-    return availableFormats[0];
+  VkMemoryRequirements memRequirements;
+  vkGetImageMemoryRequirements(device, image, &memRequirements);
+
+  VkMemoryAllocateInfo allocInfo = {};
+  allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+  allocInfo.allocationSize = memRequirements.size;
+  allocInfo.memoryTypeIndex = findMemoryType(
+      physicalDevice, memRequirements.memoryTypeBits, properties);
+
+  if (vkAllocateMemory(device, &allocInfo, nullptr, &imageMemory) !=
+      VK_SUCCESS) {
+    throw std::runtime_error("failed to allocate image memory!");
+  }
+
+  vkBindImageMemory(device, image, imageMemory, 0);
 }
 
-static VkPresentModeKHR chooseSwapPresentMode(const std::vector<VkPresentModeKHR>& availablePresentModes)
-{
-    VkPresentModeKHR bestMode = VK_PRESENT_MODE_FIFO_KHR;
+void VulkanBaseApp::createImageViews() {
+  m_swapChainImageViews.resize(m_swapChainImages.size());
 
-    for (const auto & availablePresentMode : availablePresentModes) {
-        if (availablePresentMode == VK_PRESENT_MODE_MAILBOX_KHR) {
-            return availablePresentMode;
-        }
-        else if (availablePresentMode == VK_PRESENT_MODE_IMMEDIATE_KHR) {
-            bestMode = availablePresentMode;
-        }
-    }
-
-    return bestMode;
+  for (uint32_t i = 0; i < m_swapChainImages.size(); i++) {
+    m_swapChainImageViews[i] =
+        createImageView(m_device, m_swapChainImages[i], m_swapChainFormat,
+                        VK_IMAGE_ASPECT_COLOR_BIT);
+  }
 }
 
-static VkExtent2D chooseSwapExtent(GLFWwindow *window, const VkSurfaceCapabilitiesKHR& capabilities)
-{
-    if (capabilities.currentExtent.width != std::numeric_limits<uint32_t>::max()) {
-        return capabilities.currentExtent;
-    }
-    else {
-        int width, height;
-        glfwGetFramebufferSize(window, &width, &height);
-        VkExtent2D actualExtent = { static_cast<uint32_t>(width), static_cast<uint32_t>(height) };
+void VulkanBaseApp::createRenderPass() {
+  VkAttachmentDescription colorAttachment = {};
+  colorAttachment.format = m_swapChainFormat;
+  colorAttachment.samples = VK_SAMPLE_COUNT_1_BIT;
+  colorAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR;
+  colorAttachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
+  colorAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+  colorAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+  colorAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+  colorAttachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
 
-        actualExtent.width = std::max(capabilities.minImageExtent.width, std::min(capabilities.maxImageExtent.width, actualExtent.width));
-        actualExtent.height = std::max(capabilities.minImageExtent.height, std::min(capabilities.maxImageExtent.height, actualExtent.height));
+  VkAttachmentReference colorAttachmentRef = {};
+  colorAttachmentRef.attachment = 0;
+  colorAttachmentRef.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
 
-        return actualExtent;
-    }
+  VkAttachmentDescription depthAttachment = {};
+  depthAttachment.format = findSupportedFormat(
+      m_physicalDevice, {VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT,
+                         VK_FORMAT_D24_UNORM_S8_UINT},
+      VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT);
+  depthAttachment.samples = VK_SAMPLE_COUNT_1_BIT;
+  depthAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR;
+  depthAttachment.storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+  depthAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+  depthAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+  depthAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+  depthAttachment.finalLayout =
+      VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
+
+  VkAttachmentReference depthAttachmentRef = {};
+  depthAttachmentRef.attachment = 1;
+  depthAttachmentRef.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
+
+  VkSubpassDescription subpass = {};
+  subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
+  subpass.colorAttachmentCount = 1;
+  subpass.pColorAttachments = &colorAttachmentRef;
+  subpass.pDepthStencilAttachment = &depthAttachmentRef;
+
+  VkSubpassDependency dependency = {};
+  dependency.srcSubpass = VK_SUBPASS_EXTERNAL;
+  dependency.dstSubpass = 0;
+  dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+  dependency.srcAccessMask = 0;
+  dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+  dependency.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
+                             VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+
+  VkAttachmentDescription attachments[] = {colorAttachment, depthAttachment};
+  VkRenderPassCreateInfo renderPassInfo = {};
+  renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO;
+  renderPassInfo.attachmentCount = countof(attachments);
+  renderPassInfo.pAttachments = attachments;
+  renderPassInfo.subpassCount = 1;
+  renderPassInfo.pSubpasses = &subpass;
+  renderPassInfo.dependencyCount = 1;
+  renderPassInfo.pDependencies = &dependency;
+
+  if (vkCreateRenderPass(m_device, &renderPassInfo, nullptr, &m_renderPass) !=
+      VK_SUCCESS) {
+    throw std::runtime_error("failed to create render pass!");
+  }
 }
 
-void VulkanBaseApp::createSwapChain()
-{
-    VkSurfaceCapabilitiesKHR capabilities;
-    VkSurfaceFormatKHR format;
-    VkPresentModeKHR presentMode;
-    VkExtent2D extent;
-    uint32_t imageCount;
+void VulkanBaseApp::createDescriptorSetLayout() {
+  VkDescriptorSetLayoutBinding uboLayoutBinding = {};
+  uboLayoutBinding.binding = 0;
+  uboLayoutBinding.descriptorCount = 1;
+  uboLayoutBinding.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+  uboLayoutBinding.pImmutableSamplers = nullptr;
+  uboLayoutBinding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
 
-    {
-        std::vector<VkSurfaceFormatKHR> formats;
-        std::vector<VkPresentModeKHR> presentModes;
+  VkDescriptorSetLayoutCreateInfo layoutInfo = {};
+  layoutInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+  layoutInfo.bindingCount = 1;
+  layoutInfo.pBindings = &uboLayoutBinding;
 
-        getSwapChainProperties(m_physicalDevice, m_surface, capabilities, formats, presentModes);
-        format = chooseSwapSurfaceFormat(formats);
-        presentMode = chooseSwapPresentMode(presentModes);
-        extent = chooseSwapExtent(m_window, capabilities);
-        imageCount = capabilities.minImageCount + 1;
-        if (capabilities.maxImageCount > 0 && imageCount > capabilities.maxImageCount) {
-            imageCount = capabilities.maxImageCount;
-        }
-    }
-
-    VkSwapchainCreateInfoKHR createInfo = {};
-    createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR;
-    createInfo.surface = m_surface;
-
-    createInfo.minImageCount = imageCount;
-    createInfo.imageFormat = format.format;
-    createInfo.imageColorSpace = format.colorSpace;
-    createInfo.imageExtent = extent;
-    createInfo.imageArrayLayers = 1;
-    createInfo.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
-
-    uint32_t queueFamilyIndices[2];
-    findGraphicsQueueIndicies(m_physicalDevice, m_surface, queueFamilyIndices[0], queueFamilyIndices[1]);
-
-    if (queueFamilyIndices[0] != queueFamilyIndices[1]) {
-        createInfo.imageSharingMode = VK_SHARING_MODE_CONCURRENT;
-        createInfo.queueFamilyIndexCount = countof(queueFamilyIndices);
-        createInfo.pQueueFamilyIndices = queueFamilyIndices;
-    }
-    else {
-        createInfo.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE;
-    }
-
-    createInfo.preTransform = capabilities.currentTransform;
-    createInfo.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR;
-    createInfo.presentMode = presentMode;
-    createInfo.clipped = VK_TRUE;
-
-    createInfo.oldSwapchain = VK_NULL_HANDLE;
-
-    if (vkCreateSwapchainKHR(m_device, &createInfo, nullptr, &m_swapChain) != VK_SUCCESS) {
-        throw std::runtime_error("failed to create swap chain!");
-    }
-
-    vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount, nullptr);
-    m_swapChainImages.resize(imageCount);
-    vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount, m_swapChainImages.data());
-
-    m_swapChainFormat = format.format;
-    m_swapChainExtent = extent;
+  if (vkCreateDescriptorSetLayout(m_device, &layoutInfo, nullptr,
+                                  &m_descriptorSetLayout) != VK_SUCCESS) {
+    throw std::runtime_error("failed to create descriptor set layout!");
+  }
 }
 
-static VkImageView createImageView(VkDevice dev, VkImage image, VkFormat format, VkImageAspectFlags aspectFlags)
-{
-    VkImageView imageView;
-    VkImageViewCreateInfo createInfo = {};
-    createInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
-    createInfo.image = image;
-    createInfo.viewType = VK_IMAGE_VIEW_TYPE_2D;
-    createInfo.format = format;
-    createInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
-    createInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
-    createInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
-    createInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
-    createInfo.subresourceRange.aspectMask = aspectFlags;
-    createInfo.subresourceRange.baseMipLevel = 0;
-    createInfo.subresourceRange.levelCount = 1;
-    createInfo.subresourceRange.baseArrayLayer = 0;
-    createInfo.subresourceRange.layerCount = 1;
-    if (vkCreateImageView(dev, &createInfo, nullptr, &imageView) != VK_SUCCESS) {
-        throw std::runtime_error("Failed to create image views!");
-    }
+VkShaderModule createShaderModule(VkDevice device, const char *filename) {
+  std::vector<char> shaderContents;
+  std::ifstream shaderFile(filename, std::ios_base::in | std::ios_base::binary);
+  VkShaderModuleCreateInfo createInfo = {};
+  VkShaderModule shaderModule;
 
-    return imageView;
+  if (!shaderFile.good()) {
+    throw std::runtime_error("Failed to load shader contents");
+  }
+  readFile(shaderFile, shaderContents);
+
+  createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+  createInfo.codeSize = shaderContents.size();
+  createInfo.pCode = reinterpret_cast<const uint32_t *>(shaderContents.data());
+
+  if (vkCreateShaderModule(device, &createInfo, nullptr, &shaderModule) !=
+      VK_SUCCESS) {
+    throw std::runtime_error("Failed to create shader module!");
+  }
+
+  return shaderModule;
 }
 
-static void createImage(VkPhysicalDevice physicalDevice, VkDevice device, uint32_t width, uint32_t height, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage, VkMemoryPropertyFlags properties, VkImage& image, VkDeviceMemory& imageMemory)
-{
-    VkImageCreateInfo imageInfo = {};
-    imageInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
-    imageInfo.imageType = VK_IMAGE_TYPE_2D;
-    imageInfo.extent.width = width;
-    imageInfo.extent.height = height;
-    imageInfo.extent.depth = 1;
-    imageInfo.mipLevels = 1;
-    imageInfo.arrayLayers = 1;
-    imageInfo.format = format;
-    imageInfo.tiling = tiling;
-    imageInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
-    imageInfo.usage = usage;
-    imageInfo.samples = VK_SAMPLE_COUNT_1_BIT;
-    imageInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+void VulkanBaseApp::getVertexDescriptions(
+    std::vector<VkVertexInputBindingDescription> &bindingDesc,
+    std::vector<VkVertexInputAttributeDescription> &attribDesc) {}
 
-    if (vkCreateImage(device, &imageInfo, nullptr, &image) != VK_SUCCESS) {
-        throw std::runtime_error("failed to create image!");
-    }
+void VulkanBaseApp::getAssemblyStateInfo(
+    VkPipelineInputAssemblyStateCreateInfo &info) {}
 
-    VkMemoryRequirements memRequirements;
-    vkGetImageMemoryRequirements(device, image, &memRequirements);
+void VulkanBaseApp::createGraphicsPipeline() {
+  std::vector<VkPipelineShaderStageCreateInfo> shaderStageInfos(
+      m_shaderFiles.size());
+  for (size_t i = 0; i < m_shaderFiles.size(); i++) {
+    shaderStageInfos[i] = {};
+    shaderStageInfos[i].sType =
+        VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+    shaderStageInfos[i].stage = m_shaderFiles[i].first;
+    shaderStageInfos[i].module =
+        createShaderModule(m_device, m_shaderFiles[i].second.c_str());
+    shaderStageInfos[i].pName = "main";
+  }
 
-    VkMemoryAllocateInfo allocInfo = {};
-    allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
-    allocInfo.allocationSize = memRequirements.size;
-    allocInfo.memoryTypeIndex = findMemoryType(physicalDevice, memRequirements.memoryTypeBits, properties);
+  VkPipelineVertexInputStateCreateInfo vertexInputInfo = {};
 
-    if (vkAllocateMemory(device, &allocInfo, nullptr, &imageMemory) != VK_SUCCESS) {
-        throw std::runtime_error("failed to allocate image memory!");
-    }
+  std::vector<VkVertexInputBindingDescription> vertexBindingDescriptions;
+  std::vector<VkVertexInputAttributeDescription> vertexAttributeDescriptions;
 
-    vkBindImageMemory(device, image, imageMemory, 0);
+  getVertexDescriptions(vertexBindingDescriptions, vertexAttributeDescriptions);
+
+  vertexInputInfo.sType =
+      VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO;
+  vertexInputInfo.vertexBindingDescriptionCount =
+      static_cast<uint32_t>(vertexBindingDescriptions.size());
+  vertexInputInfo.pVertexBindingDescriptions = vertexBindingDescriptions.data();
+  vertexInputInfo.vertexAttributeDescriptionCount =
+      static_cast<uint32_t>(vertexAttributeDescriptions.size());
+  vertexInputInfo.pVertexAttributeDescriptions =
+      vertexAttributeDescriptions.data();
+
+  VkPipelineInputAssemblyStateCreateInfo inputAssembly = {};
+  getAssemblyStateInfo(inputAssembly);
+
+  VkViewport viewport = {};
+  viewport.x = 0.0f;
+  viewport.y = 0.0f;
+  viewport.width = (float)m_swapChainExtent.width;
+  viewport.height = (float)m_swapChainExtent.height;
+  viewport.minDepth = 0.0f;
+  viewport.maxDepth = 1.0f;
+
+  VkRect2D scissor = {};
+  scissor.offset = {0, 0};
+  scissor.extent = m_swapChainExtent;
+
+  VkPipelineViewportStateCreateInfo viewportState = {};
+  viewportState.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO;
+  viewportState.viewportCount = 1;
+  viewportState.pViewports = &viewport;
+  viewportState.scissorCount = 1;
+  viewportState.pScissors = &scissor;
+
+  VkPipelineRasterizationStateCreateInfo rasterizer = {};
+  rasterizer.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
+  rasterizer.depthClampEnable = VK_FALSE;
+  rasterizer.rasterizerDiscardEnable = VK_FALSE;
+  rasterizer.polygonMode = VK_POLYGON_MODE_LINE;
+  rasterizer.lineWidth = 1.0f;
+  rasterizer.cullMode = VK_CULL_MODE_NONE;
+  rasterizer.frontFace = VK_FRONT_FACE_CLOCKWISE;
+  rasterizer.depthBiasEnable = VK_FALSE;
+
+  VkPipelineMultisampleStateCreateInfo multisampling = {};
+  multisampling.sType =
+      VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
+  multisampling.sampleShadingEnable = VK_FALSE;
+  multisampling.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
+  multisampling.minSampleShading = 1.0f;           // Optional
+  multisampling.pSampleMask = nullptr;             // Optional
+  multisampling.alphaToCoverageEnable = VK_FALSE;  // Optional
+  multisampling.alphaToOneEnable = VK_FALSE;       // Optional
+
+  VkPipelineDepthStencilStateCreateInfo depthStencil = {};
+  depthStencil.sType =
+      VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
+  depthStencil.depthTestEnable = VK_TRUE;
+  depthStencil.depthWriteEnable = VK_TRUE;
+  depthStencil.depthCompareOp = VK_COMPARE_OP_LESS;
+  depthStencil.depthBoundsTestEnable = VK_FALSE;
+  depthStencil.stencilTestEnable = VK_FALSE;
+
+  VkPipelineColorBlendAttachmentState colorBlendAttachment = {};
+  colorBlendAttachment.colorWriteMask =
+      VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT |
+      VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT;
+  colorBlendAttachment.blendEnable = VK_FALSE;
+
+  VkPipelineColorBlendStateCreateInfo colorBlending = {};
+  colorBlending.sType =
+      VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO;
+  colorBlending.logicOpEnable = VK_FALSE;
+  colorBlending.logicOp = VK_LOGIC_OP_COPY;
+  colorBlending.attachmentCount = 1;
+  colorBlending.pAttachments = &colorBlendAttachment;
+  colorBlending.blendConstants[0] = 0.0f;
+  colorBlending.blendConstants[1] = 0.0f;
+  colorBlending.blendConstants[2] = 0.0f;
+  colorBlending.blendConstants[3] = 0.0f;
+
+  VkPipelineLayoutCreateInfo pipelineLayoutInfo = {};
+  pipelineLayoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+  pipelineLayoutInfo.setLayoutCount = 1;                    // Optional
+  pipelineLayoutInfo.pSetLayouts = &m_descriptorSetLayout;  // Optional
+  pipelineLayoutInfo.pushConstantRangeCount = 0;            // Optional
+  pipelineLayoutInfo.pPushConstantRanges = nullptr;         // Optional
+
+  if (vkCreatePipelineLayout(m_device, &pipelineLayoutInfo, nullptr,
+                             &m_pipelineLayout) != VK_SUCCESS) {
+    throw std::runtime_error("failed to create pipeline layout!");
+  }
+
+  VkGraphicsPipelineCreateInfo pipelineInfo = {};
+  pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO;
+  pipelineInfo.stageCount = static_cast<uint32_t>(shaderStageInfos.size());
+  pipelineInfo.pStages = shaderStageInfos.data();
+
+  pipelineInfo.pVertexInputState = &vertexInputInfo;
+  pipelineInfo.pInputAssemblyState = &inputAssembly;
+  pipelineInfo.pViewportState = &viewportState;
+  pipelineInfo.pRasterizationState = &rasterizer;
+  pipelineInfo.pMultisampleState = &multisampling;
+  pipelineInfo.pDepthStencilState = &depthStencil;  // Optional
+  pipelineInfo.pColorBlendState = &colorBlending;
+  pipelineInfo.pDynamicState = nullptr;  // Optional
+
+  pipelineInfo.layout = m_pipelineLayout;
+
+  pipelineInfo.renderPass = m_renderPass;
+  pipelineInfo.subpass = 0;
+
+  pipelineInfo.basePipelineHandle = VK_NULL_HANDLE;  // Optional
+  pipelineInfo.basePipelineIndex = -1;               // Optional
+
+  if (vkCreateGraphicsPipelines(m_device, VK_NULL_HANDLE, 1, &pipelineInfo,
+                                nullptr, &m_graphicsPipeline) != VK_SUCCESS) {
+    throw std::runtime_error("failed to create graphics pipeline!");
+  }
+
+  for (size_t i = 0; i < shaderStageInfos.size(); i++) {
+    vkDestroyShaderModule(m_device, shaderStageInfos[i].module, nullptr);
+  }
 }
 
-void VulkanBaseApp::createImageViews()
-{
-    m_swapChainImageViews.resize(m_swapChainImages.size());
+void VulkanBaseApp::createFramebuffers() {
+  m_swapChainFramebuffers.resize(m_swapChainImageViews.size());
+  for (size_t i = 0; i < m_swapChainImageViews.size(); i++) {
+    VkImageView attachments[] = {m_swapChainImageViews[i], m_depthImageView};
 
-    for (uint32_t i = 0; i < m_swapChainImages.size(); i++) {
-        m_swapChainImageViews[i] = createImageView(m_device, m_swapChainImages[i], m_swapChainFormat, VK_IMAGE_ASPECT_COLOR_BIT);
+    VkFramebufferCreateInfo framebufferInfo = {};
+    framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO;
+    framebufferInfo.renderPass = m_renderPass;
+    framebufferInfo.attachmentCount = countof(attachments);
+    framebufferInfo.pAttachments = attachments;
+    framebufferInfo.width = m_swapChainExtent.width;
+    framebufferInfo.height = m_swapChainExtent.height;
+    framebufferInfo.layers = 1;
+
+    if (vkCreateFramebuffer(m_device, &framebufferInfo, nullptr,
+                            &m_swapChainFramebuffers[i]) != VK_SUCCESS) {
+      throw std::runtime_error("failed to create framebuffer!");
     }
+  }
 }
 
-void VulkanBaseApp::createRenderPass()
-{
-    VkAttachmentDescription colorAttachment = {};
-    colorAttachment.format = m_swapChainFormat;
-    colorAttachment.samples = VK_SAMPLE_COUNT_1_BIT;
-    colorAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR;
-    colorAttachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
-    colorAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
-    colorAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
-    colorAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
-    colorAttachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
+void VulkanBaseApp::createCommandPool() {
+  VkCommandPoolCreateInfo poolInfo = {};
+  uint32_t graphicsIndex, presentIndex;
 
-    VkAttachmentReference colorAttachmentRef = {};
-    colorAttachmentRef.attachment = 0;
-    colorAttachmentRef.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+  findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsIndex,
+                            presentIndex);
 
-    VkAttachmentDescription depthAttachment = {};
-    depthAttachment.format = findSupportedFormat(m_physicalDevice,
-    { VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_D24_UNORM_S8_UINT },
-    VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT);
-    depthAttachment.samples = VK_SAMPLE_COUNT_1_BIT;
-    depthAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR;
-    depthAttachment.storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
-    depthAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
-    depthAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
-    depthAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
-    depthAttachment.finalLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
+  poolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+  poolInfo.queueFamilyIndex = graphicsIndex;
+  poolInfo.flags = 0;  // Optional
 
-    VkAttachmentReference depthAttachmentRef = {};
-    depthAttachmentRef.attachment = 1;
-    depthAttachmentRef.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
-
-    VkSubpassDescription subpass = {};
-    subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
-    subpass.colorAttachmentCount = 1;
-    subpass.pColorAttachments = &colorAttachmentRef;
-    subpass.pDepthStencilAttachment = &depthAttachmentRef;
-
-
-    VkSubpassDependency dependency = {};
-    dependency.srcSubpass = VK_SUBPASS_EXTERNAL;
-    dependency.dstSubpass = 0;
-    dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
-    dependency.srcAccessMask = 0;
-    dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
-    dependency.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
-
-    VkAttachmentDescription attachments[] = {colorAttachment, depthAttachment};
-    VkRenderPassCreateInfo renderPassInfo = {};
-    renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO;
-    renderPassInfo.attachmentCount = countof(attachments);
-    renderPassInfo.pAttachments = attachments;
-    renderPassInfo.subpassCount = 1;
-    renderPassInfo.pSubpasses = &subpass;
-    renderPassInfo.dependencyCount = 1;
-    renderPassInfo.pDependencies = &dependency;
-
-    if (vkCreateRenderPass(m_device, &renderPassInfo, nullptr, &m_renderPass) != VK_SUCCESS) {
-        throw std::runtime_error("failed to create render pass!");
-    }
+  if (vkCreateCommandPool(m_device, &poolInfo, nullptr, &m_commandPool) !=
+      VK_SUCCESS) {
+    throw std::runtime_error("Failed to create command pool!");
+  }
 }
 
-void VulkanBaseApp::createDescriptorSetLayout()
-{
-    VkDescriptorSetLayoutBinding uboLayoutBinding = {};
-    uboLayoutBinding.binding = 0;
-    uboLayoutBinding.descriptorCount = 1;
-    uboLayoutBinding.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
-    uboLayoutBinding.pImmutableSamplers = nullptr;
-    uboLayoutBinding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
+static void transitionImageLayout(VulkanBaseApp *app, VkImage image,
+                                  VkFormat format, VkImageLayout oldLayout,
+                                  VkImageLayout newLayout) {
+  VkCommandBuffer commandBuffer = app->beginSingleTimeCommands();
 
-    VkDescriptorSetLayoutCreateInfo layoutInfo = {};
-    layoutInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
-    layoutInfo.bindingCount = 1;
-    layoutInfo.pBindings = &uboLayoutBinding;
+  VkImageMemoryBarrier barrier = {};
+  barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+  barrier.oldLayout = oldLayout;
+  barrier.newLayout = newLayout;
+  barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  barrier.image = image;
 
-    if (vkCreateDescriptorSetLayout(m_device, &layoutInfo, nullptr, &m_descriptorSetLayout) != VK_SUCCESS) {
-        throw std::runtime_error("failed to create descriptor set layout!");
+  if (newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) {
+    barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
+
+    if (format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
+        format == VK_FORMAT_D24_UNORM_S8_UINT) {
+      barrier.subresourceRange.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
     }
+  } else {
+    barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+  }
+
+  barrier.subresourceRange.baseMipLevel = 0;
+  barrier.subresourceRange.levelCount = 1;
+  barrier.subresourceRange.baseArrayLayer = 0;
+  barrier.subresourceRange.layerCount = 1;
+
+  VkPipelineStageFlags sourceStage;
+  VkPipelineStageFlags destinationStage;
+
+  if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED &&
+      newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {
+    barrier.srcAccessMask = 0;
+    barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+
+    sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+    destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+  } else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL &&
+             newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
+    barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+    barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+
+    sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+    destinationStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
+  } else if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED &&
+             newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) {
+    barrier.srcAccessMask = 0;
+    barrier.dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
+                            VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
+
+    sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+    destinationStage = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT;
+  } else {
+    throw std::invalid_argument("unsupported layout transition!");
+  }
+
+  vkCmdPipelineBarrier(commandBuffer, sourceStage, destinationStage, 0, 0,
+                       nullptr, 0, nullptr, 1, &barrier);
+
+  app->endSingleTimeCommands(commandBuffer);
 }
 
-VkShaderModule createShaderModule(VkDevice device, const char *filename)
-{
-    std::vector<char> shaderContents;
-    std::ifstream shaderFile(filename, std::ios_base::in | std::ios_base::binary);
-    VkShaderModuleCreateInfo createInfo = {};
-    VkShaderModule shaderModule;
-
-    if (!shaderFile.good()) {
-        throw std::runtime_error("Failed to load shader contents");
-    }
-    readFile(shaderFile, shaderContents);
-
-    createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
-    createInfo.codeSize = shaderContents.size();
-    createInfo.pCode = reinterpret_cast<const uint32_t *>(shaderContents.data());
-
-    if (vkCreateShaderModule(device, &createInfo, nullptr, &shaderModule) != VK_SUCCESS) {
-        throw std::runtime_error("Failed to create shader module!");
-    }
-
-    return shaderModule;
+void VulkanBaseApp::createDepthResources() {
+  VkFormat depthFormat = findSupportedFormat(
+      m_physicalDevice, {VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT,
+                         VK_FORMAT_D24_UNORM_S8_UINT},
+      VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT);
+  createImage(m_physicalDevice, m_device, m_swapChainExtent.width,
+              m_swapChainExtent.height, depthFormat, VK_IMAGE_TILING_OPTIMAL,
+              VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+              VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_depthImage,
+              m_depthImageMemory);
+  m_depthImageView = createImageView(m_device, m_depthImage, depthFormat,
+                                     VK_IMAGE_ASPECT_DEPTH_BIT);
+  transitionImageLayout(this, m_depthImage, depthFormat,
+                        VK_IMAGE_LAYOUT_UNDEFINED,
+                        VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL);
 }
 
-void VulkanBaseApp::getVertexDescriptions(std::vector<VkVertexInputBindingDescription>& bindingDesc, std::vector<VkVertexInputAttributeDescription>& attribDesc)
-{
-}
-
-void VulkanBaseApp::getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info)
-{
-
-}
-
-void VulkanBaseApp::createGraphicsPipeline()
-{
-    std::vector<VkPipelineShaderStageCreateInfo> shaderStageInfos(m_shaderFiles.size());
-    for (size_t i = 0; i < m_shaderFiles.size(); i++) {
-        shaderStageInfos[i] = {};
-        shaderStageInfos[i].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
-        shaderStageInfos[i].stage = m_shaderFiles[i].first;
-        shaderStageInfos[i].module = createShaderModule(m_device, m_shaderFiles[i].second.c_str());
-        shaderStageInfos[i].pName = "main";
-    }
-
-    VkPipelineVertexInputStateCreateInfo vertexInputInfo = {};
-
-    std::vector<VkVertexInputBindingDescription> vertexBindingDescriptions;
-    std::vector<VkVertexInputAttributeDescription> vertexAttributeDescriptions;
-
-    getVertexDescriptions(vertexBindingDescriptions, vertexAttributeDescriptions);
-
-    vertexInputInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO;
-    vertexInputInfo.vertexBindingDescriptionCount = static_cast<uint32_t>(vertexBindingDescriptions.size());
-    vertexInputInfo.pVertexBindingDescriptions = vertexBindingDescriptions.data();
-    vertexInputInfo.vertexAttributeDescriptionCount = static_cast<uint32_t>(vertexAttributeDescriptions.size());
-    vertexInputInfo.pVertexAttributeDescriptions = vertexAttributeDescriptions.data();
-
-    VkPipelineInputAssemblyStateCreateInfo inputAssembly = {};
-    getAssemblyStateInfo(inputAssembly);
-
-    VkViewport viewport = {};
-    viewport.x = 0.0f;
-    viewport.y = 0.0f;
-    viewport.width = (float)m_swapChainExtent.width;
-    viewport.height = (float)m_swapChainExtent.height;
-    viewport.minDepth = 0.0f;
-    viewport.maxDepth = 1.0f;
-
-    VkRect2D scissor = {};
-    scissor.offset = { 0, 0 };
-    scissor.extent = m_swapChainExtent;
-
-    VkPipelineViewportStateCreateInfo viewportState = {};
-    viewportState.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO;
-    viewportState.viewportCount = 1;
-    viewportState.pViewports = &viewport;
-    viewportState.scissorCount = 1;
-    viewportState.pScissors = &scissor;
-
-    VkPipelineRasterizationStateCreateInfo rasterizer = {};
-    rasterizer.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
-    rasterizer.depthClampEnable = VK_FALSE;
-    rasterizer.rasterizerDiscardEnable = VK_FALSE;
-    rasterizer.polygonMode = VK_POLYGON_MODE_LINE;
-    rasterizer.lineWidth = 1.0f;
-    rasterizer.cullMode = VK_CULL_MODE_NONE;
-    rasterizer.frontFace = VK_FRONT_FACE_CLOCKWISE;
-    rasterizer.depthBiasEnable = VK_FALSE;
-
-    VkPipelineMultisampleStateCreateInfo multisampling = {};
-    multisampling.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
-    multisampling.sampleShadingEnable = VK_FALSE;
-    multisampling.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
-    multisampling.minSampleShading = 1.0f; // Optional
-    multisampling.pSampleMask = nullptr; // Optional
-    multisampling.alphaToCoverageEnable = VK_FALSE; // Optional
-    multisampling.alphaToOneEnable = VK_FALSE; // Optional
-
-    VkPipelineDepthStencilStateCreateInfo depthStencil = {};
-    depthStencil.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
-    depthStencil.depthTestEnable = VK_TRUE;
-    depthStencil.depthWriteEnable = VK_TRUE;
-    depthStencil.depthCompareOp = VK_COMPARE_OP_LESS;
-    depthStencil.depthBoundsTestEnable = VK_FALSE;
-    depthStencil.stencilTestEnable = VK_FALSE;
-
-    VkPipelineColorBlendAttachmentState colorBlendAttachment = {};
-    colorBlendAttachment.colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT;
-    colorBlendAttachment.blendEnable = VK_FALSE;
-
-    VkPipelineColorBlendStateCreateInfo colorBlending = {};
-    colorBlending.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO;
-    colorBlending.logicOpEnable = VK_FALSE;
-    colorBlending.logicOp = VK_LOGIC_OP_COPY;
-    colorBlending.attachmentCount = 1;
-    colorBlending.pAttachments = &colorBlendAttachment;
-    colorBlending.blendConstants[0] = 0.0f;
-    colorBlending.blendConstants[1] = 0.0f;
-    colorBlending.blendConstants[2] = 0.0f;
-    colorBlending.blendConstants[3] = 0.0f;
-
-    VkPipelineLayoutCreateInfo pipelineLayoutInfo = {};
-    pipelineLayoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
-    pipelineLayoutInfo.setLayoutCount = 1; // Optional
-    pipelineLayoutInfo.pSetLayouts = &m_descriptorSetLayout; // Optional
-    pipelineLayoutInfo.pushConstantRangeCount = 0; // Optional
-    pipelineLayoutInfo.pPushConstantRanges = nullptr; // Optional
-
-    if (vkCreatePipelineLayout(m_device, &pipelineLayoutInfo, nullptr, &m_pipelineLayout) != VK_SUCCESS) {
-        throw std::runtime_error("failed to create pipeline layout!");
-    }
-
-    VkGraphicsPipelineCreateInfo pipelineInfo = {};
-    pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO;
-    pipelineInfo.stageCount = static_cast<uint32_t>(shaderStageInfos.size());
-    pipelineInfo.pStages = shaderStageInfos.data();
-
-    pipelineInfo.pVertexInputState = &vertexInputInfo;
-    pipelineInfo.pInputAssemblyState = &inputAssembly;
-    pipelineInfo.pViewportState = &viewportState;
-    pipelineInfo.pRasterizationState = &rasterizer;
-    pipelineInfo.pMultisampleState = &multisampling;
-    pipelineInfo.pDepthStencilState = &depthStencil; // Optional
-    pipelineInfo.pColorBlendState = &colorBlending;
-    pipelineInfo.pDynamicState = nullptr; // Optional
-
-    pipelineInfo.layout = m_pipelineLayout;
-
-    pipelineInfo.renderPass = m_renderPass;
-    pipelineInfo.subpass = 0;
-
-    pipelineInfo.basePipelineHandle = VK_NULL_HANDLE; // Optional
-    pipelineInfo.basePipelineIndex = -1; // Optional
-
-    if (vkCreateGraphicsPipelines(m_device, VK_NULL_HANDLE, 1, &pipelineInfo, nullptr, &m_graphicsPipeline) != VK_SUCCESS) {
-        throw std::runtime_error("failed to create graphics pipeline!");
-    }
-
-    for (size_t i = 0; i < shaderStageInfos.size(); i++) {
-        vkDestroyShaderModule(m_device, shaderStageInfos[i].module, nullptr);
-    }
-}
-
-void VulkanBaseApp::createFramebuffers()
-{
-    m_swapChainFramebuffers.resize(m_swapChainImageViews.size());
-    for (size_t i = 0; i < m_swapChainImageViews.size(); i++) {
-        VkImageView attachments[] = {
-            m_swapChainImageViews[i],
-            m_depthImageView
-        };
-
-        VkFramebufferCreateInfo framebufferInfo = {};
-        framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO;
-        framebufferInfo.renderPass = m_renderPass;
-        framebufferInfo.attachmentCount = countof(attachments);
-        framebufferInfo.pAttachments = attachments;
-        framebufferInfo.width = m_swapChainExtent.width;
-        framebufferInfo.height = m_swapChainExtent.height;
-        framebufferInfo.layers = 1;
-
-        if (vkCreateFramebuffer(m_device, &framebufferInfo, nullptr, &m_swapChainFramebuffers[i]) != VK_SUCCESS) {
-            throw std::runtime_error("failed to create framebuffer!");
-        }
-    }
-}
-
-void VulkanBaseApp::createCommandPool()
-{
-    VkCommandPoolCreateInfo poolInfo = {};
-    uint32_t graphicsIndex, presentIndex;
-
-    findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsIndex, presentIndex);
-
-    poolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
-    poolInfo.queueFamilyIndex = graphicsIndex;
-    poolInfo.flags = 0; // Optional
-
-    if (vkCreateCommandPool(m_device, &poolInfo, nullptr, &m_commandPool) != VK_SUCCESS) {
-        throw std::runtime_error("Failed to create command pool!");
-    }
-}
-
-static void transitionImageLayout(VulkanBaseApp *app, VkImage image, VkFormat format, VkImageLayout oldLayout, VkImageLayout newLayout)
-{
-    VkCommandBuffer commandBuffer = app->beginSingleTimeCommands();
-
-    VkImageMemoryBarrier barrier = {};
-    barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
-    barrier.oldLayout = oldLayout;
-    barrier.newLayout = newLayout;
-    barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-    barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-    barrier.image = image;
-
-    if (newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) {
-        barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
-
-        if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_D24_UNORM_S8_UINT) {
-            barrier.subresourceRange.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
-        }
-    }
-    else {
-        barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-    }
-
-    barrier.subresourceRange.baseMipLevel = 0;
-    barrier.subresourceRange.levelCount = 1;
-    barrier.subresourceRange.baseArrayLayer = 0;
-    barrier.subresourceRange.layerCount = 1;
-
-    VkPipelineStageFlags sourceStage;
-    VkPipelineStageFlags destinationStage;
-
-    if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {
-        barrier.srcAccessMask = 0;
-        barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
-
-        sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
-        destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
-    }
-    else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL && newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
-        barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
-        barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
-
-        sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
-        destinationStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
-    }
-    else if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) {
-        barrier.srcAccessMask = 0;
-        barrier.dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
-
-        sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
-        destinationStage = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT;
-    }
-    else {
-        throw std::invalid_argument("unsupported layout transition!");
-    }
-
-    vkCmdPipelineBarrier(
-        commandBuffer,
-        sourceStage, destinationStage,
-        0,
-        0, nullptr,
-        0, nullptr,
-        1, &barrier
-    );
-
-    app->endSingleTimeCommands(commandBuffer);
-}
-
-void VulkanBaseApp::createDepthResources()
-{
-    VkFormat depthFormat = findSupportedFormat(m_physicalDevice,
-    { VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_D24_UNORM_S8_UINT },
-    VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT);
-    createImage(m_physicalDevice, m_device, m_swapChainExtent.width, m_swapChainExtent.height, depthFormat, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_depthImage, m_depthImageMemory);
-    m_depthImageView = createImageView(m_device, m_depthImage, depthFormat, VK_IMAGE_ASPECT_DEPTH_BIT);
-    transitionImageLayout(this, m_depthImage, depthFormat, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL);
-}
-
-void VulkanBaseApp::createUniformBuffers()
-{
-    VkDeviceSize size = getUniformSize();
-    if (size > 0) {
-        m_uniformBuffers.resize(m_swapChainImages.size());
-        m_uniformMemory.resize(m_swapChainImages.size());
-        for (size_t i = 0; i < m_uniformBuffers.size(); i++) {
-            createBuffer(getUniformSize(),
-                         VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
-                         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
-                         m_uniformBuffers[i], m_uniformMemory[i]);
-        }
-    }
-}
-
-void VulkanBaseApp::createDescriptorPool()
-{
-    VkDescriptorPoolSize poolSize = {};
-    poolSize.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
-    poolSize.descriptorCount = static_cast<uint32_t>(m_swapChainImages.size());
-    VkDescriptorPoolCreateInfo poolInfo = {};
-    poolInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
-    poolInfo.poolSizeCount = 1;
-    poolInfo.pPoolSizes = &poolSize;
-    poolInfo.maxSets = static_cast<uint32_t>(m_swapChainImages.size());;
-    if (vkCreateDescriptorPool(m_device, &poolInfo, nullptr, &m_descriptorPool) != VK_SUCCESS) {
-        throw std::runtime_error("failed to create descriptor pool!");
-    }
-}
-
-void VulkanBaseApp::createDescriptorSets()
-{
-    std::vector<VkDescriptorSetLayout> layouts(m_swapChainImages.size(), m_descriptorSetLayout);
-    VkDescriptorSetAllocateInfo allocInfo = {};
-    allocInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
-    allocInfo.descriptorPool = m_descriptorPool;
-    allocInfo.descriptorSetCount = static_cast<uint32_t>(m_swapChainImages.size());
-    allocInfo.pSetLayouts = layouts.data();
-    m_descriptorSets.resize(m_swapChainImages.size());
-
-    if (vkAllocateDescriptorSets(m_device, &allocInfo, m_descriptorSets.data()) != VK_SUCCESS) {
-        throw std::runtime_error("failed to allocate descriptor sets!");
-    }
-
-    VkDescriptorBufferInfo bufferInfo = {};
-    bufferInfo.offset = 0;
-    bufferInfo.range = VK_WHOLE_SIZE;
-    VkWriteDescriptorSet descriptorWrite = {};
-    descriptorWrite.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
-    descriptorWrite.dstBinding = 0;
-    descriptorWrite.dstArrayElement = 0;
-    descriptorWrite.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
-    descriptorWrite.descriptorCount = 1;
-    descriptorWrite.pBufferInfo = &bufferInfo;
-    descriptorWrite.pImageInfo = nullptr; // Optional
-    descriptorWrite.pTexelBufferView = nullptr; // Optional
-
-    for (size_t i = 0; i < m_swapChainImages.size(); i++) {
-        bufferInfo.buffer = m_uniformBuffers[i];
-        descriptorWrite.dstSet = m_descriptorSets[i];
-        vkUpdateDescriptorSets(m_device, 1, &descriptorWrite, 0, nullptr);
-    }
-}
-
-void VulkanBaseApp::createCommandBuffers()
-{
-    m_commandBuffers.resize(m_swapChainFramebuffers.size());
-    VkCommandBufferAllocateInfo allocInfo = {};
-    allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
-    allocInfo.commandPool = m_commandPool;
-    allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
-    allocInfo.commandBufferCount = (uint32_t)m_commandBuffers.size();
-
-    if (vkAllocateCommandBuffers(m_device, &allocInfo, m_commandBuffers.data()) != VK_SUCCESS) {
-        throw std::runtime_error("failed to allocate command buffers!");
-    }
-
-    for (size_t i = 0; i < m_commandBuffers.size(); i++) {
-        VkCommandBufferBeginInfo beginInfo = {};
-        beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
-        beginInfo.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
-        beginInfo.pInheritanceInfo = nullptr; // Optional
-
-        if (vkBeginCommandBuffer(m_commandBuffers[i], &beginInfo) != VK_SUCCESS) {
-            throw std::runtime_error("failed to begin recording command buffer!");
-        }
-
-        VkRenderPassBeginInfo renderPassInfo = {};
-        renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
-        renderPassInfo.renderPass = m_renderPass;
-        renderPassInfo.framebuffer = m_swapChainFramebuffers[i];
-
-        renderPassInfo.renderArea.offset = { 0, 0 };
-        renderPassInfo.renderArea.extent = m_swapChainExtent;
-
-        VkClearValue clearColors[2];
-        clearColors[0].color = { 0.0f, 0.0f, 0.0f, 1.0f };
-        clearColors[1].depthStencil = { 1.0f, 0 };
-        renderPassInfo.clearValueCount = countof(clearColors);
-        renderPassInfo.pClearValues = clearColors;
-
-        vkCmdBeginRenderPass(m_commandBuffers[i], &renderPassInfo, VK_SUBPASS_CONTENTS_INLINE);
-
-        vkCmdBindPipeline(m_commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, m_graphicsPipeline);
-
-        vkCmdBindDescriptorSets(m_commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, m_pipelineLayout, 0, 1, &m_descriptorSets[i], 0, nullptr);
-
-        fillRenderingCommandBuffer(m_commandBuffers[i]);
-
-        vkCmdEndRenderPass(m_commandBuffers[i]);
-
-        if (vkEndCommandBuffer(m_commandBuffers[i]) != VK_SUCCESS) {
-            throw std::runtime_error("failed to record command buffer!");
-        }
-    }
-}
-
-void VulkanBaseApp::createSyncObjects()
-{
-    VkSemaphoreCreateInfo semaphoreInfo = {};
-    semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
-    VkFenceCreateInfo fenceInfo = {};
-    fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
-    fenceInfo.flags = VK_FENCE_CREATE_SIGNALED_BIT;
-
-    m_inFlightFences.resize(MAX_FRAMES_IN_FLIGHT);
-    m_imageAvailableSemaphores.resize(MAX_FRAMES_IN_FLIGHT);
-    m_renderFinishedSemaphores.resize(MAX_FRAMES_IN_FLIGHT);
-
-    for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) {
-        if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &m_imageAvailableSemaphores[i]) != VK_SUCCESS) {
-            throw std::runtime_error("Failed to create image available semaphore!");
-        }
-        if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &m_renderFinishedSemaphores[i]) != VK_SUCCESS) {
-            throw std::runtime_error("Failed to create image available semaphore!");
-        }
-        if (vkCreateFence(m_device, &fenceInfo, nullptr, &m_inFlightFences[i]) != VK_SUCCESS) {
-            throw std::runtime_error("Failed to create image available semaphore!");
-        }
-    }
-}
-
-void VulkanBaseApp::getWaitFrameSemaphores(std::vector<VkSemaphore>& wait, std::vector<VkPipelineStageFlags>& waitStages) const
-{
-}
-
-void VulkanBaseApp::getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const
-{
-}
-
-VkDeviceSize VulkanBaseApp::getUniformSize() const
-{
-    return VkDeviceSize(0);
-}
-
-void VulkanBaseApp::updateUniformBuffer(uint32_t imageIndex)
-{
-}
-
-void VulkanBaseApp::createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& bufferMemory)
-{
-    VkBufferCreateInfo bufferInfo = {};
-    bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
-    bufferInfo.size = size;
-    bufferInfo.usage = usage;
-    bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-
-    if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
-        throw std::runtime_error("failed to create buffer!");
-    }
-
-    VkMemoryRequirements memRequirements;
-    vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements);
-
-    VkMemoryAllocateInfo allocInfo = {};
-    allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
-    allocInfo.allocationSize = memRequirements.size;
-    allocInfo.memoryTypeIndex = findMemoryType(m_physicalDevice, memRequirements.memoryTypeBits, properties);
-
-    if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) != VK_SUCCESS) {
-        throw std::runtime_error("failed to allocate buffer memory!");
-    }
-
-    vkBindBufferMemory(m_device, buffer, bufferMemory, 0);
-}
-
-void VulkanBaseApp::createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, VkBuffer& buffer, VkDeviceMemory& bufferMemory)
-{
-    VkBufferCreateInfo bufferInfo = {};
-    bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
-    bufferInfo.size = size;
-    bufferInfo.usage = usage;
-    bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-
-    if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
-        throw std::runtime_error("failed to create buffer!");
-    }
-
-    VkMemoryRequirements memRequirements;
-    vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements);
-
-#ifdef _WIN64
-    WindowsSecurityAttributes winSecurityAttributes;
-
-    VkExportMemoryWin32HandleInfoKHR vulkanExportMemoryWin32HandleInfoKHR = {};
-    vulkanExportMemoryWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR;
-    vulkanExportMemoryWin32HandleInfoKHR.pNext = NULL;
-    vulkanExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes;
-    vulkanExportMemoryWin32HandleInfoKHR.dwAccess = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE;
-    vulkanExportMemoryWin32HandleInfoKHR.name = (LPCWSTR)NULL;
-#endif
-    VkExportMemoryAllocateInfoKHR vulkanExportMemoryAllocateInfoKHR = {};
-    vulkanExportMemoryAllocateInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR;
-#ifdef _WIN64
-    vulkanExportMemoryAllocateInfoKHR.pNext = extMemHandleType & VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR ? &vulkanExportMemoryWin32HandleInfoKHR : NULL;
-    vulkanExportMemoryAllocateInfoKHR.handleTypes = extMemHandleType;
-#else
-    vulkanExportMemoryAllocateInfoKHR.pNext = NULL;
-    vulkanExportMemoryAllocateInfoKHR.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
-#endif
-    VkMemoryAllocateInfo allocInfo = {};
-    allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
-    allocInfo.pNext = &vulkanExportMemoryAllocateInfoKHR;
-    allocInfo.allocationSize = memRequirements.size;
-    allocInfo.memoryTypeIndex = findMemoryType(m_physicalDevice, memRequirements.memoryTypeBits, properties);
-
-    if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) != VK_SUCCESS) {
-        throw std::runtime_error("failed to allocate external buffer memory!");
-    }
-
-    vkBindBufferMemory(m_device, buffer, bufferMemory, 0);
-}
-
-void *VulkanBaseApp::getMemHandle(VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType)
-{
-#ifdef _WIN64
-    HANDLE handle = 0;
-
-    VkMemoryGetWin32HandleInfoKHR vkMemoryGetWin32HandleInfoKHR = {};
-    vkMemoryGetWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR;
-    vkMemoryGetWin32HandleInfoKHR.pNext = NULL;
-    vkMemoryGetWin32HandleInfoKHR.memory = memory;
-    vkMemoryGetWin32HandleInfoKHR.handleType = handleType;
-
-    PFN_vkGetMemoryWin32HandleKHR fpGetMemoryWin32HandleKHR;
-    fpGetMemoryWin32HandleKHR = (PFN_vkGetMemoryWin32HandleKHR)vkGetDeviceProcAddr(m_device, "vkGetMemoryWin32HandleKHR");
-    if (!fpGetMemoryWin32HandleKHR) {
-        throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!");
-    }
-    if (fpGetMemoryWin32HandleKHR(m_device, &vkMemoryGetWin32HandleInfoKHR, &handle) != VK_SUCCESS) {
-        throw std::runtime_error("Failed to retrieve handle for buffer!");
-    }
-    return (void *)handle;
-#else
-    int fd = -1;
-
-    VkMemoryGetFdInfoKHR vkMemoryGetFdInfoKHR = {};
-    vkMemoryGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR;
-    vkMemoryGetFdInfoKHR.pNext = NULL;
-    vkMemoryGetFdInfoKHR.memory = memory;
-    vkMemoryGetFdInfoKHR.handleType = handleType;
-
-    PFN_vkGetMemoryFdKHR fpGetMemoryFdKHR;
-    fpGetMemoryFdKHR = (PFN_vkGetMemoryFdKHR)vkGetDeviceProcAddr(m_device, "vkGetMemoryFdKHR");
-    if (!fpGetMemoryFdKHR) {
-        throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!");
-    }
-    if (fpGetMemoryFdKHR(m_device, &vkMemoryGetFdInfoKHR, &fd) != VK_SUCCESS) {
-        throw std::runtime_error("Failed to retrieve handle for buffer!");
-    }
-    return (void *)(uintptr_t)fd;
-#endif /* _WIN64 */
-}
-
-void *VulkanBaseApp::getSemaphoreHandle(VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType)
-{
-#ifdef _WIN64
-    HANDLE handle;
-
-    VkSemaphoreGetWin32HandleInfoKHR semaphoreGetWin32HandleInfoKHR = {};
-    semaphoreGetWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR;
-    semaphoreGetWin32HandleInfoKHR.pNext = NULL;
-    semaphoreGetWin32HandleInfoKHR.semaphore = semaphore;
-    semaphoreGetWin32HandleInfoKHR.handleType = handleType;
-
-    PFN_vkGetSemaphoreWin32HandleKHR fpGetSemaphoreWin32HandleKHR;
-    fpGetSemaphoreWin32HandleKHR = (PFN_vkGetSemaphoreWin32HandleKHR)vkGetDeviceProcAddr(m_device, "vkGetSemaphoreWin32HandleKHR");
-    if (!fpGetSemaphoreWin32HandleKHR) {
-        throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!");
-    }
-    if (fpGetSemaphoreWin32HandleKHR(m_device, &semaphoreGetWin32HandleInfoKHR, &handle) != VK_SUCCESS) {
-        throw std::runtime_error("Failed to retrieve handle for buffer!");
-    }
-
-    return (void *)handle;
-#else
-    int fd;
-
-    VkSemaphoreGetFdInfoKHR semaphoreGetFdInfoKHR = {};
-    semaphoreGetFdInfoKHR.sType =VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR;
-    semaphoreGetFdInfoKHR.pNext = NULL;
-    semaphoreGetFdInfoKHR.semaphore = semaphore;
-    semaphoreGetFdInfoKHR.handleType = handleType;
-
-    PFN_vkGetSemaphoreFdKHR fpGetSemaphoreFdKHR;
-    fpGetSemaphoreFdKHR = (PFN_vkGetSemaphoreFdKHR)vkGetDeviceProcAddr(m_device, "vkGetSemaphoreFdKHR");
-    if (!fpGetSemaphoreFdKHR) {
-        throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!");
-    }
-    if (fpGetSemaphoreFdKHR(m_device, &semaphoreGetFdInfoKHR, &fd) != VK_SUCCESS) {
-        throw std::runtime_error("Failed to retrieve handle for buffer!");
-    }
-
-    return (void *)(uintptr_t)fd;
-#endif
-}
-
-void VulkanBaseApp::createExternalSemaphore(VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType)
-{
-    VkSemaphoreCreateInfo semaphoreInfo = {};
-    semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
-    VkExportSemaphoreCreateInfoKHR exportSemaphoreCreateInfo = {};
-    exportSemaphoreCreateInfo.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR;
-
-#ifdef _WIN64
-    WindowsSecurityAttributes winSecurityAttributes;
-
-    VkExportSemaphoreWin32HandleInfoKHR exportSemaphoreWin32HandleInfoKHR = {};
-    exportSemaphoreWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR;
-    exportSemaphoreWin32HandleInfoKHR.pNext = NULL;
-    exportSemaphoreWin32HandleInfoKHR.pAttributes = &winSecurityAttributes;
-    exportSemaphoreWin32HandleInfoKHR.dwAccess = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE;
-    exportSemaphoreWin32HandleInfoKHR.name = (LPCWSTR)NULL;
-    exportSemaphoreCreateInfo.pNext = (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) ? &exportSemaphoreWin32HandleInfoKHR : NULL;
-#else
-    exportSemaphoreCreateInfo.pNext = NULL;
-#endif
-    exportSemaphoreCreateInfo.handleTypes = handleType;
-    semaphoreInfo.pNext = &exportSemaphoreCreateInfo;
-
-    if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &semaphore) != VK_SUCCESS) {
-        throw std::runtime_error("failed to create synchronization objects for a CUDA-Vulkan!");
-    }
-}
-
-void VulkanBaseApp::importExternalBuffer(void *handle, VkExternalMemoryHandleTypeFlagBits handleType, size_t size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& memory)
-{
-    VkBufferCreateInfo bufferInfo = {};
-    bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
-    bufferInfo.size = size;
-    bufferInfo.usage = usage;
-    bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-
-    if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
-        throw std::runtime_error("failed to create buffer!");
-    }
-
-    VkMemoryRequirements memRequirements;
-    vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements);
-
-#ifdef _WIN64
-    VkImportMemoryWin32HandleInfoKHR handleInfo = {};
-    handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR;
-    handleInfo.pNext = NULL;
-    handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
-    handleInfo.handle = handle;
-    handleInfo.name = NULL;
-#else
-    VkImportMemoryFdInfoKHR handleInfo = {};
-    handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR;
-    handleInfo.pNext = NULL;
-    handleInfo.fd = (int)(uintptr_t)handle;
-    handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
-#endif /* _WIN64 */
-
-    VkMemoryAllocateInfo memAllocation = {};
-    memAllocation.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
-    memAllocation.pNext = (void *)&handleInfo;
-    memAllocation.allocationSize = size;
-    memAllocation.memoryTypeIndex = findMemoryType(m_physicalDevice, memRequirements.memoryTypeBits, properties);
-
-    if (vkAllocateMemory(m_device, &memAllocation, nullptr, &memory) != VK_SUCCESS) {
-        throw std::runtime_error("Failed to import allocation!");
-    }
-
-    vkBindBufferMemory(m_device, buffer, memory, 0);
-}
-
-void VulkanBaseApp::copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size)
-{
-
-    VkCommandBuffer commandBuffer = beginSingleTimeCommands();
-
-    VkBufferCopy copyRegion = {};
-    copyRegion.size = size;
-    vkCmdCopyBuffer(commandBuffer, src, dst, 1, &copyRegion);
-
-    endSingleTimeCommands(commandBuffer);
-}
-
-void VulkanBaseApp::drawFrame()
-{
-    size_t currentFrameIdx = m_currentFrame % MAX_FRAMES_IN_FLIGHT;
-    vkWaitForFences(m_device, 1, &m_inFlightFences[currentFrameIdx], VK_TRUE, std::numeric_limits<uint64_t>::max());
-
-    uint32_t imageIndex;
-    VkResult result = vkAcquireNextImageKHR(m_device, m_swapChain, std::numeric_limits<uint64_t>::max(), m_imageAvailableSemaphores[currentFrameIdx], VK_NULL_HANDLE, &imageIndex);
-    if (result == VK_ERROR_OUT_OF_DATE_KHR) {
-        recreateSwapChain();
-    }
-    else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) {
-        throw std::runtime_error("Failed to acquire swap chain image!");
-    }
-
-    updateUniformBuffer(imageIndex);
-
-    VkSubmitInfo submitInfo = {};
-    submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
-
-    std::vector<VkSemaphore> waitSemaphores;
-    std::vector<VkPipelineStageFlags> waitStages;
-
-    waitSemaphores.push_back(m_imageAvailableSemaphores[currentFrameIdx]);
-    waitStages.push_back(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT);
-    getWaitFrameSemaphores(waitSemaphores, waitStages);
-
-    submitInfo.waitSemaphoreCount = (uint32_t)waitSemaphores.size();
-    submitInfo.pWaitSemaphores = waitSemaphores.data();
-    submitInfo.pWaitDstStageMask = waitStages.data();
-
-    submitInfo.commandBufferCount = 1;
-    submitInfo.pCommandBuffers = &m_commandBuffers[imageIndex];
-
-    std::vector<VkSemaphore> signalSemaphores;
-    getSignalFrameSemaphores(signalSemaphores);
-    signalSemaphores.push_back(m_renderFinishedSemaphores[currentFrameIdx]);
-    submitInfo.signalSemaphoreCount = (uint32_t)signalSemaphores.size();
-    submitInfo.pSignalSemaphores = signalSemaphores.data();
-
-    vkResetFences(m_device, 1, &m_inFlightFences[currentFrameIdx]);
-
-    if (vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, m_inFlightFences[currentFrameIdx]) != VK_SUCCESS) {
-        throw std::runtime_error("failed to submit draw command buffer!");
-    }
-
-    VkPresentInfoKHR presentInfo = {};
-    presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
-    presentInfo.waitSemaphoreCount = 1;
-    presentInfo.pWaitSemaphores = &m_renderFinishedSemaphores[currentFrameIdx];
-
-    VkSwapchainKHR swapChains[] = { m_swapChain };
-    presentInfo.swapchainCount = 1;
-    presentInfo.pSwapchains = swapChains;
-    presentInfo.pImageIndices = &imageIndex;
-
-    result = vkQueuePresentKHR(m_presentQueue, &presentInfo);
-    if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR || m_framebufferResized) {
-        recreateSwapChain();
-        m_framebufferResized = false;
-    }
-    else if (result != VK_SUCCESS) {
-        throw std::runtime_error("Failed to acquire swap chain image!");
-    }
-
-    m_currentFrame++;
-}
-
-void VulkanBaseApp::cleanupSwapChain()
-{
-
-    if (m_depthImageView != VK_NULL_HANDLE) {
-        vkDestroyImageView(m_device, m_depthImageView, nullptr);
-    }
-    if (m_depthImage != VK_NULL_HANDLE) {
-        vkDestroyImage(m_device, m_depthImage, nullptr);
-    }
-    if (m_depthImageMemory != VK_NULL_HANDLE) {
-        vkFreeMemory(m_device, m_depthImageMemory, nullptr);
-    }
-
+void VulkanBaseApp::createUniformBuffers() {
+  VkDeviceSize size = getUniformSize();
+  if (size > 0) {
+    m_uniformBuffers.resize(m_swapChainImages.size());
+    m_uniformMemory.resize(m_swapChainImages.size());
     for (size_t i = 0; i < m_uniformBuffers.size(); i++) {
-        vkDestroyBuffer(m_device, m_uniformBuffers[i], nullptr);
-        vkFreeMemory(m_device, m_uniformMemory[i], nullptr);
-    }
-
-    if (m_descriptorPool != VK_NULL_HANDLE) {
-        vkDestroyDescriptorPool(m_device, m_descriptorPool, nullptr);
-    }
-
-    for (size_t i = 0; i < m_swapChainFramebuffers.size(); i++) {
-        vkDestroyFramebuffer(m_device, m_swapChainFramebuffers[i], nullptr);
-    }
-
-    if (m_graphicsPipeline != VK_NULL_HANDLE) {
-        vkDestroyPipeline(m_device, m_graphicsPipeline, nullptr);
-    }
-
-    if (m_pipelineLayout != VK_NULL_HANDLE) {
-        vkDestroyPipelineLayout(m_device, m_pipelineLayout, nullptr);
-    }
-
-    if (m_renderPass != VK_NULL_HANDLE) {
-        vkDestroyRenderPass(m_device, m_renderPass, nullptr);
-    }
-
-    for (size_t i = 0; i < m_swapChainImageViews.size(); i++) {
-        vkDestroyImageView(m_device, m_swapChainImageViews[i], nullptr);
-    }
-
-    if (m_swapChain != VK_NULL_HANDLE) {
-        vkDestroySwapchainKHR(m_device, m_swapChain, nullptr);
+      createBuffer(getUniformSize(), VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
+                   VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                       VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+                   m_uniformBuffers[i], m_uniformMemory[i]);
     }
+  }
 }
 
-void VulkanBaseApp::recreateSwapChain()
-{
-    int width, height;
+void VulkanBaseApp::createDescriptorPool() {
+  VkDescriptorPoolSize poolSize = {};
+  poolSize.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+  poolSize.descriptorCount = static_cast<uint32_t>(m_swapChainImages.size());
+  VkDescriptorPoolCreateInfo poolInfo = {};
+  poolInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+  poolInfo.poolSizeCount = 1;
+  poolInfo.pPoolSizes = &poolSize;
+  poolInfo.maxSets = static_cast<uint32_t>(m_swapChainImages.size());
+  if (vkCreateDescriptorPool(m_device, &poolInfo, nullptr, &m_descriptorPool) !=
+      VK_SUCCESS) {
+    throw std::runtime_error("failed to create descriptor pool!");
+  }
+}
 
+void VulkanBaseApp::createDescriptorSets() {
+  std::vector<VkDescriptorSetLayout> layouts(m_swapChainImages.size(),
+                                             m_descriptorSetLayout);
+  VkDescriptorSetAllocateInfo allocInfo = {};
+  allocInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+  allocInfo.descriptorPool = m_descriptorPool;
+  allocInfo.descriptorSetCount =
+      static_cast<uint32_t>(m_swapChainImages.size());
+  allocInfo.pSetLayouts = layouts.data();
+  m_descriptorSets.resize(m_swapChainImages.size());
+
+  if (vkAllocateDescriptorSets(m_device, &allocInfo, m_descriptorSets.data()) !=
+      VK_SUCCESS) {
+    throw std::runtime_error("failed to allocate descriptor sets!");
+  }
+
+  VkDescriptorBufferInfo bufferInfo = {};
+  bufferInfo.offset = 0;
+  bufferInfo.range = VK_WHOLE_SIZE;
+  VkWriteDescriptorSet descriptorWrite = {};
+  descriptorWrite.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+  descriptorWrite.dstBinding = 0;
+  descriptorWrite.dstArrayElement = 0;
+  descriptorWrite.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+  descriptorWrite.descriptorCount = 1;
+  descriptorWrite.pBufferInfo = &bufferInfo;
+  descriptorWrite.pImageInfo = nullptr;        // Optional
+  descriptorWrite.pTexelBufferView = nullptr;  // Optional
+
+  for (size_t i = 0; i < m_swapChainImages.size(); i++) {
+    bufferInfo.buffer = m_uniformBuffers[i];
+    descriptorWrite.dstSet = m_descriptorSets[i];
+    vkUpdateDescriptorSets(m_device, 1, &descriptorWrite, 0, nullptr);
+  }
+}
+
+void VulkanBaseApp::createCommandBuffers() {
+  m_commandBuffers.resize(m_swapChainFramebuffers.size());
+  VkCommandBufferAllocateInfo allocInfo = {};
+  allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+  allocInfo.commandPool = m_commandPool;
+  allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+  allocInfo.commandBufferCount = (uint32_t)m_commandBuffers.size();
+
+  if (vkAllocateCommandBuffers(m_device, &allocInfo, m_commandBuffers.data()) !=
+      VK_SUCCESS) {
+    throw std::runtime_error("failed to allocate command buffers!");
+  }
+
+  for (size_t i = 0; i < m_commandBuffers.size(); i++) {
+    VkCommandBufferBeginInfo beginInfo = {};
+    beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    beginInfo.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
+    beginInfo.pInheritanceInfo = nullptr;  // Optional
+
+    if (vkBeginCommandBuffer(m_commandBuffers[i], &beginInfo) != VK_SUCCESS) {
+      throw std::runtime_error("failed to begin recording command buffer!");
+    }
+
+    VkRenderPassBeginInfo renderPassInfo = {};
+    renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
+    renderPassInfo.renderPass = m_renderPass;
+    renderPassInfo.framebuffer = m_swapChainFramebuffers[i];
+
+    renderPassInfo.renderArea.offset = {0, 0};
+    renderPassInfo.renderArea.extent = m_swapChainExtent;
+
+    VkClearValue clearColors[2];
+    clearColors[0].color = {0.0f, 0.0f, 0.0f, 1.0f};
+    clearColors[1].depthStencil = {1.0f, 0};
+    renderPassInfo.clearValueCount = countof(clearColors);
+    renderPassInfo.pClearValues = clearColors;
+
+    vkCmdBeginRenderPass(m_commandBuffers[i], &renderPassInfo,
+                         VK_SUBPASS_CONTENTS_INLINE);
+
+    vkCmdBindPipeline(m_commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS,
+                      m_graphicsPipeline);
+
+    vkCmdBindDescriptorSets(m_commandBuffers[i],
+                            VK_PIPELINE_BIND_POINT_GRAPHICS, m_pipelineLayout,
+                            0, 1, &m_descriptorSets[i], 0, nullptr);
+
+    fillRenderingCommandBuffer(m_commandBuffers[i]);
+
+    vkCmdEndRenderPass(m_commandBuffers[i]);
+
+    if (vkEndCommandBuffer(m_commandBuffers[i]) != VK_SUCCESS) {
+      throw std::runtime_error("failed to record command buffer!");
+    }
+  }
+}
+
+void VulkanBaseApp::createSyncObjects() {
+  VkSemaphoreCreateInfo semaphoreInfo = {};
+  semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+  VkFenceCreateInfo fenceInfo = {};
+  fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+  fenceInfo.flags = VK_FENCE_CREATE_SIGNALED_BIT;
+
+  m_inFlightFences.resize(MAX_FRAMES_IN_FLIGHT);
+  m_imageAvailableSemaphores.resize(MAX_FRAMES_IN_FLIGHT);
+  m_renderFinishedSemaphores.resize(MAX_FRAMES_IN_FLIGHT);
+
+  for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) {
+    if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr,
+                          &m_imageAvailableSemaphores[i]) != VK_SUCCESS) {
+      throw std::runtime_error("Failed to create image available semaphore!");
+    }
+    if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr,
+                          &m_renderFinishedSemaphores[i]) != VK_SUCCESS) {
+      throw std::runtime_error("Failed to create image available semaphore!");
+    }
+    if (vkCreateFence(m_device, &fenceInfo, nullptr, &m_inFlightFences[i]) !=
+        VK_SUCCESS) {
+      throw std::runtime_error("Failed to create image available semaphore!");
+    }
+  }
+
+#ifdef _VK_TIMELINE_SEMAPHORE
+  if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr,
+                        &m_vkPresentationSemaphore) != VK_SUCCESS) {
+    throw std::runtime_error("Failed to create binary semaphore!");
+  }
+#endif /* _VK_TIMELINE_SEMAPHORE */
+}
+
+void VulkanBaseApp::getWaitFrameSemaphores(
+    std::vector<VkSemaphore> &wait,
+    std::vector<VkPipelineStageFlags> &waitStages) const {}
+
+void VulkanBaseApp::getSignalFrameSemaphores(
+    std::vector<VkSemaphore> &signal) const {}
+
+VkDeviceSize VulkanBaseApp::getUniformSize() const { return VkDeviceSize(0); }
+
+void VulkanBaseApp::updateUniformBuffer(uint32_t imageIndex) {}
+
+void VulkanBaseApp::createBuffer(VkDeviceSize size, VkBufferUsageFlags usage,
+                                 VkMemoryPropertyFlags properties,
+                                 VkBuffer &buffer,
+                                 VkDeviceMemory &bufferMemory) {
+  VkBufferCreateInfo bufferInfo = {};
+  bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+  bufferInfo.size = size;
+  bufferInfo.usage = usage;
+  bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+
+  if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
+    throw std::runtime_error("failed to create buffer!");
+  }
+
+  VkMemoryRequirements memRequirements;
+  vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements);
+
+  VkMemoryAllocateInfo allocInfo = {};
+  allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+  allocInfo.allocationSize = memRequirements.size;
+  allocInfo.memoryTypeIndex = findMemoryType(
+      m_physicalDevice, memRequirements.memoryTypeBits, properties);
+
+  if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) !=
+      VK_SUCCESS) {
+    throw std::runtime_error("failed to allocate buffer memory!");
+  }
+
+  vkBindBufferMemory(m_device, buffer, bufferMemory, 0);
+}
+
+void VulkanBaseApp::createExternalBuffer(
+    VkDeviceSize size, VkBufferUsageFlags usage,
+    VkMemoryPropertyFlags properties,
+    VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, VkBuffer &buffer,
+    VkDeviceMemory &bufferMemory) {
+  VkBufferCreateInfo bufferInfo = {};
+  bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+  bufferInfo.size = size;
+  bufferInfo.usage = usage;
+  bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+
+  VkExternalMemoryBufferCreateInfo externalMemoryBufferInfo = {};
+  externalMemoryBufferInfo.sType =
+      VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO;
+  externalMemoryBufferInfo.handleTypes = extMemHandleType;
+  bufferInfo.pNext = &externalMemoryBufferInfo;
+
+  if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
+    throw std::runtime_error("failed to create buffer!");
+  }
+
+  VkMemoryRequirements memRequirements;
+  vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements);
+
+#ifdef _WIN64
+  WindowsSecurityAttributes winSecurityAttributes;
+
+  VkExportMemoryWin32HandleInfoKHR vulkanExportMemoryWin32HandleInfoKHR = {};
+  vulkanExportMemoryWin32HandleInfoKHR.sType =
+      VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR;
+  vulkanExportMemoryWin32HandleInfoKHR.pNext = NULL;
+  vulkanExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes;
+  vulkanExportMemoryWin32HandleInfoKHR.dwAccess =
+      DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE;
+  vulkanExportMemoryWin32HandleInfoKHR.name = (LPCWSTR)NULL;
+#endif /* _WIN64 */
+  VkExportMemoryAllocateInfoKHR vulkanExportMemoryAllocateInfoKHR = {};
+  vulkanExportMemoryAllocateInfoKHR.sType =
+      VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR;
+#ifdef _WIN64
+  vulkanExportMemoryAllocateInfoKHR.pNext =
+      extMemHandleType & VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR
+          ? &vulkanExportMemoryWin32HandleInfoKHR
+          : NULL;
+  vulkanExportMemoryAllocateInfoKHR.handleTypes = extMemHandleType;
+#else
+  vulkanExportMemoryAllocateInfoKHR.pNext = NULL;
+  vulkanExportMemoryAllocateInfoKHR.handleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+#endif /* _WIN64 */
+  VkMemoryAllocateInfo allocInfo = {};
+  allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+  allocInfo.pNext = &vulkanExportMemoryAllocateInfoKHR;
+  allocInfo.allocationSize = memRequirements.size;
+  allocInfo.memoryTypeIndex = findMemoryType(
+      m_physicalDevice, memRequirements.memoryTypeBits, properties);
+
+  if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) !=
+      VK_SUCCESS) {
+    throw std::runtime_error("failed to allocate external buffer memory!");
+  }
+
+  vkBindBufferMemory(m_device, buffer, bufferMemory, 0);
+}
+
+void *VulkanBaseApp::getMemHandle(
+    VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType) {
+#ifdef _WIN64
+  HANDLE handle = 0;
+
+  VkMemoryGetWin32HandleInfoKHR vkMemoryGetWin32HandleInfoKHR = {};
+  vkMemoryGetWin32HandleInfoKHR.sType =
+      VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR;
+  vkMemoryGetWin32HandleInfoKHR.pNext = NULL;
+  vkMemoryGetWin32HandleInfoKHR.memory = memory;
+  vkMemoryGetWin32HandleInfoKHR.handleType = handleType;
+
+  PFN_vkGetMemoryWin32HandleKHR fpGetMemoryWin32HandleKHR;
+  fpGetMemoryWin32HandleKHR =
+      (PFN_vkGetMemoryWin32HandleKHR)vkGetDeviceProcAddr(
+          m_device, "vkGetMemoryWin32HandleKHR");
+  if (!fpGetMemoryWin32HandleKHR) {
+    throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!");
+  }
+  if (fpGetMemoryWin32HandleKHR(m_device, &vkMemoryGetWin32HandleInfoKHR,
+                                &handle) != VK_SUCCESS) {
+    throw std::runtime_error("Failed to retrieve handle for buffer!");
+  }
+  return (void *)handle;
+#else
+  int fd = -1;
+
+  VkMemoryGetFdInfoKHR vkMemoryGetFdInfoKHR = {};
+  vkMemoryGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR;
+  vkMemoryGetFdInfoKHR.pNext = NULL;
+  vkMemoryGetFdInfoKHR.memory = memory;
+  vkMemoryGetFdInfoKHR.handleType = handleType;
+
+  PFN_vkGetMemoryFdKHR fpGetMemoryFdKHR;
+  fpGetMemoryFdKHR =
+      (PFN_vkGetMemoryFdKHR)vkGetDeviceProcAddr(m_device, "vkGetMemoryFdKHR");
+  if (!fpGetMemoryFdKHR) {
+    throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!");
+  }
+  if (fpGetMemoryFdKHR(m_device, &vkMemoryGetFdInfoKHR, &fd) != VK_SUCCESS) {
+    throw std::runtime_error("Failed to retrieve handle for buffer!");
+  }
+  return (void *)(uintptr_t)fd;
+#endif /* _WIN64 */
+}
+
+void *VulkanBaseApp::getSemaphoreHandle(
+    VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType) {
+#ifdef _WIN64
+  HANDLE handle;
+
+  VkSemaphoreGetWin32HandleInfoKHR semaphoreGetWin32HandleInfoKHR = {};
+  semaphoreGetWin32HandleInfoKHR.sType =
+      VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR;
+  semaphoreGetWin32HandleInfoKHR.pNext = NULL;
+  semaphoreGetWin32HandleInfoKHR.semaphore = semaphore;
+  semaphoreGetWin32HandleInfoKHR.handleType = handleType;
+
+  PFN_vkGetSemaphoreWin32HandleKHR fpGetSemaphoreWin32HandleKHR;
+  fpGetSemaphoreWin32HandleKHR =
+      (PFN_vkGetSemaphoreWin32HandleKHR)vkGetDeviceProcAddr(
+          m_device, "vkGetSemaphoreWin32HandleKHR");
+  if (!fpGetSemaphoreWin32HandleKHR) {
+    throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!");
+  }
+  if (fpGetSemaphoreWin32HandleKHR(m_device, &semaphoreGetWin32HandleInfoKHR,
+                                   &handle) != VK_SUCCESS) {
+    throw std::runtime_error("Failed to retrieve handle for buffer!");
+  }
+
+  return (void *)handle;
+#else
+  int fd;
+
+  VkSemaphoreGetFdInfoKHR semaphoreGetFdInfoKHR = {};
+  semaphoreGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR;
+  semaphoreGetFdInfoKHR.pNext = NULL;
+  semaphoreGetFdInfoKHR.semaphore = semaphore;
+  semaphoreGetFdInfoKHR.handleType = handleType;
+
+  PFN_vkGetSemaphoreFdKHR fpGetSemaphoreFdKHR;
+  fpGetSemaphoreFdKHR = (PFN_vkGetSemaphoreFdKHR)vkGetDeviceProcAddr(
+      m_device, "vkGetSemaphoreFdKHR");
+  if (!fpGetSemaphoreFdKHR) {
+    throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!");
+  }
+  if (fpGetSemaphoreFdKHR(m_device, &semaphoreGetFdInfoKHR, &fd) !=
+      VK_SUCCESS) {
+    throw std::runtime_error("Failed to retrieve handle for buffer!");
+  }
+
+  return (void *)(uintptr_t)fd;
+#endif /* _WIN64 */
+}
+
+void VulkanBaseApp::createExternalSemaphore(
+    VkSemaphore &semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType) {
+  VkSemaphoreCreateInfo semaphoreInfo = {};
+  semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+  VkExportSemaphoreCreateInfoKHR exportSemaphoreCreateInfo = {};
+  exportSemaphoreCreateInfo.sType =
+      VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR;
+
+#ifdef _VK_TIMELINE_SEMAPHORE
+  VkSemaphoreTypeCreateInfo timelineCreateInfo;
+  timelineCreateInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO;
+  timelineCreateInfo.pNext = NULL;
+  timelineCreateInfo.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE;
+  timelineCreateInfo.initialValue = 0;
+#endif /* _VK_TIMELINE_SEMAPHORE */
+
+#ifdef _WIN64
+  WindowsSecurityAttributes winSecurityAttributes;
+
+  VkExportSemaphoreWin32HandleInfoKHR exportSemaphoreWin32HandleInfoKHR = {};
+  exportSemaphoreWin32HandleInfoKHR.sType =
+      VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR;
+
+#ifdef _VK_TIMELINE_SEMAPHORE
+  exportSemaphoreWin32HandleInfoKHR.pNext = &timelineCreateInfo;
+#else
+  exportSemaphoreWin32HandleInfoKHR.pNext = NULL;
+#endif /* _VK_TIMELINE_SEMAPHORE */
+
+  exportSemaphoreWin32HandleInfoKHR.pAttributes = &winSecurityAttributes;
+  exportSemaphoreWin32HandleInfoKHR.dwAccess =
+      DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE;
+  exportSemaphoreWin32HandleInfoKHR.name = (LPCWSTR)NULL;
+  exportSemaphoreCreateInfo.pNext =
+      (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT)
+          ? &exportSemaphoreWin32HandleInfoKHR
+          : NULL;
+#else
+#ifdef _VK_TIMELINE_SEMAPHORE
+  exportSemaphoreCreateInfo.pNext = &timelineCreateInfo;
+#else
+  exportSemaphoreCreateInfo.pNext = NULL;
+#endif /* _VK_TIMELINE_SEMAPHORE */
+#endif /* _WIN64 */
+  exportSemaphoreCreateInfo.handleTypes = handleType;
+  semaphoreInfo.pNext = &exportSemaphoreCreateInfo;
+
+  if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &semaphore) !=
+      VK_SUCCESS) {
+    throw std::runtime_error(
+        "failed to create synchronization objects for a CUDA-Vulkan!");
+  }
+}
+
+void VulkanBaseApp::importExternalBuffer(
+    void *handle, VkExternalMemoryHandleTypeFlagBits handleType, size_t size,
+    VkBufferUsageFlags usage, VkMemoryPropertyFlags properties,
+    VkBuffer &buffer, VkDeviceMemory &memory) {
+  VkBufferCreateInfo bufferInfo = {};
+  bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+  bufferInfo.size = size;
+  bufferInfo.usage = usage;
+  bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+
+  if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
+    throw std::runtime_error("failed to create buffer!");
+  }
+
+  VkMemoryRequirements memRequirements;
+  vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements);
+
+#ifdef _WIN64
+  VkImportMemoryWin32HandleInfoKHR handleInfo = {};
+  handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR;
+  handleInfo.pNext = NULL;
+  handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
+  handleInfo.handle = handle;
+  handleInfo.name = NULL;
+#else
+  VkImportMemoryFdInfoKHR handleInfo = {};
+  handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR;
+  handleInfo.pNext = NULL;
+  handleInfo.fd = (int)(uintptr_t)handle;
+  handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+#endif /* _WIN64 */
+
+  VkMemoryAllocateInfo memAllocation = {};
+  memAllocation.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+  memAllocation.pNext = (void *)&handleInfo;
+  memAllocation.allocationSize = size;
+  memAllocation.memoryTypeIndex = findMemoryType(
+      m_physicalDevice, memRequirements.memoryTypeBits, properties);
+
+  if (vkAllocateMemory(m_device, &memAllocation, nullptr, &memory) !=
+      VK_SUCCESS) {
+    throw std::runtime_error("Failed to import allocation!");
+  }
+
+  vkBindBufferMemory(m_device, buffer, memory, 0);
+}
+
+void VulkanBaseApp::copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size) {
+  VkCommandBuffer commandBuffer = beginSingleTimeCommands();
+
+  VkBufferCopy copyRegion = {};
+  copyRegion.size = size;
+  vkCmdCopyBuffer(commandBuffer, src, dst, 1, &copyRegion);
+
+  endSingleTimeCommands(commandBuffer);
+}
+
+#ifdef _VK_TIMELINE_SEMAPHORE
+void VulkanBaseApp::drawFrame() {
+  const uint64_t waitValue = 0;
+  const uint64_t signalValue = 1;
+
+  VkSemaphoreWaitInfo semaphoreWaitInfo = {};
+  semaphoreWaitInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO;
+  semaphoreWaitInfo.pSemaphores = &m_vkTimelineSemaphore;
+  semaphoreWaitInfo.semaphoreCount = 1;
+  semaphoreWaitInfo.pValues = &waitValue;
+  vkWaitSemaphores(m_device, &semaphoreWaitInfo,
+                   std::numeric_limits<uint64_t>::max());
+
+  uint32_t imageIndex;
+  VkResult result = vkAcquireNextImageKHR(
+      m_device, m_swapChain, std::numeric_limits<uint64_t>::max(),
+      m_vkPresentationSemaphore, VK_NULL_HANDLE, &imageIndex);
+  if (result == VK_ERROR_OUT_OF_DATE_KHR) {
+    recreateSwapChain();
+  } else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) {
+    throw std::runtime_error("Failed to acquire swap chain image!");
+  }
+
+  updateUniformBuffer(imageIndex);
+
+  VkSubmitInfo submitInfo = {};
+  submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+
+  std::vector<VkSemaphore> waitSemaphores;
+  std::vector<VkPipelineStageFlags> waitStages;
+
+  waitSemaphores.push_back(m_vkTimelineSemaphore);
+  waitStages.push_back(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT);
+
+  submitInfo.waitSemaphoreCount = (uint32_t)waitSemaphores.size();
+  submitInfo.pWaitSemaphores = waitSemaphores.data();
+  submitInfo.pWaitDstStageMask = waitStages.data();
+
+  submitInfo.commandBufferCount = 1;
+  submitInfo.pCommandBuffers = &m_commandBuffers[imageIndex];
+
+  std::vector<VkSemaphore> signalSemaphores;
+  signalSemaphores.push_back(m_vkTimelineSemaphore);
+  submitInfo.signalSemaphoreCount = (uint32_t)signalSemaphores.size();
+  submitInfo.pSignalSemaphores = signalSemaphores.data();
+
+  VkTimelineSemaphoreSubmitInfo timelineInfo = {};
+  timelineInfo.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO;
+  timelineInfo.waitSemaphoreValueCount = 1;
+  timelineInfo.pWaitSemaphoreValues = &waitValue;
+  timelineInfo.signalSemaphoreValueCount = 1;
+  timelineInfo.pSignalSemaphoreValues = &signalValue;
+
+  submitInfo.pNext = &timelineInfo;
+
+  if (vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE) !=
+      VK_SUCCESS) {
+    throw std::runtime_error("failed to submit draw command buffer!");
+  }
+
+  VkPresentInfoKHR presentInfo = {};
+  presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
+  presentInfo.waitSemaphoreCount = 1;
+  presentInfo.pWaitSemaphores = &m_vkPresentationSemaphore;
+
+  VkSwapchainKHR swapChains[] = {m_swapChain};
+  presentInfo.swapchainCount = 1;
+  presentInfo.pSwapchains = swapChains;
+  presentInfo.pImageIndices = &imageIndex;
+
+  result = vkQueuePresentKHR(m_presentQueue, &presentInfo);
+  if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR ||
+      m_framebufferResized) {
+    recreateSwapChain();
+    m_framebufferResized = false;
+  } else if (result != VK_SUCCESS) {
+    throw std::runtime_error("Failed to acquire swap chain image!");
+  }
+
+  m_currentFrame++;
+}
+#else
+void VulkanBaseApp::drawFrame() {
+  size_t currentFrameIdx = m_currentFrame % MAX_FRAMES_IN_FLIGHT;
+  vkWaitForFences(m_device, 1, &m_inFlightFences[currentFrameIdx], VK_TRUE,
+                  std::numeric_limits<uint64_t>::max());
+
+  uint32_t imageIndex;
+  VkResult result = vkAcquireNextImageKHR(
+      m_device, m_swapChain, std::numeric_limits<uint64_t>::max(),
+      m_imageAvailableSemaphores[currentFrameIdx], VK_NULL_HANDLE, &imageIndex);
+  if (result == VK_ERROR_OUT_OF_DATE_KHR) {
+    recreateSwapChain();
+  } else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) {
+    throw std::runtime_error("Failed to acquire swap chain image!");
+  }
+
+  updateUniformBuffer(imageIndex);
+
+  VkSubmitInfo submitInfo = {};
+  submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+
+  std::vector<VkSemaphore> waitSemaphores;
+  std::vector<VkPipelineStageFlags> waitStages;
+
+  waitSemaphores.push_back(m_imageAvailableSemaphores[currentFrameIdx]);
+  waitStages.push_back(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT);
+  getWaitFrameSemaphores(waitSemaphores, waitStages);
+
+  submitInfo.waitSemaphoreCount = (uint32_t)waitSemaphores.size();
+  submitInfo.pWaitSemaphores = waitSemaphores.data();
+  submitInfo.pWaitDstStageMask = waitStages.data();
+
+  submitInfo.commandBufferCount = 1;
+  submitInfo.pCommandBuffers = &m_commandBuffers[imageIndex];
+
+  std::vector<VkSemaphore> signalSemaphores;
+  getSignalFrameSemaphores(signalSemaphores);
+  signalSemaphores.push_back(m_renderFinishedSemaphores[currentFrameIdx]);
+  submitInfo.signalSemaphoreCount = (uint32_t)signalSemaphores.size();
+  submitInfo.pSignalSemaphores = signalSemaphores.data();
+
+  vkResetFences(m_device, 1, &m_inFlightFences[currentFrameIdx]);
+
+  if (vkQueueSubmit(m_graphicsQueue, 1, &submitInfo,
+                    m_inFlightFences[currentFrameIdx]) != VK_SUCCESS) {
+    throw std::runtime_error("failed to submit draw command buffer!");
+  }
+
+  VkPresentInfoKHR presentInfo = {};
+  presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
+  presentInfo.waitSemaphoreCount = 1;
+  presentInfo.pWaitSemaphores = &m_renderFinishedSemaphores[currentFrameIdx];
+
+  VkSwapchainKHR swapChains[] = {m_swapChain};
+  presentInfo.swapchainCount = 1;
+  presentInfo.pSwapchains = swapChains;
+  presentInfo.pImageIndices = &imageIndex;
+
+  result = vkQueuePresentKHR(m_presentQueue, &presentInfo);
+  if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR ||
+      m_framebufferResized) {
+    recreateSwapChain();
+    m_framebufferResized = false;
+  } else if (result != VK_SUCCESS) {
+    throw std::runtime_error("Failed to acquire swap chain image!");
+  }
+
+  m_currentFrame++;
+}
+#endif /* _VK_TIMELINE_SEMAPHORE */
+
+void VulkanBaseApp::cleanupSwapChain() {
+  if (m_depthImageView != VK_NULL_HANDLE) {
+    vkDestroyImageView(m_device, m_depthImageView, nullptr);
+  }
+  if (m_depthImage != VK_NULL_HANDLE) {
+    vkDestroyImage(m_device, m_depthImage, nullptr);
+  }
+  if (m_depthImageMemory != VK_NULL_HANDLE) {
+    vkFreeMemory(m_device, m_depthImageMemory, nullptr);
+  }
+
+  for (size_t i = 0; i < m_uniformBuffers.size(); i++) {
+    vkDestroyBuffer(m_device, m_uniformBuffers[i], nullptr);
+    vkFreeMemory(m_device, m_uniformMemory[i], nullptr);
+  }
+
+  if (m_descriptorPool != VK_NULL_HANDLE) {
+    vkDestroyDescriptorPool(m_device, m_descriptorPool, nullptr);
+  }
+
+  for (size_t i = 0; i < m_swapChainFramebuffers.size(); i++) {
+    vkDestroyFramebuffer(m_device, m_swapChainFramebuffers[i], nullptr);
+  }
+
+  if (m_graphicsPipeline != VK_NULL_HANDLE) {
+    vkDestroyPipeline(m_device, m_graphicsPipeline, nullptr);
+  }
+
+  if (m_pipelineLayout != VK_NULL_HANDLE) {
+    vkDestroyPipelineLayout(m_device, m_pipelineLayout, nullptr);
+  }
+
+  if (m_renderPass != VK_NULL_HANDLE) {
+    vkDestroyRenderPass(m_device, m_renderPass, nullptr);
+  }
+
+  for (size_t i = 0; i < m_swapChainImageViews.size(); i++) {
+    vkDestroyImageView(m_device, m_swapChainImageViews[i], nullptr);
+  }
+
+  if (m_swapChain != VK_NULL_HANDLE) {
+    vkDestroySwapchainKHR(m_device, m_swapChain, nullptr);
+  }
+}
+
+void VulkanBaseApp::recreateSwapChain() {
+  int width, height;
+
+  glfwGetFramebufferSize(m_window, &width, &height);
+  while (width == 0 || height == 0) {
+    glfwWaitEvents();
     glfwGetFramebufferSize(m_window, &width, &height);
-    while (width == 0 || height == 0) {
-        glfwWaitEvents();
-        glfwGetFramebufferSize(m_window, &width, &height);
-    }
+  }
 
-    vkDeviceWaitIdle(m_device);
+  vkDeviceWaitIdle(m_device);
 
-    cleanupSwapChain();
+  cleanupSwapChain();
 
-    createSwapChain();
-    createImageViews();
-    createRenderPass();
-    createGraphicsPipeline();
-    createDepthResources();
-    createFramebuffers();
-    createUniformBuffers();
-    createDescriptorPool();
-    createDescriptorSets();
-    createCommandBuffers();
+  createSwapChain();
+  createImageViews();
+  createRenderPass();
+  createGraphicsPipeline();
+  createDepthResources();
+  createFramebuffers();
+  createUniformBuffers();
+  createDescriptorPool();
+  createDescriptorSets();
+  createCommandBuffers();
 }
 
-void VulkanBaseApp::mainLoop()
-{
-    while (!glfwWindowShouldClose(m_window)) {
-        glfwPollEvents();
-        drawFrame();
-    }
-    vkDeviceWaitIdle(m_device);
+void VulkanBaseApp::mainLoop() {
+  while (!glfwWindowShouldClose(m_window)) {
+    glfwPollEvents();
+    drawFrame();
+  }
+  vkDeviceWaitIdle(m_device);
 }
 
-void readFile(std::istream& s, std::vector<char>& data)
-{
-    s.seekg(0, std::ios_base::end);
-    data.resize(s.tellg());
-    s.clear();
-    s.seekg(0, std::ios_base::beg);
-    s.read(data.data(), data.size());
+void readFile(std::istream &s, std::vector<char> &data) {
+  s.seekg(0, std::ios_base::end);
+  data.resize(s.tellg());
+  s.clear();
+  s.seekg(0, std::ios_base::beg);
+  s.read(data.data(), data.size());
 }
diff --git a/Samples/simpleVulkan/VulkanBaseApp.h b/Samples/simpleVulkan/VulkanBaseApp.h
index 5cb7396d..5609acd7 100644
--- a/Samples/simpleVulkan/VulkanBaseApp.h
+++ b/Samples/simpleVulkan/VulkanBaseApp.h
@@ -38,101 +38,125 @@
 #include <vulkan/vulkan_win32.h>
 #endif /* _WIN64 */
 
+/* remove _VK_TIMELINE_SEMAPHORE to use binary semaphores */
+// use vulkan timeline semaphore
+#define _VK_TIMELINE_SEMAPHORE
+
 struct GLFWwindow;
 
-class VulkanBaseApp
-{
-public:
-    VulkanBaseApp(const std::string& appName, bool enableValidation = false);
-    static VkExternalSemaphoreHandleTypeFlagBits getDefaultSemaphoreHandleType();
-    static VkExternalMemoryHandleTypeFlagBits getDefaultMemHandleType();
-    virtual ~VulkanBaseApp();
-    void init();
-    void *getMemHandle(VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType);
-    void *getSemaphoreHandle(VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType);
-    void createExternalSemaphore(VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType);
-    void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& bufferMemory);
-    void createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, VkBuffer& buffer, VkDeviceMemory& bufferMemory);
-    void importExternalBuffer(void *handle, VkExternalMemoryHandleTypeFlagBits handleType, size_t size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& memory);
-    void copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size);
-    VkCommandBuffer beginSingleTimeCommands();
-    void endSingleTimeCommands(VkCommandBuffer commandBuffer);
-    void mainLoop();
-protected:
-    const std::string m_appName;
-    const bool m_enableValidation;
-    VkInstance m_instance;
-    VkDebugUtilsMessengerEXT m_debugMessenger;
-    VkSurfaceKHR m_surface;
-    VkPhysicalDevice m_physicalDevice;
-    VkDevice m_device;
-    VkQueue m_graphicsQueue;
-    VkQueue m_presentQueue;
-    VkSwapchainKHR m_swapChain;
-    std::vector<VkImage> m_swapChainImages;
-    VkFormat m_swapChainFormat;
-    VkExtent2D m_swapChainExtent;
-    std::vector<VkImageView> m_swapChainImageViews;
-    std::vector<std::pair<VkShaderStageFlagBits, std::string> > m_shaderFiles;
-    VkRenderPass m_renderPass;
-    VkPipelineLayout m_pipelineLayout;
-    VkPipeline m_graphicsPipeline;
-    std::vector<VkFramebuffer> m_swapChainFramebuffers;
-    VkCommandPool m_commandPool;
-    std::vector<VkCommandBuffer> m_commandBuffers;
-    std::vector<VkSemaphore> m_imageAvailableSemaphores;
-    std::vector<VkSemaphore> m_renderFinishedSemaphores;
-    std::vector<VkFence> m_inFlightFences;
-    std::vector<VkBuffer> m_uniformBuffers;
-    std::vector<VkDeviceMemory> m_uniformMemory;
-    VkDescriptorSetLayout m_descriptorSetLayout;
-    VkDescriptorPool m_descriptorPool;
-    std::vector<VkDescriptorSet> m_descriptorSets;
-    VkImage m_depthImage;
-    VkDeviceMemory m_depthImageMemory;
-    VkImageView m_depthImageView;
-    size_t m_currentFrame;
-    bool m_framebufferResized;
-    uint8_t  m_vkDeviceUUID[VK_UUID_SIZE];
+class VulkanBaseApp {
+ public:
+  VulkanBaseApp(const std::string& appName, bool enableValidation = false);
+  static VkExternalSemaphoreHandleTypeFlagBits getDefaultSemaphoreHandleType();
+  static VkExternalMemoryHandleTypeFlagBits getDefaultMemHandleType();
+  virtual ~VulkanBaseApp();
+  void init();
+  void* getMemHandle(VkDeviceMemory memory,
+                     VkExternalMemoryHandleTypeFlagBits handleType);
+  void* getSemaphoreHandle(VkSemaphore semaphore,
+                           VkExternalSemaphoreHandleTypeFlagBits handleType);
+  void createExternalSemaphore(
+      VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType);
+  void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage,
+                    VkMemoryPropertyFlags properties, VkBuffer& buffer,
+                    VkDeviceMemory& bufferMemory);
+  void createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage,
+                            VkMemoryPropertyFlags properties,
+                            VkExternalMemoryHandleTypeFlagsKHR extMemHandleType,
+                            VkBuffer& buffer, VkDeviceMemory& bufferMemory);
+  void importExternalBuffer(void* handle,
+                            VkExternalMemoryHandleTypeFlagBits handleType,
+                            size_t size, VkBufferUsageFlags usage,
+                            VkMemoryPropertyFlags properties, VkBuffer& buffer,
+                            VkDeviceMemory& memory);
+  void copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size);
+  VkCommandBuffer beginSingleTimeCommands();
+  void endSingleTimeCommands(VkCommandBuffer commandBuffer);
+  void mainLoop();
 
-    virtual void initVulkanApp() {}
-    virtual void fillRenderingCommandBuffer(VkCommandBuffer& buffer) {}
-    virtual std::vector<const char *> getRequiredExtensions() const;
-    virtual std::vector<const char *> getRequiredDeviceExtensions() const;
-    virtual void getVertexDescriptions(std::vector<VkVertexInputBindingDescription>& bindingDesc, std::vector<VkVertexInputAttributeDescription>& attribDesc);
-    virtual void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info);
-    virtual void getWaitFrameSemaphores(std::vector<VkSemaphore>& wait, std::vector< VkPipelineStageFlags>& waitStages) const;
-    virtual void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const;
-    virtual VkDeviceSize getUniformSize() const;
-    virtual void updateUniformBuffer(uint32_t imageIndex);
-    virtual void drawFrame();
-private:
-    GLFWwindow *m_window;
+ protected:
+  const std::string m_appName;
+  const bool m_enableValidation;
+  VkInstance m_instance;
+  VkDebugUtilsMessengerEXT m_debugMessenger;
+  VkSurfaceKHR m_surface;
+  VkPhysicalDevice m_physicalDevice;
+  VkDevice m_device;
+  VkQueue m_graphicsQueue;
+  VkQueue m_presentQueue;
+  VkSwapchainKHR m_swapChain;
+  std::vector<VkImage> m_swapChainImages;
+  VkFormat m_swapChainFormat;
+  VkExtent2D m_swapChainExtent;
+  std::vector<VkImageView> m_swapChainImageViews;
+  std::vector<std::pair<VkShaderStageFlagBits, std::string> > m_shaderFiles;
+  VkRenderPass m_renderPass;
+  VkPipelineLayout m_pipelineLayout;
+  VkPipeline m_graphicsPipeline;
+  std::vector<VkFramebuffer> m_swapChainFramebuffers;
+  VkCommandPool m_commandPool;
+  std::vector<VkCommandBuffer> m_commandBuffers;
+  std::vector<VkSemaphore> m_imageAvailableSemaphores;
+  std::vector<VkSemaphore> m_renderFinishedSemaphores;
+  std::vector<VkFence> m_inFlightFences;
+  std::vector<VkBuffer> m_uniformBuffers;
+  std::vector<VkDeviceMemory> m_uniformMemory;
+  VkSemaphore m_vkPresentationSemaphore;
+  VkSemaphore m_vkTimelineSemaphore;
+  VkDescriptorSetLayout m_descriptorSetLayout;
+  VkDescriptorPool m_descriptorPool;
+  std::vector<VkDescriptorSet> m_descriptorSets;
+  VkImage m_depthImage;
+  VkDeviceMemory m_depthImageMemory;
+  VkImageView m_depthImageView;
+  size_t m_currentFrame;
+  bool m_framebufferResized;
+  uint8_t m_vkDeviceUUID[VK_UUID_SIZE];
 
-    void initWindow();
-    void initVulkan();
-    void createInstance();
-    void createSurface();
-    void createDevice();
-    void createSwapChain();
-    void createImageViews();
-    void createRenderPass();
-    void createDescriptorSetLayout();
-    void createGraphicsPipeline();
-    void createFramebuffers();
-    void createCommandPool();
-    void createDepthResources();
-    void createUniformBuffers();
-    void createDescriptorPool();
-    void createDescriptorSets();
-    void createCommandBuffers();
-    void createSyncObjects();
+  virtual void initVulkanApp() {}
+  virtual void fillRenderingCommandBuffer(VkCommandBuffer& buffer) {}
+  virtual std::vector<const char*> getRequiredExtensions() const;
+  virtual std::vector<const char*> getRequiredDeviceExtensions() const;
+  virtual void getVertexDescriptions(
+      std::vector<VkVertexInputBindingDescription>& bindingDesc,
+      std::vector<VkVertexInputAttributeDescription>& attribDesc);
+  virtual void getAssemblyStateInfo(
+      VkPipelineInputAssemblyStateCreateInfo& info);
+  virtual void getWaitFrameSemaphores(
+      std::vector<VkSemaphore>& wait,
+      std::vector<VkPipelineStageFlags>& waitStages) const;
+  virtual void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const;
+  virtual VkDeviceSize getUniformSize() const;
+  virtual void updateUniformBuffer(uint32_t imageIndex);
+  virtual void drawFrame();
 
-    void cleanupSwapChain();
-    void recreateSwapChain();
+ private:
+  GLFWwindow* m_window;
 
-    bool isSuitableDevice(VkPhysicalDevice dev) const;
-    static void resizeCallback(GLFWwindow *window, int width, int height);
+  void initWindow();
+  void initVulkan();
+  void createInstance();
+  void createSurface();
+  void createDevice();
+  void createSwapChain();
+  void createImageViews();
+  void createRenderPass();
+  void createDescriptorSetLayout();
+  void createGraphicsPipeline();
+  void createFramebuffers();
+  void createCommandPool();
+  void createDepthResources();
+  void createUniformBuffers();
+  void createDescriptorPool();
+  void createDescriptorSets();
+  void createCommandBuffers();
+  void createSyncObjects();
+
+  void cleanupSwapChain();
+  void recreateSwapChain();
+
+  bool isSuitableDevice(VkPhysicalDevice dev) const;
+  static void resizeCallback(GLFWwindow* window, int width, int height);
 };
 
 void readFile(std::istream& s, std::vector<char>& data);
diff --git a/Samples/simpleVulkan/main.cpp b/Samples/simpleVulkan/main.cpp
index 303361b1..3277cb7b 100644
--- a/Samples/simpleVulkan/main.cpp
+++ b/Samples/simpleVulkan/main.cpp
@@ -46,28 +46,28 @@ std::string execution_path;
 #define ENABLE_VALIDATION (true)
 #endif
 
-class VulkanCudaSineWave : public VulkanBaseApp
-{
+class VulkanCudaSineWave : public VulkanBaseApp {
+  typedef struct UniformBufferObject_st {
+    mat4x4 modelViewProj;
+  } UniformBufferObject;
 
-    typedef struct UniformBufferObject_st {
-        mat4x4 modelViewProj;
-    } UniformBufferObject;
+  VkBuffer m_heightBuffer, m_xyBuffer, m_indexBuffer;
+  VkDeviceMemory m_heightMemory, m_xyMemory, m_indexMemory;
+  UniformBufferObject m_ubo;
+  VkSemaphore m_vkWaitSemaphore, m_vkSignalSemaphore;
+  SineWaveSimulation m_sim;
+  cudaStream_t m_stream;
+  cudaExternalSemaphore_t m_cudaWaitSemaphore, m_cudaSignalSemaphore,
+      m_cudaTimelineSemaphore;
+  cudaExternalMemory_t m_cudaVertMem;
+  float *m_cudaHeightMap;
+  using chrono_tp = std::chrono::time_point<std::chrono::high_resolution_clock>;
+  chrono_tp m_lastTime;
+  size_t m_lastFrame;
 
-    VkBuffer m_heightBuffer, m_xyBuffer, m_indexBuffer;
-    VkDeviceMemory m_heightMemory, m_xyMemory, m_indexMemory;
-    UniformBufferObject m_ubo;
-    VkSemaphore m_vkWaitSemaphore, m_vkSignalSemaphore;
-    SineWaveSimulation m_sim;
-    cudaStream_t m_stream;
-    cudaExternalSemaphore_t m_cudaWaitSemaphore, m_cudaSignalSemaphore;
-    cudaExternalMemory_t m_cudaVertMem;
-    float *m_cudaHeightMap;
-    using chrono_tp = std::chrono::time_point<std::chrono::high_resolution_clock>;
-    chrono_tp m_lastTime;
-    size_t m_lastFrame;
-public:
-    VulkanCudaSineWave(size_t width, size_t height) :
-        VulkanBaseApp("vulkanCudaSineWave", ENABLE_VALIDATION),
+ public:
+  VulkanCudaSineWave(size_t width, size_t height)
+      : VulkanBaseApp("vulkanCudaSineWave", ENABLE_VALIDATION),
         m_heightBuffer(VK_NULL_HANDLE),
         m_xyBuffer(VK_NULL_HANDLE),
         m_indexBuffer(VK_NULL_HANDLE),
@@ -81,361 +81,458 @@ public:
         m_vkSignalSemaphore(VK_NULL_HANDLE),
         m_cudaWaitSemaphore(),
         m_cudaSignalSemaphore(),
+        m_cudaTimelineSemaphore(),
         m_cudaVertMem(),
         m_cudaHeightMap(nullptr),
         m_lastFrame(0) {
-        // Our index buffer can only index 32-bits of the vertex buffer
-        if ((width * height) > (1ULL << 32ULL)) {
-            throw std::runtime_error("Requested height and width is too large for this sample!");
-        }
-        // Add our compiled vulkan shader files
-        char* vertex_shader_path = sdkFindFilePath("sinewave.vert", execution_path.c_str());
-        char* fragment_shader_path = sdkFindFilePath("sinewave.frag", execution_path.c_str());
-        m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path));
-        m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader_path));
-
+    // Our index buffer can only index 32-bits of the vertex buffer
+    if ((width * height) > (1ULL << 32ULL)) {
+      throw std::runtime_error(
+          "Requested height and width is too large for this sample!");
     }
-    ~VulkanCudaSineWave() {
-        // Make sure there's no pending work before we start tearing down
-        checkCudaErrors(cudaStreamSynchronize(m_stream));
+    // Add our compiled vulkan shader files
+    char *vertex_shader_path =
+        sdkFindFilePath("sinewave.vert", execution_path.c_str());
+    char *fragment_shader_path =
+        sdkFindFilePath("sinewave.frag", execution_path.c_str());
+    m_shaderFiles.push_back(
+        std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path));
+    m_shaderFiles.push_back(
+        std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader_path));
+  }
+  ~VulkanCudaSineWave() {
+    // Make sure there's no pending work before we start tearing down
+    checkCudaErrors(cudaStreamSynchronize(m_stream));
 
-        if (m_vkSignalSemaphore != VK_NULL_HANDLE) {
-            checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaSignalSemaphore));
-            vkDestroySemaphore(m_device, m_vkSignalSemaphore, nullptr);
-        }
-        if (m_vkWaitSemaphore != VK_NULL_HANDLE) {
-            checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaWaitSemaphore));
-            vkDestroySemaphore(m_device, m_vkWaitSemaphore, nullptr);
-        }
+#ifdef _VK_TIMELINE_SEMAPHORE
+    if (m_vkTimelineSemaphore != VK_NULL_HANDLE) {
+      checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaTimelineSemaphore));
+      vkDestroySemaphore(m_device, m_vkTimelineSemaphore, nullptr);
+    }
+#endif /* _VK_TIMELINE_SEMAPHORE */
 
-        if (m_xyBuffer != VK_NULL_HANDLE) {
-            vkDestroyBuffer(m_device, m_xyBuffer, nullptr);
-        }
-        if (m_xyMemory != VK_NULL_HANDLE) {
-            vkFreeMemory(m_device, m_xyMemory, nullptr);
-        }
-
-        if (m_heightBuffer != VK_NULL_HANDLE) {
-            vkDestroyBuffer(m_device, m_heightBuffer, nullptr);
-        }
-        if (m_heightMemory != VK_NULL_HANDLE) {
-            vkFreeMemory(m_device, m_heightMemory, nullptr);
-        }
-        if (m_cudaHeightMap) {
-            checkCudaErrors(cudaDestroyExternalMemory(m_cudaVertMem));
-        }
-
-        if (m_indexBuffer != VK_NULL_HANDLE) {
-            vkDestroyBuffer(m_device, m_indexBuffer, nullptr);
-        }
-        if (m_indexMemory != VK_NULL_HANDLE) {
-            vkFreeMemory(m_device, m_indexMemory, nullptr);
-        }
+    if (m_vkSignalSemaphore != VK_NULL_HANDLE) {
+      checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaSignalSemaphore));
+      vkDestroySemaphore(m_device, m_vkSignalSemaphore, nullptr);
+    }
+    if (m_vkWaitSemaphore != VK_NULL_HANDLE) {
+      checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaWaitSemaphore));
+      vkDestroySemaphore(m_device, m_vkWaitSemaphore, nullptr);
     }
 
-    void fillRenderingCommandBuffer(VkCommandBuffer& commandBuffer) {
-        VkBuffer vertexBuffers[] = { m_heightBuffer, m_xyBuffer };
-        VkDeviceSize offsets[] = { 0, 0 };
-        vkCmdBindVertexBuffers(commandBuffer, 0, sizeof(vertexBuffers) / sizeof(vertexBuffers[0]), vertexBuffers, offsets);
-        vkCmdBindIndexBuffer(commandBuffer, m_indexBuffer, 0, VK_INDEX_TYPE_UINT32);
-        vkCmdDrawIndexed(commandBuffer, (uint32_t)((m_sim.getWidth() - 1) * (m_sim.getHeight() - 1) * 6), 1, 0, 0, 0);
+    if (m_xyBuffer != VK_NULL_HANDLE) {
+      vkDestroyBuffer(m_device, m_xyBuffer, nullptr);
+    }
+    if (m_xyMemory != VK_NULL_HANDLE) {
+      vkFreeMemory(m_device, m_xyMemory, nullptr);
     }
 
-    void getVertexDescriptions(std::vector<VkVertexInputBindingDescription>& bindingDesc, std::vector<VkVertexInputAttributeDescription>& attribDesc) {
-        bindingDesc.resize(2);
-        attribDesc.resize(2);
-
-        bindingDesc[0].binding = 0;
-        bindingDesc[0].stride = sizeof(float);
-        bindingDesc[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
-
-        bindingDesc[1].binding = 1;
-        bindingDesc[1].stride = sizeof(vec2);
-        bindingDesc[1].inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
-
-        attribDesc[0].binding = 0;
-        attribDesc[0].location = 0;
-        attribDesc[0].format = VK_FORMAT_R32_SFLOAT;
-        attribDesc[0].offset = 0;
-
-        attribDesc[1].binding = 1;
-        attribDesc[1].location = 1;
-        attribDesc[1].format = VK_FORMAT_R32G32_SFLOAT;
-        attribDesc[1].offset = 0;
+    if (m_heightBuffer != VK_NULL_HANDLE) {
+      vkDestroyBuffer(m_device, m_heightBuffer, nullptr);
+    }
+    if (m_heightMemory != VK_NULL_HANDLE) {
+      vkFreeMemory(m_device, m_heightMemory, nullptr);
+    }
+    if (m_cudaHeightMap) {
+      checkCudaErrors(cudaDestroyExternalMemory(m_cudaVertMem));
     }
 
-    void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info) {
-        info.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
-        info.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
-        info.primitiveRestartEnable = VK_FALSE;
+    if (m_indexBuffer != VK_NULL_HANDLE) {
+      vkDestroyBuffer(m_device, m_indexBuffer, nullptr);
+    }
+    if (m_indexMemory != VK_NULL_HANDLE) {
+      vkFreeMemory(m_device, m_indexMemory, nullptr);
+    }
+  }
+
+  void fillRenderingCommandBuffer(VkCommandBuffer &commandBuffer) {
+    VkBuffer vertexBuffers[] = {m_heightBuffer, m_xyBuffer};
+    VkDeviceSize offsets[] = {0, 0};
+    vkCmdBindVertexBuffers(commandBuffer, 0,
+                           sizeof(vertexBuffers) / sizeof(vertexBuffers[0]),
+                           vertexBuffers, offsets);
+    vkCmdBindIndexBuffer(commandBuffer, m_indexBuffer, 0, VK_INDEX_TYPE_UINT32);
+    vkCmdDrawIndexed(commandBuffer, (uint32_t)((m_sim.getWidth() - 1) *
+                                               (m_sim.getHeight() - 1) * 6),
+                     1, 0, 0, 0);
+  }
+
+  void getVertexDescriptions(
+      std::vector<VkVertexInputBindingDescription> &bindingDesc,
+      std::vector<VkVertexInputAttributeDescription> &attribDesc) {
+    bindingDesc.resize(2);
+    attribDesc.resize(2);
+
+    bindingDesc[0].binding = 0;
+    bindingDesc[0].stride = sizeof(float);
+    bindingDesc[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
+
+    bindingDesc[1].binding = 1;
+    bindingDesc[1].stride = sizeof(vec2);
+    bindingDesc[1].inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
+
+    attribDesc[0].binding = 0;
+    attribDesc[0].location = 0;
+    attribDesc[0].format = VK_FORMAT_R32_SFLOAT;
+    attribDesc[0].offset = 0;
+
+    attribDesc[1].binding = 1;
+    attribDesc[1].location = 1;
+    attribDesc[1].format = VK_FORMAT_R32G32_SFLOAT;
+    attribDesc[1].offset = 0;
+  }
+
+  void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo &info) {
+    info.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
+    info.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
+    info.primitiveRestartEnable = VK_FALSE;
+  }
+
+  void getWaitFrameSemaphores(
+      std::vector<VkSemaphore> &wait,
+      std::vector<VkPipelineStageFlags> &waitStages) const {
+    if (m_currentFrame != 0) {
+      // Have vulkan wait until cuda is done with the vertex buffer before
+      // rendering, We don't do this on the first frame, as the wait semaphore
+      // hasn't been initialized yet
+      wait.push_back(m_vkWaitSemaphore);
+      // We want to wait until all the pipeline commands are complete before
+      // letting cuda work
+      waitStages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
+    }
+  }
+
+  void getSignalFrameSemaphores(std::vector<VkSemaphore> &signal) const {
+    // Add this semaphore for vulkan to signal once the vertex buffer is ready
+    // for cuda to modify
+    signal.push_back(m_vkSignalSemaphore);
+  }
+
+  void initVulkanApp() {
+    int cuda_device = -1;
+
+    // Select cuda device where vulkan is running.
+    cuda_device = m_sim.initCuda(m_vkDeviceUUID, VK_UUID_SIZE);
+    if (cuda_device == -1) {
+      printf("Error: No CUDA-Vulkan interop capable device found\n");
+      exit(EXIT_FAILURE);
     }
 
-    void getWaitFrameSemaphores(std::vector<VkSemaphore>& wait, std::vector< VkPipelineStageFlags>& waitStages) const {
-        if (m_currentFrame != 0) {
-            // Have vulkan wait until cuda is done with the vertex buffer before rendering
-            // We don't do this on the first frame, as the wait semaphore hasn't been initialized yet
-            wait.push_back(m_vkWaitSemaphore);
-            // We want to wait until all the pipeline commands are complete before letting cuda work
-            waitStages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
+    m_sim.initCudaLaunchConfig(cuda_device);
+
+    // Create the cuda stream we'll be using
+    checkCudaErrors(
+        cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking));
+
+    const size_t nVerts = m_sim.getWidth() * m_sim.getHeight();
+    const size_t nInds = (m_sim.getWidth() - 1) * (m_sim.getHeight() - 1) * 6;
+
+    // Create the height map cuda will write to
+    createExternalBuffer(
+        nVerts * sizeof(float),
+        VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
+        VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, getDefaultMemHandleType(),
+        m_heightBuffer, m_heightMemory);
+
+    // Create the vertex buffer that will hold the xy coordinates for the grid
+    createBuffer(nVerts * sizeof(vec2), VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+                                            VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
+                 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_xyBuffer, m_xyMemory);
+
+    // Create the index buffer that references from both buffers above
+    createBuffer(
+        nInds * sizeof(uint32_t),
+        VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT,
+        VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_indexBuffer, m_indexMemory);
+
+    // Import the height map into cuda and retrieve a device pointer to use
+    importCudaExternalMemory((void **)&m_cudaHeightMap, m_cudaVertMem,
+                             m_heightMemory, nVerts * sizeof(*m_cudaHeightMap),
+                             getDefaultMemHandleType());
+    // Set the height map to use in the simulation
+    m_sim.initSimulation(m_cudaHeightMap);
+
+    {
+      // Set up the initial values for the vertex buffers with Vulkan
+      void *stagingBase;
+      VkBuffer stagingBuffer;
+      VkDeviceMemory stagingMemory;
+      VkDeviceSize stagingSz =
+          std::max(nVerts * sizeof(vec2), nInds * sizeof(uint32_t));
+      createBuffer(stagingSz, VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+                   VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                       VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+                   stagingBuffer, stagingMemory);
+
+      vkMapMemory(m_device, stagingMemory, 0, stagingSz, 0, &stagingBase);
+
+      memset(stagingBase, 0, nVerts * sizeof(float));
+      copyBuffer(m_heightBuffer, stagingBuffer, nVerts * sizeof(float));
+
+      for (size_t y = 0; y < m_sim.getHeight(); y++) {
+        for (size_t x = 0; x < m_sim.getWidth(); x++) {
+          vec2 *stagedVert = (vec2 *)stagingBase;
+          stagedVert[y * m_sim.getWidth() + x][0] =
+              (2.0f * x) / (m_sim.getWidth() - 1) - 1;
+          stagedVert[y * m_sim.getWidth() + x][1] =
+              (2.0f * y) / (m_sim.getHeight() - 1) - 1;
         }
+      }
+      copyBuffer(m_xyBuffer, stagingBuffer, nVerts * sizeof(vec2));
+
+      {
+        uint32_t *indices = (uint32_t *)stagingBase;
+        for (size_t y = 0; y < m_sim.getHeight() - 1; y++) {
+          for (size_t x = 0; x < m_sim.getWidth() - 1; x++) {
+            indices[0] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 0));
+            indices[1] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 0));
+            indices[2] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 1));
+            indices[3] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 0));
+            indices[4] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 1));
+            indices[5] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 1));
+            indices += 6;
+          }
+        }
+      }
+      copyBuffer(m_indexBuffer, stagingBuffer, nInds * sizeof(uint32_t));
+
+      vkUnmapMemory(m_device, stagingMemory);
+      vkDestroyBuffer(m_device, stagingBuffer, nullptr);
+      vkFreeMemory(m_device, stagingMemory, nullptr);
     }
 
-    void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const {
-        // Add this semaphore for vulkan to signal once the vertex buffer is ready for cuda to modify
-        signal.push_back(m_vkSignalSemaphore);
+#ifdef _VK_TIMELINE_SEMAPHORE
+    // Create the timeline semaphore to sync cuda and vulkan access to vertex
+    // buffer
+    createExternalSemaphore(m_vkTimelineSemaphore,
+                            getDefaultSemaphoreHandleType());
+    // Import the timeline semaphore cuda will use to sync cuda and vulkan
+    // access to vertex buffer
+    importCudaExternalSemaphore(m_cudaTimelineSemaphore, m_vkTimelineSemaphore,
+                                getDefaultSemaphoreHandleType());
+#else
+
+    // Create the semaphore vulkan will signal when it's done with the vertex
+    // buffer
+    createExternalSemaphore(m_vkSignalSemaphore,
+                            getDefaultSemaphoreHandleType());
+    // Create the semaphore vulkan will wait for before using the vertex buffer
+    createExternalSemaphore(m_vkWaitSemaphore, getDefaultSemaphoreHandleType());
+    // Import the semaphore cuda will use -- vulkan's signal will be cuda's wait
+    importCudaExternalSemaphore(m_cudaWaitSemaphore, m_vkSignalSemaphore,
+                                getDefaultSemaphoreHandleType());
+    // Import the semaphore cuda will use -- cuda's signal will be vulkan's wait
+    importCudaExternalSemaphore(m_cudaSignalSemaphore, m_vkWaitSemaphore,
+                                getDefaultSemaphoreHandleType());
+#endif /* _VK_TIMELINE_SEMAPHORE */
+  }
+
+  void importCudaExternalMemory(void **cudaPtr, cudaExternalMemory_t &cudaMem,
+                                VkDeviceMemory &vkMem, VkDeviceSize size,
+                                VkExternalMemoryHandleTypeFlagBits handleType) {
+    cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {};
+
+    if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) {
+      externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32;
+    } else if (handleType &
+               VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) {
+      externalMemoryHandleDesc.type =
+          cudaExternalMemoryHandleTypeOpaqueWin32Kmt;
+    } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) {
+      externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueFd;
+    } else {
+      throw std::runtime_error("Unknown handle type requested!");
     }
 
-    void initVulkanApp() {
-        int cuda_device = -1;
-
-        // Select cuda device where vulkan is running.
-        cuda_device = m_sim.initCuda(m_vkDeviceUUID, VK_UUID_SIZE);
-        if (cuda_device == -1)
-        {
-            printf("Error: No CUDA-Vulkan interop capable device found\n");
-            exit(EXIT_FAILURE);
-        }
-
-        m_sim.initCudaLaunchConfig(cuda_device);
-
-        // Create the cuda stream we'll be using
-        checkCudaErrors(cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking));
-
-        const size_t nVerts = m_sim.getWidth() * m_sim.getHeight();
-        const size_t nInds = (m_sim.getWidth() - 1) * (m_sim.getHeight() - 1) * 6;
-
-        // Create the height map cuda will write to
-        createExternalBuffer(nVerts * sizeof(float),
-                             VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
-                             VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
-                             getDefaultMemHandleType(),
-                             m_heightBuffer, m_heightMemory);
-
-        // Create the vertex buffer that will hold the xy coordinates for the grid
-        createBuffer(nVerts * sizeof(vec2),
-                     VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
-                     VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
-                     m_xyBuffer, m_xyMemory);
-
-        // Create the index buffer that references from both buffers above
-        createBuffer(nInds * sizeof(uint32_t),
-                     VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT,
-                     VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
-                     m_indexBuffer, m_indexMemory);
-
-        // Import the height map into cuda and retrieve a device pointer to use
-        importCudaExternalMemory((void **)&m_cudaHeightMap, m_cudaVertMem, m_heightMemory, nVerts * sizeof(*m_cudaHeightMap), getDefaultMemHandleType());
-        // Set the height map to use in the simulation
-        m_sim.initSimulation(m_cudaHeightMap);
-
-        {
-            // Set up the initial values for the vertex buffers with Vulkan
-            void *stagingBase;
-            VkBuffer stagingBuffer;
-            VkDeviceMemory stagingMemory;
-            VkDeviceSize stagingSz = std::max(nVerts * sizeof(vec2), nInds * sizeof(uint32_t));
-            createBuffer(stagingSz, VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
-                         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBuffer, stagingMemory);
-
-            vkMapMemory(m_device, stagingMemory, 0, stagingSz, 0, &stagingBase);
-
-            memset(stagingBase, 0, nVerts * sizeof(float));
-            copyBuffer(m_heightBuffer, stagingBuffer, nVerts * sizeof(float));
-
-            for (size_t y = 0; y < m_sim.getHeight(); y++) {
-                for (size_t x = 0; x < m_sim.getWidth(); x++) {
-                    vec2 *stagedVert = (vec2 *)stagingBase;
-                    stagedVert[y * m_sim.getWidth() + x][0] = (2.0f * x) / (m_sim.getWidth() - 1) - 1;
-                    stagedVert[y * m_sim.getWidth() + x][1] = (2.0f * y) / (m_sim.getHeight() - 1) - 1;
-                }
-            }
-            copyBuffer(m_xyBuffer, stagingBuffer, nVerts * sizeof(vec2));
-
-            {
-                uint32_t *indices = (uint32_t *)stagingBase;
-                for (size_t y = 0; y < m_sim.getHeight() - 1; y++) {
-                    for (size_t x = 0; x < m_sim.getWidth() - 1; x++) {
-                        indices[0] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 0));
-                        indices[1] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 0));
-                        indices[2] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 1));
-                        indices[3] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 0));
-                        indices[4] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 1));
-                        indices[5] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 1));
-                        indices += 6;
-                    }
-                }
-            }
-            copyBuffer(m_indexBuffer, stagingBuffer, nInds * sizeof(uint32_t));
-
-            vkUnmapMemory(m_device, stagingMemory);
-            vkDestroyBuffer(m_device, stagingBuffer, nullptr);
-            vkFreeMemory(m_device, stagingMemory, nullptr);
-        }
-
-        // Create the semaphore vulkan will signal when it's done with the vertex buffer
-        createExternalSemaphore(m_vkSignalSemaphore, getDefaultSemaphoreHandleType());
-        // Create the semaphore vulkan will wait for before using the vertex buffer
-        createExternalSemaphore(m_vkWaitSemaphore, getDefaultSemaphoreHandleType());
-        // Import the semaphore cuda will use -- vulkan's signal will be cuda's wait
-        importCudaExternalSemaphore(m_cudaWaitSemaphore, m_vkSignalSemaphore, getDefaultSemaphoreHandleType());
-        // Import the semaphore cuda will use -- cuda's signal will be vulkan's wait
-        importCudaExternalSemaphore(m_cudaSignalSemaphore, m_vkWaitSemaphore, getDefaultSemaphoreHandleType());
-    }
-
-    void importCudaExternalMemory(void **cudaPtr, cudaExternalMemory_t& cudaMem, VkDeviceMemory& vkMem, VkDeviceSize size, VkExternalMemoryHandleTypeFlagBits handleType) {
-        cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {};
-
-        if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) {
-            externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32;
-        }
-        else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) {
-            externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32Kmt;
-        }
-        else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) {
-            externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueFd;
-        }
-        else {
-            throw std::runtime_error("Unknown handle type requested!");
-        }
-
-        externalMemoryHandleDesc.size = size;
+    externalMemoryHandleDesc.size = size;
 
 #ifdef _WIN64
-        externalMemoryHandleDesc.handle.win32.handle = (HANDLE)getMemHandle(vkMem, handleType);
+    externalMemoryHandleDesc.handle.win32.handle =
+        (HANDLE)getMemHandle(vkMem, handleType);
 #else
-        externalMemoryHandleDesc.handle.fd = (int)(uintptr_t)getMemHandle(vkMem, handleType);
+    externalMemoryHandleDesc.handle.fd =
+        (int)(uintptr_t)getMemHandle(vkMem, handleType);
 #endif
 
-        checkCudaErrors(cudaImportExternalMemory(&cudaMem, &externalMemoryHandleDesc));
+    checkCudaErrors(
+        cudaImportExternalMemory(&cudaMem, &externalMemoryHandleDesc));
 
-        cudaExternalMemoryBufferDesc externalMemBufferDesc = {};
-        externalMemBufferDesc.offset = 0;
-        externalMemBufferDesc.size = size;
-        externalMemBufferDesc.flags = 0;
+    cudaExternalMemoryBufferDesc externalMemBufferDesc = {};
+    externalMemBufferDesc.offset = 0;
+    externalMemBufferDesc.size = size;
+    externalMemBufferDesc.flags = 0;
 
-        checkCudaErrors(cudaExternalMemoryGetMappedBuffer(cudaPtr, cudaMem, &externalMemBufferDesc));
+    checkCudaErrors(cudaExternalMemoryGetMappedBuffer(cudaPtr, cudaMem,
+                                                      &externalMemBufferDesc));
+  }
+
+  void importCudaExternalSemaphore(
+      cudaExternalSemaphore_t &cudaSem, VkSemaphore &vkSem,
+      VkExternalSemaphoreHandleTypeFlagBits handleType) {
+    cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc = {};
+
+#ifdef _VK_TIMELINE_SEMAPHORE
+    if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) {
+      externalSemaphoreHandleDesc.type =
+          cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32;
+    } else if (handleType &
+               VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) {
+      externalSemaphoreHandleDesc.type =
+          cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32;
+    } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) {
+      externalSemaphoreHandleDesc.type =
+          cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd;
+    }
+#else
+    if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) {
+      externalSemaphoreHandleDesc.type =
+          cudaExternalSemaphoreHandleTypeOpaqueWin32;
+    } else if (handleType &
+               VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) {
+      externalSemaphoreHandleDesc.type =
+          cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt;
+    } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) {
+      externalSemaphoreHandleDesc.type =
+          cudaExternalSemaphoreHandleTypeOpaqueFd;
+    }
+#endif /* _VK_TIMELINE_SEMAPHORE */
+    else {
+      throw std::runtime_error("Unknown handle type requested!");
     }
 
-    void importCudaExternalSemaphore(cudaExternalSemaphore_t& cudaSem, VkSemaphore& vkSem, VkExternalSemaphoreHandleTypeFlagBits handleType) {
-        cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc = {};
-
-        if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) {
-            externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32;
-        }
-        else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) {
-            externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt;
-        }
-        else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) {
-            externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd;
-        }
-        else {
-            throw std::runtime_error("Unknown handle type requested!");
-        }
-
 #ifdef _WIN64
-        externalSemaphoreHandleDesc.handle.win32.handle = (HANDLE)getSemaphoreHandle(vkSem, handleType);
+    externalSemaphoreHandleDesc.handle.win32.handle =
+        (HANDLE)getSemaphoreHandle(vkSem, handleType);
 #else
-        externalSemaphoreHandleDesc.handle.fd = (int)(uintptr_t)getSemaphoreHandle(vkSem, handleType);
+    externalSemaphoreHandleDesc.handle.fd =
+        (int)(uintptr_t)getSemaphoreHandle(vkSem, handleType);
 #endif
 
-        externalSemaphoreHandleDesc.flags = 0;
+    externalSemaphoreHandleDesc.flags = 0;
 
-        checkCudaErrors(cudaImportExternalSemaphore(&cudaSem, &externalSemaphoreHandleDesc));
+    checkCudaErrors(
+        cudaImportExternalSemaphore(&cudaSem, &externalSemaphoreHandleDesc));
+  }
+
+  VkDeviceSize getUniformSize() const { return sizeof(UniformBufferObject); }
+
+  void updateUniformBuffer(uint32_t imageIndex) {
+    {
+      mat4x4 view, proj;
+      vec3 eye = {1.75f, 1.75f, 1.25f};
+      vec3 center = {0.0f, 0.0f, -0.25f};
+      vec3 up = {0.0f, 0.0f, 1.0f};
+
+      mat4x4_perspective(
+          proj, (float)degreesToRadians(45.0f),
+          m_swapChainExtent.width / (float)m_swapChainExtent.height, 0.1f,
+          10.0f);
+      proj[1][1] *= -1.0f;  // Flip y axis
+
+      mat4x4_look_at(view, eye, center, up);
+      mat4x4_mul(m_ubo.modelViewProj, proj, view);
     }
 
-    VkDeviceSize getUniformSize() const {
-        return sizeof(UniformBufferObject);
-    }
+    void *data;
+    vkMapMemory(m_device, m_uniformMemory[imageIndex], 0, getUniformSize(), 0,
+                &data);
+    memcpy(data, &m_ubo, sizeof(m_ubo));
+    vkUnmapMemory(m_device, m_uniformMemory[imageIndex]);
+  }
 
-    void updateUniformBuffer(uint32_t imageIndex) {
-        {
-            mat4x4 view, proj;
-            vec3 eye = { 1.75f, 1.75f, 1.25f };
-            vec3 center = { 0.0f, 0.0f, -0.25f };
-            vec3 up = { 0.0f, 0.0f, 1.0f };
+  std::vector<const char *> getRequiredExtensions() const {
+    std::vector<const char *> extensions;
+    extensions.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME);
+    return extensions;
+  }
 
-            mat4x4_perspective(proj, (float)degreesToRadians(45.0f), m_swapChainExtent.width / (float)m_swapChainExtent.height, 0.1f, 10.0f);
-            proj[1][1] *= -1.0f;        // Flip y axis
-
-            mat4x4_look_at(view, eye, center, up);
-            mat4x4_mul(m_ubo.modelViewProj, proj, view);
-        }
-
-        void *data;
-        vkMapMemory(m_device, m_uniformMemory[imageIndex], 0, getUniformSize(), 0, &data);
-        memcpy(data, &m_ubo, sizeof(m_ubo));
-        vkUnmapMemory(m_device, m_uniformMemory[imageIndex]);
-    }
-
-    std::vector<const char *> getRequiredExtensions() const {
-        std::vector<const char *> extensions;
-        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME);
-        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME);
-        return extensions;
-    }
-
-    std::vector<const char *> getRequiredDeviceExtensions() const {
-        std::vector<const char *> extensions;
-        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME);
-        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME);
+  std::vector<const char *> getRequiredDeviceExtensions() const {
+    std::vector<const char *> extensions;
+    extensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME);
 #ifdef _WIN64
-        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME);
-        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME);
 #else
-        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME);
-        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME);
 #endif /* _WIN64 */
-        return extensions;
+    return extensions;
+  }
+
+  void drawFrame() {
+    static chrono_tp startTime = std::chrono::high_resolution_clock::now();
+
+    chrono_tp currentTime = std::chrono::high_resolution_clock::now();
+    float time = std::chrono::duration<float, std::chrono::seconds::period>(
+                     currentTime - startTime)
+                     .count();
+
+    if (m_currentFrame == 0) {
+      m_lastTime = startTime;
     }
 
-    void drawFrame() {
-        static chrono_tp startTime = std::chrono::high_resolution_clock::now();
+    float frame_time =
+        std::chrono::duration<float, std::chrono::seconds::period>(currentTime -
+                                                                   m_lastTime)
+            .count();
 
-        chrono_tp currentTime = std::chrono::high_resolution_clock::now();
-        float time = std::chrono::duration<float, std::chrono::seconds::period>(currentTime - startTime).count();
+    // Have vulkan draw the current frame...
+    VulkanBaseApp::drawFrame();
 
-        if (m_currentFrame == 0) {
-            m_lastTime = startTime;
-        }
+#ifdef _VK_TIMELINE_SEMAPHORE
+    cudaExternalSemaphoreWaitParams waitParams = {};
+    waitParams.flags = 0;
+    waitParams.params.fence.value = 1;
 
-        float frame_time = std::chrono::duration<float, std::chrono::seconds::period>(currentTime - m_lastTime).count();
+    cudaExternalSemaphoreSignalParams signalParams = {};
+    signalParams.flags = 0;
+    signalParams.params.fence.value = 0;
+    // Wait for vulkan to complete it's work
+    checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaTimelineSemaphore,
+                                                    &waitParams, 1, m_stream));
+    // Now step the simulation
+    m_sim.stepSimulation(time, m_stream);
+    // Signal vulkan to continue with the updated buffers
+    checkCudaErrors(cudaSignalExternalSemaphoresAsync(
+        &m_cudaTimelineSemaphore, &signalParams, 1, m_stream));
+#else
+    cudaExternalSemaphoreWaitParams waitParams = {};
+    waitParams.flags = 0;
+    waitParams.params.fence.value = 0;
 
-        cudaExternalSemaphoreWaitParams waitParams = {};
-        waitParams.flags = 0;
-        waitParams.params.fence.value = 0;
+    cudaExternalSemaphoreSignalParams signalParams = {};
+    signalParams.flags = 0;
+    signalParams.params.fence.value = 0;
 
-        cudaExternalSemaphoreSignalParams signalParams = {};
-        signalParams.flags = 0;
-        signalParams.params.fence.value = 0;
+    // Wait for vulkan to complete it's work
+    checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaWaitSemaphore,
+                                                    &waitParams, 1, m_stream));
+    // Now step the simulation
+    m_sim.stepSimulation(time, m_stream);
+    // Signal vulkan to continue with the updated buffers
+    checkCudaErrors(cudaSignalExternalSemaphoresAsync(
+        &m_cudaSignalSemaphore, &signalParams, 1, m_stream));
+#endif /* _VK_TIMELINE_SEMAPHORE */
 
-        // Have vulkan draw the current frame...
-        VulkanBaseApp::drawFrame();
-        // Wait for vulkan to complete it's work
-        checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaWaitSemaphore, &waitParams, 1, m_stream));
-        // Now step the simulation
-        m_sim.stepSimulation(time, m_stream);
-        // Signal vulkan to continue with the updated buffers
-        checkCudaErrors(cudaSignalExternalSemaphoresAsync(&m_cudaSignalSemaphore, &signalParams, 1, m_stream));
-
-        // Output a naive measurement of the frames per second every five seconds
-        if (frame_time > 5) {
-            std::cout << "Average FPS (over "
-                      << std::fixed << std::setprecision(2) << frame_time
-                      << " seconds): "
-                      << std::fixed << std::setprecision(2)
-                      << ((m_currentFrame - m_lastFrame) / frame_time)
-                      << std::endl;
-            m_lastFrame = m_currentFrame;
-            m_lastTime = currentTime;
-        }
+    // Output a naive measurement of the frames per second every five seconds
+    if (frame_time > 5) {
+      std::cout << "Average FPS (over " << std::fixed << std::setprecision(2)
+                << frame_time << " seconds): " << std::fixed
+                << std::setprecision(2)
+                << ((m_currentFrame - m_lastFrame) / frame_time) << std::endl;
+      m_lastFrame = m_currentFrame;
+      m_lastTime = currentTime;
     }
+  }
 };
 
-int main(int argc, char **argv)
-{
-    execution_path = argv[0];
-    VulkanCudaSineWave app((1ULL << 8ULL), (1ULL << 8ULL));
-    app.init();
-    app.mainLoop();
-    return 0;
+int main(int argc, char **argv) {
+  execution_path = argv[0];
+  VulkanCudaSineWave app((1ULL << 8ULL), (1ULL << 8ULL));
+  app.init();
+  app.mainLoop();
+  return 0;
 }
diff --git a/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj b/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj
index b44dbe13..713ae122 100644
--- a/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj
+++ b/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -122,6 +122,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleVulkan/simpleVulkan_vs2019.vcxproj b/Samples/simpleVulkan/simpleVulkan_vs2019.vcxproj
index 0f6b767a..a03ea3de 100644
--- a/Samples/simpleVulkan/simpleVulkan_vs2019.vcxproj
+++ b/Samples/simpleVulkan/simpleVulkan_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -118,6 +118,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleVulkanMMAP/README.md b/Samples/simpleVulkanMMAP/README.md
index 9a51b048..bd3aeb63 100644
--- a/Samples/simpleVulkanMMAP/README.md
+++ b/Samples/simpleVulkanMMAP/README.md
@@ -33,7 +33,7 @@ cudaGetDeviceProperties, cudaImportExternalMemory, cudaExternalMemoryGetMappedBu
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2017.vcxproj b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2017.vcxproj
index b9d1658c..95ec4011 100644
--- a/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2017.vcxproj
+++ b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -124,6 +124,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2019.vcxproj b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2019.vcxproj
index f2232967..c15cb955 100644
--- a/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2019.vcxproj
+++ b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -120,6 +120,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleZeroCopy/README.md b/Samples/simpleZeroCopy/README.md
index 128cdf31..12919ca0 100644
--- a/Samples/simpleZeroCopy/README.md
+++ b/Samples/simpleZeroCopy/README.md
@@ -27,7 +27,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/simpleZeroCopy/simpleZeroCopy_vs2017.vcxproj b/Samples/simpleZeroCopy/simpleZeroCopy_vs2017.vcxproj
index a40d2464..72ad3aaa 100644
--- a/Samples/simpleZeroCopy/simpleZeroCopy_vs2017.vcxproj
+++ b/Samples/simpleZeroCopy/simpleZeroCopy_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleZeroCopy/simpleZeroCopy_vs2019.vcxproj b/Samples/simpleZeroCopy/simpleZeroCopy_vs2019.vcxproj
index 20f55821..40b06783 100644
--- a/Samples/simpleZeroCopy/simpleZeroCopy_vs2019.vcxproj
+++ b/Samples/simpleZeroCopy/simpleZeroCopy_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/streamOrderedAllocation/README.md b/Samples/streamOrderedAllocation/README.md
index 2dd455bd..be8d5602 100644
--- a/Samples/streamOrderedAllocation/README.md
+++ b/Samples/streamOrderedAllocation/README.md
@@ -27,7 +27,7 @@ cudaMallocAsync, cudaFreeAsync, cudaMemPoolSetAttribute, cudaDeviceGetDefaultMem
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2017.vcxproj b/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2017.vcxproj
index 402a041c..1113cafe 100644
--- a/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2017.vcxproj
+++ b/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2019.vcxproj b/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2019.vcxproj
index d5f8b08d..5e884ceb 100644
--- a/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2019.vcxproj
+++ b/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/streamOrderedAllocationIPC/Makefile b/Samples/streamOrderedAllocationIPC/Makefile
new file mode 100644
index 00000000..910b6064
--- /dev/null
+++ b/Samples/streamOrderedAllocationIPC/Makefile
@@ -0,0 +1,423 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        NVCCFLAGS += --qpp-config 5.4.0,gcc_ntoaarch64le
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
+        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
+        ifdef TARGET_OVERRIDE
+            LDFLAGS += -lslog2
+        endif
+
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -L$(TARGET_FS)/usr/lib
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
+            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
+            CCFLAGS += -I$(TARGET_FS)/../include
+        endif
+    endif
+endif
+
+ifdef TARGET_OVERRIDE # cuda toolkit targets override
+    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on Mac OSX
+ifeq ($(TARGET_OS),darwin)
+  $(info >>> WARNING - streamOrderedAllocationIPC is not supported on Mac OSX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on ARMv7
+ifeq ($(TARGET_ARCH),armv7l)
+  $(info >>> WARNING - streamOrderedAllocationIPC is not supported on ARMv7 - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on aarch64
+ifeq ($(TARGET_ARCH),aarch64)
+  $(info >>> WARNING - streamOrderedAllocationIPC is not supported on aarch64 - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on sbsa
+ifeq ($(TARGET_ARCH),sbsa)
+  $(info >>> WARNING - streamOrderedAllocationIPC is not supported on sbsa - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 35 37 50 52 60 61 70 72 75 80 86
+else
+SMS ?= 35 37 50 52 60 61 70 75 80 86
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ifeq ($(TARGET_OS),darwin)
+  ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
+else
+  ifeq ($(TARGET_ARCH),x86_64)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
+    ifdef TARGET_OVERRIDE
+        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
+    endif
+  endif
+
+  ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
+  endif
+
+  ifeq ($(HOST_ARCH),ppc64le)
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
+  endif
+
+  CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
+  ifeq ("$(CUDALIB)","")
+    $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
+    SAMPLE_ENABLED := 0
+  else
+    CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
+    LIBRARIES += -L$(CUDALIB) -lcuda
+  endif
+endif
+
+ALL_CCFLAGS += --std=c++11 --threads 0
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: streamOrderedAllocationIPC
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+helper_multiprocess.o:../../common/src/helper_multiprocess.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+streamOrderedAllocationIPC.o:streamOrderedAllocationIPC.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+streamOrderedAllocationIPC: helper_multiprocess.o streamOrderedAllocationIPC.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./streamOrderedAllocationIPC
+
+clean:
+	rm -f streamOrderedAllocationIPC helper_multiprocess.o streamOrderedAllocationIPC.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/streamOrderedAllocationIPC
+
+clobber: clean
diff --git a/Samples/streamOrderedAllocationIPC/NsightEclipse.xml b/Samples/streamOrderedAllocationIPC/NsightEclipse.xml
new file mode 100644
index 00000000..713a3d8f
--- /dev/null
+++ b/Samples/streamOrderedAllocationIPC/NsightEclipse.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>streamOrderedAllocationIPC</name>
+  <cflags>
+    <flag>--std=c++11</flag>
+  </cflags>
+  <cuda_api_list>
+    <toolkit>cudaMallocAsync</toolkit>
+    <toolkit>cudaFreeAsync</toolkit>
+    <toolkit>cudaMemPoolCreate</toolkit>
+    <toolkit>cudaMemPoolImportPointer</toolkit>
+    <toolkit>cudaMemPoolSetAccess</toolkit>
+    <toolkit>cudaMemPoolGetAccess</toolkit>
+    <toolkit>cudaMemPoolExportToShareableHandle</toolkit>
+    <toolkit>cudaMemPoolExportPointer</toolkit>
+    <toolkit>cudaMemPoolDestroy</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[This sample demonstrates IPC pools of stream ordered memory allocated using cudaMallocAsync and cudaMemPool family of APIs.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">Performance Strategies</concept>
+  </keyconcepts>
+  <keywords>
+  </keywords>
+  <libraries>
+    <library os="linux">cuda</library>
+    <library framework="true" os="macosx">CUDA</library>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>streamOrderedAllocationIPC.cu</primary_file>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+    <scope>1:Performance Strategies</scope>
+  </scopes>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
+  <sm-arch>sm86</sm-arch>
+  <sources>
+    <extracompilation>../../common/src/helper_multiprocess.cpp</extracompilation>
+    <extraheader>../../common/inc/helper_multiprocess.h</extraheader>
+  </sources>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <from>6.0</from>
+  </supported_sm_architectures>
+  <title>stream Ordered Allocation IPC Pools</title>
+  <type>exe</type>
+</entry>
diff --git a/Samples/streamOrderedAllocationIPC/README.md b/Samples/streamOrderedAllocationIPC/README.md
new file mode 100644
index 00000000..04948fae
--- /dev/null
+++ b/Samples/streamOrderedAllocationIPC/README.md
@@ -0,0 +1,60 @@
+# streamOrderedAllocationIPC - stream Ordered Allocation IPC Pools
+
+## Description
+
+This sample demonstrates IPC pools of stream ordered memory allocated using cudaMallocAsync and cudaMemPool family of APIs.
+
+## Key Concepts
+
+Performance Strategies
+
+## Supported SM Architectures
+
+[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux
+
+## Supported CPU Architecture
+
+x86_64
+
+## CUDA APIs involved
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaMallocAsync, cudaFreeAsync, cudaMemPoolCreate, cudaMemPoolImportPointer, cudaMemPoolSetAccess, cudaMemPoolGetAccess, cudaMemPoolExportToShareableHandle, cudaMemPoolExportPointer, cudaMemPoolDestroy
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+
+## Build and Run
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
diff --git a/Samples/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu b/Samples/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu
new file mode 100644
index 00000000..f8f783b8
--- /dev/null
+++ b/Samples/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu
@@ -0,0 +1,440 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This sample demonstrates Inter Process Communication
+ * using one process per GPU for computation.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+#include <cuda.h>
+#define CUDA_DRIVER_API 1
+#include "helper_cuda.h"
+#include "helper_cuda_drvapi.h"
+#include "helper_multiprocess.h"
+
+static const char shmName[] = "streamOrderedAllocationIPCshm";
+static const char ipcName[] = "streamOrderedAllocationIPC_pipe";
+// For direct NVLINK and PCI-E peers, at max 8 simultaneous peers are allowed
+// For NVSWITCH connected peers like DGX-2, simultaneous peers are not limited
+// in the same way.
+#define MAX_DEVICES (32)
+#define DATA_SIZE (64ULL << 20ULL)  // 64MB
+
+#if defined(__linux__)
+#define cpu_atomic_add32(a, x) __sync_add_and_fetch(a, x)
+#elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#define cpu_atomic_add32(a, x) InterlockedAdd((volatile LONG *)a, x)
+#else
+#error Unsupported system
+#endif
+
+typedef struct shmStruct_st {
+  size_t nprocesses;
+  int barrier;
+  int sense;
+  int devices[MAX_DEVICES];
+  cudaMemPoolPtrExportData exportPtrData[MAX_DEVICES];
+} shmStruct;
+
+__global__ void simpleKernel(char *ptr, int sz, char val) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  for (; idx < sz; idx += (gridDim.x * blockDim.x)) {
+    ptr[idx] = val;
+  }
+}
+
+static void barrierWait(volatile int *barrier, volatile int *sense,
+                        unsigned int n) {
+  int count;
+
+  // Check-in
+  count = cpu_atomic_add32(barrier, 1);
+  if (count == n)  // Last one in
+    *sense = 1;
+  while (!*sense)
+    ;
+
+  // Check-out
+  count = cpu_atomic_add32(barrier, -1);
+  if (count == 0)  // Last one out
+    *sense = 0;
+  while (*sense)
+    ;
+}
+
+static void childProcess(int id) {
+  volatile shmStruct *shm = NULL;
+  cudaStream_t stream;
+  sharedMemoryInfo info;
+  size_t procCount, i;
+  int blocks = 0;
+  int threads = 128;
+  cudaDeviceProp prop;
+  std::vector<void *> ptrs;
+
+  std::vector<char> verification_buffer(DATA_SIZE);
+
+  ipcHandle *ipcChildHandle = NULL;
+  checkIpcErrors(ipcOpenSocket(ipcChildHandle));
+
+  if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) {
+    printf("Failed to create shared memory slab\n");
+    exit(EXIT_FAILURE);
+  }
+  shm = (volatile shmStruct *)info.addr;
+  procCount = shm->nprocesses;
+
+  barrierWait(&shm->barrier, &shm->sense, (unsigned int)(procCount + 1));
+
+  // Receive all allocation handles shared by Parent.
+  std::vector<ShareableHandle> shHandle(shm->nprocesses);
+  checkIpcErrors(ipcRecvShareableHandles(ipcChildHandle, shHandle));
+
+  checkCudaErrors(cudaSetDevice(shm->devices[id]));
+  checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id]));
+  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks, simpleKernel, threads, 0));
+  blocks *= prop.multiProcessorCount;
+
+  std::vector<cudaMemPool_t> pools(shm->nprocesses);
+
+  cudaMemAllocationHandleType handleType = cudaMemHandleTypePosixFileDescriptor;
+
+  // Import mem pools from all the devices created in the master
+  // process using shareable handles received via socket
+  // and import the pointer to the allocated buffer using
+  // exportData filled in shared memory by the master process.
+  for (i = 0; i < procCount; i++) {
+    checkCudaErrors(cudaMemPoolImportFromShareableHandle(
+        &pools[i], (void *)shHandle[i], handleType, 0));
+
+    cudaMemAccessFlags accessFlags;
+    cudaMemLocation location;
+    location.type = cudaMemLocationTypeDevice;
+    location.id = shm->devices[id];
+    checkCudaErrors(cudaMemPoolGetAccess(&accessFlags, pools[i], &location));
+    if (accessFlags != cudaMemAccessFlagsProtReadWrite) {
+      cudaMemAccessDesc desc;
+      memset(&desc, 0, sizeof(cudaMemAccessDesc));
+      desc.location.type = cudaMemLocationTypeDevice;
+      desc.location.id = shm->devices[id];
+      desc.flags = cudaMemAccessFlagsProtReadWrite;
+      checkCudaErrors(cudaMemPoolSetAccess(pools[i], &desc, 1));
+    }
+
+    // Import the allocation from each memory pool by iterating over exportData
+    // until import is success.
+    for (int j = 0; j < procCount; j++) {
+      void *ptr = NULL;
+      // Import the allocation using the opaque export data retrieved through
+      // the shared memory".
+      cudaError_t ret = cudaMemPoolImportPointer(
+          &ptr, pools[i], (cudaMemPoolPtrExportData *)&shm->exportPtrData[j]);
+
+      if (ret == cudaSuccess) {
+        // Pointer import is successful hence add it to the ptrs bag.
+        ptrs.push_back(ptr);
+        break;
+      } else {
+        // Reset failure error received from cudaMemPoolImportPointer
+        // for further try.
+        cudaGetLastError();
+      }
+    }
+    // Since we have imported allocations shared by the parent with us, we can
+    // close this ShareableHandle.
+    checkIpcErrors(ipcCloseShareableHandle(shHandle[i]));
+  }
+
+  // Since we have imported allocations shared by the parent with us, we can
+  // close the socket.
+  checkIpcErrors(ipcCloseSocket(ipcChildHandle));
+
+  // At each iteration of the loop, each sibling process will push work on
+  // their respective devices accessing the next peer mapped buffer allocated
+  // by the master process (these can come from other sibling processes as
+  // well). To coordinate each process' access, we force the stream to wait for
+  // the work already accessing this buffer.
+  for (i = 0; i < procCount; i++) {
+    size_t bufferId = (i + id) % procCount;
+
+    // Push a simple kernel on it
+    simpleKernel<<<blocks, threads, 0, stream>>>((char *)ptrs[bufferId],
+                                                 DATA_SIZE, id);
+    checkCudaErrors(cudaGetLastError());
+    checkCudaErrors(cudaStreamSynchronize(stream));
+
+    // Wait for all my sibling processes to push this stage of their work
+    // before proceeding to the next. This prevents siblings from racing
+    // ahead and clobbering the recorded event or waiting on the wrong
+    // recorded event.
+    barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount);
+    if (id == 0) {
+      printf("Step %lld done\n", (unsigned long long)i);
+    }
+  }
+
+  // Now wait for my buffer to be ready so I can copy it locally and verify it
+  checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE,
+                                  cudaMemcpyDeviceToHost, stream));
+
+  // And wait for all the queued up work to complete
+  checkCudaErrors(cudaStreamSynchronize(stream));
+
+  printf("Process %d: verifying...\n", id);
+
+  // The contents should have the id of the sibling just after me
+  char compareId = (char)((id + 1) % procCount);
+  for (unsigned long long j = 0; j < DATA_SIZE; j++) {
+    if (verification_buffer[j] != compareId) {
+      printf("Process %d: Verification mismatch at %lld: %d != %d\n", id, j,
+             (int)verification_buffer[j], (int)compareId);
+    }
+  }
+
+  // Clean up!
+  for (i = 0; i < procCount; i++) {
+    // Free the memory before the exporter process frees it
+    checkCudaErrors(cudaFreeAsync(ptrs[i], stream));
+  }
+
+  // And wait for all the queued up work to complete
+  checkCudaErrors(cudaStreamSynchronize(stream));
+  checkCudaErrors(cudaStreamDestroy(stream));
+
+  printf("Process %d complete!\n", id);
+}
+
+static void parentProcess(char *app) {
+  sharedMemoryInfo info;
+  int devCount, i;
+  volatile shmStruct *shm = NULL;
+  std::vector<void *> ptrs;
+  std::vector<Process> processes;
+
+  checkCudaErrors(cudaGetDeviceCount(&devCount));
+  std::vector<CUdevice> devices(devCount);
+  for (i = 0; i < devCount; i++) {
+    cuDeviceGet(&devices[i], i);
+  }
+
+  if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) {
+    printf("Failed to create shared memory slab\n");
+    exit(EXIT_FAILURE);
+  }
+  shm = (volatile shmStruct *)info.addr;
+  memset((void *)shm, 0, sizeof(*shm));
+
+  // Pick all the devices that can access each other's memory for this test
+  // Keep in mind that CUDA has minimal support for fork() without a
+  // corresponding exec() in the child process, but in this case our
+  // spawnProcess will always exec, so no need to worry.
+  for (i = 0; i < devCount; i++) {
+    bool allPeers = true;
+    cudaDeviceProp prop;
+    checkCudaErrors(cudaGetDeviceProperties(&prop, i));
+
+    int isMemPoolSupported = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&isMemPoolSupported,
+                                           cudaDevAttrMemoryPoolsSupported, i));
+    // CUDA IPC is only supported on devices with unified addressing
+    if (!isMemPoolSupported) {
+      printf("Device %d does not support cuda memory pools, skipping...\n", i);
+      continue;
+    }
+    int deviceSupportsIpcHandle = 0;
+#if defined(__linux__)
+    checkCudaErrors(cuDeviceGetAttribute(
+        &deviceSupportsIpcHandle,
+        CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED,
+        devices[i]));
+#else
+    cuDeviceGetAttribute(&deviceSupportsIpcHandle,
+                         CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED,
+                         devices[i]);
+#endif
+
+    if (!deviceSupportsIpcHandle) {
+      printf("Device %d does not support CUDA IPC Handle, skipping...\n", i);
+      continue;
+    }
+    // This sample requires two processes accessing each device, so we need
+    // to ensure exclusive or prohibited mode is not set
+    if (prop.computeMode != cudaComputeModeDefault) {
+      printf("Device %d is in an unsupported compute mode for this sample\n",
+             i);
+      continue;
+    }
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    // CUDA IPC on Windows is only supported on TCC
+    if (!prop.tccDriver) {
+      printf("Device %d is not in TCC mode\n", i);
+      continue;
+    }
+#endif
+
+    for (int j = 0; j < shm->nprocesses; j++) {
+      int canAccessPeerIJ, canAccessPeerJI;
+      checkCudaErrors(
+          cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i));
+      checkCudaErrors(
+          cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j]));
+      if (!canAccessPeerIJ || !canAccessPeerJI) {
+        allPeers = false;
+        break;
+      }
+    }
+    if (allPeers) {
+      // Enable peers here.  This isn't necessary for IPC, but it will
+      // setup the peers for the device.  For systems that only allow 8
+      // peers per GPU at a time, this acts to remove devices from CanAccessPeer
+      for (int j = 0; j < shm->nprocesses; j++) {
+        checkCudaErrors(cudaSetDevice(i));
+        checkCudaErrors(cudaDeviceEnablePeerAccess(shm->devices[j], 0));
+        checkCudaErrors(cudaSetDevice(shm->devices[j]));
+        checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0));
+      }
+      shm->devices[shm->nprocesses++] = i;
+      if (shm->nprocesses >= MAX_DEVICES) break;
+    } else {
+      printf(
+          "Device %d is not peer capable with some other selected peers, "
+          "skipping\n",
+          i);
+    }
+  }
+
+  if (shm->nprocesses == 0) {
+    printf("No CUDA devices support IPC\n");
+    exit(EXIT_WAIVED);
+  }
+
+  std::vector<ShareableHandle> shareableHandles(shm->nprocesses);
+  std::vector<cudaStream_t> streams(shm->nprocesses);
+  std::vector<cudaMemPool_t> pools(shm->nprocesses);
+
+  // Now allocate memory for each process and fill the shared
+  // memory buffer with the export data and get memPool handles to communicate
+  for (i = 0; i < shm->nprocesses; i++) {
+    void *ptr = NULL;
+    checkCudaErrors(cudaSetDevice(shm->devices[i]));
+    checkCudaErrors(
+        cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking));
+    // Allocate an explicit pool with IPC capabilities
+    cudaMemPoolProps poolProps;
+    memset(&poolProps, 0, sizeof(cudaMemPoolProps));
+    poolProps.allocType = cudaMemAllocationTypePinned;
+    poolProps.handleTypes = cudaMemHandleTypePosixFileDescriptor;
+
+    poolProps.location.type = cudaMemLocationTypeDevice;
+    poolProps.location.id = shm->devices[i];
+
+    checkCudaErrors(cudaMemPoolCreate(&pools[i], &poolProps));
+
+    // Query the shareable handle for the pool
+    cudaMemAllocationHandleType handleType =
+        cudaMemHandleTypePosixFileDescriptor;
+    // Allocate memory in a stream from the pool just created
+    checkCudaErrors(cudaMallocAsync(&ptr, DATA_SIZE, pools[i], streams[i]));
+
+    checkCudaErrors(cudaMemPoolExportToShareableHandle(
+        &shareableHandles[i], pools[i], handleType, 0));
+
+    // Get the opaque ‘bag-of-bits’ representing the allocation
+    memset((void *)&shm->exportPtrData[i], 0, sizeof(cudaMemPoolPtrExportData));
+    checkCudaErrors(cudaMemPoolExportPointer(
+        (cudaMemPoolPtrExportData *)&shm->exportPtrData[i], ptr));
+    ptrs.push_back(ptr);
+  }
+
+  // Launch the child processes!
+  for (i = 0; i < shm->nprocesses; i++) {
+    char devIdx[10];
+    char *const args[] = {app, devIdx, NULL};
+    Process process;
+
+    SPRINTF(devIdx, "%d", i);
+
+    if (spawnProcess(&process, app, args)) {
+      printf("Failed to create process\n");
+      exit(EXIT_FAILURE);
+    }
+
+    processes.push_back(process);
+  }
+
+  barrierWait(&shm->barrier, &shm->sense, (unsigned int)(shm->nprocesses + 1));
+
+  ipcHandle *ipcParentHandle = NULL;
+  checkIpcErrors(ipcCreateSocket(ipcParentHandle, ipcName, processes));
+  checkIpcErrors(
+      ipcSendShareableHandles(ipcParentHandle, shareableHandles, processes));
+
+  // Close the shareable handles as they are not needed anymore.
+  for (int i = 0; i < shm->nprocesses; i++) {
+    checkIpcErrors(ipcCloseShareableHandle(shareableHandles[i]));
+  }
+  checkIpcErrors(ipcCloseSocket(ipcParentHandle));
+
+  // And wait for them to finish
+  for (i = 0; i < processes.size(); i++) {
+    if (waitProcess(&processes[i]) != EXIT_SUCCESS) {
+      printf("Process %d failed!\n", i);
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  // Clean up!
+  for (i = 0; i < shm->nprocesses; i++) {
+    checkCudaErrors(cudaSetDevice(shm->devices[i]));
+    checkCudaErrors(cudaFreeAsync(ptrs[i], streams[i]));
+    checkCudaErrors(cudaStreamSynchronize(streams[i]));
+    checkCudaErrors(cudaMemPoolDestroy(pools[i]));
+  }
+
+  sharedMemoryClose(&info);
+}
+
+// Host code
+int main(int argc, char **argv) {
+#if defined(__arm__) || defined(__aarch64__) || defined(WIN32) || \
+    defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+  printf("Not supported on ARM\n");
+  return EXIT_WAIVED;
+#else
+  if (argc == 1) {
+    parentProcess(argv[0]);
+  } else {
+    childProcess(atoi(argv[1]));
+  }
+  return EXIT_SUCCESS;
+#endif
+}
diff --git a/Samples/streamOrderedAllocationP2P/README.md b/Samples/streamOrderedAllocationP2P/README.md
index ce4c20a6..164284a9 100644
--- a/Samples/streamOrderedAllocationP2P/README.md
+++ b/Samples/streamOrderedAllocationP2P/README.md
@@ -27,7 +27,7 @@ cudaMallocAsync, cudaFreeAsync, cudaMemPoolSetAccess, cudaDeviceGetDefaultMemPoo
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P.cu b/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P.cu
index e289de4e..3c6cffb5 100644
--- a/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P.cu
+++ b/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P.cu
@@ -122,24 +122,23 @@ std::pair<int, int> getP2PCapableGpuPair() {
     int deviceId = itr->second;
     checkCudaErrors(cudaSetDevice(deviceId));
 
-    std::for_each(
-        itr, bestFit.second,
-        [&deviceId, &bestFitDeviceIds](decltype(*itr) mapPair) {
-          if (deviceId != mapPair.second) {
-            int access = 0;
-            checkCudaErrors(
-                cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second));
-            printf("Device=%d %s Access Peer Device=%d\n", deviceId,
-                   access ? "CAN" : "CANNOT", mapPair.second);
-            if (access && bestFitDeviceIds.size() < kNumGpusRequired) {
-              bestFitDeviceIds.emplace(deviceId);
-              bestFitDeviceIds.emplace(mapPair.second);
-            } else {
-              printf("Ignoring device %i (max devices exceeded)\n",
-                     mapPair.second);
-            }
-          }
-        });
+    std::for_each(itr, bestFit.second, [&deviceId, &bestFitDeviceIds,
+                                        &kNumGpusRequired](
+                                           decltype(*itr) mapPair) {
+      if (deviceId != mapPair.second) {
+        int access = 0;
+        checkCudaErrors(
+            cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second));
+        printf("Device=%d %s Access Peer Device=%d\n", deviceId,
+               access ? "CAN" : "CANNOT", mapPair.second);
+        if (access && bestFitDeviceIds.size() < kNumGpusRequired) {
+          bestFitDeviceIds.emplace(deviceId);
+          bestFitDeviceIds.emplace(mapPair.second);
+        } else {
+          printf("Ignoring device %i (max devices exceeded)\n", mapPair.second);
+        }
+      }
+    });
 
     if (bestFitDeviceIds.size() >= kNumGpusRequired) {
       printf("Selected p2p capable devices - ");
diff --git a/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2017.vcxproj b/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2017.vcxproj
index 724e2228..116d3c9c 100644
--- a/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2017.vcxproj
+++ b/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2019.vcxproj b/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2019.vcxproj
index abe713cb..50529ea0 100644
--- a/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2019.vcxproj
+++ b/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/systemWideAtomics/README.md b/Samples/systemWideAtomics/README.md
index bfd9d101..530df9d3 100644
--- a/Samples/systemWideAtomics/README.md
+++ b/Samples/systemWideAtomics/README.md
@@ -30,7 +30,7 @@ cudaMalloc, cudaFree, cudaMemcpy, cudaFreeHost
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/tf32TensorCoreGemm/README.md b/Samples/tf32TensorCoreGemm/README.md
index 517eb9bf..c1513be1 100644
--- a/Samples/tf32TensorCoreGemm/README.md
+++ b/Samples/tf32TensorCoreGemm/README.md
@@ -27,7 +27,7 @@ cudaMalloc, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate, cudaEv
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2017.vcxproj b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2017.vcxproj
index 29078f91..4cd44a20 100644
--- a/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2017.vcxproj
+++ b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2019.vcxproj b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2019.vcxproj
index 6d7501ec..5ed41711 100644
--- a/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2019.vcxproj
+++ b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/vectorAddMMAP/README.md b/Samples/vectorAddMMAP/README.md
index 3d8bfeeb..c385d627 100644
--- a/Samples/vectorAddMMAP/README.md
+++ b/Samples/vectorAddMMAP/README.md
@@ -27,7 +27,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cu
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/vectorAddMMAP/vectorAddMMAP_vs2017.vcxproj b/Samples/vectorAddMMAP/vectorAddMMAP_vs2017.vcxproj
index 6a2619bd..39f9ba5b 100644
--- a/Samples/vectorAddMMAP/vectorAddMMAP_vs2017.vcxproj
+++ b/Samples/vectorAddMMAP/vectorAddMMAP_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -113,6 +113,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/vectorAddMMAP/vectorAddMMAP_vs2019.vcxproj b/Samples/vectorAddMMAP/vectorAddMMAP_vs2019.vcxproj
index 366a3747..ccc98fe0 100644
--- a/Samples/vectorAddMMAP/vectorAddMMAP_vs2019.vcxproj
+++ b/Samples/vectorAddMMAP/vectorAddMMAP_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -109,6 +109,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/vectorAdd_nvrtc/README.md b/Samples/vectorAdd_nvrtc/README.md
index 24c16b06..64132b80 100644
--- a/Samples/vectorAdd_nvrtc/README.md
+++ b/Samples/vectorAdd_nvrtc/README.md
@@ -30,7 +30,7 @@ cuMemAlloc, cuMemFree, cuMemcpyHtoD, cuMemcpyDtoH
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj
index bf1d6e6a..3fecdd25 100644
--- a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj
+++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2019.vcxproj b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2019.vcxproj
index eafec39f..697e47aa 100644
--- a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2019.vcxproj
+++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/vulkanImageCUDA/README.md b/Samples/vulkanImageCUDA/README.md
index b970b993..f37e25f3 100644
--- a/Samples/vulkanImageCUDA/README.md
+++ b/Samples/vulkanImageCUDA/README.md
@@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedMipmappedArray, cudaImportE
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2017.vcxproj b/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2017.vcxproj
index 84fedd67..f14a0515 100644
--- a/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2017.vcxproj
+++ b/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -118,6 +118,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2019.vcxproj b/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2019.vcxproj
index 0d444252..42673869 100644
--- a/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2019.vcxproj
+++ b/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -114,6 +114,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/warpAggregatedAtomicsCG/README.md b/Samples/warpAggregatedAtomicsCG/README.md
index fe0541f3..c4c351a7 100644
--- a/Samples/warpAggregatedAtomicsCG/README.md
+++ b/Samples/warpAggregatedAtomicsCG/README.md
@@ -22,7 +22,7 @@ x86_64, ppc64le, armv7l, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj
index c1087bb9..6a6c8655 100644
--- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj
+++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2019.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2019.vcxproj
index 141ebb15..6e83354a 100644
--- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2019.vcxproj
+++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/watershedSegmentationNPP/Makefile b/Samples/watershedSegmentationNPP/Makefile
index a65719dc..c03a879d 100644
--- a/Samples/watershedSegmentationNPP/Makefile
+++ b/Samples/watershedSegmentationNPP/Makefile
@@ -271,6 +271,12 @@ ifeq ($(TARGET_OS),darwin)
   SAMPLE_ENABLED := 0
 endif
 
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - watershedSegmentationNPP is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
diff --git a/Samples/watershedSegmentationNPP/README.md b/Samples/watershedSegmentationNPP/README.md
index 496cb03b..0b320280 100644
--- a/Samples/watershedSegmentationNPP/README.md
+++ b/Samples/watershedSegmentationNPP/README.md
@@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2017.vcxproj b/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2017.vcxproj
index 48a54fee..548b5361 100644
--- a/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2017.vcxproj
+++ b/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2017.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -108,6 +108,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2019.vcxproj b/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2019.vcxproj
index 8956ca3b..ee297d72 100644
--- a/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2019.vcxproj
+++ b/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2019.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
   </ImportGroup>
 </Project>