From 568b39bd5bea5924446d49d07a4c026a1ec945a5 Mon Sep 17 00:00:00 2001 From: Rutwik Choughule Date: Fri, 16 Apr 2021 11:54:26 +0530 Subject: [PATCH] add and update samples with CUDA 11.3 support --- README.md | 67 +- Samples/EGLStream_CUDA_Interop/Makefile | 6 + Samples/EGLStream_CUDA_Interop/README.md | 2 +- Samples/MersenneTwisterGP11213/Makefile | 29 +- .../MersenneTwister.cpp | 188 +- .../MersenneTwisterGP11213_vs2017.vcxproj | 4 +- .../MersenneTwisterGP11213_vs2019.vcxproj | 4 +- Samples/MersenneTwisterGP11213/README.md | 2 +- .../NV12toBGRandResize_vs2017.vcxproj | 4 +- .../NV12toBGRandResize_vs2019.vcxproj | 4 +- Samples/NV12toBGRandResize/README.md | 2 +- Samples/UnifiedMemoryPerf/README.md | 2 +- .../UnifiedMemoryPerf_vs2017.vcxproj | 4 +- .../UnifiedMemoryPerf_vs2019.vcxproj | 4 +- Samples/bandwidthTest/README.md | 2 +- .../bandwidthTest_vs2017.vcxproj | 4 +- .../bandwidthTest_vs2019.vcxproj | 4 +- .../Makefile | 6 + .../README.md | 2 +- ...chedLabelMarkersAndLabelCompressionNPP.cpp | 93 +- ...rkersAndLabelCompressionNPP_vs2017.vcxproj | 4 +- ...rkersAndLabelCompressionNPP_vs2019.vcxproj | 4 +- Samples/bf16TensorCoreGemm/README.md | 2 +- .../bf16TensorCoreGemm_vs2017.vcxproj | 4 +- .../bf16TensorCoreGemm_vs2019.vcxproj | 4 +- Samples/binaryPartitionCG/README.md | 2 +- .../binaryPartitionCG/binaryPartitionCG.cu | 172 +- .../binaryPartitionCG_vs2017.vcxproj | 4 +- .../binaryPartitionCG_vs2019.vcxproj | 4 +- Samples/boxFilterNPP/README.md | 2 +- .../boxFilterNPP/boxFilterNPP_vs2017.vcxproj | 4 +- .../boxFilterNPP/boxFilterNPP_vs2019.vcxproj | 4 +- Samples/cannyEdgeDetectorNPP/README.md | 2 +- .../cannyEdgeDetectorNPP_vs2017.vcxproj | 4 +- .../cannyEdgeDetectorNPP_vs2019.vcxproj | 4 +- Samples/concurrentKernels/README.md | 2 +- .../concurrentKernels_vs2017.vcxproj | 4 +- .../concurrentKernels_vs2019.vcxproj | 4 +- Samples/conjugateGradientCudaGraphs/Makefile | 6 + Samples/conjugateGradientCudaGraphs/README.md | 2 +- .../conjugateGradientCudaGraphs.cu | 78 +- ...conjugateGradientCudaGraphs_vs2017.vcxproj | 4 +- ...conjugateGradientCudaGraphs_vs2019.vcxproj | 4 +- .../conjugateGradientMultiBlockCG/README.md | 2 +- ...njugateGradientMultiBlockCG_vs2017.vcxproj | 4 +- ...njugateGradientMultiBlockCG_vs2019.vcxproj | 4 +- .../conjugateGradientMultiDeviceCG/README.md | 2 +- .../conjugateGradientMultiDeviceCG.cu | 138 +- ...jugateGradientMultiDeviceCG_vs2017.vcxproj | 4 +- ...jugateGradientMultiDeviceCG_vs2019.vcxproj | 4 +- Samples/cuSolverDn_LinearSolver/Makefile | 6 + Samples/cuSolverDn_LinearSolver/README.md | 2 +- .../cuSolverDn_LinearSolver_vs2017.vcxproj | 4 +- .../cuSolverDn_LinearSolver_vs2019.vcxproj | 4 +- Samples/cuSolverSp_LinearSolver/Makefile | 6 + Samples/cuSolverSp_LinearSolver/README.md | 2 +- .../cuSolverSp_LinearSolver.cpp | 6 +- .../cuSolverSp_LinearSolver_vs2017.vcxproj | 4 +- .../cuSolverSp_LinearSolver_vs2019.vcxproj | 4 +- Samples/cudaCompressibleMemory/README.md | 2 +- .../cudaCompressibleMemory_vs2017.vcxproj | 4 +- .../cudaCompressibleMemory_vs2019.vcxproj | 4 +- Samples/cudaNvSci/Makefile | 6 + Samples/cudaNvSci/README.md | 2 +- Samples/cudaNvSciNvMedia/README.md | 2 +- Samples/cudaOpenMP/README.md | 2 +- Samples/cudaOpenMP/cudaOpenMP_vs2017.vcxproj | 4 +- Samples/cudaOpenMP/cudaOpenMP_vs2019.vcxproj | 4 +- Samples/cudaTensorCoreGemm/README.md | 2 +- .../cudaTensorCoreGemm_vs2017.vcxproj | 4 +- .../cudaTensorCoreGemm_vs2019.vcxproj | 4 +- Samples/deviceQuery/README.md | 2 +- Samples/deviceQuery/deviceQuery.cpp | 21 +- .../deviceQuery/deviceQuery_vs2017.vcxproj | 4 +- .../deviceQuery/deviceQuery_vs2019.vcxproj | 4 +- Samples/dmmaTensorCoreGemm/README.md | 2 +- .../dmmaTensorCoreGemm_vs2017.vcxproj | 4 +- .../dmmaTensorCoreGemm_vs2019.vcxproj | 4 +- Samples/globalToShmemAsyncCopy/README.md | 2 +- .../globalToShmemAsyncCopy.cu | 1681 ++++----- .../globalToShmemAsyncCopy_vs2017.vcxproj | 4 +- .../globalToShmemAsyncCopy_vs2019.vcxproj | 4 +- Samples/immaTensorCoreGemm/README.md | 2 +- .../immaTensorCoreGemm_vs2017.vcxproj | 4 +- .../immaTensorCoreGemm_vs2019.vcxproj | 4 +- Samples/jacobiCudaGraphs/README.md | 2 +- .../jacobiCudaGraphs_vs2017.vcxproj | 4 +- .../jacobiCudaGraphs_vs2019.vcxproj | 4 +- Samples/jacobiCudaGraphs/main.cpp | 9 +- Samples/matrixMul/README.md | 2 +- Samples/matrixMul/matrixMul_vs2017.vcxproj | 4 +- Samples/matrixMul/matrixMul_vs2019.vcxproj | 4 +- Samples/matrixMulDrv/README.md | 2 +- .../matrixMulDrv/matrixMulDrv_vs2017.vcxproj | 4 +- .../matrixMulDrv/matrixMulDrv_vs2019.vcxproj | 4 +- Samples/memMapIPCDrv/Makefile | 19 +- Samples/memMapIPCDrv/README.md | 2 +- .../memMapIPCDrv/memMapIPCDrv_vs2017.vcxproj | 10 +- .../memMapIPCDrv/memMapIPCDrv_vs2019.vcxproj | 10 +- Samples/memMapIPCDrv/memMapIpc.cpp | 87 +- Samples/nvJPEG/Makefile | 6 + Samples/nvJPEG/README.md | 2 +- Samples/nvJPEG/nvJPEG_vs2017.vcxproj | 4 +- Samples/nvJPEG/nvJPEG_vs2019.vcxproj | 4 +- Samples/nvJPEG_encoder/Makefile | 6 + Samples/nvJPEG_encoder/README.md | 2 +- .../nvJPEG_encoder_vs2017.vcxproj | 4 +- .../nvJPEG_encoder_vs2019.vcxproj | 4 +- Samples/p2pBandwidthLatencyTest/README.md | 2 +- .../p2pBandwidthLatencyTest_vs2017.vcxproj | 4 +- .../p2pBandwidthLatencyTest_vs2019.vcxproj | 4 +- Samples/reduction/README.md | 2 +- Samples/reduction/reduction_vs2017.vcxproj | 4 +- Samples/reduction/reduction_vs2019.vcxproj | 4 +- Samples/shfl_scan/README.md | 2 +- Samples/shfl_scan/shfl_scan_vs2017.vcxproj | 4 +- Samples/shfl_scan/shfl_scan_vs2019.vcxproj | 4 +- Samples/simpleAWBarrier/README.md | 2 +- Samples/simpleAWBarrier/simpleAWBarrier.cu | 356 +- .../simpleAWBarrier_vs2017.vcxproj | 4 +- .../simpleAWBarrier_vs2019.vcxproj | 4 +- Samples/simpleAttributes/README.md | 2 +- Samples/simpleAttributes/simpleAttributes.cu | 263 +- .../simpleAttributes_vs2017.vcxproj | 4 +- .../simpleAttributes_vs2019.vcxproj | 4 +- Samples/simpleCUBLAS/Makefile | 29 +- Samples/simpleCUBLAS/README.md | 2 +- .../simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj | 4 +- .../simpleCUBLAS/simpleCUBLAS_vs2019.vcxproj | 4 +- Samples/simpleCUBLASXT/Makefile | 6 + Samples/simpleCUBLASXT/README.md | 2 +- .../simpleCUBLASXT_vs2017.vcxproj | 4 +- .../simpleCUBLASXT_vs2019.vcxproj | 4 +- Samples/simpleCUBLAS_LU/Makefile | 357 ++ Samples/simpleCUBLAS_LU/NsightEclipse.xml | 68 + Samples/simpleCUBLAS_LU/README.md | 71 + Samples/simpleCUBLAS_LU/simpleCUBLAS_LU.cpp | 417 +++ .../simpleCUBLAS_LU_vs2017.sln | 20 + .../simpleCUBLAS_LU_vs2017.vcxproj | 113 + .../simpleCUBLAS_LU_vs2019.sln | 20 + .../simpleCUBLAS_LU_vs2019.vcxproj | 109 + Samples/simpleCUFFT/Makefile | 6 + Samples/simpleCUFFT/README.md | 2 +- .../simpleCUFFT/simpleCUFFT_vs2017.vcxproj | 4 +- .../simpleCUFFT/simpleCUFFT_vs2019.vcxproj | 4 +- Samples/simpleCudaGraphs/README.md | 2 +- Samples/simpleCudaGraphs/simpleCudaGraphs.cu | 3 +- .../simpleCudaGraphs_vs2017.vcxproj | 4 +- .../simpleCudaGraphs_vs2019.vcxproj | 4 +- Samples/simpleD3D11/README.md | 2 +- .../simpleD3D11/simpleD3D11_vs2017.vcxproj | 4 +- .../simpleD3D11/simpleD3D11_vs2019.vcxproj | 4 +- Samples/simpleD3D12/README.md | 2 +- Samples/simpleD3D12/simpleD3D12.cpp | 15 +- .../simpleD3D12/simpleD3D12_vs2017.vcxproj | 4 +- .../simpleD3D12/simpleD3D12_vs2019.vcxproj | 4 +- Samples/simpleDrvRuntime/README.md | 2 +- Samples/simpleDrvRuntime/simpleDrvRuntime.cpp | 12 +- .../simpleDrvRuntime_vs2017.vcxproj | 4 +- .../simpleDrvRuntime_vs2019.vcxproj | 4 +- Samples/simpleGL/README.md | 2 +- Samples/simpleGL/simpleGL_vs2017.vcxproj | 4 +- Samples/simpleGL/simpleGL_vs2019.vcxproj | 4 +- Samples/simpleIPC/README.md | 2 +- Samples/simpleIPC/simpleIPC_vs2017.vcxproj | 4 +- Samples/simpleIPC/simpleIPC_vs2019.vcxproj | 4 +- Samples/simpleVoteIntrinsics/README.md | 2 +- .../simpleVoteIntrinsics_vs2017.vcxproj | 4 +- .../simpleVoteIntrinsics_vs2019.vcxproj | 4 +- Samples/simpleVulkan/README.md | 2 +- Samples/simpleVulkan/SineWaveSimulation.cu | 170 +- Samples/simpleVulkan/SineWaveSimulation.h | 34 +- Samples/simpleVulkan/VulkanBaseApp.cpp | 3201 +++++++++-------- Samples/simpleVulkan/VulkanBaseApp.h | 202 +- Samples/simpleVulkan/main.cpp | 749 ++-- .../simpleVulkan/simpleVulkan_vs2017.vcxproj | 4 +- .../simpleVulkan/simpleVulkan_vs2019.vcxproj | 4 +- Samples/simpleVulkanMMAP/README.md | 2 +- .../simpleVulkanMMAP_vs2017.vcxproj | 4 +- .../simpleVulkanMMAP_vs2019.vcxproj | 4 +- Samples/simpleZeroCopy/README.md | 2 +- .../simpleZeroCopy_vs2017.vcxproj | 4 +- .../simpleZeroCopy_vs2019.vcxproj | 4 +- Samples/streamOrderedAllocation/README.md | 2 +- .../streamOrderedAllocation_vs2017.vcxproj | 4 +- .../streamOrderedAllocation_vs2019.vcxproj | 4 +- Samples/streamOrderedAllocationIPC/Makefile | 423 +++ .../NsightEclipse.xml | 65 + Samples/streamOrderedAllocationIPC/README.md | 60 + .../streamOrderedAllocationIPC.cu | 440 +++ Samples/streamOrderedAllocationP2P/README.md | 2 +- .../streamOrderedAllocationP2P.cu | 35 +- .../streamOrderedAllocationP2P_vs2017.vcxproj | 4 +- .../streamOrderedAllocationP2P_vs2019.vcxproj | 4 +- Samples/systemWideAtomics/README.md | 2 +- Samples/tf32TensorCoreGemm/README.md | 2 +- .../tf32TensorCoreGemm_vs2017.vcxproj | 4 +- .../tf32TensorCoreGemm_vs2019.vcxproj | 4 +- Samples/vectorAddMMAP/README.md | 2 +- .../vectorAddMMAP_vs2017.vcxproj | 4 +- .../vectorAddMMAP_vs2019.vcxproj | 4 +- Samples/vectorAdd_nvrtc/README.md | 2 +- .../vectorAdd_nvrtc_vs2017.vcxproj | 4 +- .../vectorAdd_nvrtc_vs2019.vcxproj | 4 +- Samples/vulkanImageCUDA/README.md | 2 +- .../vulkanImageCUDA_vs2017.vcxproj | 4 +- .../vulkanImageCUDA_vs2019.vcxproj | 4 +- Samples/warpAggregatedAtomicsCG/README.md | 2 +- .../warpAggregatedAtomicsCG_vs2017.vcxproj | 4 +- .../warpAggregatedAtomicsCG_vs2019.vcxproj | 4 +- Samples/watershedSegmentationNPP/Makefile | 6 + Samples/watershedSegmentationNPP/README.md | 2 +- .../watershedSegmentationNPP_vs2017.vcxproj | 4 +- .../watershedSegmentationNPP_vs2019.vcxproj | 4 +- 214 files changed, 6590 insertions(+), 3856 deletions(-) create mode 100644 Samples/simpleCUBLAS_LU/Makefile create mode 100644 Samples/simpleCUBLAS_LU/NsightEclipse.xml create mode 100644 Samples/simpleCUBLAS_LU/README.md create mode 100644 Samples/simpleCUBLAS_LU/simpleCUBLAS_LU.cpp create mode 100644 Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.sln create mode 100644 Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.vcxproj create mode 100644 Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.sln create mode 100644 Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.vcxproj create mode 100644 Samples/streamOrderedAllocationIPC/Makefile create mode 100644 Samples/streamOrderedAllocationIPC/NsightEclipse.xml create mode 100644 Samples/streamOrderedAllocationIPC/README.md create mode 100644 Samples/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu diff --git a/README.md b/README.md index 0db26681..a4fafa7f 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,17 @@ # CUDA Samples -Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads). +Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads). ## Release Notes This section describes the release notes for the CUDA Samples on GitHub only. +### CUDA 11.3 +* Added `streamOrderedAllocationIPC`. Demonstrates Inter Process Communication using one process per GPU for computation. +* Added `simpleCUBLAS_LU`. Demonstrates batched matrix LU decomposition using cuBLAS API `cublasgetrfBatched()` +* Updated `simpleVulkan`. Demonstrates use of timeline semaphore. +* Updated multiple samples to use pinned memory using `cudaMallocHost()`. + ### CUDA 11.2 * Added `streamOrderedAllocation`. Demonstrates stream ordered memory allocation on a GPU using cudaMallocAsync and cudaMemPool family of APIs. * Added `streamOrderedAllocationP2P`. Demonstrates peer-to-peer access of stream ordered memory allocated using cudaMallocAsync and cudaMemPool family of APIs. @@ -103,7 +109,7 @@ This is the first release of CUDA Samples on GitHub: ### Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), and the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html). ### Getting the CUDA Samples @@ -160,38 +166,39 @@ The samples makefiles can take advantage of certain options: ### Samples by OS #### Linux -**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** | +**[bandwidthTest](./Samples/bandwidthTest)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** | ---|---|---|---| -**[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[EGLStream_CUDA_Interop](./Samples/EGLStream_CUDA_Interop)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleIPC](./Samples/simpleIPC)** | -**[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[shfl_scan](./Samples/shfl_scan)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | -**[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[cudaNvSciNvMedia](./Samples/cudaNvSciNvMedia)** | **[nvJPEG](./Samples/nvJPEG)** | -**[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** | -**[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleAttributes](./Samples/simpleAttributes)** | -**[cudaNvSci](./Samples/cudaNvSci)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | -**[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | -**[matrixMulDrv](./Samples/matrixMulDrv)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** | -**[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** | -**[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[bandwidthTest](./Samples/bandwidthTest)** | **[concurrentKernels](./Samples/concurrentKernels)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | -**[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[simpleGL](./Samples/simpleGL)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | -**[simpleVulkan](./Samples/simpleVulkan)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | -**[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[cudaOpenMP](./Samples/cudaOpenMP)** | **[matrixMul](./Samples/matrixMul)** | **[systemWideAtomics](./Samples/systemWideAtomics)** | +**[boxFilterNPP](./Samples/boxFilterNPP)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[concurrentKernels](./Samples/concurrentKernels)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | +**[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[cudaNvSci](./Samples/cudaNvSci)** | +**[cudaNvSciNvMedia](./Samples/cudaNvSciNvMedia)** | **[cudaOpenMP](./Samples/cudaOpenMP)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | +**[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[deviceQuery](./Samples/deviceQuery)** | **[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[EGLStream_CUDA_Interop](./Samples/EGLStream_CUDA_Interop)** | +**[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | **[matrixMul](./Samples/matrixMul)** | +**[matrixMulDrv](./Samples/matrixMulDrv)** | **[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | +**[nvJPEG](./Samples/nvJPEG)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[reduction](./Samples/reduction)** | +**[shfl_scan](./Samples/shfl_scan)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | +**[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleCUBLAS_LU](./Samples/simpleCUBLAS_LU)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | +**[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[simpleGL](./Samples/simpleGL)** | **[simpleIPC](./Samples/simpleIPC)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | +**[simpleVulkan](./Samples/simpleVulkan)** | **[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** | +**[streamOrderedAllocationIPC](./Samples/streamOrderedAllocationIPC)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | **[systemWideAtomics](./Samples/systemWideAtomics)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | +**[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | +**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** | #### Windows -**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** | +**[bandwidthTest](./Samples/bandwidthTest)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** | ---|---|---|---| -**[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleIPC](./Samples/simpleIPC)** | **[memMapIPCDrv](./Samples/memMapIPCDrv)** | -**[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[shfl_scan](./Samples/shfl_scan)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | -**[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[nvJPEG](./Samples/nvJPEG)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[simpleD3D12](./Samples/simpleD3D12)** | -**[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** | **[deviceQuery](./Samples/deviceQuery)** | -**[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | -**[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | -**[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | -**[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | -**[simpleD3D11](./Samples/simpleD3D11)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | -**[bandwidthTest](./Samples/bandwidthTest)** | **[concurrentKernels](./Samples/concurrentKernels)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | -**[simpleGL](./Samples/simpleGL)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | **[simpleVulkan](./Samples/simpleVulkan)** | -**[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | -**[cudaOpenMP](./Samples/cudaOpenMP)** | **[matrixMul](./Samples/matrixMul)** | +**[boxFilterNPP](./Samples/boxFilterNPP)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[concurrentKernels](./Samples/concurrentKernels)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | +**[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[cudaOpenMP](./Samples/cudaOpenMP)** | +**[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[deviceQuery](./Samples/deviceQuery)** | +**[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | +**[matrixMul](./Samples/matrixMul)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | +**[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[nvJPEG](./Samples/nvJPEG)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | +**[reduction](./Samples/reduction)** | **[shfl_scan](./Samples/shfl_scan)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** | +**[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleCUBLAS_LU](./Samples/simpleCUBLAS_LU)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | +**[simpleCUFFT](./Samples/simpleCUFFT)** | **[simpleD3D11](./Samples/simpleD3D11)** | **[simpleD3D12](./Samples/simpleD3D12)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | +**[simpleGL](./Samples/simpleGL)** | **[simpleIPC](./Samples/simpleIPC)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleVulkan](./Samples/simpleVulkan)** | +**[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | +**[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | +**[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** | ## Dependencies diff --git a/Samples/EGLStream_CUDA_Interop/Makefile b/Samples/EGLStream_CUDA_Interop/Makefile index 010ce65c..1e901d99 100644 --- a/Samples/EGLStream_CUDA_Interop/Makefile +++ b/Samples/EGLStream_CUDA_Interop/Makefile @@ -285,6 +285,12 @@ ifeq ($(TARGET_OS),android) SAMPLE_ENABLED := 0 endif +# This sample is not supported on QNX +ifeq ($(TARGET_OS),qnx) + $(info >>> WARNING - EGLStream_CUDA_Interop is not supported on QNX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) diff --git a/Samples/EGLStream_CUDA_Interop/README.md b/Samples/EGLStream_CUDA_Interop/README.md index b7420b73..2a0f654d 100644 --- a/Samples/EGLStream_CUDA_Interop/README.md +++ b/Samples/EGLStream_CUDA_Interop/README.md @@ -30,7 +30,7 @@ cuDeviceGet, cuDeviceGetAttribute, cuDeviceComputeCapability, cuDeviceGetCount, ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/MersenneTwisterGP11213/Makefile b/Samples/MersenneTwisterGP11213/Makefile index e40b5b99..fb3aa590 100644 --- a/Samples/MersenneTwisterGP11213/Makefile +++ b/Samples/MersenneTwisterGP11213/Makefile @@ -263,6 +263,14 @@ ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) +SAMPLE_ENABLED := 1 + +# This sample is not supported on QNX +ifeq ($(TARGET_OS),qnx) + $(info >>> WARNING - MersenneTwisterGP11213 is not supported on QNX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) @@ -297,6 +305,10 @@ ALL_CCFLAGS += --threads 0 LIBRARIES += -lcurand_static -lculibos +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + ################################################################################ # Target rules @@ -304,16 +316,23 @@ all: build build: MersenneTwisterGP11213 +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + MersenneTwister.o:MersenneTwister.cpp - $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< MersenneTwisterGP11213: MersenneTwister.o - $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) - mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) - cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) run: build - ./MersenneTwisterGP11213 + $(EXEC) ./MersenneTwisterGP11213 clean: rm -f MersenneTwisterGP11213 MersenneTwister.o diff --git a/Samples/MersenneTwisterGP11213/MersenneTwister.cpp b/Samples/MersenneTwisterGP11213/MersenneTwister.cpp index bb2916bc..3462512c 100644 --- a/Samples/MersenneTwisterGP11213/MersenneTwister.cpp +++ b/Samples/MersenneTwisterGP11213/MersenneTwister.cpp @@ -47,138 +47,134 @@ float compareResults(int rand_n, float *h_RandGPU, float *h_RandCPU); -const int DEFAULT_RAND_N = 2400000; +const int DEFAULT_RAND_N = 2400000; const unsigned int DEFAULT_SEED = 777; /////////////////////////////////////////////////////////////////////////////// // Main program /////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) -{ - // Start logs - printf("%s Starting...\n\n", argv[0]); +int main(int argc, char **argv) { + // Start logs + printf("%s Starting...\n\n", argv[0]); - // initialize the GPU, either identified by --device - // or by picking the device with highest flop rate. - int devID = findCudaDevice(argc, (const char **)argv); + // initialize the GPU, either identified by --device + // or by picking the device with highest flop rate. + int devID = findCudaDevice(argc, (const char **)argv); - // parsing the number of random numbers to generate - int rand_n = DEFAULT_RAND_N; + // parsing the number of random numbers to generate + int rand_n = DEFAULT_RAND_N; - if (checkCmdLineFlag(argc, (const char **) argv, "count")) - { - rand_n = getCmdLineArgumentInt(argc, (const char **) argv, "count"); - } + if (checkCmdLineFlag(argc, (const char **)argv, "count")) { + rand_n = getCmdLineArgumentInt(argc, (const char **)argv, "count"); + } - printf("Allocating data for %i samples...\n", rand_n); + printf("Allocating data for %i samples...\n", rand_n); - // parsing the seed - int seed = DEFAULT_SEED; + // parsing the seed + int seed = DEFAULT_SEED; - if (checkCmdLineFlag(argc, (const char **) argv, "seed")) - { - seed = getCmdLineArgumentInt(argc, (const char **) argv, "seed"); - } + if (checkCmdLineFlag(argc, (const char **)argv, "seed")) { + seed = getCmdLineArgumentInt(argc, (const char **)argv, "seed"); + } - printf("Seeding with %i ...\n", seed); + printf("Seeding with %i ...\n", seed); - cudaStream_t stream; - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + cudaStream_t stream; + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - float *d_Rand; - checkCudaErrors(cudaMalloc((void **)&d_Rand, rand_n * sizeof(float))); + float *d_Rand; + checkCudaErrors(cudaMalloc((void **)&d_Rand, rand_n * sizeof(float))); - curandGenerator_t prngGPU; - checkCudaErrors(curandCreateGenerator(&prngGPU, CURAND_RNG_PSEUDO_MTGP32)); - checkCudaErrors(curandSetStream(prngGPU, stream)); - checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngGPU, seed)); + curandGenerator_t prngGPU; + checkCudaErrors(curandCreateGenerator(&prngGPU, CURAND_RNG_PSEUDO_MTGP32)); + checkCudaErrors(curandSetStream(prngGPU, stream)); + checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngGPU, seed)); - curandGenerator_t prngCPU; - checkCudaErrors(curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32)); - checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngCPU, seed)); + curandGenerator_t prngCPU; + checkCudaErrors( + curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32)); + checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngCPU, seed)); - // - // Example 1: Compare random numbers generated on GPU and CPU - float *h_RandGPU = (float *)malloc(rand_n * sizeof(float)); + // + // Example 1: Compare random numbers generated on GPU and CPU + float *h_RandGPU; + checkCudaErrors(cudaMallocHost(&h_RandGPU, rand_n * sizeof(float))); - printf("Generating random numbers on GPU...\n\n"); - checkCudaErrors(curandGenerateUniform(prngGPU, (float *) d_Rand, rand_n)); + printf("Generating random numbers on GPU...\n\n"); + checkCudaErrors(curandGenerateUniform(prngGPU, (float *)d_Rand, rand_n)); - printf("\nReading back the results...\n"); - checkCudaErrors(cudaMemcpyAsync(h_RandGPU, d_Rand, rand_n * sizeof(float), cudaMemcpyDeviceToHost, stream)); + printf("\nReading back the results...\n"); + checkCudaErrors(cudaMemcpyAsync(h_RandGPU, d_Rand, rand_n * sizeof(float), + cudaMemcpyDeviceToHost, stream)); + float *h_RandCPU = (float *)malloc(rand_n * sizeof(float)); - float *h_RandCPU = (float *)malloc(rand_n * sizeof(float)); + printf("Generating random numbers on CPU...\n\n"); + checkCudaErrors(curandGenerateUniform(prngCPU, (float *)h_RandCPU, rand_n)); - printf("Generating random numbers on CPU...\n\n"); - checkCudaErrors(curandGenerateUniform(prngCPU, (float *) h_RandCPU, rand_n)); + checkCudaErrors(cudaStreamSynchronize(stream)); + printf("Comparing CPU/GPU random numbers...\n\n"); + float L1norm = compareResults(rand_n, h_RandGPU, h_RandCPU); - checkCudaErrors(cudaStreamSynchronize(stream)); - printf("Comparing CPU/GPU random numbers...\n\n"); - float L1norm = compareResults(rand_n, h_RandGPU, h_RandCPU); + // + // Example 2: Timing of random number generation on GPU + const int numIterations = 10; + int i; + StopWatchInterface *hTimer; - // - // Example 2: Timing of random number generation on GPU - const int numIterations = 10; - int i; - StopWatchInterface *hTimer; + sdkCreateTimer(&hTimer); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); - sdkCreateTimer(&hTimer); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); + for (i = 0; i < numIterations; i++) { + checkCudaErrors(curandGenerateUniform(prngGPU, (float *)d_Rand, rand_n)); + } - for (i = 0; i < numIterations; i++) - { - checkCudaErrors(curandGenerateUniform(prngGPU, (float *) d_Rand, rand_n)); - } + checkCudaErrors(cudaStreamSynchronize(stream)); + sdkStopTimer(&hTimer); - checkCudaErrors(cudaStreamSynchronize(stream)); - sdkStopTimer(&hTimer); + double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer) / (double)numIterations; - double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer)/(double)numIterations; + printf( + "MersenneTwisterGP11213, Throughput = %.4f GNumbers/s, Time = %.5f s, " + "Size = %u Numbers\n", + 1.0e-9 * rand_n / gpuTime, gpuTime, rand_n); - printf("MersenneTwisterGP11213, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers\n", - 1.0e-9 * rand_n / gpuTime, gpuTime, rand_n); + printf("Shutting down...\n"); - printf("Shutting down...\n"); + checkCudaErrors(curandDestroyGenerator(prngGPU)); + checkCudaErrors(curandDestroyGenerator(prngCPU)); + checkCudaErrors(cudaStreamDestroy(stream)); + checkCudaErrors(cudaFree(d_Rand)); + sdkDeleteTimer(&hTimer); + checkCudaErrors(cudaFreeHost(h_RandGPU)); + free(h_RandCPU); - checkCudaErrors(curandDestroyGenerator(prngGPU)); - checkCudaErrors(curandDestroyGenerator(prngCPU)); - checkCudaErrors(cudaStreamDestroy(stream)); - checkCudaErrors(cudaFree(d_Rand)); - sdkDeleteTimer(&hTimer); - free(h_RandGPU); - free(h_RandCPU); - - exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE); + exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE); } +float compareResults(int rand_n, float *h_RandGPU, float *h_RandCPU) { + int i; + float rCPU, rGPU, delta; + float max_delta = 0.; + float sum_delta = 0.; + float sum_ref = 0.; -float compareResults(int rand_n, float *h_RandGPU, float *h_RandCPU) -{ - int i; - float rCPU, rGPU, delta; - float max_delta = 0.; - float sum_delta = 0.; - float sum_ref = 0.; + for (i = 0; i < rand_n; i++) { + rCPU = h_RandCPU[i]; + rGPU = h_RandGPU[i]; + delta = fabs(rCPU - rGPU); + sum_delta += delta; + sum_ref += fabs(rCPU); - for (i = 0; i < rand_n; i++) - { - rCPU = h_RandCPU[i]; - rGPU = h_RandGPU[i]; - delta = fabs(rCPU - rGPU); - sum_delta += delta; - sum_ref += fabs(rCPU); - - if (delta >= max_delta) - { - max_delta = delta; - } + if (delta >= max_delta) { + max_delta = delta; } + } - float L1norm = (float)(sum_delta / sum_ref); - printf("Max absolute error: %E\n", max_delta); - printf("L1 norm: %E\n\n", L1norm); + float L1norm = (float)(sum_delta / sum_ref); + printf("Max absolute error: %E\n", max_delta); + printf("L1 norm: %E\n\n", L1norm); - return L1norm; + return L1norm; } diff --git a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2017.vcxproj b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2017.vcxproj index 73c7cc50..e39c60a4 100644 --- a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2017.vcxproj +++ b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2019.vcxproj b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2019.vcxproj index 01fcaae8..8648205f 100644 --- a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2019.vcxproj +++ b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/MersenneTwisterGP11213/README.md b/Samples/MersenneTwisterGP11213/README.md index c3a82706..eb8bd797 100644 --- a/Samples/MersenneTwisterGP11213/README.md +++ b/Samples/MersenneTwisterGP11213/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.vcxproj b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.vcxproj index cc3bd1a6..3bbad98a 100644 --- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.vcxproj +++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -113,6 +113,6 @@ - + diff --git a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.vcxproj b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.vcxproj index 61b93fa1..a5149390 100644 --- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.vcxproj +++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -109,6 +109,6 @@ - + diff --git a/Samples/NV12toBGRandResize/README.md b/Samples/NV12toBGRandResize/README.md index 70b2f5ea..8070aaf0 100644 --- a/Samples/NV12toBGRandResize/README.md +++ b/Samples/NV12toBGRandResize/README.md @@ -27,7 +27,7 @@ cudaMemcpy2D, cudaMallocManaged ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/UnifiedMemoryPerf/README.md b/Samples/UnifiedMemoryPerf/README.md index c3dd1857..43cecf11 100644 --- a/Samples/UnifiedMemoryPerf/README.md +++ b/Samples/UnifiedMemoryPerf/README.md @@ -28,7 +28,7 @@ cudaMallocManaged, cudaStreamAttachMemAsync, cudaMemcpyAsync, cudaMallocHost, cu ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj index a8c46f60..b767c25f 100644 --- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj +++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -111,6 +111,6 @@ - + diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.vcxproj b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.vcxproj index 6b1b3383..cfcb126c 100644 --- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.vcxproj +++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -107,6 +107,6 @@ - + diff --git a/Samples/bandwidthTest/README.md b/Samples/bandwidthTest/README.md index 04db60d7..8f70b9c0 100644 --- a/Samples/bandwidthTest/README.md +++ b/Samples/bandwidthTest/README.md @@ -27,7 +27,7 @@ cudaSetDevice, cudaHostAlloc, cudaFree, cudaMallocHost, cudaFreeHost, cudaMemcpy ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj b/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj index 68998f73..c6979275 100644 --- a/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj +++ b/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/bandwidthTest/bandwidthTest_vs2019.vcxproj b/Samples/bandwidthTest/bandwidthTest_vs2019.vcxproj index 2c9afc01..40850f7e 100644 --- a/Samples/bandwidthTest/bandwidthTest_vs2019.vcxproj +++ b/Samples/bandwidthTest/bandwidthTest_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/batchedLabelMarkersAndLabelCompressionNPP/Makefile b/Samples/batchedLabelMarkersAndLabelCompressionNPP/Makefile index 00ee41fa..fccab0a1 100644 --- a/Samples/batchedLabelMarkersAndLabelCompressionNPP/Makefile +++ b/Samples/batchedLabelMarkersAndLabelCompressionNPP/Makefile @@ -271,6 +271,12 @@ ifeq ($(TARGET_OS),darwin) SAMPLE_ENABLED := 0 endif +# This sample is not supported on QNX +ifeq ($(TARGET_OS),qnx) + $(info >>> WARNING - batchedLabelMarkersAndLabelCompressionNPP is not supported on QNX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) diff --git a/Samples/batchedLabelMarkersAndLabelCompressionNPP/README.md b/Samples/batchedLabelMarkersAndLabelCompressionNPP/README.md index 430e7aa8..16270de7 100644 --- a/Samples/batchedLabelMarkersAndLabelCompressionNPP/README.md +++ b/Samples/batchedLabelMarkersAndLabelCompressionNPP/README.md @@ -28,7 +28,7 @@ x86_64, ppc64le, armv7l ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP.cpp b/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP.cpp index 6079a289..50a6b400 100644 --- a/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP.cpp +++ b/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP.cpp @@ -36,11 +36,13 @@ #include #include +#include #include +#include #include // Note: If you want to view these images we HIGHLY recommend using imagej -// which is free on the internet and works on most platforms +// which is free on the internet and works on most platforms // because it is one of the few image viewing apps that can display 32 // bit integer image data. While it normalizes the data to floating // point values for viewing it still provides a good representation of @@ -102,11 +104,12 @@ void tearDown() // Clean up and tear down if (pUFBatchSrcDstImageListDev != 0) cudaFree(pUFBatchSrcDstImageListDev); if (pUFBatchSrcImageListDev != 0) cudaFree(pUFBatchSrcImageListDev); if (pUFBatchPerImageCompressedCountListHost != 0) - free(pUFBatchPerImageCompressedCountListHost); + cudaFreeHost(pUFBatchPerImageCompressedCountListHost); if (pUFBatchSrcDstScratchBufferListHost != 0) - free(pUFBatchSrcDstScratchBufferListHost); - if (pUFBatchSrcDstImageListHost != 0) free(pUFBatchSrcDstImageListHost); - if (pUFBatchSrcImageListHost != 0) free(pUFBatchSrcImageListHost); + cudaFreeHost(pUFBatchSrcDstScratchBufferListHost); + if (pUFBatchSrcDstImageListHost != 0) + cudaFreeHost(pUFBatchSrcDstImageListHost); + if (pUFBatchSrcImageListHost != 0) cudaFreeHost(pUFBatchSrcImageListHost); for (int j = 0; j < NUMBER_OF_IMAGES; j++) { if (pUFCompressedLabelsScratchBufferDev[j] != 0) @@ -115,8 +118,8 @@ void tearDown() // Clean up and tear down cudaFree(pUFGenerateLabelsScratchBufferDev[j]); if (pUFLabelDev[j] != 0) cudaFree(pUFLabelDev[j]); if (pInputImageDev[j] != 0) cudaFree(pInputImageDev[j]); - if (pUFLabelHost[j] != 0) free(pUFLabelHost[j]); - if (pInputImageHost[j] != 0) free(pInputImageHost[j]); + if (pUFLabelHost[j] != 0) cudaFreeHost(pUFLabelHost[j]); + if (pInputImageHost[j] != 0) cudaFreeHost(pInputImageHost[j]); } } @@ -177,7 +180,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) { exit(EXIT_WAIVED); } - bmpFile = fopen(InputFile, "rb"); + FOPEN(bmpFile, InputFile, "rb"); } else if (nImage == 1) { if (nWidth != 512 || nHeight != 512) return -1; const char *fileName = "CT_skull_512x512_8u.raw"; @@ -187,7 +190,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) { exit(EXIT_WAIVED); } - bmpFile = fopen(InputFile, "rb"); + FOPEN(bmpFile, InputFile, "rb"); } else if (nImage == 2) { if (nWidth != 509 || nHeight != 335) return -1; const char *fileName = "PCB_METAL_509x335_8u.raw"; @@ -197,7 +200,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) { exit(EXIT_WAIVED); } - bmpFile = fopen(InputFile, "rb"); + FOPEN(bmpFile, InputFile, "rb"); } else if (nImage == 3) { if (nWidth != 1024 || nHeight != 683) return -1; const char *fileName = "PCB2_1024x683_8u.raw"; @@ -207,7 +210,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) { exit(EXIT_WAIVED); } - bmpFile = fopen(InputFile, "rb"); + FOPEN(bmpFile, InputFile, "rb"); } else if (nImage == 4) { if (nWidth != 1280 || nHeight != 720) return -1; const char *fileName = "PCB_1280x720_8u.raw"; @@ -217,7 +220,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) { exit(EXIT_WAIVED); } - bmpFile = fopen(InputFile, "rb"); + FOPEN(bmpFile, InputFile, "rb"); } else { printf("Input file load failed.\n"); return -1; @@ -347,9 +350,11 @@ int main(int argc, char **argv) { oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height); if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR; - pInputImageHost[nImage] = reinterpret_cast(malloc( + checkCudaErrors(cudaMallocHost( + &(pInputImageHost[nImage]), oSizeROI[nImage].width * sizeof(Npp8u) * oSizeROI[nImage].height)); - pUFLabelHost[nImage] = reinterpret_cast(malloc( + checkCudaErrors(cudaMallocHost( + &(pUFLabelHost[nImage]), oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height)); // Use UF functions throughout this sample. @@ -409,15 +414,15 @@ int main(int argc, char **argv) { } if (nImage == 0) - bmpFile = fopen(LabelMarkersOutputFile0.c_str(), "wb"); + FOPEN(bmpFile, LabelMarkersOutputFile0.c_str(), "wb"); else if (nImage == 1) - bmpFile = fopen(LabelMarkersOutputFile1.c_str(), "wb"); + FOPEN(bmpFile, LabelMarkersOutputFile1.c_str(), "wb"); else if (nImage == 2) - bmpFile = fopen(LabelMarkersOutputFile2.c_str(), "wb"); + FOPEN(bmpFile, LabelMarkersOutputFile2.c_str(), "wb"); else if (nImage == 3) - bmpFile = fopen(LabelMarkersOutputFile3.c_str(), "wb"); + FOPEN(bmpFile, LabelMarkersOutputFile3.c_str(), "wb"); else if (nImage == 4) - bmpFile = fopen(LabelMarkersOutputFile4.c_str(), "wb"); + FOPEN(bmpFile, LabelMarkersOutputFile4.c_str(), "wb"); if (bmpFile == NULL) return -1; size_t nSize = 0; @@ -478,15 +483,15 @@ int main(int argc, char **argv) { } if (nImage == 0) - bmpFile = fopen(CompressedMarkerLabelsOutputFile0.c_str(), "wb"); + FOPEN(bmpFile, CompressedMarkerLabelsOutputFile0.c_str(), "wb"); else if (nImage == 1) - bmpFile = fopen(CompressedMarkerLabelsOutputFile1.c_str(), "wb"); + FOPEN(bmpFile, CompressedMarkerLabelsOutputFile1.c_str(), "wb"); else if (nImage == 2) - bmpFile = fopen(CompressedMarkerLabelsOutputFile2.c_str(), "wb"); + FOPEN(bmpFile, CompressedMarkerLabelsOutputFile2.c_str(), "wb"); else if (nImage == 3) - bmpFile = fopen(CompressedMarkerLabelsOutputFile3.c_str(), "wb"); + FOPEN(bmpFile, CompressedMarkerLabelsOutputFile3.c_str(), "wb"); else if (nImage == 4) - bmpFile = fopen(CompressedMarkerLabelsOutputFile4.c_str(), "wb"); + FOPEN(bmpFile, CompressedMarkerLabelsOutputFile4.c_str(), "wb"); if (bmpFile == NULL) return -1; nSize = 0; @@ -554,10 +559,11 @@ int main(int argc, char **argv) { cudaMalloc((void **)&pUFBatchSrcDstImageListDev, nBatchImageListBytes); if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR; - pUFBatchSrcImageListHost = - reinterpret_cast(malloc(nBatchImageListBytes)); - pUFBatchSrcDstImageListHost = - reinterpret_cast(malloc(nBatchImageListBytes)); + checkCudaErrors( + cudaMallocHost((void **)&pUFBatchSrcImageListHost, nBatchImageListBytes)); + + checkCudaErrors(cudaMallocHost((void **)&pUFBatchSrcDstImageListHost, + nBatchImageListBytes)); NppiSize oMaxROISize = {0, 0}; @@ -620,15 +626,15 @@ int main(int argc, char **argv) { // Save output to files for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) { if (nImage == 0) - bmpFile = fopen(LabelMarkersBatchOutputFile0.c_str(), "wb"); + FOPEN(bmpFile, LabelMarkersBatchOutputFile0.c_str(), "wb"); else if (nImage == 1) - bmpFile = fopen(LabelMarkersBatchOutputFile1.c_str(), "wb"); + FOPEN(bmpFile, LabelMarkersBatchOutputFile1.c_str(), "wb"); else if (nImage == 2) - bmpFile = fopen(LabelMarkersBatchOutputFile2.c_str(), "wb"); + FOPEN(bmpFile, LabelMarkersBatchOutputFile2.c_str(), "wb"); else if (nImage == 3) - bmpFile = fopen(LabelMarkersBatchOutputFile3.c_str(), "wb"); + FOPEN(bmpFile, LabelMarkersBatchOutputFile3.c_str(), "wb"); else if (nImage == 4) - bmpFile = fopen(LabelMarkersBatchOutputFile4.c_str(), "wb"); + FOPEN(bmpFile, LabelMarkersBatchOutputFile4.c_str(), "wb"); if (bmpFile == NULL) return -1; size_t nSize = 0; @@ -652,12 +658,13 @@ int main(int argc, char **argv) { // Allocate host side scratch buffer point and size list and initialize with // device scratch buffer pointers - pUFBatchSrcDstScratchBufferListHost = - reinterpret_cast( - malloc(NUMBER_OF_IMAGES * sizeof(NppiBufferDescriptor))); + checkCudaErrors( + cudaMallocHost((void **)&pUFBatchSrcDstScratchBufferListHost, + NUMBER_OF_IMAGES * sizeof(NppiBufferDescriptor))); - pUFBatchPerImageCompressedCountListHost = - reinterpret_cast(malloc(NUMBER_OF_IMAGES * sizeof(Npp32u))); + checkCudaErrors( + cudaMallocHost((void **)&pUFBatchPerImageCompressedCountListHost, + +NUMBER_OF_IMAGES * sizeof(Npp32u))); // Start buffer pointer at beginning of full per image buffer list sized // pUFCompressedLabelsScratchBufferDev[0] @@ -728,15 +735,15 @@ int main(int argc, char **argv) { // Save compressed label images into files for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) { if (nImage == 0) - bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile0.c_str(), "wb"); + FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile0.c_str(), "wb"); else if (nImage == 1) - bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile1.c_str(), "wb"); + FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile1.c_str(), "wb"); else if (nImage == 2) - bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile2.c_str(), "wb"); + FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile2.c_str(), "wb"); else if (nImage == 3) - bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile3.c_str(), "wb"); + FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile3.c_str(), "wb"); else if (nImage == 4) - bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile4.c_str(), "wb"); + FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile4.c_str(), "wb"); if (bmpFile == NULL) return -1; size_t nSize = 0; diff --git a/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2017.vcxproj b/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2017.vcxproj index c931c860..0f34cd13 100644 --- a/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2017.vcxproj +++ b/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2019.vcxproj b/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2019.vcxproj index 8c37e349..22205f6b 100644 --- a/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2019.vcxproj +++ b/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/bf16TensorCoreGemm/README.md b/Samples/bf16TensorCoreGemm/README.md index 10775e50..5a51bb4e 100644 --- a/Samples/bf16TensorCoreGemm/README.md +++ b/Samples/bf16TensorCoreGemm/README.md @@ -27,7 +27,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate, ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2017.vcxproj b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2017.vcxproj index 384240ed..156376ad 100644 --- a/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2017.vcxproj +++ b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2019.vcxproj b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2019.vcxproj index a8f8eded..1146105a 100644 --- a/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2019.vcxproj +++ b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/binaryPartitionCG/README.md b/Samples/binaryPartitionCG/README.md index c24500fb..98c3418d 100644 --- a/Samples/binaryPartitionCG/README.md +++ b/Samples/binaryPartitionCG/README.md @@ -24,7 +24,7 @@ x86_64, ppc64le, armv7l ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/binaryPartitionCG/binaryPartitionCG.cu b/Samples/binaryPartitionCG/binaryPartitionCG.cu index 53021c44..341fb4f1 100644 --- a/Samples/binaryPartitionCG/binaryPartitionCG.cu +++ b/Samples/binaryPartitionCG/binaryPartitionCG.cu @@ -31,14 +31,16 @@ * 1.) Each thread loads a value from random array. * 2.) then checks if it is odd or even. * 3.) create binary partition group based on the above predicate - * 4.) we count the number of odd/even in the group based on size of the binary groups + * 4.) we count the number of odd/even in the group based on size of the binary + groups * 5.) write it global counter of odd. - * 6.) sum the values loaded by individual threads(using reduce) and write it to global - * even & odd elements sum. + * 6.) sum the values loaded by individual threads(using reduce) and write it to + global even & odd elements sum. * - * **NOTE** : binary_partition results in splitting warp into divergent thread groups - this is not good from performance perspective, but in cases where warp - divergence is inevitable one can use binary_partition group. + * **NOTE** : + * binary_partition results in splitting warp into divergent thread groups + * this is not good from performance perspective, but in cases where warp + * divergence is inevitable one can use binary_partition group. */ #include @@ -48,108 +50,110 @@ namespace cg = cooperative_groups; -void initOddEvenArr(int *inputArr, unsigned int size) -{ - for (int i=0; i < size; i++) - { - inputArr[i] = rand() % 50; - } +void initOddEvenArr(int *inputArr, unsigned int size) { + for (int i = 0; i < size; i++) { + inputArr[i] = rand() % 50; + } } - /** * CUDA kernel device code - * + * * Creates cooperative groups and performs odd/even counting & summation. */ -__global__ void oddEvenCountAndSumCG(int *inputArr, int *numOfOdds, int *sumOfOddAndEvens, unsigned int size) -{ - cg::thread_block cta = cg::this_thread_block(); - cg::grid_group grid = cg::this_grid(); - cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); +__global__ void oddEvenCountAndSumCG(int *inputArr, int *numOfOdds, + int *sumOfOddAndEvens, unsigned int size) { + cg::thread_block cta = cg::this_thread_block(); + cg::grid_group grid = cg::this_grid(); + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); - for (int i = grid.thread_rank(); i < size; i += grid.size()) + for (int i = grid.thread_rank(); i < size; i += grid.size()) { + int elem = inputArr[i]; + auto subTile = cg::binary_partition(tile32, elem & 1); + if (elem & 1) // Odd numbers group { - int elem = inputArr[i]; - auto subTile = cg::binary_partition(tile32, elem & 1); - if (elem & 1) // Odd numbers group - { - int oddGroupSum = cg::reduce(subTile, elem, cg::plus()); + int oddGroupSum = cg::reduce(subTile, elem, cg::plus()); - if (subTile.thread_rank() == 0) - { - // Add number of odds present in this group of Odds. - atomicAdd(numOfOdds, subTile.size()); + if (subTile.thread_rank() == 0) { + // Add number of odds present in this group of Odds. + atomicAdd(numOfOdds, subTile.size()); - // Add local reduction of odds present in this group of Odds. - atomicAdd(&sumOfOddAndEvens[0], oddGroupSum); + // Add local reduction of odds present in this group of Odds. + atomicAdd(&sumOfOddAndEvens[0], oddGroupSum); + } + } else // Even numbers group + { + int evenGroupSum = cg::reduce(subTile, elem, cg::plus()); - } - } - else // Even numbers group - { - int evenGroupSum = cg::reduce(subTile, elem, cg::plus()); - - if (subTile.thread_rank() == 0) - { - // Add local reduction of even present in this group of evens. - atomicAdd(&sumOfOddAndEvens[1], evenGroupSum); - } - } - // reconverge warp so for next loop iteration we ensure convergence of - // above diverged threads to perform coalesced loads of inputArr. - cg::sync(tile32); + if (subTile.thread_rank() == 0) { + // Add local reduction of even present in this group of evens. + atomicAdd(&sumOfOddAndEvens[1], evenGroupSum); + } } + // reconverge warp so for next loop iteration we ensure convergence of + // above diverged threads to perform coalesced loads of inputArr. + cg::sync(tile32); + } } - /** * Host main routine */ -int main(int argc, const char **argv) -{ - int deviceId = findCudaDevice(argc, argv); - int *h_inputArr, *d_inputArr; - int *h_numOfOdds, *d_numOfOdds; - int *h_sumOfOddEvenElems, *d_sumOfOddEvenElems; - unsigned int arrSize = 1024 * 100; +int main(int argc, const char **argv) { + int deviceId = findCudaDevice(argc, argv); + int *h_inputArr, *d_inputArr; + int *h_numOfOdds, *d_numOfOdds; + int *h_sumOfOddEvenElems, *d_sumOfOddEvenElems; + unsigned int arrSize = 1024 * 100; - h_inputArr = new int[arrSize]; - h_numOfOdds = new int[1]; - h_sumOfOddEvenElems = new int[2]; - initOddEvenArr(h_inputArr, arrSize); - - cudaStream_t stream; - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - checkCudaErrors(cudaMalloc(&d_inputArr, sizeof(int)*arrSize)); - checkCudaErrors(cudaMalloc(&d_numOfOdds, sizeof(int))); - checkCudaErrors(cudaMalloc(&d_sumOfOddEvenElems, sizeof(int)*2)); + checkCudaErrors(cudaMallocHost(&h_inputArr, sizeof(int) * arrSize)); + checkCudaErrors(cudaMallocHost(&h_numOfOdds, sizeof(int))); + checkCudaErrors(cudaMallocHost(&h_sumOfOddEvenElems, sizeof(int) * 2)); + initOddEvenArr(h_inputArr, arrSize); - checkCudaErrors(cudaMemcpyAsync(d_inputArr, h_inputArr, sizeof(int)*arrSize, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemsetAsync(d_numOfOdds, 0, sizeof(int), stream)); - checkCudaErrors(cudaMemsetAsync(d_sumOfOddEvenElems, 0, 2*sizeof(int), stream)); + cudaStream_t stream; + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + checkCudaErrors(cudaMalloc(&d_inputArr, sizeof(int) * arrSize)); + checkCudaErrors(cudaMalloc(&d_numOfOdds, sizeof(int))); + checkCudaErrors(cudaMalloc(&d_sumOfOddEvenElems, sizeof(int) * 2)); - //Launch the kernel - int threadsPerBlock=1024; - int blocksPerGrid = arrSize / threadsPerBlock; + checkCudaErrors(cudaMemcpyAsync(d_inputArr, h_inputArr, sizeof(int) * arrSize, + cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemsetAsync(d_numOfOdds, 0, sizeof(int), stream)); + checkCudaErrors( + cudaMemsetAsync(d_sumOfOddEvenElems, 0, 2 * sizeof(int), stream)); - printf("\nLaunching %d blocks with %d threads...\n\n",blocksPerGrid, threadsPerBlock); + // Launch the kernel + int threadsPerBlock = 0; + int blocksPerGrid = 0; + checkCudaErrors(cudaOccupancyMaxPotentialBlockSize( + &blocksPerGrid, &threadsPerBlock, oddEvenCountAndSumCG, 0, 0)); - oddEvenCountAndSumCG<<>>(d_inputArr, d_numOfOdds, d_sumOfOddEvenElems, arrSize); + printf("\nLaunching %d blocks with %d threads...\n\n", blocksPerGrid, + threadsPerBlock); - checkCudaErrors(cudaMemcpyAsync(h_numOfOdds, d_numOfOdds, sizeof(int), cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaMemcpyAsync(h_sumOfOddEvenElems, d_sumOfOddEvenElems, 2*sizeof(int), cudaMemcpyDeviceToHost, stream)); - - printf("Array size = %d Num of Odds = %d Sum of Odds = %d Sum of Evens %d\n", arrSize, h_numOfOdds[0], h_sumOfOddEvenElems[0], h_sumOfOddEvenElems[1]); - printf("\n...Done.\n\n"); + oddEvenCountAndSumCG<<>>( + d_inputArr, d_numOfOdds, d_sumOfOddEvenElems, arrSize); - delete[] h_inputArr; - delete[] h_numOfOdds; - delete[] h_sumOfOddEvenElems; + checkCudaErrors(cudaMemcpyAsync(h_numOfOdds, d_numOfOdds, sizeof(int), + cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaMemcpyAsync(h_sumOfOddEvenElems, d_sumOfOddEvenElems, + 2 * sizeof(int), cudaMemcpyDeviceToHost, + stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); - checkCudaErrors(cudaFree(d_inputArr)); - checkCudaErrors(cudaFree(d_numOfOdds)); - checkCudaErrors(cudaFree(d_sumOfOddEvenElems)); + printf("Array size = %d Num of Odds = %d Sum of Odds = %d Sum of Evens %d\n", + arrSize, h_numOfOdds[0], h_sumOfOddEvenElems[0], + h_sumOfOddEvenElems[1]); + printf("\n...Done.\n\n"); - return EXIT_SUCCESS; + checkCudaErrors(cudaFreeHost(h_inputArr)); + checkCudaErrors(cudaFreeHost(h_numOfOdds)); + checkCudaErrors(cudaFreeHost(h_sumOfOddEvenElems)); + + checkCudaErrors(cudaFree(d_inputArr)); + checkCudaErrors(cudaFree(d_numOfOdds)); + checkCudaErrors(cudaFree(d_sumOfOddEvenElems)); + + return EXIT_SUCCESS; } diff --git a/Samples/binaryPartitionCG/binaryPartitionCG_vs2017.vcxproj b/Samples/binaryPartitionCG/binaryPartitionCG_vs2017.vcxproj index d2c55039..2399c9ec 100644 --- a/Samples/binaryPartitionCG/binaryPartitionCG_vs2017.vcxproj +++ b/Samples/binaryPartitionCG/binaryPartitionCG_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/binaryPartitionCG/binaryPartitionCG_vs2019.vcxproj b/Samples/binaryPartitionCG/binaryPartitionCG_vs2019.vcxproj index 27c33226..fe7bb11f 100644 --- a/Samples/binaryPartitionCG/binaryPartitionCG_vs2019.vcxproj +++ b/Samples/binaryPartitionCG/binaryPartitionCG_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/boxFilterNPP/README.md b/Samples/boxFilterNPP/README.md index c14fe05e..54f26d6a 100644 --- a/Samples/boxFilterNPP/README.md +++ b/Samples/boxFilterNPP/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/boxFilterNPP/boxFilterNPP_vs2017.vcxproj b/Samples/boxFilterNPP/boxFilterNPP_vs2017.vcxproj index b233a75e..580c3df5 100644 --- a/Samples/boxFilterNPP/boxFilterNPP_vs2017.vcxproj +++ b/Samples/boxFilterNPP/boxFilterNPP_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -118,6 +118,6 @@ - + diff --git a/Samples/boxFilterNPP/boxFilterNPP_vs2019.vcxproj b/Samples/boxFilterNPP/boxFilterNPP_vs2019.vcxproj index 435986b7..91f4db2d 100644 --- a/Samples/boxFilterNPP/boxFilterNPP_vs2019.vcxproj +++ b/Samples/boxFilterNPP/boxFilterNPP_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -114,6 +114,6 @@ - + diff --git a/Samples/cannyEdgeDetectorNPP/README.md b/Samples/cannyEdgeDetectorNPP/README.md index 2d67e1f0..0c969c8e 100644 --- a/Samples/cannyEdgeDetectorNPP/README.md +++ b/Samples/cannyEdgeDetectorNPP/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2017.vcxproj b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2017.vcxproj index 78c395f6..f0140b6a 100644 --- a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2017.vcxproj +++ b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -118,6 +118,6 @@ - + diff --git a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2019.vcxproj b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2019.vcxproj index 318815b6..f919b081 100644 --- a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2019.vcxproj +++ b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -114,6 +114,6 @@ - + diff --git a/Samples/concurrentKernels/README.md b/Samples/concurrentKernels/README.md index f2933e27..b3a52d91 100644 --- a/Samples/concurrentKernels/README.md +++ b/Samples/concurrentKernels/README.md @@ -24,7 +24,7 @@ x86_64, ppc64le, armv7l ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/concurrentKernels/concurrentKernels_vs2017.vcxproj b/Samples/concurrentKernels/concurrentKernels_vs2017.vcxproj index a0e1b67a..f8036198 100644 --- a/Samples/concurrentKernels/concurrentKernels_vs2017.vcxproj +++ b/Samples/concurrentKernels/concurrentKernels_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/concurrentKernels/concurrentKernels_vs2019.vcxproj b/Samples/concurrentKernels/concurrentKernels_vs2019.vcxproj index 6c992bc0..f6224739 100644 --- a/Samples/concurrentKernels/concurrentKernels_vs2019.vcxproj +++ b/Samples/concurrentKernels/concurrentKernels_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/conjugateGradientCudaGraphs/Makefile b/Samples/conjugateGradientCudaGraphs/Makefile index 13d1e4ee..6609440a 100644 --- a/Samples/conjugateGradientCudaGraphs/Makefile +++ b/Samples/conjugateGradientCudaGraphs/Makefile @@ -265,6 +265,12 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) SAMPLE_ENABLED := 1 +# This sample is not supported on QNX +ifeq ($(TARGET_OS),qnx) + $(info >>> WARNING - conjugateGradientCudaGraphs is not supported on QNX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) diff --git a/Samples/conjugateGradientCudaGraphs/README.md b/Samples/conjugateGradientCudaGraphs/README.md index f9a787c9..1e723476 100644 --- a/Samples/conjugateGradientCudaGraphs/README.md +++ b/Samples/conjugateGradientCudaGraphs/README.md @@ -30,7 +30,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch, ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu index b1528438..7a4a5c8a 100644 --- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu +++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu @@ -25,7 +25,6 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - /* * This sample implements a conjugate gradient solver on GPU * using CUBLAS and CUSPARSE with CUDA Graphs @@ -46,7 +45,6 @@ #include // helper function CUDA error checking and initialization #include // helper for shared functions common to CUDA Samples - const char *sSDKname = "conjugateGradientCudaGraphs"; #ifndef WITH_GRAPH @@ -145,12 +143,12 @@ int main(int argc, char **argv) { /* Generate a random tridiagonal symmetric matrix in CSR format */ N = 1048576; nz = (N - 2) * 3 + 4; - I = (int *)malloc(sizeof(int) * (N + 1)); - J = (int *)malloc(sizeof(int) * nz); - val = (float *)malloc(sizeof(float) * nz); + checkCudaErrors(cudaMallocHost(&I, sizeof(int) * (N + 1))); + checkCudaErrors(cudaMallocHost(&J, sizeof(int) * nz)); + checkCudaErrors(cudaMallocHost(&val, sizeof(float) * nz)); genTridiag(I, J, val, N, nz); - x = (float *)malloc(sizeof(float) * N); + checkCudaErrors(cudaMallocHost(&x, sizeof(float) * N)); rhs = (float *)malloc(sizeof(float) * N); for (int i = 0; i < N; i++) { @@ -192,9 +190,9 @@ int main(int argc, char **argv) { /* Wrap raw data into cuSPARSE generic API objects */ cusparseSpMatDescr_t matA = NULL; - checkCudaErrors(cusparseCreateCsr( - &matA, N, N, nz, d_row, d_col, d_val, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); + checkCudaErrors(cusparseCreateCsr(&matA, N, N, nz, d_row, d_col, d_val, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); cusparseDnVecDescr_t vecx = NULL; checkCudaErrors(cusparseCreateDnVec(&vecx, N, d_x, CUDA_R_32F)); cusparseDnVecDescr_t vecp = NULL; @@ -206,7 +204,7 @@ int main(int argc, char **argv) { size_t bufferSize = 0; checkCudaErrors(cusparseSpMV_bufferSize( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecx, - &beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize)); + &beta, vecAx, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)); void *buffer = NULL; checkCudaErrors(cudaMalloc(&buffer, bufferSize)); @@ -234,9 +232,9 @@ int main(int argc, char **argv) { beta = 0.0; checkCudaErrors(cusparseSetStream(cusparseHandle, stream1)); - checkCudaErrors(cusparseSpMV( - cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecx, - &beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer)); + checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, matA, vecx, &beta, vecAx, CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, buffer)); checkCudaErrors(cublasSetStream(cublasHandle, stream1)); checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpham1, d_Ax, 1, d_r, 1)); @@ -248,9 +246,9 @@ int main(int argc, char **argv) { k = 1; // First Iteration when k=1 starts checkCudaErrors(cublasScopy(cublasHandle, N, d_r, 1, d_p, 1)); - checkCudaErrors(cusparseSpMV( - cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp, - &beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer)); + checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, matA, vecp, &beta, vecAx, CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, buffer)); checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot)); @@ -290,9 +288,9 @@ int main(int argc, char **argv) { checkCudaErrors( cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST)); - checkCudaErrors(cusparseSpMV( - cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp, - &beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer)); + checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, matA, vecp, &beta, vecAx, CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, buffer)); checkCudaErrors(cudaMemsetAsync(d_dot, 0, sizeof(float), stream1)); checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot)); @@ -335,8 +333,8 @@ int main(int argc, char **argv) { checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1)); checkCudaErrors(cusparseSpMV( - cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp, - &beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer)); + cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp, + &beta, vecAx, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, buffer)); cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE); checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot)); @@ -395,23 +393,31 @@ int main(int argc, char **argv) { cusparseDestroy(cusparseHandle); cublasDestroy(cublasHandle); - if (matA ) { checkCudaErrors(cusparseDestroySpMat(matA)); } - if (vecx ) { checkCudaErrors(cusparseDestroyDnVec(vecx)); } - if (vecAx ) { checkCudaErrors(cusparseDestroyDnVec(vecAx)); } - if (vecp ) { checkCudaErrors(cusparseDestroyDnVec(vecp)); } + if (matA) { + checkCudaErrors(cusparseDestroySpMat(matA)); + } + if (vecx) { + checkCudaErrors(cusparseDestroyDnVec(vecx)); + } + if (vecAx) { + checkCudaErrors(cusparseDestroyDnVec(vecAx)); + } + if (vecp) { + checkCudaErrors(cusparseDestroyDnVec(vecp)); + } - free(I); - free(J); - free(val); - free(x); + checkCudaErrors(cudaFreeHost(I)); + checkCudaErrors(cudaFreeHost(J)); + checkCudaErrors(cudaFreeHost(val)); + checkCudaErrors(cudaFreeHost(x)); free(rhs); - cudaFree(d_col); - cudaFree(d_row); - cudaFree(d_val); - cudaFree(d_x); - cudaFree(d_r); - cudaFree(d_p); - cudaFree(d_Ax); + checkCudaErrors(cudaFree(d_col)); + checkCudaErrors(cudaFree(d_row)); + checkCudaErrors(cudaFree(d_val)); + checkCudaErrors(cudaFree(d_x)); + checkCudaErrors(cudaFree(d_r)); + checkCudaErrors(cudaFree(d_p)); + checkCudaErrors(cudaFree(d_Ax)); printf("Test Summary: Error amount = %f\n", err); exit((k <= max_iter) ? 0 : 1); diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj index ed755a2c..a662b455 100644 --- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj +++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.vcxproj b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.vcxproj index d14e10fe..5fe964d8 100644 --- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.vcxproj +++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/conjugateGradientMultiBlockCG/README.md b/Samples/conjugateGradientMultiBlockCG/README.md index 0728e9af..217fabf3 100644 --- a/Samples/conjugateGradientMultiBlockCG/README.md +++ b/Samples/conjugateGradientMultiBlockCG/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj index 1d447230..9692e5fe 100644 --- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj +++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.vcxproj b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.vcxproj index 67082acf..9952e93f 100644 --- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.vcxproj +++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/conjugateGradientMultiDeviceCG/README.md b/Samples/conjugateGradientMultiDeviceCG/README.md index 4c6b7ac8..099a61bd 100644 --- a/Samples/conjugateGradientMultiDeviceCG/README.md +++ b/Samples/conjugateGradientMultiDeviceCG/README.md @@ -30,7 +30,7 @@ cudaMemAdvise, cudaMemPrefetchAsync, cudaLaunchCooperativeKernelMultiDevice, cud ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu index f7fad12d..d09c9da8 100644 --- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu +++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu @@ -223,8 +223,10 @@ __device__ void gpuDotProduct(float *vecA, float *vecB, int size, cg::sync(cta); if (tile32.meta_group_rank() == 0) { - temp_sum = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0; - temp_sum = cg::reduce(tile32, temp_sum, cg::plus()); + temp_sum = tile32.thread_rank() < tile32.meta_group_size() + ? tmp[tile32.thread_rank()] + : 0.0; + temp_sum = cg::reduce(tile32, temp_sum, cg::plus()); if (tile32.thread_rank() == 0) { atomicAdd(&grid_dot_result, temp_sum); @@ -239,8 +241,9 @@ __device__ void gpuCopyVector(float *srcA, float *destB, int size, } } -__device__ void gpuScaleVectorAndSaxpy(float *x, float *y, float a, float scale, int size, - const cg::multi_grid_group &multi_grid) { +__device__ void gpuScaleVectorAndSaxpy(float *x, float *y, float a, float scale, + int size, + const cg::multi_grid_group &multi_grid) { for (int i = multi_grid.thread_rank(); i < size; i += multi_grid.size()) { y[i] = a * x[i] + scale * y[i]; } @@ -360,10 +363,11 @@ std::multimap, int> getIdenticalGPUs() { // Filter unsupported devices if (deviceProp.cooperativeMultiDeviceLaunch && deviceProp.concurrentManagedAccess) { - identicalGpus.emplace(std::make_pair(deviceProp.major, deviceProp.minor), i); + identicalGpus.emplace(std::make_pair(deviceProp.major, deviceProp.minor), + i); } printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", i, - deviceProp.name, deviceProp.major, deviceProp.minor); + deviceProp.name, deviceProp.major, deviceProp.minor); } return identicalGpus; @@ -387,15 +391,17 @@ int main(int argc, char **argv) { auto bestFit = std::make_pair(it, it); // use std::distance to find the largest number of GPUs amongst architectures - auto distance = [](decltype(bestFit) p){return std::distance(p.first, p.second);}; + auto distance = [](decltype(bestFit) p) { + return std::distance(p.first, p.second); + }; // Read each unique key/pair element in order for (; it != end; it = gpusByArch.upper_bound(it->first)) { // first and second are iterators bounded within the architecture group auto testFit = gpusByArch.equal_range(it->first); - // Always use devices with highest architecture version or whichever has the most devices available - if (distance(bestFit) <= distance(testFit)) - bestFit = testFit; + // Always use devices with highest architecture version or whichever has the + // most devices available + if (distance(bestFit) <= distance(testFit)) bestFit = testFit; } if (distance(bestFit) < kNumGpusRequired) { @@ -408,33 +414,35 @@ int main(int argc, char **argv) { std::set bestFitDeviceIds; - // check & select peer-to-peer access capable GPU devices as enabling p2p access between participating + // check & select peer-to-peer access capable GPU devices as enabling p2p + // access between participating // GPUs gives better performance for multi_grid sync. for (auto itr = bestFit.first; itr != bestFit.second; itr++) { int deviceId = itr->second; checkCudaErrors(cudaSetDevice(deviceId)); - std::for_each(itr, bestFit.second, [&deviceId, &bestFitDeviceIds](decltype(*itr) mapPair) { - if (deviceId != mapPair.second) - { + std::for_each(itr, bestFit.second, [&deviceId, &bestFitDeviceIds, + &kNumGpusRequired]( + decltype(*itr) mapPair) { + if (deviceId != mapPair.second) { int access = 0; - checkCudaErrors(cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second)); - printf("Device=%d %s Access Peer Device=%d\n", deviceId, access ? "CAN" : "CANNOT", mapPair.second); + checkCudaErrors( + cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second)); + printf("Device=%d %s Access Peer Device=%d\n", deviceId, + access ? "CAN" : "CANNOT", mapPair.second); if (access && bestFitDeviceIds.size() < kNumGpusRequired) { bestFitDeviceIds.emplace(deviceId); bestFitDeviceIds.emplace(mapPair.second); - } - else { + } else { printf("Ignoring device %i (max devices exceeded)\n", mapPair.second); } } }); - if (bestFitDeviceIds.size() >= kNumGpusRequired) - { + if (bestFitDeviceIds.size() >= kNumGpusRequired) { printf("Selected p2p capable devices - "); - for (auto devicesItr = bestFitDeviceIds.begin(); devicesItr != bestFitDeviceIds.end(); devicesItr++) - { + for (auto devicesItr = bestFitDeviceIds.begin(); + devicesItr != bestFitDeviceIds.end(); devicesItr++) { printf("deviceId = %d ", *devicesItr); } printf("\n"); @@ -442,33 +450,34 @@ int main(int argc, char **argv) { } } - // if bestFitDeviceIds.size() == 0 it means the GPUs in system are not p2p capable, + // if bestFitDeviceIds.size() == 0 it means the GPUs in system are not p2p + // capable, // hence we add it without p2p capability check. - if (!bestFitDeviceIds.size()) - { - printf("Devices involved are not p2p capable.. selecting %zu of them\n", kNumGpusRequired); - std::for_each(bestFit.first, bestFit.second, [&bestFitDeviceIds](decltype(*bestFit.first) mapPair) { - if (bestFitDeviceIds.size() < kNumGpusRequired) { - bestFitDeviceIds.emplace(mapPair.second); - } - else { - printf("Ignoring device %i (max devices exceeded)\n", mapPair.second); - } - // Insert the sequence into the deviceIds set - }); - } - else - { - // perform cudaDeviceEnablePeerAccess in both directions for all participating devices - // of a cudaLaunchCooperativeKernelMultiDevice call this gives better performance for multi_grid sync. - for (auto p1_itr = bestFitDeviceIds.begin(); p1_itr != bestFitDeviceIds.end(); p1_itr++) - { + if (!bestFitDeviceIds.size()) { + printf("Devices involved are not p2p capable.. selecting %zu of them\n", + kNumGpusRequired); + std::for_each(bestFit.first, bestFit.second, + [&bestFitDeviceIds, + &kNumGpusRequired](decltype(*bestFit.first) mapPair) { + if (bestFitDeviceIds.size() < kNumGpusRequired) { + bestFitDeviceIds.emplace(mapPair.second); + } else { + printf("Ignoring device %i (max devices exceeded)\n", + mapPair.second); + } + // Insert the sequence into the deviceIds set + }); + } else { + // perform cudaDeviceEnablePeerAccess in both directions for all + // participating devices of a cudaLaunchCooperativeKernelMultiDevice call + // this gives better performance for multi_grid sync. + for (auto p1_itr = bestFitDeviceIds.begin(); + p1_itr != bestFitDeviceIds.end(); p1_itr++) { checkCudaErrors(cudaSetDevice(*p1_itr)); - for (auto p2_itr = bestFitDeviceIds.begin(); p2_itr != bestFitDeviceIds.end(); p2_itr++) - { - if (*p1_itr != *p2_itr) - { - checkCudaErrors(cudaDeviceEnablePeerAccess(*p2_itr, 0 )); + for (auto p2_itr = bestFitDeviceIds.begin(); + p2_itr != bestFitDeviceIds.end(); p2_itr++) { + if (*p1_itr != *p2_itr) { + checkCudaErrors(cudaDeviceEnablePeerAccess(*p2_itr, 0)); checkCudaErrors(cudaSetDevice(*p1_itr)); } } @@ -518,7 +527,7 @@ int main(int argc, char **argv) { std::cout << "\nRunning on GPUs = " << kNumGpusRequired << std::endl; cudaStream_t nStreams[kNumGpusRequired]; - int sMemSize = sizeof(double) * ((THREADS_PER_BLOCK/32) + 1); + int sMemSize = sizeof(double) * ((THREADS_PER_BLOCK / 32) + 1); int numBlocksPerSm = INT_MAX; int numThreads = THREADS_PER_BLOCK; int numSms = INT_MAX; @@ -530,17 +539,16 @@ int main(int argc, char **argv) { checkCudaErrors(cudaSetDevice(*deviceId)); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, *deviceId)); - int numBlocksPerSm_current=0; + int numBlocksPerSm_current = 0; checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocksPerSm_current, multiGpuConjugateGradient, numThreads, sMemSize)); + &numBlocksPerSm_current, multiGpuConjugateGradient, numThreads, + sMemSize)); - if (numBlocksPerSm > numBlocksPerSm_current) - { - numBlocksPerSm = numBlocksPerSm_current; + if (numBlocksPerSm > numBlocksPerSm_current) { + numBlocksPerSm = numBlocksPerSm_current; } - if (numSms > deviceProp.multiProcessorCount) - { - numSms = deviceProp.multiProcessorCount; + if (numSms > deviceProp.multiProcessorCount) { + numSms = deviceProp.multiProcessorCount; } deviceId++; } @@ -554,7 +562,7 @@ int main(int argc, char **argv) { int device_count = 0; int totalThreadsPerGPU = numSms * numBlocksPerSm * THREADS_PER_BLOCK; - deviceId = bestFitDeviceIds.begin();; + deviceId = bestFitDeviceIds.begin(); while (deviceId != bestFitDeviceIds.end()) { checkCudaErrors(cudaSetDevice(*deviceId)); checkCudaErrors(cudaStreamCreate(&nStreams[device_count])); @@ -621,14 +629,15 @@ int main(int argc, char **argv) { printf("Total threads per GPU = %d numBlocksPerSm = %d\n", numSms * numBlocksPerSm * THREADS_PER_BLOCK, numBlocksPerSm); - dim3 dimGrid(numSms * numBlocksPerSm, 1, 1), dimBlock(THREADS_PER_BLOCK, 1, 1); + dim3 dimGrid(numSms * numBlocksPerSm, 1, 1), + dimBlock(THREADS_PER_BLOCK, 1, 1); void *kernelArgs[] = { (void *)&I, (void *)&J, (void *)&val, (void *)&x, (void *)&Ax, (void *)&p, (void *)&r, (void *)&dot_result, (void *)&nz, (void *)&N, (void *)&tol, }; - cudaLaunchParams *launchParamsList = (cudaLaunchParams *)malloc( - sizeof(cudaLaunchParams) * kNumGpusRequired); + cudaLaunchParams *launchParamsList = + (cudaLaunchParams *)malloc(sizeof(cudaLaunchParams) * kNumGpusRequired); for (int i = 0; i < kNumGpusRequired; i++) { launchParamsList[i].func = (void *)multiGpuConjugateGradient; launchParamsList[i].gridDim = dimGrid; @@ -645,12 +654,11 @@ int main(int argc, char **argv) { cudaCooperativeLaunchMultiDeviceNoPreSync | cudaCooperativeLaunchMultiDeviceNoPostSync)); + checkCudaErrors(cudaMemPrefetchAsync(x, sizeof(float) * N, cudaCpuDeviceId)); checkCudaErrors( - cudaMemPrefetchAsync(x, sizeof(float) * N, cudaCpuDeviceId)); - checkCudaErrors( - cudaMemPrefetchAsync(dot_result, sizeof(double), cudaCpuDeviceId)); + cudaMemPrefetchAsync(dot_result, sizeof(double), cudaCpuDeviceId)); - deviceId = bestFitDeviceIds.begin();; + deviceId = bestFitDeviceIds.begin(); device_count = 0; while (deviceId != bestFitDeviceIds.end()) { checkCudaErrors(cudaSetDevice(*deviceId)); @@ -658,7 +666,7 @@ int main(int argc, char **argv) { deviceId++; } - r1 = *dot_result; + r1 = (float)*dot_result; printf("GPU Final, residual = %e \n ", sqrt(r1)); diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj index f6ab8299..281b9f54 100644 --- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj +++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -109,6 +109,6 @@ - + diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.vcxproj b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.vcxproj index bd54470f..da03363e 100644 --- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.vcxproj +++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -105,6 +105,6 @@ - + diff --git a/Samples/cuSolverDn_LinearSolver/Makefile b/Samples/cuSolverDn_LinearSolver/Makefile index f8b34a31..61e55f47 100644 --- a/Samples/cuSolverDn_LinearSolver/Makefile +++ b/Samples/cuSolverDn_LinearSolver/Makefile @@ -271,6 +271,12 @@ ifeq ($(TARGET_ARCH),armv7l) SAMPLE_ENABLED := 0 endif +# This sample is not supported on QNX +ifeq ($(TARGET_OS),qnx) + $(info >>> WARNING - cuSolverDn_LinearSolver is not supported on QNX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + ifeq ($(TARGET_OS),linux) ALL_CCFLAGS += -Xcompiler \"-Wl,--no-as-needed\" endif diff --git a/Samples/cuSolverDn_LinearSolver/README.md b/Samples/cuSolverDn_LinearSolver/README.md index 194c9e42..185f577b 100644 --- a/Samples/cuSolverDn_LinearSolver/README.md +++ b/Samples/cuSolverDn_LinearSolver/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2017.vcxproj b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2017.vcxproj index 910879f6..8d77015c 100644 --- a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2017.vcxproj +++ b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -110,6 +110,6 @@ - + diff --git a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2019.vcxproj b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2019.vcxproj index 194bb165..d3f1e05e 100644 --- a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2019.vcxproj +++ b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -106,6 +106,6 @@ - + diff --git a/Samples/cuSolverSp_LinearSolver/Makefile b/Samples/cuSolverSp_LinearSolver/Makefile index 59a043d9..cc002581 100644 --- a/Samples/cuSolverSp_LinearSolver/Makefile +++ b/Samples/cuSolverSp_LinearSolver/Makefile @@ -265,6 +265,12 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) SAMPLE_ENABLED := 1 +# This sample is not supported on QNX +ifeq ($(TARGET_OS),qnx) + $(info >>> WARNING - cuSolverSp_LinearSolver is not supported on QNX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + ifeq ($(TARGET_OS),linux) ALL_CCFLAGS += -Xcompiler \"-Wl,--no-as-needed\" endif diff --git a/Samples/cuSolverSp_LinearSolver/README.md b/Samples/cuSolverSp_LinearSolver/README.md index 45e2e442..35c105ee 100644 --- a/Samples/cuSolverSp_LinearSolver/README.md +++ b/Samples/cuSolverSp_LinearSolver/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp index fabb33fb..331c733b 100644 --- a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp +++ b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp @@ -495,13 +495,13 @@ int main(int argc, char *argv[]) { size_t bufferSize = 0; checkCudaErrors(cusparseSpMV_bufferSize( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx, - &one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize)); + &one, vecAx, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)); void *buffer = NULL; checkCudaErrors(cudaMalloc(&buffer, bufferSize)); checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx, &one, vecAx, CUDA_R_64F, - CUSPARSE_MV_ALG_DEFAULT, &buffer)); + CUSPARSE_SPMV_ALG_DEFAULT, buffer)); checkCudaErrors(cudaMemcpyAsync(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost, stream)); @@ -559,7 +559,7 @@ int main(int argc, char *argv[]) { checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx, &one, vecAx, CUDA_R_64F, - CUSPARSE_MV_ALG_DEFAULT, &buffer)); + CUSPARSE_SPMV_ALG_DEFAULT, buffer)); checkCudaErrors(cudaMemcpyAsync(h_x, d_x, sizeof(double) * colsA, cudaMemcpyDeviceToHost, stream)); diff --git a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2017.vcxproj b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2017.vcxproj index 57cdcc14..1bdf5779 100644 --- a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2017.vcxproj +++ b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -110,6 +110,6 @@ - + diff --git a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2019.vcxproj b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2019.vcxproj index 8f85e583..665f795e 100644 --- a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2019.vcxproj +++ b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -106,6 +106,6 @@ - + diff --git a/Samples/cudaCompressibleMemory/README.md b/Samples/cudaCompressibleMemory/README.md index aaa62565..6492f737 100644 --- a/Samples/cudaCompressibleMemory/README.md +++ b/Samples/cudaCompressibleMemory/README.md @@ -30,7 +30,7 @@ cudaMalloc, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2017.vcxproj b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2017.vcxproj index 58de5026..4f450a2c 100644 --- a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2017.vcxproj +++ b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -109,6 +109,6 @@ - + diff --git a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2019.vcxproj b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2019.vcxproj index 19b01a21..a932dd47 100644 --- a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2019.vcxproj +++ b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -105,6 +105,6 @@ - + diff --git a/Samples/cudaNvSci/Makefile b/Samples/cudaNvSci/Makefile index 1ef041a8..d7db232f 100644 --- a/Samples/cudaNvSci/Makefile +++ b/Samples/cudaNvSci/Makefile @@ -279,6 +279,12 @@ ifeq ($(TARGET_ARCH),armv7l) SAMPLE_ENABLED := 0 endif +# This sample is not supported on QNX +ifeq ($(TARGET_OS),qnx) + $(info >>> WARNING - cudaNvSci is not supported on QNX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) diff --git a/Samples/cudaNvSci/README.md b/Samples/cudaNvSci/README.md index 8a4f56bf..58d95d19 100644 --- a/Samples/cudaNvSci/README.md +++ b/Samples/cudaNvSci/README.md @@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedBuffer, cudaExternalMemoryG ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/cudaNvSciNvMedia/README.md b/Samples/cudaNvSciNvMedia/README.md index 2ff41323..a8e1a41c 100644 --- a/Samples/cudaNvSciNvMedia/README.md +++ b/Samples/cudaNvSciNvMedia/README.md @@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedBuffer, cudaExternalMemoryG ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/cudaOpenMP/README.md b/Samples/cudaOpenMP/README.md index fb9c1a2b..c2f88493 100644 --- a/Samples/cudaOpenMP/README.md +++ b/Samples/cudaOpenMP/README.md @@ -30,7 +30,7 @@ cudaMalloc, cudaFree, cudaMemcpy ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/cudaOpenMP/cudaOpenMP_vs2017.vcxproj b/Samples/cudaOpenMP/cudaOpenMP_vs2017.vcxproj index 28041d97..d3a04a75 100644 --- a/Samples/cudaOpenMP/cudaOpenMP_vs2017.vcxproj +++ b/Samples/cudaOpenMP/cudaOpenMP_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -109,6 +109,6 @@ - + diff --git a/Samples/cudaOpenMP/cudaOpenMP_vs2019.vcxproj b/Samples/cudaOpenMP/cudaOpenMP_vs2019.vcxproj index 51dfcf9b..59018165 100644 --- a/Samples/cudaOpenMP/cudaOpenMP_vs2019.vcxproj +++ b/Samples/cudaOpenMP/cudaOpenMP_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -105,6 +105,6 @@ - + diff --git a/Samples/cudaTensorCoreGemm/README.md b/Samples/cudaTensorCoreGemm/README.md index 4f5b2152..502941ae 100644 --- a/Samples/cudaTensorCoreGemm/README.md +++ b/Samples/cudaTensorCoreGemm/README.md @@ -31,7 +31,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate, ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj index 748796aa..622ffe8f 100644 --- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj +++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2019.vcxproj b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2019.vcxproj index cbb96dc4..fb649d4c 100644 --- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2019.vcxproj +++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/deviceQuery/README.md b/Samples/deviceQuery/README.md index bbbb7d18..76fb08d5 100644 --- a/Samples/deviceQuery/README.md +++ b/Samples/deviceQuery/README.md @@ -27,7 +27,7 @@ cudaSetDevice, cudaGetDeviceCount, cudaGetDeviceProperties, cudaDriverGetVersion ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/deviceQuery/deviceQuery.cpp b/Samples/deviceQuery/deviceQuery.cpp index c002cc5c..4c6c3369 100644 --- a/Samples/deviceQuery/deviceQuery.cpp +++ b/Samples/deviceQuery/deviceQuery.cpp @@ -112,10 +112,10 @@ int main(int argc, char **argv) { char msg[256]; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(msg, sizeof(msg), - " Total amount of global memory: %.0f MBytes " - "(%llu bytes)\n", - static_cast(deviceProp.totalGlobalMem / 1048576.0f), - (unsigned long long)deviceProp.totalGlobalMem); + " Total amount of global memory: %.0f MBytes " + "(%llu bytes)\n", + static_cast(deviceProp.totalGlobalMem / 1048576.0f), + (unsigned long long)deviceProp.totalGlobalMem); #else snprintf(msg, sizeof(msg), " Total amount of global memory: %.0f MBytes " @@ -125,7 +125,7 @@ int main(int argc, char **argv) { #endif printf("%s", msg); - printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n", + printf(" (%03d) Multiprocessors, (%03d) CUDA Cores/MP: %d CUDA Cores\n", deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * @@ -250,8 +250,7 @@ int main(int argc, char **argv) { "device)", "Exclusive Process (many threads in one process is able to use " "::cudaSetDevice() with this device)", - "Unknown", - NULL}; + "Unknown", NULL}; printf(" Compute Mode:\n"); printf(" < %s >\n", sComputeMode[deviceProp.computeMode]); } @@ -272,7 +271,7 @@ int main(int argc, char **argv) { // must be enabled to support this && prop[i].tccDriver #endif - ) { + ) { // This is an array of P2P capable GPUs gpuid[gpu_p2p_count++] = i; } @@ -307,7 +306,8 @@ int main(int argc, char **argv) { // driver version sProfileString += ", CUDA Driver Version = "; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10); + sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000, + (driverVersion % 100) / 10); #else snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10); @@ -317,7 +317,8 @@ int main(int argc, char **argv) { // Runtime version sProfileString += ", CUDA Runtime Version = "; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); + sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000, + (runtimeVersion % 100) / 10); #else snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10); diff --git a/Samples/deviceQuery/deviceQuery_vs2017.vcxproj b/Samples/deviceQuery/deviceQuery_vs2017.vcxproj index 9fc4c3e9..5bd56297 100644 --- a/Samples/deviceQuery/deviceQuery_vs2017.vcxproj +++ b/Samples/deviceQuery/deviceQuery_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/deviceQuery/deviceQuery_vs2019.vcxproj b/Samples/deviceQuery/deviceQuery_vs2019.vcxproj index b8fd169a..f8532544 100644 --- a/Samples/deviceQuery/deviceQuery_vs2019.vcxproj +++ b/Samples/deviceQuery/deviceQuery_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/dmmaTensorCoreGemm/README.md b/Samples/dmmaTensorCoreGemm/README.md index 9730739a..aa6e6f16 100644 --- a/Samples/dmmaTensorCoreGemm/README.md +++ b/Samples/dmmaTensorCoreGemm/README.md @@ -27,7 +27,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate, ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2017.vcxproj b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2017.vcxproj index 7d8414e5..5ea929c1 100644 --- a/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2017.vcxproj +++ b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2019.vcxproj b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2019.vcxproj index 4fe4bfc6..b415db92 100644 --- a/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2019.vcxproj +++ b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/globalToShmemAsyncCopy/README.md b/Samples/globalToShmemAsyncCopy/README.md index bd13a04d..233d5b50 100644 --- a/Samples/globalToShmemAsyncCopy/README.md +++ b/Samples/globalToShmemAsyncCopy/README.md @@ -30,7 +30,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu index ad0229a7..c1b70b46 100644 --- a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu +++ b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu @@ -28,12 +28,14 @@ /** * Matrix multiplication: C = A * B. * - * This sample demonstrates implements matrix multiplication which makes use of shared memory - * to ensure data reuse, the matrix multiplication is done using tiling approach. - * With compute capability 8.0 or higher the CUDA kernels involved uses asynchronously copy data - * from global to shared memory; a.k.a., async-copy. - * This sample has been written for clarity of exposition to illustrate various CUDA programming - * principles, not with the goal of providing the most performant generic kernel for matrix multiplication. + * This sample demonstrates implements matrix multiplication which makes use of + * shared memory to ensure data reuse, the matrix multiplication is done using + * tiling approach. + * With compute capability 8.0 or higher the CUDA kernels involved uses + * asynchronously copy data from global to shared memory; a.k.a., async-copy. + * This sample has been written for clarity of exposition to illustrate various + * CUDA programming principles, not with the goal of providing the most + * performant generic kernel for matrix multiplication. */ // System includes @@ -47,7 +49,7 @@ #if __CUDA_ARCH__ >= 700 #include #endif -#include +#include namespace cg = cooperative_groups; @@ -55,966 +57,1015 @@ namespace cg = cooperative_groups; #include #include -enum kernels -{ - AsyncCopyMultiStageLargeChunk = 0, - AsyncCopyLargeChunk = 1, - AsyncCopyLargeChunkAWBarrier = 2, - AsyncCopyMultiStageSharedState = 3, - AsyncCopyMultiStage = 4, - AsyncCopySingleStage = 5, - Naive = 6, - NaiveLargeChunk = 7 +enum kernels { + AsyncCopyMultiStageLargeChunk = 0, + AsyncCopyLargeChunk = 1, + AsyncCopyLargeChunkAWBarrier = 2, + AsyncCopyMultiStageSharedState = 3, + AsyncCopyMultiStage = 4, + AsyncCopySingleStage = 5, + Naive = 6, + NaiveLargeChunk = 7 }; -const char* kernelNames[] = {"AsyncCopyMultiStageLargeChunk", "AsyncCopyLargeChunk", - "AsyncCopyLargeChunkAWBarrier", "AsyncCopyMultiStageSharedState", - "AsyncCopyMultiStage", "AsyncCopySingleStage", "Naive", "NaiveLargeChunk"}; +const char *kernelNames[] = {"AsyncCopyMultiStageLargeChunk", + "AsyncCopyLargeChunk", + "AsyncCopyLargeChunkAWBarrier", + "AsyncCopyMultiStageSharedState", + "AsyncCopyMultiStage", + "AsyncCopySingleStage", + "Naive", + "NaiveLargeChunk"}; constexpr int blockSize = 16; // Multi Stage memcpy_async pipeline with large chunk copy -template __global__ void MatrixMulAsyncCopyMultiStageLargeChunk(float* __restrict__ C, - const float* __restrict__ A, - const float* __restrict__ B, int wA, - int wB) { - // Requires BLOCK_SIZE % 4 == 0 +template +__global__ void MatrixMulAsyncCopyMultiStageLargeChunk( + float *__restrict__ C, const float *__restrict__ A, + const float *__restrict__ B, int wA, int wB) { + // Requires BLOCK_SIZE % 4 == 0 - // Multi-stage pipeline version - constexpr size_t maxPipelineStages = 4; + // Multi-stage pipeline version + constexpr size_t maxPipelineStages = 4; - // Declaration of the shared memory array As used to - // store the sub-matrix of A for each stage - __shared__ alignas(alignof(float4)) float As[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array As used to + // store the sub-matrix of A for each stage + __shared__ alignas( + alignof(float4)) float As[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B for each stage - __shared__ alignas(alignof(float4)) float Bs[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B for each stage + __shared__ alignas( + alignof(float4)) float Bs[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; - float Csub = 0.0; + float Csub = 0.0; - // Index of the first sub-matrix of A processed by the block - const int aBegin = wA * (BLOCK_SIZE) * blockIdx.y; + // Index of the first sub-matrix of A processed by the block + const int aBegin = wA * (BLOCK_SIZE)*blockIdx.y; - // Index of the last sub-matrix of A processed by the block - const int aEnd = aBegin + wA - 1; + // Index of the last sub-matrix of A processed by the block + const int aEnd = aBegin + wA - 1; - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; - // Index of the first sub-matrix of B processed by the block - const int bBegin = BLOCK_SIZE * blockIdx.x; + // Index of the first sub-matrix of B processed by the block + const int bBegin = BLOCK_SIZE * blockIdx.x; - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * wB; + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; - const int t4x = threadIdx.x * 4; - const auto shape4 = cuda::aligned_size_t(sizeof(float4)); + const int t4x = threadIdx.x * 4; + const auto shape4 = cuda::aligned_size_t(sizeof(float4)); - cuda::pipeline pipe = cuda::make_pipeline(); - - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, iStage = 0; a <= aEnd; a += aStep, b += bStep, ++i ) { - // Load the matrices from device memory to shared memory; each thread loads - // one element of each matrix - for ( ; aStage <= a + aStep * maxPipelineStages ; aStage += aStep, bStage += bStep, ++iStage ) - { - pipe.producer_acquire(); - if ( aStage <= aEnd && t4x < BLOCK_SIZE ) - { - // Rotating buffer - const int j = iStage % maxPipelineStages; - cuda::memcpy_async(&As[j][threadIdx.y][t4x], &A[aStage + wA * threadIdx.y + t4x], shape4, pipe); - cuda::memcpy_async(&Bs[j][threadIdx.y][t4x], &B[aStage + wA * threadIdx.y + t4x], shape4, pipe); - } - pipe.producer_commit(); - } - - pipe.consumer_wait(); - // Synchronize to make sure the matrices are loaded - __syncthreads(); + cuda::pipeline pipe = cuda::make_pipeline(); + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, + iStage = 0; + a <= aEnd; a += aStep, b += bStep, ++i) { + // Load the matrices from device memory to shared memory; each thread loads + // one element of each matrix + for (; aStage <= a + aStep * maxPipelineStages; + aStage += aStep, bStage += bStep, ++iStage) { + pipe.producer_acquire(); + if (aStage <= aEnd && t4x < BLOCK_SIZE) { // Rotating buffer - const int j = i % maxPipelineStages; - - // Multiply the two matrices together; - // each thread computes one element - // of the block sub-matrix - #pragma unroll - for (int k = 0; k < BLOCK_SIZE; ++k) { - Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x]; - } - pipe.consumer_release(); - - // Don't have to synchronize because maxPipelineStages is greater than one - // therefore next iteration is loading to a different buffer. + const int j = iStage % maxPipelineStages; + cuda::memcpy_async(&As[j][threadIdx.y][t4x], + &A[aStage + wA * threadIdx.y + t4x], shape4, pipe); + cuda::memcpy_async(&Bs[j][threadIdx.y][t4x], + &B[aStage + wA * threadIdx.y + t4x], shape4, pipe); + } + pipe.producer_commit(); } - // Write the block sub-matrix to device memory; - // each thread writes four element - int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; - C[c + wB * threadIdx.y + threadIdx.x] = Csub; -} - - -// Single Stage memcpy_async pipeline with Large copy chunk (float4) -template __global__ void MatrixMulAsyncCopyLargeChunk(float* __restrict__ C, - const float* __restrict__ A, - const float* __restrict__ B, int wA, - int wB) { - // Requires BLOCK_SIZE % 4 == 0 - - // Declaration of the shared memory array As used to - // store the sub-matrix of A - __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE]; - - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B - __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE]; - - // Index of the first sub-matrix of A processed by the block - int aBegin = wA * BLOCK_SIZE * blockIdx.y; - - // Index of the last sub-matrix of A processed by the block - int aEnd = aBegin + wA - 1; - - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; - - // Index of the first sub-matrix of B processed by the block - int bBegin = BLOCK_SIZE * blockIdx.x; - - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * wB; - - // Single-stage pipeline version - float Csub = 0.0; - - const int t4x = threadIdx.x * 4; - const auto shape4 = cuda::aligned_size_t(sizeof(float4)); - cuda::pipeline pipe = cuda::make_pipeline(); - - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { - // Load the matrices from device memory to shared memory; - // a subset of threads loads a contiguous chunk of elements. - - // Previously, per-thread: - // As[ty][tx] = A[a + wA * ty + tx]; - // Bs[ty][tx] = B[b + wB * ty + tx]; - - // Now, one fourth of the threads load four elements of each matrix - if ( t4x < BLOCK_SIZE ) { - - pipe.producer_acquire(); - - cuda::memcpy_async(&As[threadIdx.y][t4x], &A[a + wA * threadIdx.y + t4x], shape4, pipe); - cuda::memcpy_async(&Bs[threadIdx.y][t4x], &B[a + wA * threadIdx.y + t4x], shape4, pipe); - - pipe.producer_commit(); - pipe.consumer_wait(); - } - - // Synchronize to make sure the matrices are loaded - __syncthreads(); - - // Multiply the two matrices together; - // each thread computes one element - // of the block sub-matrix -#pragma unroll - for (int k = 0; k < BLOCK_SIZE; ++k) { - Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; - } - - pipe.consumer_release(); - - // Synchronize to make sure that the preceding - // computation is done before overwriting the - // shared memory sub-matrix buffers As and Bs in the next iteration. - __syncthreads(); - } - - // Write the block sub-matrix to device memory; - // each thread writes four element - int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; - C[c + wB * threadIdx.y + threadIdx.x] = Csub; -} - -// Single Stage memcpy_async pipeline with Large copy chunk (float4) using arrive-wait barrier -template __global__ void MatrixMulAsyncCopyLargeChunkAWBarrier(float* __restrict__ C, - const float* __restrict__ A, - const float* __restrict__ B, int wA, - int wB) { -#if __CUDA_ARCH__ >= 700 -#pragma diag_suppress static_var_with_dynamic_init - // Requires BLOCK_SIZE % 4 == 0 - - __shared__ cuda::barrier bar; - - // Declaration of the shared memory array As used to - // store the sub-matrix of A - __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE]; - - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B - __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE]; - - if (threadIdx.x == 0) { - init(&bar, blockDim.x*blockDim.y); - } + pipe.consumer_wait(); + // Synchronize to make sure the matrices are loaded __syncthreads(); - // Index of the first sub-matrix of A processed by the block - int aBegin = wA * BLOCK_SIZE * blockIdx.y; + // Rotating buffer + const int j = i % maxPipelineStages; - // Index of the last sub-matrix of A processed by the block - int aEnd = aBegin + wA - 1; - - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; - - // Index of the first sub-matrix of B processed by the block - int bBegin = BLOCK_SIZE * blockIdx.x; - - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * wB; - - float Csub = 0.0; - - const int t4x = threadIdx.x * 4; - - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { - // Load the matrices from device memory to shared memory; - // a subset of threads loads a contiguous chunk of elements. - - // Now, one fourth of the threads load four elements of each matrix - if ( t4x < BLOCK_SIZE ) { - float4 * const A4s = reinterpret_cast(& As[threadIdx.y][t4x]); - float4 * const B4s = reinterpret_cast(& Bs[threadIdx.y][t4x]); - const float4 * const A4 = reinterpret_cast(& A[a + wA * threadIdx.y + t4x]); - const float4 * const B4 = reinterpret_cast(& B[a + wA * threadIdx.y + t4x]); - - cuda::memcpy_async(A4s, A4, sizeof(float4), bar); - cuda::memcpy_async(B4s, B4, sizeof(float4), bar); - } - - // Synchronize to make sure the matrices are loaded - bar.arrive_and_wait(); - - // Multiply the two matrices together; - // each thread computes one element - // of the block sub-matrix +// Multiply the two matrices together; +// each thread computes one element +// of the block sub-matrix #pragma unroll - for (int k = 0; k < BLOCK_SIZE; ++k) { - Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; - } + for (int k = 0; k < BLOCK_SIZE; ++k) { + Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x]; + } + pipe.consumer_release(); - // Synchronize to make sure that the preceding - // computation is done before overwriting the - // shared memory sub-matrix buffers As and Bs in the next iteration. - bar.arrive_and_wait(); + // Don't have to synchronize because maxPipelineStages is greater than one + // therefore next iteration is loading to a different buffer. + } + + // Write the block sub-matrix to device memory; + // each thread writes four element + int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; + C[c + wB * threadIdx.y + threadIdx.x] = Csub; +} + +// Single Stage memcpy_async pipeline with Large copy chunk (float4) +template +__global__ void MatrixMulAsyncCopyLargeChunk(float *__restrict__ C, + const float *__restrict__ A, + const float *__restrict__ B, + int wA, int wB) { + // Requires BLOCK_SIZE % 4 == 0 + + // Declaration of the shared memory array As used to + // store the sub-matrix of A + __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE]; + + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B + __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE]; + + // Index of the first sub-matrix of A processed by the block + int aBegin = wA * BLOCK_SIZE * blockIdx.y; + + // Index of the last sub-matrix of A processed by the block + int aEnd = aBegin + wA - 1; + + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; + + // Index of the first sub-matrix of B processed by the block + int bBegin = BLOCK_SIZE * blockIdx.x; + + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; + + // Single-stage pipeline version + float Csub = 0.0; + + const int t4x = threadIdx.x * 4; + const auto shape4 = cuda::aligned_size_t(sizeof(float4)); + cuda::pipeline pipe = cuda::make_pipeline(); + + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { + // Load the matrices from device memory to shared memory; + // a subset of threads loads a contiguous chunk of elements. + + // Previously, per-thread: + // As[ty][tx] = A[a + wA * ty + tx]; + // Bs[ty][tx] = B[b + wB * ty + tx]; + + // Now, one fourth of the threads load four elements of each matrix + if (t4x < BLOCK_SIZE) { + pipe.producer_acquire(); + + cuda::memcpy_async(&As[threadIdx.y][t4x], &A[a + wA * threadIdx.y + t4x], + shape4, pipe); + cuda::memcpy_async(&Bs[threadIdx.y][t4x], &B[a + wA * threadIdx.y + t4x], + shape4, pipe); + + pipe.producer_commit(); + pipe.consumer_wait(); } - // Write the block sub-matrix to device memory; - // each thread writes four element - int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; - C[c + wB * threadIdx.y + threadIdx.x] = Csub; + // Synchronize to make sure the matrices are loaded + __syncthreads(); + +// Multiply the two matrices together; +// each thread computes one element +// of the block sub-matrix +#pragma unroll + for (int k = 0; k < BLOCK_SIZE; ++k) { + Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; + } + + pipe.consumer_release(); + + // Synchronize to make sure that the preceding + // computation is done before overwriting the + // shared memory sub-matrix buffers As and Bs in the next iteration. + __syncthreads(); + } + + // Write the block sub-matrix to device memory; + // each thread writes four element + int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; + C[c + wB * threadIdx.y + threadIdx.x] = Csub; +} + +// Single Stage memcpy_async pipeline with Large copy chunk (float4) using +// arrive-wait barrier +template +__global__ void MatrixMulAsyncCopyLargeChunkAWBarrier( + float *__restrict__ C, const float *__restrict__ A, + const float *__restrict__ B, int wA, int wB) { +#if __CUDA_ARCH__ >= 700 +#pragma diag_suppress static_var_with_dynamic_init + // Requires BLOCK_SIZE % 4 == 0 + + __shared__ cuda::barrier bar; + + // Declaration of the shared memory array As used to + // store the sub-matrix of A + __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE]; + + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B + __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE]; + + if (threadIdx.x == 0) { + init(&bar, blockDim.x * blockDim.y); + } + __syncthreads(); + + // Index of the first sub-matrix of A processed by the block + int aBegin = wA * BLOCK_SIZE * blockIdx.y; + + // Index of the last sub-matrix of A processed by the block + int aEnd = aBegin + wA - 1; + + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; + + // Index of the first sub-matrix of B processed by the block + int bBegin = BLOCK_SIZE * blockIdx.x; + + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; + + float Csub = 0.0; + + const int t4x = threadIdx.x * 4; + + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { + // Load the matrices from device memory to shared memory; + // a subset of threads loads a contiguous chunk of elements. + + // Now, one fourth of the threads load four elements of each matrix + if (t4x < BLOCK_SIZE) { + float4 *const A4s = reinterpret_cast(&As[threadIdx.y][t4x]); + float4 *const B4s = reinterpret_cast(&Bs[threadIdx.y][t4x]); + const float4 *const A4 = + reinterpret_cast(&A[a + wA * threadIdx.y + t4x]); + const float4 *const B4 = + reinterpret_cast(&B[a + wA * threadIdx.y + t4x]); + + cuda::memcpy_async(A4s, A4, sizeof(float4), bar); + cuda::memcpy_async(B4s, B4, sizeof(float4), bar); + } + + // Synchronize to make sure the matrices are loaded + bar.arrive_and_wait(); + +// Multiply the two matrices together; +// each thread computes one element +// of the block sub-matrix +#pragma unroll + for (int k = 0; k < BLOCK_SIZE; ++k) { + Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; + } + + // Synchronize to make sure that the preceding + // computation is done before overwriting the + // shared memory sub-matrix buffers As and Bs in the next iteration. + bar.arrive_and_wait(); + } + + // Write the block sub-matrix to device memory; + // each thread writes four element + int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; + C[c + wB * threadIdx.y + threadIdx.x] = Csub; #endif } // Single Stage memcpy_async pipeline with float copy -template __global__ void MatrixMulAsyncCopySingleStage(float *C, const float *A, - const float *B, int wA, - int wB) { +template +__global__ void MatrixMulAsyncCopySingleStage(float *C, const float *A, + const float *B, int wA, int wB) { + // Declaration of the shared memory array As used to + // store the sub-matrix of A + __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; - // Declaration of the shared memory array As used to - // store the sub-matrix of A - __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B + __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B - __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; + // Index of the first sub-matrix of A processed by the block + int aBegin = wA * BLOCK_SIZE * blockIdx.y; - // Index of the first sub-matrix of A processed by the block - int aBegin = wA * BLOCK_SIZE * blockIdx.y; + // Index of the last sub-matrix of A processed by the block + int aEnd = aBegin + wA - 1; - // Index of the last sub-matrix of A processed by the block - int aEnd = aBegin + wA - 1; + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; + // Index of the first sub-matrix of B processed by the block + int bBegin = BLOCK_SIZE * blockIdx.x; - // Index of the first sub-matrix of B processed by the block - int bBegin = BLOCK_SIZE * blockIdx.x; + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * wB; + // Single-stage pipeline version + float Csub = 0.0; - // Single-stage pipeline version - float Csub = 0.0; + cuda::pipeline pipe = cuda::make_pipeline(); + const auto shape1 = cuda::aligned_size_t(sizeof(float)); - cuda::pipeline pipe = cuda::make_pipeline(); - const auto shape1 = cuda::aligned_size_t(sizeof(float)); + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { + // Load the matrices from device memory to shared memory; each thread loads + // one element of each matrix + { + pipe.producer_acquire(); + cuda::memcpy_async(&As[threadIdx.y][threadIdx.x], + &A[a + wA * threadIdx.y + threadIdx.x], shape1, pipe); + cuda::memcpy_async(&Bs[threadIdx.y][threadIdx.x], + &B[b + wB * threadIdx.y + threadIdx.x], shape1, pipe); - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { - // Load the matrices from device memory to shared memory; each thread loads - // one element of each matrix - { - pipe.producer_acquire(); - - cuda::memcpy_async(&As[threadIdx.y][threadIdx.x], &A[a + wA * threadIdx.y + threadIdx.x], shape1, pipe); - cuda::memcpy_async(&Bs[threadIdx.y][threadIdx.x], &B[b + wB * threadIdx.y + threadIdx.x], shape1, pipe); - - pipe.producer_commit(); - } - - pipe.consumer_wait(); - // Synchronize to make sure the matrices are loaded - __syncthreads(); - - // Multiply the two matrices together; - // each thread computes one element - // of the block sub-matrix -#pragma unroll - for (int k = 0; k < BLOCK_SIZE; ++k) { - Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; - } - - // Synchronize to make sure that the preceding - // computation is done before overwriting the - // shared memory sub-matrix buffers As and Bs in the next iteration. - __syncthreads(); + pipe.producer_commit(); } - // Write the block sub-matrix to device memory; - // each thread writes four element - int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; - C[c + wB * threadIdx.y + threadIdx.x] = Csub; + pipe.consumer_wait(); + // Synchronize to make sure the matrices are loaded + __syncthreads(); + +// Multiply the two matrices together; +// each thread computes one element +// of the block sub-matrix +#pragma unroll + for (int k = 0; k < BLOCK_SIZE; ++k) { + Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; + } + + // Synchronize to make sure that the preceding + // computation is done before overwriting the + // shared memory sub-matrix buffers As and Bs in the next iteration. + __syncthreads(); + } + + // Write the block sub-matrix to device memory; + // each thread writes four element + int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; + C[c + wB * threadIdx.y + threadIdx.x] = Csub; } -// Multi Stage memcpy_async thread_scope_thread pipeline with single-element async-copy -template __global__ void MatrixMulAsyncCopyMultiStage(float* __restrict__ C, - const float* __restrict__ A, - const float* __restrict__ B, int wA, - int wB) { - // Multi-stage pipeline version - constexpr size_t maxPipelineStages = 4; +// Multi Stage memcpy_async thread_scope_thread pipeline with single-element +// async-copy +template +__global__ void MatrixMulAsyncCopyMultiStage(float *__restrict__ C, + const float *__restrict__ A, + const float *__restrict__ B, + int wA, int wB) { + // Multi-stage pipeline version + constexpr size_t maxPipelineStages = 4; - // Declaration of the shared memory array As used to - // store the sub-matrix of A for each stage - __shared__ float As[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array As used to + // store the sub-matrix of A for each stage + __shared__ float As[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B for each stage - __shared__ float Bs[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B for each stage + __shared__ float Bs[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; - float Csub = 0.0; + float Csub = 0.0; - // Index of the first sub-matrix of A processed by the block - const int aBegin = wA * BLOCK_SIZE * blockIdx.y; + // Index of the first sub-matrix of A processed by the block + const int aBegin = wA * BLOCK_SIZE * blockIdx.y; - // Index of the last sub-matrix of A processed by the block - const int aEnd = aBegin + wA - 1; + // Index of the last sub-matrix of A processed by the block + const int aEnd = aBegin + wA - 1; - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; - // Index of the first sub-matrix of B processed by the block - const int bBegin = BLOCK_SIZE * blockIdx.x; + // Index of the first sub-matrix of B processed by the block + const int bBegin = BLOCK_SIZE * blockIdx.x; - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * wB; + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; - cuda::pipeline pipe = cuda::make_pipeline(); - const auto shape1 = cuda::aligned_size_t(sizeof(float)); + cuda::pipeline pipe = cuda::make_pipeline(); + const auto shape1 = cuda::aligned_size_t(sizeof(float)); - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, iStage = 0; a <= aEnd; a += aStep, b += bStep, ++i ) { - // Load the matrices from device memory to shared memory; each thread loads - // one element of each matrix + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, + iStage = 0; + a <= aEnd; a += aStep, b += bStep, ++i) { + // Load the matrices from device memory to shared memory; each thread loads + // one element of each matrix - for ( ; aStage <= a + aStep * maxPipelineStages ; aStage += aStep, bStage += bStep, ++iStage ) - { - if ( aStage <= aEnd ) - { - // Rotating buffer - const int j = iStage % maxPipelineStages; + for (; aStage <= a + aStep * maxPipelineStages; + aStage += aStep, bStage += bStep, ++iStage) { + if (aStage <= aEnd) { + // Rotating buffer + const int j = iStage % maxPipelineStages; - pipe.producer_acquire(); + pipe.producer_acquire(); - cuda::memcpy_async(&As[j][threadIdx.y][threadIdx.x], &A[aStage + wA * threadIdx.y + threadIdx.x], shape1, pipe); - cuda::memcpy_async(&Bs[j][threadIdx.y][threadIdx.x], &B[bStage + wB * threadIdx.y + threadIdx.x], shape1, pipe); + cuda::memcpy_async(&As[j][threadIdx.y][threadIdx.x], + &A[aStage + wA * threadIdx.y + threadIdx.x], shape1, + pipe); + cuda::memcpy_async(&Bs[j][threadIdx.y][threadIdx.x], + &B[bStage + wB * threadIdx.y + threadIdx.x], shape1, + pipe); - pipe.producer_commit(); - } - } - pipe.consumer_wait(); + pipe.producer_commit(); + } + } + pipe.consumer_wait(); - // Synchronize to make sure the matrices are loaded - __syncthreads(); + // Synchronize to make sure the matrices are loaded + __syncthreads(); - const int j = i % maxPipelineStages; + const int j = i % maxPipelineStages; - // Multiply the two matrices together; - // each thread computes one element - // of the block sub-matrix - for (int k = 0; k < BLOCK_SIZE; ++k) { - Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x]; - } - - pipe.consumer_release(); - // Don't have to synchronize because maxPipelineStages is greater than one - // therefore next iteration is loading to a different buffer. + // Multiply the two matrices together; + // each thread computes one element + // of the block sub-matrix + for (int k = 0; k < BLOCK_SIZE; ++k) { + Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x]; } - // Write the block sub-matrix to device memory; - // each thread writes four element - int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; - C[c + wB * threadIdx.y + threadIdx.x] = Csub; + pipe.consumer_release(); + // Don't have to synchronize because maxPipelineStages is greater than one + // therefore next iteration is loading to a different buffer. + } + + // Write the block sub-matrix to device memory; + // each thread writes four element + int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; + C[c + wB * threadIdx.y + threadIdx.x] = Csub; } // Multi Stage shared state memcpy_async pipeline thread_scope_block // with parititioned producer & consumer, here we've 1 warp as producer // group which issues memcpy_async operations and rest all warps are part of -// consumer group which perform gemm computation on the loaded matrices by producer. -template __global__ void MatrixMulAsyncCopyMultiStageSharedState(float* __restrict__ C, - const float* __restrict__ A, - const float* __restrict__ B, int wA, - int wB) { - // Multi-stage pipeline version - constexpr size_t maxPipelineStages = 4; +// consumer group which perform gemm computation on the loaded matrices by +// producer. +template +__global__ void MatrixMulAsyncCopyMultiStageSharedState( + float *__restrict__ C, const float *__restrict__ A, + const float *__restrict__ B, int wA, int wB) { + // Multi-stage pipeline version + constexpr size_t maxPipelineStages = 4; - // Declaration of the shared memory array As used to - // store the sub-matrix of A for each stage - __shared__ float As[maxPipelineStages][BLOCK_SIZE_X][BLOCK_SIZE_X]; + // Declaration of the shared memory array As used to + // store the sub-matrix of A for each stage + __shared__ float As[maxPipelineStages][BLOCK_SIZE_X][BLOCK_SIZE_X]; - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B for each stage - __shared__ float Bs[maxPipelineStages][BLOCK_SIZE_X][BLOCK_SIZE_X]; + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B for each stage + __shared__ float Bs[maxPipelineStages][BLOCK_SIZE_X][BLOCK_SIZE_X]; - float Csub = 0.0; + float Csub = 0.0; - // Index of the first sub-matrix of A processed by the block - const int aBegin = wA * BLOCK_SIZE_X * blockIdx.y; + // Index of the first sub-matrix of A processed by the block + const int aBegin = wA * BLOCK_SIZE_X * blockIdx.y; - // Index of the last sub-matrix of A processed by the block - const int aEnd = aBegin + wA - 1; + // Index of the last sub-matrix of A processed by the block + const int aEnd = aBegin + wA - 1; - // Step size used to iterate through the sub-matrices of A - constexpr int aStep = BLOCK_SIZE_X; + // Step size used to iterate through the sub-matrices of A + constexpr int aStep = BLOCK_SIZE_X; - // Index of the first sub-matrix of B processed by the block - const int bBegin = BLOCK_SIZE_X * blockIdx.x; + // Index of the first sub-matrix of B processed by the block + const int bBegin = BLOCK_SIZE_X * blockIdx.x; - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE_X * wB; + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE_X * wB; - auto cta = cg::this_thread_block(); + auto cta = cg::this_thread_block(); - const auto shape1 = cuda::aligned_size_t(sizeof(float)); - __shared__ cuda::pipeline_shared_state shared_state; - constexpr int consumer_row_count = BLOCK_SIZE_X; + const auto shape1 = cuda::aligned_size_t(sizeof(float)); + __shared__ cuda::pipeline_shared_state shared_state; + constexpr int consumer_row_count = BLOCK_SIZE_X; - const auto thread_role = (cta.thread_index().y < consumer_row_count) - ? cuda::pipeline_role::consumer - : cuda::pipeline_role::producer; - auto pipe = cuda::make_pipeline(cta, &shared_state, thread_role); + const auto thread_role = (cta.thread_index().y < consumer_row_count) + ? cuda::pipeline_role::consumer + : cuda::pipeline_role::producer; + auto pipe = cuda::make_pipeline(cta, &shared_state, thread_role); - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, iStage = 0; - a <= aEnd; a += aStep, b += bStep, ++i) { - if (threadIdx.y >= consumer_row_count) { - // this is a whole producer warp because threadIdx.y >= 16 where 16 == consumer_row_count, - // which loads the matrices from device memory to shared memory; - for (; aStage <= a + aStep * maxPipelineStages; aStage += aStep, bStage += bStep, ++iStage) { - if (aStage <= aEnd) { - // Rotating buffer - const int j = iStage % maxPipelineStages; - const int strideRows = (blockDim.y - consumer_row_count); - pipe.producer_acquire(); - for (int rowId = threadIdx.y - consumer_row_count; rowId < BLOCK_SIZE_X; rowId += strideRows) { - cuda::memcpy_async(&As[j][rowId][threadIdx.x], - &A[aStage + wA * rowId + threadIdx.x], shape1, pipe); - cuda::memcpy_async(&Bs[j][rowId][threadIdx.x], - &B[bStage + wB * rowId + threadIdx.x], shape1, pipe); - } - pipe.producer_commit(); - } - } - } - else { - // this is a whole set of consumer group because threadIdx.y < consumer_row_count where consumer_row_count == 16, - // which computes gemm operation on matrices loaded in shared memory by producer warp. - const int j = i % maxPipelineStages; - // Synchronize consumer group to make sure the matrices are loaded by producer group. - pipe.consumer_wait(); - // Multiply the two matrices together; - // each thread computes one element - // of the block sub-matrix - #pragma unroll - for (int k = 0; k < BLOCK_SIZE_X; ++k) { - Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x]; - } - pipe.consumer_release(); + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, + iStage = 0; + a <= aEnd; a += aStep, b += bStep, ++i) { + if (threadIdx.y >= consumer_row_count) { + // this is a whole producer warp because threadIdx.y >= 16 where 16 == + // consumer_row_count, + // which loads the matrices from device memory to shared memory; + for (; aStage <= a + aStep * maxPipelineStages; + aStage += aStep, bStage += bStep, ++iStage) { + if (aStage <= aEnd) { + // Rotating buffer + const int j = iStage % maxPipelineStages; + const int strideRows = (blockDim.y - consumer_row_count); + pipe.producer_acquire(); + for (int rowId = threadIdx.y - consumer_row_count; + rowId < BLOCK_SIZE_X; rowId += strideRows) { + cuda::memcpy_async(&As[j][rowId][threadIdx.x], + &A[aStage + wA * rowId + threadIdx.x], shape1, + pipe); + cuda::memcpy_async(&Bs[j][rowId][threadIdx.x], + &B[bStage + wB * rowId + threadIdx.x], shape1, + pipe); + } + pipe.producer_commit(); } + } + } else { + // this is a whole set of consumer group because threadIdx.y < + // consumer_row_count where consumer_row_count == 16, + // which computes gemm operation on matrices loaded in shared memory by + // producer warp. + const int j = i % maxPipelineStages; + // Synchronize consumer group to make sure the matrices are loaded by + // producer group. + pipe.consumer_wait(); +// Multiply the two matrices together; +// each thread computes one element +// of the block sub-matrix +#pragma unroll + for (int k = 0; k < BLOCK_SIZE_X; ++k) { + Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x]; + } + pipe.consumer_release(); } + } - // Write the block sub-matrix to device memory; - // each thread writes four element - if (threadIdx.y < consumer_row_count) - { - const int c = wB * BLOCK_SIZE_X * blockIdx.y + BLOCK_SIZE_X * blockIdx.x; - C[c + wB * threadIdx.y + threadIdx.x] = Csub; - } + // Write the block sub-matrix to device memory; + // each thread writes four element + if (threadIdx.y < consumer_row_count) { + const int c = wB * BLOCK_SIZE_X * blockIdx.y + BLOCK_SIZE_X * blockIdx.x; + C[c + wB * threadIdx.y + threadIdx.x] = Csub; + } } /** * Matrix multiplication (CUDA Kernel) on the device: C = A * B * wA is A's width and wB is B's width */ -template __global__ void MatrixMulNaive(float *C, float *A, - float *B, int wA, - int wB) { - // Declaration of the shared memory array As used to - // store the sub-matrix of A - __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; +template +__global__ void MatrixMulNaive(float *C, float *A, float *B, int wA, int wB) { + // Declaration of the shared memory array As used to + // store the sub-matrix of A + __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B - __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B + __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; - // Index of the first sub-matrix of A processed by the block - int aBegin = wA * BLOCK_SIZE * blockIdx.y; + // Index of the first sub-matrix of A processed by the block + int aBegin = wA * BLOCK_SIZE * blockIdx.y; - // Index of the last sub-matrix of A processed by the block - int aEnd = aBegin + wA - 1; + // Index of the last sub-matrix of A processed by the block + int aEnd = aBegin + wA - 1; - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; - // Index of the first sub-matrix of B processed by the block - int bBegin = BLOCK_SIZE * blockIdx.x; + // Index of the first sub-matrix of B processed by the block + int bBegin = BLOCK_SIZE * blockIdx.x; - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * wB; + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; - // Csub is used to store the element of the block sub-matrix - // that is computed by the thread - float Csub = 0; + // Csub is used to store the element of the block sub-matrix + // that is computed by the thread + float Csub = 0; - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin; - a <= aEnd; - a += aStep, b += bStep) { + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { + // Load the matrices from device memory + // to shared memory; each thread loads + // one element of each matrix + As[threadIdx.y][threadIdx.x] = A[a + wA * threadIdx.y + threadIdx.x]; + Bs[threadIdx.y][threadIdx.x] = B[b + wB * threadIdx.y + threadIdx.x]; - // Load the matrices from device memory - // to shared memory; each thread loads - // one element of each matrix - As[threadIdx.y][threadIdx.x] = A[a + wA * threadIdx.y + threadIdx.x]; - Bs[threadIdx.y][threadIdx.x] = B[b + wB * threadIdx.y + threadIdx.x]; + // Synchronize to make sure the matrices are loaded + __syncthreads(); - // Synchronize to make sure the matrices are loaded - __syncthreads(); - - // Multiply the two matrices together; - // each thread computes one element - // of the block sub-matrix +// Multiply the two matrices together; +// each thread computes one element +// of the block sub-matrix #pragma unroll - for (int k = 0; k < BLOCK_SIZE; ++k) { - Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; - } - - // Synchronize to make sure that the preceding - // computation is done before loading two new - // sub-matrices of A and B in the next iteration - __syncthreads(); + for (int k = 0; k < BLOCK_SIZE; ++k) { + Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; } - // Write the block sub-matrix to device memory; - // each thread writes one element - int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; - C[c + wB * threadIdx.y + threadIdx.x] = Csub; + // Synchronize to make sure that the preceding + // computation is done before loading two new + // sub-matrices of A and B in the next iteration + __syncthreads(); + } + + // Write the block sub-matrix to device memory; + // each thread writes one element + int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; + C[c + wB * threadIdx.y + threadIdx.x] = Csub; } -template __global__ void MatrixMulNaiveLargeChunk(float *C, float *A, - float *B, int wA, - int wB) { - // Declaration of the shared memory array As used to - // store the sub-matrix of A - __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE]; +template +__global__ void MatrixMulNaiveLargeChunk(float *C, float *A, float *B, int wA, + int wB) { + // Declaration of the shared memory array As used to + // store the sub-matrix of A + __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE]; - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B - __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B + __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE]; - int t4x = threadIdx.x * 4 ; + int t4x = threadIdx.x * 4; - // Index of the first sub-matrix of A processed by the block - int aBegin = wA * BLOCK_SIZE * blockIdx.y; + // Index of the first sub-matrix of A processed by the block + int aBegin = wA * BLOCK_SIZE * blockIdx.y; - // Index of the last sub-matrix of A processed by the block - int aEnd = aBegin + wA - 1; + // Index of the last sub-matrix of A processed by the block + int aEnd = aBegin + wA - 1; - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; - // Index of the first sub-matrix of B processed by the block - int bBegin = BLOCK_SIZE * blockIdx.x; + // Index of the first sub-matrix of B processed by the block + int bBegin = BLOCK_SIZE * blockIdx.x; - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * wB; + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; - // Csub is used to store the element of the block sub-matrix - // that is computed by the thread - float Csub = 0; + // Csub is used to store the element of the block sub-matrix + // that is computed by the thread + float Csub = 0; - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin; - a <= aEnd; - a += aStep, b += bStep) { + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { + // Load the matrices from device memory + // to shared memory; - // Load the matrices from device memory - // to shared memory; - - // One fourth of the threads load four elements of each matrix - if ( t4x < BLOCK_SIZE ) { - float4 * const A4s = reinterpret_cast(& As[threadIdx.y][t4x]); - float4 * const B4s = reinterpret_cast(& Bs[threadIdx.y][t4x]); - const float4 * const A4 = reinterpret_cast(& A[a + wA * threadIdx.y + t4x]); - const float4 * const B4 = reinterpret_cast(& B[a + wA * threadIdx.y + t4x]); - *A4s = *A4 ; - *B4s = *B4 ; - } - - // Synchronize to make sure the matrices are loaded - __syncthreads(); - - // Multiply the two matrices together; - // each thread computes one element - // of the block sub-matrix -#pragma unroll - for (int k = 0; k < BLOCK_SIZE; ++k) { - Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; - } - - // Synchronize to make sure that the preceding - // computation is done before loading two new - // sub-matrices of A and B in the next iteration - __syncthreads(); + // One fourth of the threads load four elements of each matrix + if (t4x < BLOCK_SIZE) { + float4 *const A4s = reinterpret_cast(&As[threadIdx.y][t4x]); + float4 *const B4s = reinterpret_cast(&Bs[threadIdx.y][t4x]); + const float4 *const A4 = + reinterpret_cast(&A[a + wA * threadIdx.y + t4x]); + const float4 *const B4 = + reinterpret_cast(&B[a + wA * threadIdx.y + t4x]); + *A4s = *A4; + *B4s = *B4; } - // Write the block sub-matrix to device memory; - // each thread writes one element - int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; - C[c + wB * threadIdx.y + threadIdx.x] = Csub; -} + // Synchronize to make sure the matrices are loaded + __syncthreads(); +// Multiply the two matrices together; +// each thread computes one element +// of the block sub-matrix +#pragma unroll + for (int k = 0; k < BLOCK_SIZE; ++k) { + Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; + } + + // Synchronize to make sure that the preceding + // computation is done before loading two new + // sub-matrices of A and B in the next iteration + __syncthreads(); + } + + // Write the block sub-matrix to device memory; + // each thread writes one element + int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; + C[c + wB * threadIdx.y + threadIdx.x] = Csub; +} void ConstantInit(float *data, int size, float val) { - for (int i = 0; i < size; ++i) { - data[i] = val; - } + for (int i = 0; i < size; ++i) { + data[i] = val; + } } /** * Run matrix multiplication using CUDA */ -int MatrixMultiply(int argc, char **argv, - const dim3 &dimsA, - const dim3 &dimsB, +int MatrixMultiply(int argc, char **argv, const dim3 &dimsA, const dim3 &dimsB, kernels kernel_number) { - // Allocate host memory for matrices A and B - unsigned int size_A = dimsA.x * dimsA.y; - unsigned int mem_size_A = sizeof(float) * size_A; - float* h_A; - checkCudaErrors(cudaMallocHost(&h_A, mem_size_A)); - unsigned int size_B = dimsB.x * dimsB.y; - unsigned int mem_size_B = sizeof(float) * size_B; - float* h_B; - checkCudaErrors(cudaMallocHost(&h_B, mem_size_B)); - cudaStream_t stream; + // Allocate host memory for matrices A and B + unsigned int size_A = dimsA.x * dimsA.y; + unsigned int mem_size_A = sizeof(float) * size_A; + float *h_A; + checkCudaErrors(cudaMallocHost(&h_A, mem_size_A)); + unsigned int size_B = dimsB.x * dimsB.y; + unsigned int mem_size_B = sizeof(float) * size_B; + float *h_B; + checkCudaErrors(cudaMallocHost(&h_B, mem_size_B)); + cudaStream_t stream; - // Initialize host memory - const float valB = 2.10f; - ConstantInit(h_A, size_A, 1.0f); - ConstantInit(h_B, size_B, valB); + // Initialize host memory + const float valB = 2.10f; + ConstantInit(h_A, size_A, 1.0f); + ConstantInit(h_B, size_B, valB); - // Allocate device memory - float *d_A, *d_B, *d_C; + // Allocate device memory + float *d_A, *d_B, *d_C; - // Allocate host matrix C - dim3 dimsC(dimsB.x, dimsA.y, 1); - unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); - float* h_C; - checkCudaErrors(cudaMallocHost(&h_C, mem_size_C)); + // Allocate host matrix C + dim3 dimsC(dimsB.x, dimsA.y, 1); + unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); + float *h_C; + checkCudaErrors(cudaMallocHost(&h_C, mem_size_C)); - if (h_C == NULL) { - fprintf(stderr, "Failed to allocate host matrix C!\n"); - exit(EXIT_FAILURE); + if (h_C == NULL) { + fprintf(stderr, "Failed to allocate host matrix C!\n"); + exit(EXIT_FAILURE); + } + + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_A), mem_size_A)); + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_B), mem_size_B)); + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_C), mem_size_C)); + // Allocate CUDA events that we'll use for timing + cudaEvent_t start, stop; + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + // copy host memory to device + checkCudaErrors( + cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream)); + checkCudaErrors( + cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemsetAsync(d_C, 0, mem_size_C, stream)); + + // Setup execution parameters + dim3 threads(blockSize, blockSize); + dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); + + // Here the block size is 16x18, where first 16 rows are consumer thread group + // and last 2 rows (1 warp) is producer thread group + dim3 threadsSharedStateKernel(blockSize, blockSize + 2, 1); + dim3 gridSharedStateKernel(dimsB.x / threadsSharedStateKernel.x, + dimsA.y / threadsSharedStateKernel.x); + + printf("Running kernel = %d - %s\n", kernel_number, + kernelNames[kernel_number]); + // Create and start timer + printf("Computing result using CUDA Kernel...\n"); + + // Performs warmup operation using matrixMul CUDA kernel + switch (kernel_number) { + case AsyncCopyMultiStageLargeChunk: + default: + MatrixMulAsyncCopyMultiStageLargeChunk< + blockSize><<>>(d_C, d_A, d_B, dimsA.x, + dimsB.x); + break; + case AsyncCopyLargeChunk: + MatrixMulAsyncCopyLargeChunk<<>>( + d_C, d_A, d_B, dimsA.x, dimsB.x); + break; + case AsyncCopyLargeChunkAWBarrier: + MatrixMulAsyncCopyLargeChunkAWBarrier< + blockSize><<>>(d_C, d_A, d_B, dimsA.x, + dimsB.x); + break; + case AsyncCopyMultiStageSharedState: + MatrixMulAsyncCopyMultiStageSharedState<<< + gridSharedStateKernel, threadsSharedStateKernel, 0, stream>>>( + d_C, d_A, d_B, dimsA.x, dimsB.x); + break; + case AsyncCopyMultiStage: + MatrixMulAsyncCopyMultiStage<<>>( + d_C, d_A, d_B, dimsA.x, dimsB.x); + break; + case AsyncCopySingleStage: + MatrixMulAsyncCopySingleStage<<>>( + d_C, d_A, d_B, dimsA.x, dimsB.x); + break; + case Naive: + MatrixMulNaive<<>>(d_C, d_A, d_B, + dimsA.x, dimsB.x); + break; + case NaiveLargeChunk: + MatrixMulNaiveLargeChunk<<>>( + d_C, d_A, d_B, dimsA.x, dimsB.x); + break; + } + + printf("done\n"); + checkCudaErrors(cudaStreamSynchronize(stream)); + + // Execute the kernel + int nIter = 100; + + // Record the start event + checkCudaErrors(cudaEventRecord(start, stream)); + + for (int j = 0; j < nIter; j++) { + switch (kernel_number) { + case AsyncCopyMultiStageLargeChunk: + default: + MatrixMulAsyncCopyMultiStageLargeChunk< + blockSize><<>>(d_C, d_A, d_B, dimsA.x, + dimsB.x); + break; + case AsyncCopyLargeChunk: + MatrixMulAsyncCopyLargeChunk<<>>( + d_C, d_A, d_B, dimsA.x, dimsB.x); + break; + case AsyncCopyLargeChunkAWBarrier: + MatrixMulAsyncCopyLargeChunkAWBarrier< + blockSize><<>>(d_C, d_A, d_B, dimsA.x, + dimsB.x); + break; + case AsyncCopyMultiStageSharedState: + MatrixMulAsyncCopyMultiStageSharedState<<< + gridSharedStateKernel, threadsSharedStateKernel, 0, stream>>>( + d_C, d_A, d_B, dimsA.x, dimsB.x); + break; + case AsyncCopyMultiStage: + MatrixMulAsyncCopyMultiStage<<>>( + d_C, d_A, d_B, dimsA.x, dimsB.x); + break; + case AsyncCopySingleStage: + MatrixMulAsyncCopySingleStage<<>>( + d_C, d_A, d_B, dimsA.x, dimsB.x); + break; + case Naive: + MatrixMulNaive<<>>( + d_C, d_A, d_B, dimsA.x, dimsB.x); + break; + case NaiveLargeChunk: + MatrixMulNaiveLargeChunk<<>>( + d_C, d_A, d_B, dimsA.x, dimsB.x); + break; } + } - checkCudaErrors(cudaMalloc(reinterpret_cast(&d_A), mem_size_A)); - checkCudaErrors(cudaMalloc(reinterpret_cast(&d_B), mem_size_B)); - checkCudaErrors(cudaMalloc(reinterpret_cast(&d_C), mem_size_C)); - // Allocate CUDA events that we'll use for timing - cudaEvent_t start, stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); + // Record the stop event + checkCudaErrors(cudaEventRecord(stop, stream)); - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + // Wait for the stop event to complete + checkCudaErrors(cudaEventSynchronize(stop)); - // copy host memory to device - checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemsetAsync(d_C, 0, mem_size_C, stream)); + float msecTotal = 0.0f; + checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); - // Setup execution parameters - dim3 threads(blockSize, blockSize); - dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); + // Compute and print the performance + float msecPerMatrixMul = msecTotal / nIter; + double flopsPerMatrixMul = 2.0 * static_cast(dimsA.x) * + static_cast(dimsA.y) * + static_cast(dimsB.x); + double gigaFlops = + (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); + printf( + "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops," + " WorkgroupSize= %u threads/block\n", + gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y); - // Here the block size is 16x18, where first 16 rows are consumer thread group - // and last 2 rows (1 warp) is producer thread group - dim3 threadsSharedStateKernel(blockSize, blockSize + 2, 1); - dim3 gridSharedStateKernel(dimsB.x / threadsSharedStateKernel.x, dimsA.y / threadsSharedStateKernel.x); + // Copy result from device to host + checkCudaErrors( + cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); - printf("Running kernel = %d - %s\n", kernel_number, kernelNames[kernel_number]); - // Create and start timer - printf("Computing result using CUDA Kernel...\n"); + printf("Checking computed result for correctness: "); + bool correct = true; - // Performs warmup operation using matrixMul CUDA kernel - switch (kernel_number) - { - case AsyncCopyMultiStageLargeChunk : - default: - MatrixMulAsyncCopyMultiStageLargeChunk<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - case AsyncCopyLargeChunk : - MatrixMulAsyncCopyLargeChunk<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - case AsyncCopyLargeChunkAWBarrier : - MatrixMulAsyncCopyLargeChunkAWBarrier<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - case AsyncCopyMultiStageSharedState : - MatrixMulAsyncCopyMultiStageSharedState<<>> - (d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - case AsyncCopyMultiStage : - MatrixMulAsyncCopyMultiStage<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - case AsyncCopySingleStage : - MatrixMulAsyncCopySingleStage<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - case Naive : - MatrixMulNaive<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - case NaiveLargeChunk: - MatrixMulNaiveLargeChunk<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); - break; + // test relative error by the formula + // |_cpu - _gpu|/<|x|, |y|> < eps + double eps = 1.e-6; // machine zero + + for (int i = 0; i < static_cast(dimsC.x * dimsC.y); i++) { + double abs_err = fabs(h_C[i] - (dimsA.x * valB)); + double dot_length = dimsA.x; + double abs_val = fabs(h_C[i]); + double rel_err = abs_err / abs_val / dot_length; + + if (rel_err > eps) { + printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, + h_C[i], dimsA.x * valB, eps); + correct = false; } + } - printf("done\n"); - checkCudaErrors(cudaStreamSynchronize(stream)); + printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); + // Clean up memory + checkCudaErrors(cudaFreeHost(h_A)); + checkCudaErrors(cudaFreeHost(h_B)); + checkCudaErrors(cudaFreeHost(h_C)); + checkCudaErrors(cudaFree(d_A)); + checkCudaErrors(cudaFree(d_B)); + checkCudaErrors(cudaFree(d_C)); + checkCudaErrors(cudaEventDestroy(start)); + checkCudaErrors(cudaEventDestroy(stop)); + printf( + "\nNOTE: The CUDA Samples are not meant for performance " + "measurements. Results may vary when GPU Boost is enabled.\n"); - // Execute the kernel - int nIter = 100; - - // Record the start event - checkCudaErrors(cudaEventRecord(start, stream)); - - for (int j = 0; j < nIter; j++) { - switch (kernel_number) - { - case AsyncCopyMultiStageLargeChunk : - default: - MatrixMulAsyncCopyMultiStageLargeChunk<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - case AsyncCopyLargeChunk : - MatrixMulAsyncCopyLargeChunk<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - case AsyncCopyLargeChunkAWBarrier : - MatrixMulAsyncCopyLargeChunkAWBarrier<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - case AsyncCopyMultiStageSharedState : - MatrixMulAsyncCopyMultiStageSharedState<<>> - (d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - case AsyncCopyMultiStage : - MatrixMulAsyncCopyMultiStage<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - case AsyncCopySingleStage : - MatrixMulAsyncCopySingleStage<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - case Naive : - MatrixMulNaive<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - case NaiveLargeChunk: - MatrixMulNaiveLargeChunk<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - } - } - - // Record the stop event - checkCudaErrors(cudaEventRecord(stop, stream)); - - // Wait for the stop event to complete - checkCudaErrors(cudaEventSynchronize(stop)); - - float msecTotal = 0.0f; - checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); - - // Compute and print the performance - float msecPerMatrixMul = msecTotal / nIter; - double flopsPerMatrixMul = 2.0 * static_cast(dimsA.x) * - static_cast(dimsA.y) * - static_cast(dimsB.x); - double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / - (msecPerMatrixMul / 1000.0f); - printf( - "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops," \ - " WorkgroupSize= %u threads/block\n", - gigaFlops, - msecPerMatrixMul, - flopsPerMatrixMul, - threads.x * threads.y); - - // Copy result from device to host - checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - - printf("Checking computed result for correctness: "); - bool correct = true; - - // test relative error by the formula - // |_cpu - _gpu|/<|x|, |y|> < eps - double eps = 1.e-6; // machine zero - - for (int i = 0; i < static_cast(dimsC.x * dimsC.y); i++) { - double abs_err = fabs(h_C[i] - (dimsA.x * valB)); - double dot_length = dimsA.x; - double abs_val = fabs(h_C[i]); - double rel_err = abs_err / abs_val / dot_length; - - if (rel_err > eps) { - printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", - i, h_C[i], dimsA.x * valB, eps); - correct = false; - } - } - - printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); - - // Clean up memory - checkCudaErrors(cudaFreeHost(h_A)); - checkCudaErrors(cudaFreeHost(h_B)); - checkCudaErrors(cudaFreeHost(h_C)); - checkCudaErrors(cudaFree(d_A)); - checkCudaErrors(cudaFree(d_B)); - checkCudaErrors(cudaFree(d_C)); - checkCudaErrors(cudaEventDestroy(start)); - checkCudaErrors(cudaEventDestroy(stop)); - printf("\nNOTE: The CUDA Samples are not meant for performance "\ - "measurements. Results may vary when GPU Boost is enabled.\n"); - - if (correct) { - return EXIT_SUCCESS; - } else { - return EXIT_FAILURE; - } + if (correct) { + return EXIT_SUCCESS; + } else { + return EXIT_FAILURE; + } } - int main(int argc, char **argv) { - printf("[globalToShmemAsyncCopy] - Starting...\n"); + printf("[globalToShmemAsyncCopy] - Starting...\n"); - if (checkCmdLineFlag(argc, (const char **)argv, "help") || - checkCmdLineFlag(argc, (const char **)argv, "?")) { - printf("Usage -device=n (n >= 0 for deviceID)\n"); - printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); - printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); - printf(" -kernel=kernel_number (0 - AsyncCopyMultiStageLargeChunk; 1 - AsyncCopyLargeChunk)\n"); - printf(" (2 - AsyncCopyLargeChunkAWBarrier; 3 - AsyncCopyMultiStageSharedState)\n"); - printf(" (4 - AsyncCopyMultiStage; 5 - AsyncCopySingleStage; 6 - Naive without memcpy_async)\n"); - printf(" (7 - NaiveLargeChunk without memcpy_async)\n"); - printf(" Note: Outer matrix dimensions of A & B matrices must be equal.\n"); + if (checkCmdLineFlag(argc, (const char **)argv, "help") || + checkCmdLineFlag(argc, (const char **)argv, "?")) { + printf("Usage -device=n (n >= 0 for deviceID)\n"); + printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); + printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); + printf( + " -kernel=kernel_number (0 - AsyncCopyMultiStageLargeChunk; 1 - " + "AsyncCopyLargeChunk)\n"); + printf( + " (2 - AsyncCopyLargeChunkAWBarrier; 3 - " + "AsyncCopyMultiStageSharedState)\n"); + printf( + " (4 - AsyncCopyMultiStage; 5 - " + "AsyncCopySingleStage; 6 - Naive without memcpy_async)\n"); + printf( + " (7 - NaiveLargeChunk without " + "memcpy_async)\n"); + printf( + " Note: Outer matrix dimensions of A & B matrices must be equal.\n"); - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); + } + + // This will pick the best possible CUDA capable device, otherwise + // override the device ID based on input provided at the command line + int dev = findCudaDevice(argc, (const char **)argv); + + int matrixBlock = 32; + dim3 dimsA(10 * 4 * matrixBlock, 10 * 4 * matrixBlock, 1); + dim3 dimsB(10 * 4 * matrixBlock, 10 * 4 * matrixBlock, 1); + + // width of Matrix A + if (checkCmdLineFlag(argc, (const char **)argv, "wA")) { + dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); + } + + // height of Matrix A + if (checkCmdLineFlag(argc, (const char **)argv, "hA")) { + dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); + } + + // width of Matrix B + if (checkCmdLineFlag(argc, (const char **)argv, "wB")) { + dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); + } + + // height of Matrix B + if (checkCmdLineFlag(argc, (const char **)argv, "hB")) { + dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); + } + + if (dimsA.x != dimsB.y) { + printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", + dimsA.x, dimsB.y); + exit(EXIT_FAILURE); + } + + kernels selected_kernel = AsyncCopyMultiStageLargeChunk; + + // kernel to run - default (AsyncCopyMultiStageLargeChunk == 0) + if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) { + int kernel_number = + getCmdLineArgumentInt(argc, (const char **)argv, "kernel"); + if (kernel_number < 8) { + selected_kernel = (kernels)kernel_number; + } else { + printf( + "Error: kernel number should be between 0 to 6, you have entered " + "%d\n", + kernel_number); + exit(EXIT_FAILURE); } + } - // This will pick the best possible CUDA capable device, otherwise - // override the device ID based on input provided at the command line - int dev = findCudaDevice(argc, (const char **)argv); + int major = 0; + checkCudaErrors( + cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev)); + if (major < 7) { + printf("globalToShmemAsyncCopy requires SM 7.0 or higher. Exiting...\n"); + exit(EXIT_WAIVED); + } - int matrixBlock = 32; - dim3 dimsA(10 * 4 * matrixBlock, 10 * 4 * matrixBlock, 1); - dim3 dimsB(10 * 4 * matrixBlock, 10 * 4 * matrixBlock, 1); + printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, + dimsB.y); - // width of Matrix A - if (checkCmdLineFlag(argc, (const char **)argv, "wA")) { - dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); - } + int matrix_result = MatrixMultiply(argc, argv, dimsA, dimsB, selected_kernel); - // height of Matrix A - if (checkCmdLineFlag(argc, (const char **)argv, "hA")) { - dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); - } - - // width of Matrix B - if (checkCmdLineFlag(argc, (const char **)argv, "wB")) { - dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); - } - - // height of Matrix B - if (checkCmdLineFlag(argc, (const char **)argv, "hB")) { - dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); - } - - if (dimsA.x != dimsB.y) { - printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", - dimsA.x, dimsB.y); - exit(EXIT_FAILURE); - } - - kernels selected_kernel = AsyncCopyMultiStageLargeChunk; - - // kernel to run - default (AsyncCopyMultiStageLargeChunk == 0) - if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) { - int kernel_number = getCmdLineArgumentInt(argc, (const char **)argv, "kernel"); - if (kernel_number < 8) - { - selected_kernel = (kernels)kernel_number; - } - else - { - printf("Error: kernel number should be between 0 to 6, you have entered %d\n", kernel_number); - exit(EXIT_FAILURE); - } - } - - int major = 0; - checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev)); - if (major < 7) - { - printf("globalToShmemAsyncCopy requires SM 7.0 or higher. Exiting...\n"); - exit(EXIT_WAIVED); - } - - printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, - dimsB.x, dimsB.y); - - int matrix_result = MatrixMultiply(argc, argv, dimsA, dimsB, selected_kernel); - - exit(matrix_result); + exit(matrix_result); } - diff --git a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2017.vcxproj b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2017.vcxproj index 1fd5cdd3..added1d2 100644 --- a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2017.vcxproj +++ b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2019.vcxproj b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2019.vcxproj index c8c02a93..bf65f63a 100644 --- a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2019.vcxproj +++ b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/immaTensorCoreGemm/README.md b/Samples/immaTensorCoreGemm/README.md index 21262589..3c07bb95 100644 --- a/Samples/immaTensorCoreGemm/README.md +++ b/Samples/immaTensorCoreGemm/README.md @@ -27,7 +27,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate, ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj index 5055f031..d6942bc2 100644 --- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj +++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2019.vcxproj b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2019.vcxproj index 6139830e..6ecb5d5f 100644 --- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2019.vcxproj +++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/jacobiCudaGraphs/README.md b/Samples/jacobiCudaGraphs/README.md index 78a2e97e..c6223ff2 100644 --- a/Samples/jacobiCudaGraphs/README.md +++ b/Samples/jacobiCudaGraphs/README.md @@ -25,7 +25,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch, ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2017.vcxproj b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2017.vcxproj index a811971b..c899fc38 100644 --- a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2017.vcxproj +++ b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -109,6 +109,6 @@ - + diff --git a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2019.vcxproj b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2019.vcxproj index c02f5c88..c6158ebd 100644 --- a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2019.vcxproj +++ b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -105,6 +105,6 @@ - + diff --git a/Samples/jacobiCudaGraphs/main.cpp b/Samples/jacobiCudaGraphs/main.cpp index 1257e32a..5cb7db72 100644 --- a/Samples/jacobiCudaGraphs/main.cpp +++ b/Samples/jacobiCudaGraphs/main.cpp @@ -100,8 +100,10 @@ int main(int argc, char **argv) { double *b = NULL; float *A = NULL; - b = (double *)calloc(N_ROWS, sizeof(double)); - A = (float *)calloc(N_ROWS * N_ROWS, sizeof(float)); + checkCudaErrors(cudaMallocHost(&b, N_ROWS * sizeof(double))); + memset(b, 0, N_ROWS * sizeof(double)); + checkCudaErrors(cudaMallocHost(&A, N_ROWS * N_ROWS * sizeof(float))); + memset(A, 0, N_ROWS * N_ROWS * sizeof(float)); createLinearSystem(A, b); double *x = NULL; @@ -170,6 +172,9 @@ int main(int argc, char **argv) { checkCudaErrors(cudaFree(d_x)); checkCudaErrors(cudaFree(d_x_new)); + checkCudaErrors(cudaFreeHost(A)); + checkCudaErrors(cudaFreeHost(b)); + printf("&&&& jacobiCudaGraphs %s\n", (fabs(sum - sumGPU) < conv_threshold) ? "PASSED" : "FAILED"); diff --git a/Samples/matrixMul/README.md b/Samples/matrixMul/README.md index 85b1f138..5d9dba69 100644 --- a/Samples/matrixMul/README.md +++ b/Samples/matrixMul/README.md @@ -27,7 +27,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/matrixMul/matrixMul_vs2017.vcxproj b/Samples/matrixMul/matrixMul_vs2017.vcxproj index ca222d53..c362684f 100644 --- a/Samples/matrixMul/matrixMul_vs2017.vcxproj +++ b/Samples/matrixMul/matrixMul_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/matrixMul/matrixMul_vs2019.vcxproj b/Samples/matrixMul/matrixMul_vs2019.vcxproj index b61bb8d7..084d32b0 100644 --- a/Samples/matrixMul/matrixMul_vs2019.vcxproj +++ b/Samples/matrixMul/matrixMul_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/matrixMulDrv/README.md b/Samples/matrixMulDrv/README.md index f4672258..248b61c5 100644 --- a/Samples/matrixMulDrv/README.md +++ b/Samples/matrixMulDrv/README.md @@ -27,7 +27,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cu ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj index 76200467..6360c07c 100644 --- a/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj +++ b/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -112,6 +112,6 @@ - + diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2019.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2019.vcxproj index fdff280d..69a91d3c 100644 --- a/Samples/matrixMulDrv/matrixMulDrv_vs2019.vcxproj +++ b/Samples/matrixMulDrv/matrixMulDrv_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/memMapIPCDrv/Makefile b/Samples/memMapIPCDrv/Makefile index 8a0c1c3c..fe711fd3 100644 --- a/Samples/memMapIPCDrv/Makefile +++ b/Samples/memMapIPCDrv/Makefile @@ -302,14 +302,10 @@ LIBRARIES := ################################################################################ -FATBIN_FILE := memMapIpc_kernel${TARGET_SIZE}.fatbin +PTX_FILE := memMapIpc_kernel${TARGET_SIZE}.ptx # Gencode arguments -ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64)) -SMS ?= 35 37 50 52 60 61 70 72 75 80 86 -else -SMS ?= 35 37 50 52 60 61 70 75 80 86 -endif +SMS ?= ifeq ($(GENCODE_FLAGS),) # Generate SASS code for each SM architecture listed in $(SMS) @@ -395,7 +391,7 @@ endif # Target rules all: build -build: memMapIPCDrv $(FATBIN_FILE) +build: memMapIPCDrv $(PTX_FILE) check.deps: ifeq ($(SAMPLE_ENABLED),0) @@ -404,8 +400,8 @@ else @echo "Sample is ready - all dependencies have been met" endif -$(FATBIN_FILE): memMapIpc_kernel.cu - $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -fatbin $< +$(PTX_FILE): memMapIpc_kernel.cu + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -ptx $< $(EXEC) mkdir -p data $(EXEC) cp -f $@ ./data $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) @@ -426,9 +422,8 @@ run: build $(EXEC) ./memMapIPCDrv clean: - rm -f memMapIPCDrv helper_multiprocess.o memMapIpc.o data/$(FATBIN_FILE) $(FATBIN_FILE) + rm -f memMapIPCDrv helper_multiprocess.o memMapIpc.o data/$(PTX_FILE) $(PTX_FILE) rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/memMapIPCDrv - - rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/$(FATBIN_FILE) + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/$(PTX_FILE) clobber: clean diff --git a/Samples/memMapIPCDrv/README.md b/Samples/memMapIPCDrv/README.md index 04c7f0c4..1e343fd1 100644 --- a/Samples/memMapIPCDrv/README.md +++ b/Samples/memMapIPCDrv/README.md @@ -30,7 +30,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuLaunchKernel, cuMemcpyD ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/memMapIPCDrv/memMapIPCDrv_vs2017.vcxproj b/Samples/memMapIPCDrv/memMapIPCDrv_vs2017.vcxproj index c3107e07..86d80be6 100644 --- a/Samples/memMapIPCDrv/memMapIPCDrv_vs2017.vcxproj +++ b/Samples/memMapIPCDrv/memMapIPCDrv_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -67,7 +67,7 @@ $(OutDir)/memMapIPCDrv.exe - compute_35,compute_35;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86; + compute_35,compute_35; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -105,14 +105,14 @@ - data/%(Filename)64.fatbin - fatbin + data/%(Filename)64.ptx + ptx - + diff --git a/Samples/memMapIPCDrv/memMapIPCDrv_vs2019.vcxproj b/Samples/memMapIPCDrv/memMapIPCDrv_vs2019.vcxproj index 78a96686..3c928e83 100644 --- a/Samples/memMapIPCDrv/memMapIPCDrv_vs2019.vcxproj +++ b/Samples/memMapIPCDrv/memMapIPCDrv_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -63,7 +63,7 @@ $(OutDir)/memMapIPCDrv.exe - compute_35,compute_35;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86; + compute_35,compute_35; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -101,14 +101,14 @@ - data/%(Filename)64.fatbin - fatbin + data/%(Filename)64.ptx + ptx - + diff --git a/Samples/memMapIPCDrv/memMapIpc.cpp b/Samples/memMapIPCDrv/memMapIpc.cpp index ae4dee08..729cd231 100644 --- a/Samples/memMapIPCDrv/memMapIpc.cpp +++ b/Samples/memMapIPCDrv/memMapIpc.cpp @@ -64,9 +64,13 @@ typedef struct shmStruct_st { int sense; } shmStruct; -// define input fatbin file -#ifndef FATBIN_FILE -#define FATBIN_FILE "memMapIpc_kernel64.fatbin" +bool findModulePath(const char *, string &, char **, string &); + +// define input ptx file for different platforms +#if defined(_WIN64) || defined(__LP64__) +#define PTX_FILE "memMapIpc_kernel64.ptx" +#else +#define PTX_FILE "memMapIpc_kernel32.ptx" #endif // `ipcHandleTypeFlag` specifies the platform specific handle type this sample @@ -255,23 +259,44 @@ static void memMapUnmapAndFreeMemory(CUdeviceptr dptr, size_t size) { static void memMapGetDeviceFunction(char **argv) { // first search for the module path before we load the results - string module_path; - std::ostringstream fatbin; - - if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) { - exit(EXIT_FAILURE); + string module_path, ptx_source; + if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) { + if (!findModulePath("memMapIpc_kernel.cubin", module_path, argv, + ptx_source)) { + printf( + "> findModulePath could not find ptx or cubin\n"); + exit(EXIT_FAILURE); + } } else { printf("> initCUDA loading module: <%s>\n", module_path.c_str()); } - if (!fatbin.str().size()) { - printf("fatbin file empty. exiting..\n"); - exit(EXIT_FAILURE); + // Create module from binary file (PTX or CUBIN) + if (module_path.rfind("ptx") != string::npos) { + // in this branch we use compilation with parameters + const unsigned int jitNumOptions = 3; + CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; + void **jitOptVals = new void *[jitNumOptions]; + // set up size of compilation log buffer + jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + int jitLogBufferSize = 1024; + jitOptVals[0] = (void *)(size_t)jitLogBufferSize; + // set up pointer to the compilation log buffer + jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; + char *jitLogBuffer = new char[jitLogBufferSize]; + jitOptVals[1] = jitLogBuffer; + // set up pointer to set the Maximum # of registers for a particular kernel + jitOptions[2] = CU_JIT_MAX_REGISTERS; + int jitRegCount = 32; + jitOptVals[2] = (void *)(size_t)jitRegCount; + checkCudaErrors(cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), + jitNumOptions, jitOptions, + (void **)jitOptVals)); + printf("> PTX JIT log:\n%s\n", jitLogBuffer); + } else { + checkCudaErrors(cuModuleLoad(&cuModule, module_path.c_str())); } - // Create module from binary file (FATBIN) - checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str())); - // Get function handle from module checkCudaErrors( cuModuleGetFunction(&_memMapIpc_kernel, cuModule, "memMapIpc_kernel")); @@ -585,3 +610,37 @@ int main(int argc, char **argv) { return EXIT_SUCCESS; #endif } + +bool inline findModulePath(const char *module_file, string &module_path, + char **argv, string &ptx_source) { + char *actual_path = sdkFindFilePath(module_file, argv[0]); + + if (actual_path) { + module_path = actual_path; + } else { + printf("> findModulePath file not found: <%s> \n", module_file); + return false; + } + + if (module_path.empty()) { + printf("> findModulePath could not find file: <%s> \n", module_file); + return false; + } else { + printf("> findModulePath found file at <%s>\n", module_path.c_str()); + + if (module_path.rfind(".ptx") != string::npos) { + FILE *fp = fopen(module_path.c_str(), "rb"); + fseek(fp, 0, SEEK_END); + int file_size = ftell(fp); + char *buf = new char[file_size + 1]; + fseek(fp, 0, SEEK_SET); + fread(buf, sizeof(char), file_size, fp); + fclose(fp); + buf[file_size] = '\0'; + ptx_source = buf; + delete[] buf; + } + + return true; + } +} \ No newline at end of file diff --git a/Samples/nvJPEG/Makefile b/Samples/nvJPEG/Makefile index d8c228df..f3515c78 100644 --- a/Samples/nvJPEG/Makefile +++ b/Samples/nvJPEG/Makefile @@ -277,6 +277,12 @@ ifeq ($(TARGET_ARCH),armv7l) SAMPLE_ENABLED := 0 endif +# This sample is not supported on QNX +ifeq ($(TARGET_OS),qnx) + $(info >>> WARNING - nvJPEG is not supported on QNX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) diff --git a/Samples/nvJPEG/README.md b/Samples/nvJPEG/README.md index 9a86bf7e..53c1b60d 100644 --- a/Samples/nvJPEG/README.md +++ b/Samples/nvJPEG/README.md @@ -25,7 +25,7 @@ x86_64, ppc64le, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/nvJPEG/nvJPEG_vs2017.vcxproj b/Samples/nvJPEG/nvJPEG_vs2017.vcxproj index 94b76d62..7d16e568 100644 --- a/Samples/nvJPEG/nvJPEG_vs2017.vcxproj +++ b/Samples/nvJPEG/nvJPEG_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/nvJPEG/nvJPEG_vs2019.vcxproj b/Samples/nvJPEG/nvJPEG_vs2019.vcxproj index 0bf2340d..378b9198 100644 --- a/Samples/nvJPEG/nvJPEG_vs2019.vcxproj +++ b/Samples/nvJPEG/nvJPEG_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/nvJPEG_encoder/Makefile b/Samples/nvJPEG_encoder/Makefile index 05228d1d..da0b82b7 100644 --- a/Samples/nvJPEG_encoder/Makefile +++ b/Samples/nvJPEG_encoder/Makefile @@ -277,6 +277,12 @@ ifeq ($(TARGET_ARCH),armv7l) SAMPLE_ENABLED := 0 endif +# This sample is not supported on QNX +ifeq ($(TARGET_OS),qnx) + $(info >>> WARNING - nvJPEG_encoder is not supported on QNX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) diff --git a/Samples/nvJPEG_encoder/README.md b/Samples/nvJPEG_encoder/README.md index ced6a27f..40f092b3 100644 --- a/Samples/nvJPEG_encoder/README.md +++ b/Samples/nvJPEG_encoder/README.md @@ -25,7 +25,7 @@ x86_64, ppc64le, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2017.vcxproj b/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2017.vcxproj index 9b102304..765f1f35 100644 --- a/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2017.vcxproj +++ b/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2019.vcxproj b/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2019.vcxproj index 314cb390..76fcec11 100644 --- a/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2019.vcxproj +++ b/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/p2pBandwidthLatencyTest/README.md b/Samples/p2pBandwidthLatencyTest/README.md index 4adbc8ed..ab1f4685 100644 --- a/Samples/p2pBandwidthLatencyTest/README.md +++ b/Samples/p2pBandwidthLatencyTest/README.md @@ -27,7 +27,7 @@ cudaDeviceCanAccessPeer, cudaDeviceEnablePeerAccess, cudaDeviceDisablePeerAccess ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj index d8832b17..28d5f5cd 100644 --- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj +++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2019.vcxproj b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2019.vcxproj index 7d04df50..8a2d5450 100644 --- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2019.vcxproj +++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/reduction/README.md b/Samples/reduction/README.md index 158abb00..172d748a 100644 --- a/Samples/reduction/README.md +++ b/Samples/reduction/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/reduction/reduction_vs2017.vcxproj b/Samples/reduction/reduction_vs2017.vcxproj index c1cf9fb3..7e14bc82 100644 --- a/Samples/reduction/reduction_vs2017.vcxproj +++ b/Samples/reduction/reduction_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -109,6 +109,6 @@ - + diff --git a/Samples/reduction/reduction_vs2019.vcxproj b/Samples/reduction/reduction_vs2019.vcxproj index 0fd929e9..74fb1d6a 100644 --- a/Samples/reduction/reduction_vs2019.vcxproj +++ b/Samples/reduction/reduction_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -105,6 +105,6 @@ - + diff --git a/Samples/shfl_scan/README.md b/Samples/shfl_scan/README.md index efa00f56..87b6872b 100644 --- a/Samples/shfl_scan/README.md +++ b/Samples/shfl_scan/README.md @@ -25,7 +25,7 @@ x86_64, ppc64le, armv7l, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/shfl_scan/shfl_scan_vs2017.vcxproj b/Samples/shfl_scan/shfl_scan_vs2017.vcxproj index 6f91ea60..beaad3f8 100644 --- a/Samples/shfl_scan/shfl_scan_vs2017.vcxproj +++ b/Samples/shfl_scan/shfl_scan_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -109,6 +109,6 @@ - + diff --git a/Samples/shfl_scan/shfl_scan_vs2019.vcxproj b/Samples/shfl_scan/shfl_scan_vs2019.vcxproj index 5504852e..8757714e 100644 --- a/Samples/shfl_scan/shfl_scan_vs2019.vcxproj +++ b/Samples/shfl_scan/shfl_scan_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -105,6 +105,6 @@ - + diff --git a/Samples/simpleAWBarrier/README.md b/Samples/simpleAWBarrier/README.md index ca95266a..c4003183 100644 --- a/Samples/simpleAWBarrier/README.md +++ b/Samples/simpleAWBarrier/README.md @@ -30,7 +30,7 @@ cudaMalloc, cudaFree, cudaMemcpyAsync ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleAWBarrier/simpleAWBarrier.cu b/Samples/simpleAWBarrier/simpleAWBarrier.cu index 8beb371a..b36af811 100644 --- a/Samples/simpleAWBarrier/simpleAWBarrier.cu +++ b/Samples/simpleAWBarrier/simpleAWBarrier.cu @@ -25,7 +25,6 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - // Includes, system #include @@ -35,229 +34,222 @@ #include // Utilities and timing functions -#include // includes cuda.h and cuda_runtime_api.h +#include // includes cuda.h and cuda_runtime_api.h // CUDA helper functions -#include // helper functions for CUDA error check +#include // helper functions for CUDA error check namespace cg = cooperative_groups; - #if __CUDA_ARCH__ >= 700 -template __device__ void reduceBlockData(cuda::barrier &barrier, - cg::thread_block_tile<32> &tile32, double &threadSum, double *result) -{ - extern __shared__ double tmp[]; +template +__device__ void reduceBlockData( + cuda::barrier &barrier, + cg::thread_block_tile<32> &tile32, double &threadSum, double *result) { + extern __shared__ double tmp[]; - #pragma unroll - for (int offset = tile32.size()/2; offset > 0; offset /= 2) - { - threadSum += tile32.shfl_down(threadSum, offset); - } - if (tile32.thread_rank() == 0) - { - tmp[tile32.meta_group_rank()] = threadSum; +#pragma unroll + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { + threadSum += tile32.shfl_down(threadSum, offset); + } + if (tile32.thread_rank() == 0) { + tmp[tile32.meta_group_rank()] = threadSum; + } + + auto token = barrier.arrive(); + + barrier.wait(std::move(token)); + + // The warp 0 will perform last round of reduction + if (tile32.meta_group_rank() == 0) { + double beta = tile32.thread_rank() < tile32.meta_group_size() + ? tmp[tile32.thread_rank()] + : 0.0; + +#pragma unroll + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { + beta += tile32.shfl_down(beta, offset); } - auto token = barrier.arrive(); - - barrier.wait(std::move(token)); - - // The warp 0 will perform last round of reduction - if (tile32.meta_group_rank() == 0) { - - double beta = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0; - - #pragma unroll - for (int offset = tile32.size()/2; offset > 0; offset /= 2) - { - beta += tile32.shfl_down(beta, offset); - } - - if (tile32.thread_rank() == 0) - { - if (writeSquareRoot) - *result = sqrt(beta); - else - *result = beta; - } + if (tile32.thread_rank() == 0) { + if (writeSquareRoot) + *result = sqrt(beta); + else + *result = beta; } + } } #endif -__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, double *partialResults, int size) -{ +__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, + double *partialResults, int size) { #if __CUDA_ARCH__ >= 700 #pragma diag_suppress static_var_with_dynamic_init - cg::thread_block cta = cg::this_thread_block(); - cg::grid_group grid = cg::this_grid();; - cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + cg::thread_block cta = cg::this_thread_block(); + cg::grid_group grid = cg::this_grid(); + ; + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); - __shared__ cuda::barrier barrier; + __shared__ cuda::barrier barrier; - if (threadIdx.x == 0) { - init(&barrier, blockDim.x); + if (threadIdx.x == 0) { + init(&barrier, blockDim.x); + } + + cg::sync(cta); + + double threadSum = 0.0; + for (int i = grid.thread_rank(); i < size; i += grid.size()) { + threadSum += (double)(vecA[i] * vecB[i]); + } + + // Each thread block performs reduction of partial dotProducts and writes to + // global mem. + reduceBlockData(barrier, tile32, threadSum, + &partialResults[blockIdx.x]); + + cg::sync(grid); + + // One block performs the final summation of partial dot products + // of all the thread blocks and writes the sqrt of final dot product. + if (blockIdx.x == 0) { + threadSum = 0.0; + for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) { + threadSum += partialResults[i]; } + reduceBlockData(barrier, tile32, threadSum, &partialResults[0]); + } - cg::sync(cta); + cg::sync(grid); - double threadSum = 0.0; - for (int i = grid.thread_rank(); i < size; i += grid.size()) - { - threadSum += (double) (vecA[i] * vecB[i]); - } + const double finalValue = partialResults[0]; - // Each thread block performs reduction of partial dotProducts and writes to - // global mem. - reduceBlockData(barrier, tile32, threadSum, &partialResults[blockIdx.x]); - - cg::sync(grid); - - // One block performs the final summation of partial dot products - // of all the thread blocks and writes the sqrt of final dot product. - if (blockIdx.x == 0) - { - threadSum = 0.0; - for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) - { - threadSum += partialResults[i]; - } - reduceBlockData(barrier, tile32, threadSum, &partialResults[0]); - } - - cg::sync(grid); - - const double finalValue = partialResults[0]; - - // Perform normalization of vecA & vecB. - for (int i = grid.thread_rank(); i < size; i += grid.size()) - { - vecA[i] = (float)vecA[i] / finalValue; - vecB[i] = (float)vecB[i] / finalValue; - } + // Perform normalization of vecA & vecB. + for (int i = grid.thread_rank(); i < size; i += grid.size()) { + vecA[i] = (float)vecA[i] / finalValue; + vecB[i] = (float)vecB[i] / finalValue; + } #endif } - int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) -{ - printf("%s starting...\n", argv[0]); +int main(int argc, char **argv) { + printf("%s starting...\n", argv[0]); - // This will pick the best possible CUDA capable device - int dev = findCudaDevice(argc, (const char **)argv); + // This will pick the best possible CUDA capable device + int dev = findCudaDevice(argc, (const char **)argv); - int major = 0; - checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev)); + int major = 0; + checkCudaErrors( + cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev)); - // Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher. - if (major < 7) { - printf("simpleAWBarrier requires SM 7.0 or higher. Exiting...\n"); - exit(EXIT_WAIVED); - } + // Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher. + if (major < 7) { + printf("simpleAWBarrier requires SM 7.0 or higher. Exiting...\n"); + exit(EXIT_WAIVED); + } - int supportsCooperativeLaunch = 0; - checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch, cudaDevAttrCooperativeLaunch, dev)); + int supportsCooperativeLaunch = 0; + checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch, + cudaDevAttrCooperativeLaunch, dev)); - if (!supportsCooperativeLaunch) - { - printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, Waiving the run\n", dev); - exit(EXIT_WAIVED); - } + if (!supportsCooperativeLaunch) { + printf( + "\nSelected GPU (%d) does not support Cooperative Kernel Launch, " + "Waiving the run\n", + dev); + exit(EXIT_WAIVED); + } - int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev); + int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev); - printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!"); - exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); + printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!"); + exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); } -int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) -{ - float *vecA, *d_vecA; - float *vecB, *d_vecB; - double *d_partialResults; - int size = 10000000; +int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) { + float *vecA, *d_vecA; + float *vecB, *d_vecB; + double *d_partialResults; + int size = 10000000; - vecA = new float[size]; - vecB = new float[size]; + checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size)); + checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size)); - checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float)*size)); - checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float)*size)); + checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size)); + checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size)); - float baseVal = 2.0; - for (int i = 0; i < size; i++) - { - vecA[i] = vecB[i] = baseVal; + float baseVal = 2.0; + for (int i = 0; i < size; i++) { + vecA[i] = vecB[i] = baseVal; + } + + cudaStream_t stream; + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size, + cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size, + cudaMemcpyHostToDevice, stream)); + + // Kernel configuration, where a one-dimensional + // grid and one-dimensional blocks are configured. + int minGridSize = 0, blockSize = 0; + checkCudaErrors(cudaOccupancyMaxPotentialBlockSize( + &minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size)); + + int smemSize = ((blockSize / 32) + 1) * sizeof(double); + + int numBlocksPerSm = 0; + checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize)); + + int multiProcessorCount = 0; + checkCudaErrors(cudaDeviceGetAttribute( + &multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId)); + + minGridSize = multiProcessorCount * numBlocksPerSm; + checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double))); + + printf( + "Launching normVecByDotProductAWBarrier kernel with numBlocks = %d " + "blockSize = %d\n", + minGridSize, blockSize); + + dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1); + + void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB, + (void *)&d_partialResults, (void *)&size}; + + checkCudaErrors( + cudaLaunchCooperativeKernel((void *)normVecByDotProductAWBarrier, dimGrid, + dimBlock, kernelArgs, smemSize, stream)); + + checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size, + cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + + float expectedResult = (baseVal / sqrt(size * baseVal * baseVal)); + unsigned int matches = 0; + for (int i = 0; i < size; i++) { + if ((vecA[i] - expectedResult) > 0.00001) { + printf("mismatch at i = %d\n", i); + break; + } else { + matches++; } + } - cudaStream_t stream; - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + printf("Result = %s\n", matches == size ? "PASSED" : "FAILED"); + checkCudaErrors(cudaFree(d_vecA)); + checkCudaErrors(cudaFree(d_vecB)); + checkCudaErrors(cudaFree(d_partialResults)); - checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float)*size, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float)*size, cudaMemcpyHostToDevice, stream)); - - // Kernel configuration, where a one-dimensional - // grid and one-dimensional blocks are configured. - int minGridSize = 0, blockSize = 0; - checkCudaErrors(cudaOccupancyMaxPotentialBlockSize( - &minGridSize, - &blockSize, - (void*)normVecByDotProductAWBarrier, - 0, - size)); - - int smemSize = ((blockSize/32)+1) * sizeof(double); - - int numBlocksPerSm = 0; - checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize)); - - int multiProcessorCount = 0; - checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId)); - - minGridSize = multiProcessorCount * numBlocksPerSm; - checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize*sizeof(double))); - - printf("Launching normVecByDotProductAWBarrier kernel with numBlocks = %d blockSize = %d\n", minGridSize, blockSize); - - dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1); - - void *kernelArgs[] = { - (void*)&d_vecA, - (void*)&d_vecB, - (void*)&d_partialResults, - (void*)&size - }; - - checkCudaErrors(cudaLaunchCooperativeKernel((void *)normVecByDotProductAWBarrier, dimGrid, dimBlock, kernelArgs, smemSize, stream)); - - checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float)*size, cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - - float expectedResult = (baseVal / sqrt(size*baseVal*baseVal)); - unsigned int matches = 0; - for (int i=0; i < size; i++) - { - if ((vecA[i] - expectedResult) > 0.00001) - { - printf("mismatch at i = %d\n", i); - break; - } - else - { - matches++; - } - } - - printf("Result = %s\n", matches == size ? "PASSED" : "FAILED"); - checkCudaErrors(cudaFree(d_vecA)); - checkCudaErrors(cudaFree(d_vecB)); - checkCudaErrors(cudaFree(d_partialResults)); - - delete[] vecA; - delete[] vecB; - return matches == size; + checkCudaErrors(cudaFreeHost(vecA)); + checkCudaErrors(cudaFreeHost(vecB)); + return matches == size; } diff --git a/Samples/simpleAWBarrier/simpleAWBarrier_vs2017.vcxproj b/Samples/simpleAWBarrier/simpleAWBarrier_vs2017.vcxproj index b83f8823..e03ef6e2 100644 --- a/Samples/simpleAWBarrier/simpleAWBarrier_vs2017.vcxproj +++ b/Samples/simpleAWBarrier/simpleAWBarrier_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleAWBarrier/simpleAWBarrier_vs2019.vcxproj b/Samples/simpleAWBarrier/simpleAWBarrier_vs2019.vcxproj index 62e6fccd..b4be9610 100644 --- a/Samples/simpleAWBarrier/simpleAWBarrier_vs2019.vcxproj +++ b/Samples/simpleAWBarrier/simpleAWBarrier_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/simpleAttributes/README.md b/Samples/simpleAttributes/README.md index f5947c5f..5d643c86 100644 --- a/Samples/simpleAttributes/README.md +++ b/Samples/simpleAttributes/README.md @@ -27,7 +27,7 @@ cudaCtxResetPersistingL2Cache, cudaDeviceSetLimit, cudaFree, cudaGetDeviceProper ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/simpleAttributes/simpleAttributes.cu b/Samples/simpleAttributes/simpleAttributes.cu index 46f00aae..d71b4bb1 100644 --- a/Samples/simpleAttributes/simpleAttributes.cu +++ b/Samples/simpleAttributes/simpleAttributes.cu @@ -36,22 +36,22 @@ // includes, project #include -#include // helper functions for SDK examples +#include // helper functions for SDK examples //////////////////////////////////////////////////////////////////////////////// // declaration, forward void runTest(int argc, char **argv); -cudaAccessPolicyWindow -initAccessPolicyWindow(void) { - cudaAccessPolicyWindow accessPolicyWindow = { 0 }; - accessPolicyWindow.base_ptr = (void *)0; - accessPolicyWindow.num_bytes = 0; - accessPolicyWindow.hitRatio = 0.f; - accessPolicyWindow.hitProp = cudaAccessPropertyNormal; - accessPolicyWindow.missProp = cudaAccessPropertyStreaming; - return accessPolicyWindow; +cudaAccessPolicyWindow initAccessPolicyWindow(void) { + cudaAccessPolicyWindow accessPolicyWindow = {0}; + accessPolicyWindow.base_ptr = (void *)0; + accessPolicyWindow.num_bytes = 0; + accessPolicyWindow.hitRatio = 0.f; + accessPolicyWindow.hitProp = cudaAccessPropertyNormal; + accessPolicyWindow.missProp = cudaAccessPropertyStreaming; + return accessPolicyWindow; } + //////////////////////////////////////////////////////////////////////////////// //! Simple test kernel for device functionality //! @param data input data in global memory @@ -60,146 +60,155 @@ initAccessPolicyWindow(void) { //! @param bigDataSize input bigData size //! @param hitcount how many data access are done within block //////////////////////////////////////////////////////////////////////////////// -static __global__ void -kernCacheSegmentTest(int* data, int dataSize, int *trash, int bigDataSize, int hitCount) -{ - __shared__ unsigned int hit; - int row = blockIdx.y * blockDim.y + threadIdx.y; - int col = blockIdx.x * blockDim.x + threadIdx.x; - int tID = row * blockDim.y + col; - uint32_t psRand = tID; +static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash, + int bigDataSize, int hitCount) { + __shared__ unsigned int hit; + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + int tID = row * blockDim.y + col; + uint32_t psRand = tID; - atomicExch(&hit, 0); - __syncthreads(); - while (hit < hitCount) { - psRand ^= psRand << 13; - psRand ^= psRand >> 17; - psRand ^= psRand << 5; + atomicExch(&hit, 0); + __syncthreads(); + while (hit < hitCount) { + psRand ^= psRand << 13; + psRand ^= psRand >> 17; + psRand ^= psRand << 5; - int idx = tID - psRand; - if (idx < 0) { - idx = -idx; - } - - if((tID % 2) == 0) { - data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize]; - } else { - trash[psRand % bigDataSize] = trash[psRand % bigDataSize] + trash[idx % bigDataSize]; - } - - atomicAdd(&hit, 1); + int idx = tID - psRand; + if (idx < 0) { + idx = -idx; } + + if ((tID % 2) == 0) { + data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize]; + } else { + trash[psRand % bigDataSize] = + trash[psRand % bigDataSize] + trash[idx % bigDataSize]; + } + + atomicAdd(&hit, 1); + } } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int -main(int argc, char **argv) -{ - runTest(argc, argv); -} +int main(int argc, char **argv) { runTest(argc, argv); } //////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// -void -runTest(int argc, char **argv) -{ - bool bTestResult = true; - cudaAccessPolicyWindow accessPolicyWindow; - cudaDeviceProp deviceProp; - cudaStreamAttrValue streamAttrValue; - cudaStream_t stream; - cudaStreamAttrID streamAttrID; - dim3 threads(32, 32); - int *dataDevicePointer; - int *dataHostPointer; - int dataSize; - int *bigDataDevicePointer; - int *bigDataHostPointer; - int bigDataSize; - StopWatchInterface *timer = 0; +void runTest(int argc, char **argv) { + bool bTestResult = true; + cudaAccessPolicyWindow accessPolicyWindow; + cudaDeviceProp deviceProp; + cudaStreamAttrValue streamAttrValue; + cudaStream_t stream; + cudaStreamAttrID streamAttrID; + dim3 threads(32, 32); + int *dataDevicePointer; + int *dataHostPointer; + int dataSize; + int *bigDataDevicePointer; + int *bigDataHostPointer; + int bigDataSize; + StopWatchInterface *timer = 0; - printf("%s Starting...\n\n", argv[0]); + printf("%s Starting...\n\n", argv[0]); - // use command-line specified CUDA device, otherwise use device with highest Gflops/s - int devID = findCudaDevice(argc, (const char **)argv); - sdkCreateTimer(&timer); - sdkStartTimer(&timer); - //Get device properties - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); - dim3 blocks(deviceProp.maxGridSize[1], 1); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + int devID = findCudaDevice(argc, (const char **)argv); + sdkCreateTimer(&timer); + sdkStartTimer(&timer); + // Get device properties + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); + dim3 blocks(deviceProp.maxGridSize[1], 1); - //Make sure device the l2 optimization - if (deviceProp.persistingL2CacheMaxSize == 0) { - printf("Waiving execution as device %d does not support persisting L2 Caching\n", devID); - exit(EXIT_WAIVED); + // Make sure device the l2 optimization + if (deviceProp.persistingL2CacheMaxSize == 0) { + printf( + "Waiving execution as device %d does not support persisting L2 " + "Caching\n", + devID); + exit(EXIT_WAIVED); + } + + // Create stream to assiocate with window + checkCudaErrors(cudaStreamCreate(&stream)); + + // Set the amount of l2 cache that will be persisting to maximum the device + // can support + checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, + deviceProp.persistingL2CacheMaxSize)); + + // Stream attribute to set + streamAttrID = cudaStreamAttributeAccessPolicyWindow; + + // Default window + streamAttrValue.accessPolicyWindow = initAccessPolicyWindow(); + accessPolicyWindow = initAccessPolicyWindow(); + + // Allocate size of both buffers + bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int); + dataSize = (deviceProp.l2CacheSize / 4) / sizeof(int); + + // Allocate data + checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int))); + checkCudaErrors( + cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int))); + + for (int i = 0; i < bigDataSize; ++i) { + if (i < dataSize) { + dataHostPointer[i] = i; } - //Create stream to assiocate with window - checkCudaErrors(cudaStreamCreate(&stream)); + bigDataHostPointer[bigDataSize - i - 1] = i; + } - //Set the amount of l2 cache that will be persisting to maximum the device can support - checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, deviceProp.persistingL2CacheMaxSize)); + checkCudaErrors( + cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int))); + checkCudaErrors( + cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int))); + checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer, + dataSize * sizeof(int), + cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer, + bigDataSize * sizeof(int), + cudaMemcpyHostToDevice, stream)); - //Stream attribute to set - streamAttrID = cudaStreamAttributeAccessPolicyWindow; + // Make a window for the buffer of interest + accessPolicyWindow.base_ptr = (void *)dataDevicePointer; + accessPolicyWindow.num_bytes = dataSize * sizeof(int); + accessPolicyWindow.hitRatio = 1.f; + accessPolicyWindow.hitProp = cudaAccessPropertyPersisting; + accessPolicyWindow.missProp = cudaAccessPropertyNormal; + streamAttrValue.accessPolicyWindow = accessPolicyWindow; - //Default window - streamAttrValue.accessPolicyWindow = initAccessPolicyWindow(); - accessPolicyWindow = initAccessPolicyWindow(); + // Assign window to stream + checkCudaErrors( + cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue)); - //Allocate size of both buffers - bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int); - dataSize = (deviceProp.l2CacheSize / 4) / sizeof(int); + // Demote any previous persisting lines + checkCudaErrors(cudaCtxResetPersistingL2Cache()); - //Allocate data - dataHostPointer = (int *)malloc(dataSize * sizeof(int)); - bigDataHostPointer = (int *)malloc(bigDataSize * sizeof(int)); + checkCudaErrors(cudaStreamSynchronize(stream)); + kernCacheSegmentTest<<>>( + dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF); - for ( int i = 0; i < bigDataSize; ++i) { - if (i < dataSize) { - dataHostPointer[i] = i; - } + checkCudaErrors(cudaStreamSynchronize(stream)); + // check if kernel execution generated and error + getLastCudaError("Kernel execution failed"); - bigDataHostPointer[bigDataSize - i - 1] = i; - } + // Free memory + checkCudaErrors(cudaFreeHost(dataHostPointer)); + checkCudaErrors(cudaFreeHost(bigDataHostPointer)); + checkCudaErrors(cudaFree(dataDevicePointer)); + checkCudaErrors(cudaFree(bigDataDevicePointer)); - checkCudaErrors(cudaMalloc((void**) &dataDevicePointer, dataSize * sizeof(int))); - checkCudaErrors(cudaMalloc((void**) &bigDataDevicePointer, bigDataSize * sizeof(int))); - checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer, dataSize * sizeof(int), cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer, bigDataSize * sizeof(int), cudaMemcpyHostToDevice, stream)); + sdkStopTimer(&timer); + printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); + sdkDeleteTimer(&timer); - //Make a window for the buffer of interest - accessPolicyWindow.base_ptr = (void *)dataDevicePointer; - accessPolicyWindow.num_bytes = dataSize * sizeof(int); - accessPolicyWindow.hitRatio = 1.f; - accessPolicyWindow.hitProp = cudaAccessPropertyPersisting; - accessPolicyWindow.missProp = cudaAccessPropertyNormal; - streamAttrValue.accessPolicyWindow = accessPolicyWindow; - - //Assign window to stream - checkCudaErrors(cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue)); - - //Demote any previous persisting lines - checkCudaErrors(cudaCtxResetPersistingL2Cache()); - - checkCudaErrors(cudaStreamSynchronize(stream)); - kernCacheSegmentTest<<>>(dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF); - - checkCudaErrors(cudaStreamSynchronize(stream)); - // check if kernel execution generated and error - getLastCudaError("Kernel execution failed"); - - //Free memory - free(dataHostPointer); - free(bigDataHostPointer); - checkCudaErrors(cudaFree(dataDevicePointer)); - checkCudaErrors(cudaFree(bigDataDevicePointer)); - - sdkStopTimer(&timer); - printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); - sdkDeleteTimer(&timer); - - exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); + exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/simpleAttributes/simpleAttributes_vs2017.vcxproj b/Samples/simpleAttributes/simpleAttributes_vs2017.vcxproj index 692be656..a31cf815 100644 --- a/Samples/simpleAttributes/simpleAttributes_vs2017.vcxproj +++ b/Samples/simpleAttributes/simpleAttributes_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleAttributes/simpleAttributes_vs2019.vcxproj b/Samples/simpleAttributes/simpleAttributes_vs2019.vcxproj index 47fdd889..b74d221e 100644 --- a/Samples/simpleAttributes/simpleAttributes_vs2019.vcxproj +++ b/Samples/simpleAttributes/simpleAttributes_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/simpleCUBLAS/Makefile b/Samples/simpleCUBLAS/Makefile index bbdaed39..516da194 100644 --- a/Samples/simpleCUBLAS/Makefile +++ b/Samples/simpleCUBLAS/Makefile @@ -263,6 +263,14 @@ ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) +SAMPLE_ENABLED := 1 + +# This sample is not supported on QNX +ifeq ($(TARGET_OS),qnx) + $(info >>> WARNING - simpleCUBLAS is not supported on QNX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) @@ -297,6 +305,10 @@ ALL_CCFLAGS += --threads 0 LIBRARIES += -lcublas +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + ################################################################################ # Target rules @@ -304,16 +316,23 @@ all: build build: simpleCUBLAS +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + simpleCUBLAS.o:simpleCUBLAS.cpp - $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< simpleCUBLAS: simpleCUBLAS.o - $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) - mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) - cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) run: build - ./simpleCUBLAS + $(EXEC) ./simpleCUBLAS clean: rm -f simpleCUBLAS simpleCUBLAS.o diff --git a/Samples/simpleCUBLAS/README.md b/Samples/simpleCUBLAS/README.md index d4735374..67dd4ce5 100644 --- a/Samples/simpleCUBLAS/README.md +++ b/Samples/simpleCUBLAS/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj index 161fe55f..3a68d707 100644 --- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj +++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2019.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2019.vcxproj index b43537ff..6370f200 100644 --- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2019.vcxproj +++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/simpleCUBLASXT/Makefile b/Samples/simpleCUBLASXT/Makefile index 24e5af89..b5759857 100644 --- a/Samples/simpleCUBLASXT/Makefile +++ b/Samples/simpleCUBLASXT/Makefile @@ -271,6 +271,12 @@ ifeq ($(TARGET_ARCH),armv7l) SAMPLE_ENABLED := 0 endif +# This sample is not supported on QNX +ifeq ($(TARGET_OS),qnx) + $(info >>> WARNING - simpleCUBLASXT is not supported on QNX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) diff --git a/Samples/simpleCUBLASXT/README.md b/Samples/simpleCUBLASXT/README.md index 00b4ce9b..fd3decae 100644 --- a/Samples/simpleCUBLASXT/README.md +++ b/Samples/simpleCUBLASXT/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj index faaaba0e..3805ce26 100644 --- a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj +++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2019.vcxproj b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2019.vcxproj index 7b0324fb..b0472a39 100644 --- a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2019.vcxproj +++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/simpleCUBLAS_LU/Makefile b/Samples/simpleCUBLAS_LU/Makefile new file mode 100644 index 00000000..2c49cc17 --- /dev/null +++ b/Samples/simpleCUBLAS_LU/Makefile @@ -0,0 +1,357 @@ +################################################################################ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif + +# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now. +ifeq ($(HOST_ARCH),aarch64) + ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null)) + HOST_ARCH := sbsa + TARGET_ARCH := sbsa + endif +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-clang++ + endif + else ifeq ($(TARGET_ARCH),sbsa) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu + LDFLAGS += --unresolved-symbols=ignore-in-shared-libs + CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm + CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + NVCCFLAGS += --qpp-config 5.4.0,gcc_ntoaarch64le + CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu + LDFLAGS += -lsocket + LDFLAGS += -L/usr/lib/aarch64-qnx-gnu + CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu" + ifdef TARGET_OVERRIDE + LDFLAGS += -lslog2 + endif + + ifneq ($(TARGET_FS),) + LDFLAGS += -L$(TARGET_FS)/usr/lib + CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib" + LDFLAGS += -L$(TARGET_FS)/usr/libnvidia + CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia" + CCFLAGS += -I$(TARGET_FS)/../include + endif + endif +endif + +ifdef TARGET_OVERRIDE # cuda toolkit targets override + NVCCFLAGS += -target-dir $(TARGET_OVERRIDE) +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux) + CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +SAMPLE_ENABLED := 1 + +# This sample is not supported on Mac OSX +ifeq ($(TARGET_OS),darwin) + $(info >>> WARNING - simpleCUBLAS_LU is not supported on Mac OSX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +# This sample is not supported on ARMv7 +ifeq ($(TARGET_ARCH),armv7l) + $(info >>> WARNING - simpleCUBLAS_LU is not supported on ARMv7 - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +# This sample is not supported on QNX +ifeq ($(TARGET_OS),qnx) + $(info >>> WARNING - simpleCUBLAS_LU is not supported on QNX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../Common +LIBRARIES := + +################################################################################ + +# Gencode arguments +ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64)) +SMS ?= 35 37 50 52 60 61 70 72 75 80 86 +else +SMS ?= 35 37 50 52 60 61 70 75 80 86 +endif + +ifeq ($(SMS),) +$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) +SAMPLE_ENABLED := 0 +endif + +ifeq ($(GENCODE_FLAGS),) +# Generate SASS code for each SM architecture listed in $(SMS) +$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) + +# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility +HIGHEST_SM := $(lastword $(sort $(SMS))) +ifneq ($(HIGHEST_SM),) +GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) +endif +endif + +ALL_CCFLAGS += --threads 0 + +LIBRARIES += -lcublas + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: simpleCUBLAS_LU + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +simpleCUBLAS_LU.o:simpleCUBLAS_LU.cpp + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +simpleCUBLAS_LU: simpleCUBLAS_LU.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./simpleCUBLAS_LU + +clean: + rm -f simpleCUBLAS_LU simpleCUBLAS_LU.o + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/simpleCUBLAS_LU + +clobber: clean diff --git a/Samples/simpleCUBLAS_LU/NsightEclipse.xml b/Samples/simpleCUBLAS_LU/NsightEclipse.xml new file mode 100644 index 00000000..dea35d1a --- /dev/null +++ b/Samples/simpleCUBLAS_LU/NsightEclipse.xml @@ -0,0 +1,68 @@ + + + + simpleCUBLAS_LU + + whole + + ./ + ../ + ../../common/inc + + + CUBLAS Library + LU decomposition + + + CUDA + CUBLAS + Linear Algebra + LU decomposition + + + cublas + + + + true + simpleCUBLAS_LU.cpp + + CUBLAS + + + 1:CUDA Basic Topics + 3:Linear Algebra + + sm35 + sm37 + sm50 + sm52 + sm60 + sm61 + sm70 + sm72 + sm75 + sm80 + sm86 + + + x86_64 + linux + + + windows7 + + + aarch64 + + + ppc64le + linux + + + + all + + Simple CUBLAS LU + exe + diff --git a/Samples/simpleCUBLAS_LU/README.md b/Samples/simpleCUBLAS_LU/README.md new file mode 100644 index 00000000..9ef4764b --- /dev/null +++ b/Samples/simpleCUBLAS_LU/README.md @@ -0,0 +1,71 @@ +# simpleCUBLAS_LU - Simple CUBLAS LU + +## Description + +CUDA sample demonstrating cuBLAS API cublasDgetrfBatched() for lower-upper (LU) decomposition of a matrix. + +## Key Concepts + +CUBLAS Library, LU decomposition + +## Supported SM Architectures + +[SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows + +## Supported CPU Architecture + +x86_64, ppc64le, aarch64 + +## CUDA APIs involved + +## Dependencies needed to build/run +[CUBLAS](../../README.md#cublas) + +## Prerequisites + +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Make sure the dependencies mentioned in [Dependencies]() section above are installed. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=aarch64`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +## References (for more details) + diff --git a/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU.cpp b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU.cpp new file mode 100644 index 00000000..7b7e4d7a --- /dev/null +++ b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU.cpp @@ -0,0 +1,417 @@ +/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This example demonstrates how to use the cuBLAS library API + * for lower-upper (LU) decomposition of a matrix. LU decomposition + * factors a matrix as the product of upper triangular matrix and + * lower trianglular matrix. + * + * https://en.wikipedia.org/wiki/LU_decomposition + * + * This sample uses 10000 matrices of size 4x4 and performs + * LU decomposition of them using batched decomposition API + * of cuBLAS library. To test the correctness of upper and lower + * matrices generated, they are multiplied and compared with the + * original input matrix. + * + */ + +#include +#include + +// cuda libraries and helpers +#include +#include +#include + +// configurable parameters +// dimension of matrix +#define N 4 +#define BATCH_SIZE 10000 + +// use double precision data type +#define DOUBLE_PRECISION /* comment this to use single precision */ +#ifdef DOUBLE_PRECISION +#define DATA_TYPE double +#define MAX_ERROR 1e-15 +#else +#define DATA_TYPE float +#define MAX_ERROR 1e-6 +#endif /* DOUBLE_PRCISION */ + +// use pivot vector while decomposing +#define PIVOT /* comment this to disable pivot use */ + +// helper functions + +// wrapper around cublasgetrfBatched() +cublasStatus_t cublasXgetrfBatched(cublasHandle_t handle, int n, + DATA_TYPE* const A[], int lda, int* P, + int* info, int batchSize) { +#ifdef DOUBLE_PRECISION + return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize); +#else + return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize); +#endif +} + +// wrapper around malloc +// clears the allocated memory to 0 +// terminates the program if malloc fails +void* xmalloc(size_t size) { + void* ptr = malloc(size); + if (ptr == NULL) { + printf("> ERROR: malloc for size %zu failed..\n", size); + exit(EXIT_FAILURE); + } + memset(ptr, 0, size); + return ptr; +} + +// initalize identity matrix +void initIdentityMatrix(DATA_TYPE* mat) { + // clear the matrix + memset(mat, 0, N * N * sizeof(DATA_TYPE)); + + // set all diagonals to 1 + for (int i = 0; i < N; i++) { + mat[(i * N) + i] = 1.0; + } +} + +// initialize matrix with all elements as 0 +void initZeroMatrix(DATA_TYPE* mat) { + memset(mat, 0, N * N * sizeof(DATA_TYPE)); +} + +// fill random value in column-major matrix +void initRandomMatrix(DATA_TYPE* mat) { + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + mat[(j * N) + i] = + (DATA_TYPE)1.0 + ((DATA_TYPE)rand() / (DATA_TYPE)RAND_MAX); + } + } + + // diagonal dominant matrix to insure it is invertible matrix + for (int i = 0; i < N; i++) { + mat[(i * N) + i] += (DATA_TYPE)N; + } +} + +// print column-major matrix +void printMatrix(DATA_TYPE* mat) { + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + printf("%20.16f ", mat[(j * N) + i]); + } + printf("\n"); + } + printf("\n"); +} + +// matrix mulitplication +void matrixMultiply(DATA_TYPE* res, DATA_TYPE* mat1, DATA_TYPE* mat2) { + initZeroMatrix(res); + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + res[(j * N) + i] += mat1[(k * N) + i] * mat2[(j * N) + k]; + } + } + } +} + +// check matrix equality +bool checkRelativeError(DATA_TYPE* mat1, DATA_TYPE* mat2, DATA_TYPE maxError) { + DATA_TYPE err = (DATA_TYPE)0.0; + DATA_TYPE refNorm = (DATA_TYPE)0.0; + DATA_TYPE relError = (DATA_TYPE)0.0; + DATA_TYPE relMaxError = (DATA_TYPE)0.0; + + for (int i = 0; i < N * N; i++) { + refNorm = abs(mat1[i]); + err = abs(mat1[i] - mat2[i]); + + if (refNorm != 0.0 && err > 0.0) { + relError = err / refNorm; + relMaxError = MAX(relMaxError, relError); + } + + if (relMaxError > maxError) return false; + } + return true; +} + +// decode lower and upper matrix from single matrix +// returned by getrfBatched() +void getLUdecoded(DATA_TYPE* mat, DATA_TYPE* L, DATA_TYPE* U) { + // init L as identity matrix + initIdentityMatrix(L); + + // copy lower triangular values from mat to L (skip diagonal) + for (int i = 0; i < N; i++) { + for (int j = 0; j < i; j++) { + L[(j * N) + i] = mat[(j * N) + i]; + } + } + + // init U as all zero + initZeroMatrix(U); + + // copy upper triangular values from mat to U + for (int i = 0; i < N; i++) { + for (int j = i; j < N; j++) { + U[(j * N) + i] = mat[(j * N) + i]; + } + } +} + +// generate permutation matrix from pivot vector +void getPmatFromPivot(DATA_TYPE* Pmat, int* P) { + int pivot[N]; + + // pivot vector in base-1 + // convert it to base-0 + for (int i = 0; i < N; i++) { + P[i]--; + } + + // generate permutation vector from pivot + // initialize pivot with identity sequence + for (int k = 0; k < N; k++) { + pivot[k] = k; + } + + // swap the indices according to pivot vector + for (int k = 0; k < N; k++) { + int q = P[k]; + + // swap pivot(k) and pivot(q) + int s = pivot[k]; + int t = pivot[q]; + pivot[k] = t; + pivot[q] = s; + } + + // generate permutation matrix from pivot vector + initZeroMatrix(Pmat); + for (int i = 0; i < N; i++) { + int j = pivot[i]; + Pmat[(j * N) + i] = (DATA_TYPE)1.0; + } +} + +int main(int argc, char** argv) { + // cuBLAS variables + cublasStatus_t status; + cublasHandle_t handle; + + // host variables + size_t matSize = N * N * sizeof(DATA_TYPE); + + DATA_TYPE* h_AarrayInput; + DATA_TYPE* h_AarrayOutput; + DATA_TYPE* h_ptr_array[BATCH_SIZE]; + + int* h_pivotArray; + int* h_infoArray; + + // device variables + DATA_TYPE* d_Aarray; + DATA_TYPE** d_ptr_array; + + int* d_pivotArray; + int* d_infoArray; + + int err_count = 0; + + // seed the rand() function with time + srand(12345); + + // find cuda device + printf("> initializing..\n"); + int dev = findCudaDevice(argc, (const char**)argv); + if (dev == -1) { + return (EXIT_FAILURE); + } + + // initialize cuBLAS + status = cublasCreate(&handle); + if (status != CUBLAS_STATUS_SUCCESS) { + printf("> ERROR: cuBLAS initialization failed..\n"); + return (EXIT_FAILURE); + } + +#ifdef DOUBLE_PRECISION + printf("> using DOUBLE precision..\n"); +#else + printf("> using SINGLE precision..\n"); +#endif + +#ifdef PIVOT + printf("> pivot ENABLED..\n"); +#else + printf("> pivot DISABLED..\n"); +#endif + + // allocate memory for host variables + h_AarrayInput = (DATA_TYPE*)xmalloc(BATCH_SIZE * matSize); + h_AarrayOutput = (DATA_TYPE*)xmalloc(BATCH_SIZE * matSize); + + h_pivotArray = (int*)xmalloc(N * BATCH_SIZE * sizeof(int)); + h_infoArray = (int*)xmalloc(BATCH_SIZE * sizeof(int)); + + // allocate memory for device variables + checkCudaErrors(cudaMalloc((void**)&d_Aarray, BATCH_SIZE * matSize)); + checkCudaErrors( + cudaMalloc((void**)&d_pivotArray, N * BATCH_SIZE * sizeof(int))); + checkCudaErrors(cudaMalloc((void**)&d_infoArray, BATCH_SIZE * sizeof(int))); + checkCudaErrors( + cudaMalloc((void**)&d_ptr_array, BATCH_SIZE * sizeof(DATA_TYPE*))); + + // fill matrix with random data + printf("> generating random matrices..\n"); + for (int i = 0; i < BATCH_SIZE; i++) { + initRandomMatrix(h_AarrayInput + (i * N * N)); + } + + // copy data to device from host + printf("> copying data from host memory to GPU memory..\n"); + checkCudaErrors(cudaMemcpy(d_Aarray, h_AarrayInput, BATCH_SIZE * matSize, + cudaMemcpyHostToDevice)); + + // create pointer array for matrices + for (int i = 0; i < BATCH_SIZE; i++) h_ptr_array[i] = d_Aarray + (i * N * N); + + // copy pointer array to device memory + checkCudaErrors(cudaMemcpy(d_ptr_array, h_ptr_array, + BATCH_SIZE * sizeof(DATA_TYPE*), + cudaMemcpyHostToDevice)); + + // perform LU decomposition + printf("> performing LU decomposition..\n"); +#ifdef PIVOT + status = cublasXgetrfBatched(handle, N, d_ptr_array, N, d_pivotArray, + d_infoArray, BATCH_SIZE); +#else + status = cublasXgetrfBatched(handle, N, d_ptr_array, N, NULL, d_infoArray, + BATCH_SIZE); +#endif /* PIVOT */ + if (status != CUBLAS_STATUS_SUCCESS) { + printf("> ERROR: cublasDgetrfBatched() failed with error %s..\n", + _cudaGetErrorEnum(status)); + return (EXIT_FAILURE); + } + + // copy data to host from device + printf("> copying data from GPU memory to host memory..\n"); + checkCudaErrors(cudaMemcpy(h_AarrayOutput, d_Aarray, BATCH_SIZE * matSize, + cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(h_infoArray, d_infoArray, BATCH_SIZE * sizeof(int), + cudaMemcpyDeviceToHost)); +#ifdef PIVOT + checkCudaErrors(cudaMemcpy(h_pivotArray, d_pivotArray, + N * BATCH_SIZE * sizeof(int), + cudaMemcpyDeviceToHost)); +#endif /* PIVOT */ + + // verify the result + printf("> verifying the result..\n"); + for (int i = 0; i < BATCH_SIZE; i++) { + if (h_infoArray[i] == 0) { + DATA_TYPE* A = h_AarrayInput + (i * N * N); + DATA_TYPE* LU = h_AarrayOutput + (i * N * N); + DATA_TYPE L[N * N]; + DATA_TYPE U[N * N]; + getLUdecoded(LU, L, U); + + // test P * A = L * U + int* P = h_pivotArray + (i * N); + DATA_TYPE Pmat[N * N]; +#ifdef PIVOT + getPmatFromPivot(Pmat, P); +#else + initIdentityMatrix(Pmat); +#endif /* PIVOT */ + + // perform matrix multiplication + DATA_TYPE PxA[N * N]; + DATA_TYPE LxU[N * N]; + matrixMultiply(PxA, Pmat, A); + matrixMultiply(LxU, L, U); + + // check for equality of matrices + if (!checkRelativeError(PxA, LxU, (DATA_TYPE)MAX_ERROR)) { + printf("> ERROR: accuracy check failed for matrix number %05d..\n", + i + 1); + err_count++; + } + + } else if (h_infoArray[i] > 0) { + printf( + "> execution for matrix %05d is successful, but U is singular and " + "U(%d,%d) = 0..\n", + i + 1, h_infoArray[i] - 1, h_infoArray[i] - 1); + } else // (h_infoArray[i] < 0) + { + printf("> ERROR: matrix %05d have an illegal value at index %d = %lf..\n", + i + 1, -h_infoArray[i], + *(h_AarrayInput + (i * N * N) + (-h_infoArray[i]))); + } + } + + // free device variables + checkCudaErrors(cudaFree(d_ptr_array)); + checkCudaErrors(cudaFree(d_infoArray)); + checkCudaErrors(cudaFree(d_pivotArray)); + checkCudaErrors(cudaFree(d_Aarray)); + + // free host variables + if (h_infoArray) free(h_infoArray); + if (h_pivotArray) free(h_pivotArray); + if (h_AarrayOutput) free(h_AarrayOutput); + if (h_AarrayInput) free(h_AarrayInput); + + // destroy cuBLAS handle + status = cublasDestroy(handle); + if (status != CUBLAS_STATUS_SUCCESS) { + printf("> ERROR: cuBLAS uninitialization failed..\n"); + return (EXIT_FAILURE); + } + + if (err_count > 0) { + printf("> TEST FAILED for %d matrices, with precision: %g\n", err_count, + MAX_ERROR); + return (EXIT_FAILURE); + } + + printf("> TEST SUCCESSFUL, with precision: %g\n", MAX_ERROR); + return (EXIT_SUCCESS); +} diff --git a/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.sln b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.sln new file mode 100644 index 00000000..96272bc1 --- /dev/null +++ b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2017 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUBLAS_LU", "simpleCUBLAS_LU_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.vcxproj b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.vcxproj new file mode 100644 index 00000000..7599aeda --- /dev/null +++ b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.vcxproj @@ -0,0 +1,113 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + simpleCUBLAS_LU_vs2017 + simpleCUBLAS_LU + + + + $([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0')) + $(LatestTargetPlatformVersion) + $(WindowsTargetPlatformVersion) + + + + Application + MultiByte + v141 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/simpleCUBLAS_LU.exe + + + compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + --threads 0 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.sln b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.sln new file mode 100644 index 00000000..950f5c83 --- /dev/null +++ b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2019 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUBLAS_LU", "simpleCUBLAS_LU_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.vcxproj b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.vcxproj new file mode 100644 index 00000000..154dac0a --- /dev/null +++ b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.vcxproj @@ -0,0 +1,109 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + simpleCUBLAS_LU_vs2019 + simpleCUBLAS_LU + + + + + Application + MultiByte + v142 + 10.0 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/simpleCUBLAS_LU.exe + + + compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + --threads 0 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/simpleCUFFT/Makefile b/Samples/simpleCUFFT/Makefile index 9e9475ee..c716cd0c 100644 --- a/Samples/simpleCUFFT/Makefile +++ b/Samples/simpleCUFFT/Makefile @@ -265,6 +265,12 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) SAMPLE_ENABLED := 1 +# This sample is not supported on QNX +ifeq ($(TARGET_OS),qnx) + $(info >>> WARNING - simpleCUFFT is not supported on QNX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) diff --git a/Samples/simpleCUFFT/README.md b/Samples/simpleCUFFT/README.md index ecf41085..67227805 100644 --- a/Samples/simpleCUFFT/README.md +++ b/Samples/simpleCUFFT/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj index 0914f439..914b65a7 100644 --- a/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj +++ b/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2019.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2019.vcxproj index 075276a2..339d7959 100644 --- a/Samples/simpleCUFFT/simpleCUFFT_vs2019.vcxproj +++ b/Samples/simpleCUFFT/simpleCUFFT_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/simpleCudaGraphs/README.md b/Samples/simpleCudaGraphs/README.md index c6982a12..9e044f33 100644 --- a/Samples/simpleCudaGraphs/README.md +++ b/Samples/simpleCudaGraphs/README.md @@ -25,7 +25,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaLaunchHostFunc, cudaGraphCreat ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs.cu b/Samples/simpleCudaGraphs/simpleCudaGraphs.cu index b62d04e2..82b6c160 100644 --- a/Samples/simpleCudaGraphs/simpleCudaGraphs.cu +++ b/Samples/simpleCudaGraphs/simpleCudaGraphs.cu @@ -393,7 +393,7 @@ int main(int argc, char **argv) { float *inputVec_d = NULL, *inputVec_h = NULL; double *outputVec_d = NULL, *result_d; - inputVec_h = (float *)malloc(sizeof(float) * size); + checkCudaErrors(cudaMallocHost(&inputVec_h, sizeof(float) * size)); checkCudaErrors(cudaMalloc(&inputVec_d, sizeof(float) * size)); checkCudaErrors(cudaMalloc(&outputVec_d, sizeof(double) * maxBlocks)); checkCudaErrors(cudaMalloc(&result_d, sizeof(double))); @@ -408,5 +408,6 @@ int main(int argc, char **argv) { checkCudaErrors(cudaFree(inputVec_d)); checkCudaErrors(cudaFree(outputVec_d)); checkCudaErrors(cudaFree(result_d)); + checkCudaErrors(cudaFreeHost(inputVec_h)); return EXIT_SUCCESS; } diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj index 6cc0c2f1..5a48206e 100644 --- a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj +++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2019.vcxproj b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2019.vcxproj index 3f977e78..e7aeecd2 100644 --- a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2019.vcxproj +++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/simpleD3D11/README.md b/Samples/simpleD3D11/README.md index 1262eeaa..eb8d6428 100644 --- a/Samples/simpleD3D11/README.md +++ b/Samples/simpleD3D11/README.md @@ -30,7 +30,7 @@ cudaD3D11GetDevice, cudaImportExternalSemaphore, cudaImportExternalMemory, cudaE ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleD3D11/simpleD3D11_vs2017.vcxproj b/Samples/simpleD3D11/simpleD3D11_vs2017.vcxproj index acd1e849..651c1a4c 100644 --- a/Samples/simpleD3D11/simpleD3D11_vs2017.vcxproj +++ b/Samples/simpleD3D11/simpleD3D11_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -112,6 +112,6 @@ - + diff --git a/Samples/simpleD3D11/simpleD3D11_vs2019.vcxproj b/Samples/simpleD3D11/simpleD3D11_vs2019.vcxproj index fb464f50..c3bcdbe4 100644 --- a/Samples/simpleD3D11/simpleD3D11_vs2019.vcxproj +++ b/Samples/simpleD3D11/simpleD3D11_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleD3D12/README.md b/Samples/simpleD3D12/README.md index 0622cfdc..4c60ed5b 100644 --- a/Samples/simpleD3D12/README.md +++ b/Samples/simpleD3D12/README.md @@ -30,7 +30,7 @@ cudaWaitExternalSemaphoresAsync, cudaSignalExternalSemaphoresAsync, cudaImportEx ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleD3D12/simpleD3D12.cpp b/Samples/simpleD3D12/simpleD3D12.cpp index 5eb64764..fe072d30 100755 --- a/Samples/simpleD3D12/simpleD3D12.cpp +++ b/Samples/simpleD3D12/simpleD3D12.cpp @@ -266,17 +266,8 @@ void DX12CudaInterop::LoadAssets() { parameter.InitAsDescriptorTable(1, &range, D3D12_SHADER_VISIBILITY_VERTEX); D3D12_ROOT_SIGNATURE_FLAGS rootSignatureFlags = - D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT | // Only - // the - // input - // assembler - // stage - // needs - // access - // to - // the - // constant - // buffer. + // Only the input assembler stage needs access to the constant buffer. + D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT | D3D12_ROOT_SIGNATURE_FLAG_DENY_DOMAIN_SHADER_ROOT_ACCESS | D3D12_ROOT_SIGNATURE_FLAG_DENY_GEOMETRY_SHADER_ROOT_ACCESS | D3D12_ROOT_SIGNATURE_FLAG_DENY_HULL_SHADER_ROOT_ACCESS | @@ -390,6 +381,7 @@ void DX12CudaInterop::LoadAssets() { checkCudaErrors( cudaImportExternalMemory(&m_externalMemory, &externalMemoryHandleDesc)); + CloseHandle(sharedHandle); cudaExternalMemoryBufferDesc externalMemoryBufferDesc; memset(&externalMemoryBufferDesc, 0, sizeof(externalMemoryBufferDesc)); @@ -468,6 +460,7 @@ void DX12CudaInterop::OnDestroy() { WaitForGpu(); checkCudaErrors(cudaDestroyExternalSemaphore(m_externalSemaphore)); checkCudaErrors(cudaDestroyExternalMemory(m_externalMemory)); + checkCudaErrors(cudaFree(m_cudaDevVertptr)); CloseHandle(m_fenceEvent); } diff --git a/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj b/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj index 7932d3de..e0bee149 100644 --- a/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj +++ b/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -120,6 +120,6 @@ - + diff --git a/Samples/simpleD3D12/simpleD3D12_vs2019.vcxproj b/Samples/simpleD3D12/simpleD3D12_vs2019.vcxproj index 72fe6656..ec724d05 100644 --- a/Samples/simpleD3D12/simpleD3D12_vs2019.vcxproj +++ b/Samples/simpleD3D12/simpleD3D12_vs2019.vcxproj @@ -39,7 +39,7 @@ - + @@ -121,6 +121,6 @@ - + diff --git a/Samples/simpleDrvRuntime/README.md b/Samples/simpleDrvRuntime/README.md index bce62a82..8c09f93b 100644 --- a/Samples/simpleDrvRuntime/README.md +++ b/Samples/simpleDrvRuntime/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaMalloc, cudaStreamCreateWithFlags ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/simpleDrvRuntime/simpleDrvRuntime.cpp b/Samples/simpleDrvRuntime/simpleDrvRuntime.cpp index f2926be1..cd2c0c3b 100644 --- a/Samples/simpleDrvRuntime/simpleDrvRuntime.cpp +++ b/Samples/simpleDrvRuntime/simpleDrvRuntime.cpp @@ -117,9 +117,9 @@ int main(int argc, char **argv) { cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel")); // Allocate input vectors h_A and h_B in host memory - h_A = (float *)malloc(size); - h_B = (float *)malloc(size); - h_C = (float *)malloc(size); + checkCudaErrors(cudaMallocHost(&h_A, size)); + checkCudaErrors(cudaMallocHost(&h_B, size)); + checkCudaErrors(cudaMallocHost(&h_C, size)); // Initialize input vectors RandomInit(h_A, N); @@ -179,15 +179,15 @@ int CleanupNoFailure(CUcontext &cuContext) { // Free host memory if (h_A) { - free(h_A); + checkCudaErrors(cudaFreeHost(h_A)); } if (h_B) { - free(h_B); + checkCudaErrors(cudaFreeHost(h_B)); } if (h_C) { - free(h_C); + checkCudaErrors(cudaFreeHost(h_C)); } checkCudaDrvErrors(cuCtxDestroy(cuContext)); diff --git a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2017.vcxproj b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2017.vcxproj index 862b0dc0..94e29419 100644 --- a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2017.vcxproj +++ b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -112,6 +112,6 @@ - + diff --git a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2019.vcxproj b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2019.vcxproj index 879c0699..42556e0c 100644 --- a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2019.vcxproj +++ b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleGL/README.md b/Samples/simpleGL/README.md index 35a42076..5176ee1b 100644 --- a/Samples/simpleGL/README.md +++ b/Samples/simpleGL/README.md @@ -30,7 +30,7 @@ cudaGraphicsMapResources, cudaGraphicsUnmapResources, cudaGraphicsResourceGetMap ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleGL/simpleGL_vs2017.vcxproj b/Samples/simpleGL/simpleGL_vs2017.vcxproj index 87d2f599..d096e815 100644 --- a/Samples/simpleGL/simpleGL_vs2017.vcxproj +++ b/Samples/simpleGL/simpleGL_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -118,6 +118,6 @@ - + diff --git a/Samples/simpleGL/simpleGL_vs2019.vcxproj b/Samples/simpleGL/simpleGL_vs2019.vcxproj index d5e1f2e3..527b22d0 100644 --- a/Samples/simpleGL/simpleGL_vs2019.vcxproj +++ b/Samples/simpleGL/simpleGL_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -114,6 +114,6 @@ - + diff --git a/Samples/simpleIPC/README.md b/Samples/simpleIPC/README.md index 445db7c9..3fcb740a 100644 --- a/Samples/simpleIPC/README.md +++ b/Samples/simpleIPC/README.md @@ -30,7 +30,7 @@ cudaIpcGetEventHandle, cudaIpcOpenMemHandle, cudaIpcCloseMemHandle, cudaMemcpyAs ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleIPC/simpleIPC_vs2017.vcxproj b/Samples/simpleIPC/simpleIPC_vs2017.vcxproj index 47270332..640802b1 100644 --- a/Samples/simpleIPC/simpleIPC_vs2017.vcxproj +++ b/Samples/simpleIPC/simpleIPC_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -109,6 +109,6 @@ - + diff --git a/Samples/simpleIPC/simpleIPC_vs2019.vcxproj b/Samples/simpleIPC/simpleIPC_vs2019.vcxproj index 271da1f3..8c03b709 100644 --- a/Samples/simpleIPC/simpleIPC_vs2019.vcxproj +++ b/Samples/simpleIPC/simpleIPC_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -105,6 +105,6 @@ - + diff --git a/Samples/simpleVoteIntrinsics/README.md b/Samples/simpleVoteIntrinsics/README.md index cbf4ec3f..314de841 100644 --- a/Samples/simpleVoteIntrinsics/README.md +++ b/Samples/simpleVoteIntrinsics/README.md @@ -27,7 +27,7 @@ cudaMalloc, cudaFree, cudaMemcpy, cudaFreeHost ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj index 0170b401..62b48298 100644 --- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj +++ b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2019.vcxproj b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2019.vcxproj index eaf28122..630c35e5 100644 --- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2019.vcxproj +++ b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/simpleVulkan/README.md b/Samples/simpleVulkan/README.md index de1bbe69..4cbb0122 100644 --- a/Samples/simpleVulkan/README.md +++ b/Samples/simpleVulkan/README.md @@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedBuffer, cudaImportExternalS ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleVulkan/SineWaveSimulation.cu b/Samples/simpleVulkan/SineWaveSimulation.cu index 68e63d7a..f0289733 100644 --- a/Samples/simpleVulkan/SineWaveSimulation.cu +++ b/Samples/simpleVulkan/SineWaveSimulation.cu @@ -29,110 +29,106 @@ #include #include -__global__ void sinewave(float *heightMap, unsigned int width, unsigned int height, float time) -{ - const float freq = 4.0f; - const size_t stride = gridDim.x * blockDim.x; +__global__ void sinewave(float *heightMap, unsigned int width, + unsigned int height, float time) { + const float freq = 4.0f; + const size_t stride = gridDim.x * blockDim.x; - // Iterate through the entire array in a way that is - // independent of the grid configuration - for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < width * height; tid += stride) { - // Calculate the x, y coordinates - const size_t y = tid / width; - const size_t x = tid - y * width; - // Normalize x, y to [0,1] - const float u = ((2.0f * x) / width) - 1.0f; - const float v = ((2.0f * y) / height) - 1.0f; - // Calculate the new height value - const float w = 0.5f * sinf(u * freq + time) * cosf(v * freq + time); - // Store this new height value - heightMap[tid] = w; - } + // Iterate through the entire array in a way that is + // independent of the grid configuration + for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < width * height; + tid += stride) { + // Calculate the x, y coordinates + const size_t y = tid / width; + const size_t x = tid - y * width; + // Normalize x, y to [0,1] + const float u = ((2.0f * x) / width) - 1.0f; + const float v = ((2.0f * y) / height) - 1.0f; + // Calculate the new height value + const float w = 0.5f * sinf(u * freq + time) * cosf(v * freq + time); + // Store this new height value + heightMap[tid] = w; + } } -SineWaveSimulation::SineWaveSimulation(size_t width, size_t height) - : m_heightMap(nullptr), m_width(width), m_height(height) -{ +SineWaveSimulation::SineWaveSimulation(size_t width, size_t height) + : m_heightMap(nullptr), m_width(width), m_height(height) {} + +void SineWaveSimulation::initCudaLaunchConfig(int device) { + cudaDeviceProp prop = {}; + checkCudaErrors(cudaSetDevice(device)); + checkCudaErrors(cudaGetDeviceProperties(&prop, device)); + + // We don't need large block sizes, since there's not much inter-thread + // communication + m_threads = prop.warpSize; + + // Use the occupancy calculator and fill the gpu as best as we can + checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &m_blocks, sinewave, prop.warpSize, 0)); + m_blocks *= prop.multiProcessorCount; + + // Go ahead and the clamp the blocks to the minimum needed for this + // height/width + m_blocks = std::min(m_blocks, + (int)((m_width * m_height + m_threads - 1) / m_threads)); } -void SineWaveSimulation::initCudaLaunchConfig(int device) -{ - cudaDeviceProp prop = {}; - checkCudaErrors(cudaSetDevice(device)); - checkCudaErrors(cudaGetDeviceProperties(&prop, device)); +int SineWaveSimulation::initCuda(uint8_t *vkDeviceUUID, size_t UUID_SIZE) { + int current_device = 0; + int device_count = 0; + int devices_prohibited = 0; - // We don't need large block sizes, since there's not much inter-thread communication - m_threads = prop.warpSize; + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceCount(&device_count)); - // Use the occupancy calculator and fill the gpu as best as we can - checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_blocks, sinewave, prop.warpSize, 0)); - m_blocks *= prop.multiProcessorCount; + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } - // Go ahead and the clamp the blocks to the minimum needed for this height/width - m_blocks = std::min(m_blocks, (int)((m_width * m_height + m_threads - 1) / m_threads)); -} + // Find the GPU which is selected by Vulkan + while (current_device < device_count) { + cudaGetDeviceProperties(&deviceProp, current_device); -int SineWaveSimulation::initCuda(uint8_t *vkDeviceUUID, size_t UUID_SIZE) -{ - int current_device = 0; - int device_count = 0; - int devices_prohibited = 0; + if ((deviceProp.computeMode != cudaComputeModeProhibited)) { + // Compare the cuda device UUID with vulkan UUID + int ret = memcmp((void *)&deviceProp.uuid, vkDeviceUUID, UUID_SIZE); + if (ret == 0) { + checkCudaErrors(cudaSetDevice(current_device)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + current_device, deviceProp.name, deviceProp.major, + deviceProp.minor); - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceCount(&device_count)); + return current_device; + } - if (device_count == 0) { - fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); - exit(EXIT_FAILURE); + } else { + devices_prohibited++; } - // Find the GPU which is selected by Vulkan - while (current_device < device_count) { - cudaGetDeviceProperties(&deviceProp, current_device); + current_device++; + } - if ((deviceProp.computeMode != cudaComputeModeProhibited)) { - // Compare the cuda device UUID with vulkan UUID - int ret = memcmp((void*)&deviceProp.uuid, vkDeviceUUID, UUID_SIZE); - if (ret == 0) - { - checkCudaErrors(cudaSetDevice(current_device)); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device)); - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", - current_device, deviceProp.name, deviceProp.major, - deviceProp.minor); + if (devices_prohibited == device_count) { + fprintf(stderr, + "CUDA error:" + " No Vulkan-CUDA Interop capable GPU found.\n"); + exit(EXIT_FAILURE); + } - return current_device; - } - - } else { - devices_prohibited++; - } - - current_device++; - } - - if (devices_prohibited == device_count) { - fprintf(stderr, - "CUDA error:" - " No Vulkan-CUDA Interop capable GPU found.\n"); - exit(EXIT_FAILURE); - } - - return -1; + return -1; } -SineWaveSimulation::~SineWaveSimulation() -{ - m_heightMap = NULL; +SineWaveSimulation::~SineWaveSimulation() { m_heightMap = NULL; } + +void SineWaveSimulation::initSimulation(float *heights) { + m_heightMap = heights; } -void SineWaveSimulation::initSimulation(float *heights) -{ - m_heightMap = heights; -} - -void SineWaveSimulation::stepSimulation(float time, cudaStream_t stream) -{ - sinewave <<< m_blocks, m_threads, 0, stream >>> (m_heightMap, m_width, m_height, time); - getLastCudaError("Failed to launch CUDA simulation"); +void SineWaveSimulation::stepSimulation(float time, cudaStream_t stream) { + sinewave<<>>(m_heightMap, m_width, m_height, + time); + getLastCudaError("Failed to launch CUDA simulation"); } diff --git a/Samples/simpleVulkan/SineWaveSimulation.h b/Samples/simpleVulkan/SineWaveSimulation.h index dc889b4b..aadc4828 100644 --- a/Samples/simpleVulkan/SineWaveSimulation.h +++ b/Samples/simpleVulkan/SineWaveSimulation.h @@ -34,25 +34,21 @@ #include #include "linmath.h" -class SineWaveSimulation -{ - float *m_heightMap; - size_t m_width, m_height; - int m_blocks, m_threads; -public: - SineWaveSimulation(size_t width, size_t height); - ~SineWaveSimulation(); - void initSimulation(float *heightMap); - void stepSimulation(float time, cudaStream_t stream = 0); - void initCudaLaunchConfig(int device); - int initCuda(uint8_t *vkDeviceUUID, size_t UUID_SIZE); +class SineWaveSimulation { + float *m_heightMap; + size_t m_width, m_height; + int m_blocks, m_threads; - size_t getWidth() const { - return m_width; - } - size_t getHeight() const { - return m_height; - } + public: + SineWaveSimulation(size_t width, size_t height); + ~SineWaveSimulation(); + void initSimulation(float *heightMap); + void stepSimulation(float time, cudaStream_t stream = 0); + void initCudaLaunchConfig(int device); + int initCuda(uint8_t *vkDeviceUUID, size_t UUID_SIZE); + + size_t getWidth() const { return m_width; } + size_t getHeight() const { return m_height; } }; -#endif // __SINESIM_H__ +#endif // __SINESIM_H__ diff --git a/Samples/simpleVulkan/VulkanBaseApp.cpp b/Samples/simpleVulkan/VulkanBaseApp.cpp index 05dece53..fbc4049c 100644 --- a/Samples/simpleVulkan/VulkanBaseApp.cpp +++ b/Samples/simpleVulkan/VulkanBaseApp.cpp @@ -55,1665 +55,1886 @@ #define countof(x) (sizeof(x) / sizeof(*(x))) #endif -static const char *validationLayers[] = { "VK_LAYER_KHRONOS_validation" }; +static const char *validationLayers[] = {"VK_LAYER_KHRONOS_validation"}; static const size_t MAX_FRAMES_IN_FLIGHT = 5; -void VulkanBaseApp::resizeCallback(GLFWwindow *window, int width, int height) -{ - VulkanBaseApp *app = reinterpret_cast(glfwGetWindowUserPointer(window)); - app->m_framebufferResized = true; +void VulkanBaseApp::resizeCallback(GLFWwindow *window, int width, int height) { + VulkanBaseApp *app = + reinterpret_cast(glfwGetWindowUserPointer(window)); + app->m_framebufferResized = true; } -static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, VkDebugUtilsMessageTypeFlagsEXT messageType, const VkDebugUtilsMessengerCallbackDataEXT *pCallbackData, void *pUserData) -{ - std::cerr << "validation layer: " << pCallbackData->pMessage << std::endl; +static VKAPI_ATTR VkBool32 VKAPI_CALL +debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, + VkDebugUtilsMessageTypeFlagsEXT messageType, + const VkDebugUtilsMessengerCallbackDataEXT *pCallbackData, + void *pUserData) { + std::cerr << "validation layer: " << pCallbackData->pMessage << std::endl; - return VK_FALSE; + return VK_FALSE; } -VulkanBaseApp::VulkanBaseApp(const std::string& appName, bool enableValidation) : - m_appName(appName), - m_enableValidation(enableValidation), - m_instance(VK_NULL_HANDLE), - m_window(nullptr), - m_debugMessenger(VK_NULL_HANDLE), - m_surface(VK_NULL_HANDLE), - m_physicalDevice(VK_NULL_HANDLE), - m_device(VK_NULL_HANDLE), - m_graphicsQueue(VK_NULL_HANDLE), - m_presentQueue(VK_NULL_HANDLE), - m_swapChain(VK_NULL_HANDLE), - m_vkDeviceUUID(), - m_swapChainImages(), - m_swapChainFormat(), - m_swapChainExtent(), - m_swapChainImageViews(), - m_shaderFiles(), - m_renderPass(), - m_pipelineLayout(VK_NULL_HANDLE), - m_graphicsPipeline(VK_NULL_HANDLE), - m_swapChainFramebuffers(), - m_commandPool(VK_NULL_HANDLE), - m_commandBuffers(), - m_imageAvailableSemaphores(), - m_renderFinishedSemaphores(), - m_inFlightFences(), - m_uniformBuffers(), - m_uniformMemory(), - m_descriptorSetLayout(VK_NULL_HANDLE), - m_descriptorPool(VK_NULL_HANDLE), - m_descriptorSets(), - m_depthImage(VK_NULL_HANDLE), - m_depthImageMemory(VK_NULL_HANDLE), - m_depthImageView(VK_NULL_HANDLE), - m_currentFrame(0), - m_framebufferResized(false) -{ -} +VulkanBaseApp::VulkanBaseApp(const std::string &appName, bool enableValidation) + : m_appName(appName), + m_enableValidation(enableValidation), + m_instance(VK_NULL_HANDLE), + m_window(nullptr), + m_debugMessenger(VK_NULL_HANDLE), + m_surface(VK_NULL_HANDLE), + m_physicalDevice(VK_NULL_HANDLE), + m_device(VK_NULL_HANDLE), + m_graphicsQueue(VK_NULL_HANDLE), + m_presentQueue(VK_NULL_HANDLE), + m_swapChain(VK_NULL_HANDLE), + m_vkDeviceUUID(), + m_swapChainImages(), + m_swapChainFormat(), + m_swapChainExtent(), + m_swapChainImageViews(), + m_shaderFiles(), + m_renderPass(), + m_pipelineLayout(VK_NULL_HANDLE), + m_graphicsPipeline(VK_NULL_HANDLE), + m_swapChainFramebuffers(), + m_commandPool(VK_NULL_HANDLE), + m_commandBuffers(), + m_imageAvailableSemaphores(), + m_renderFinishedSemaphores(), + m_inFlightFences(), + m_uniformBuffers(), + m_uniformMemory(), + m_descriptorSetLayout(VK_NULL_HANDLE), + m_descriptorPool(VK_NULL_HANDLE), + m_descriptorSets(), + m_depthImage(VK_NULL_HANDLE), + m_depthImageMemory(VK_NULL_HANDLE), + m_depthImageView(VK_NULL_HANDLE), + m_currentFrame(0), + m_framebufferResized(false) {} -VkExternalSemaphoreHandleTypeFlagBits VulkanBaseApp::getDefaultSemaphoreHandleType() -{ +VkExternalSemaphoreHandleTypeFlagBits +VulkanBaseApp::getDefaultSemaphoreHandleType() { #ifdef _WIN64 - return IsWindows8OrGreater() ? - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT : - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT; + return IsWindows8OrGreater() + ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT + : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT; #else - return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT; -#endif + return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT; +#endif /* _WIN64 */ } -VkExternalMemoryHandleTypeFlagBits VulkanBaseApp::getDefaultMemHandleType() -{ +VkExternalMemoryHandleTypeFlagBits VulkanBaseApp::getDefaultMemHandleType() { #ifdef _WIN64 - return IsWindows8Point1OrGreater() ? - VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT : - VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT; + return IsWindows8Point1OrGreater() + ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT + : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT; #else - return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; -#endif + return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; +#endif /* _WIN64 */ } -VulkanBaseApp::~VulkanBaseApp() -{ - cleanupSwapChain(); +VulkanBaseApp::~VulkanBaseApp() { + cleanupSwapChain(); - if (m_descriptorSetLayout != VK_NULL_HANDLE) { - vkDestroyDescriptorSetLayout(m_device, m_descriptorSetLayout, nullptr); - } + if (m_descriptorSetLayout != VK_NULL_HANDLE) { + vkDestroyDescriptorSetLayout(m_device, m_descriptorSetLayout, nullptr); + } - for (size_t i = 0; i < m_renderFinishedSemaphores.size(); i++) { - vkDestroySemaphore(m_device, m_renderFinishedSemaphores[i], nullptr); - vkDestroySemaphore(m_device, m_imageAvailableSemaphores[i], nullptr); - vkDestroyFence(m_device, m_inFlightFences[i], nullptr); - } - if (m_commandPool != VK_NULL_HANDLE) { - vkDestroyCommandPool(m_device, m_commandPool, nullptr); - } +#ifdef _VK_TIMELINE_SEMAPHORE + if (m_vkPresentationSemaphore != VK_NULL_HANDLE) { + vkDestroySemaphore(m_device, m_vkPresentationSemaphore, nullptr); + } +#endif /* _VK_TIMELINE_SEMAPHORE */ - if (m_device != VK_NULL_HANDLE) { - vkDestroyDevice(m_device, nullptr); - } + for (size_t i = 0; i < m_renderFinishedSemaphores.size(); i++) { + vkDestroySemaphore(m_device, m_renderFinishedSemaphores[i], nullptr); + vkDestroySemaphore(m_device, m_imageAvailableSemaphores[i], nullptr); + vkDestroyFence(m_device, m_inFlightFences[i], nullptr); + } + if (m_commandPool != VK_NULL_HANDLE) { + vkDestroyCommandPool(m_device, m_commandPool, nullptr); + } - if (m_enableValidation) { - PFN_vkDestroyDebugUtilsMessengerEXT func = (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr(m_instance, "vkDestroyDebugUtilsMessengerEXT"); - if (func != nullptr) { - func(m_instance, m_debugMessenger, nullptr); - } - } + if (m_device != VK_NULL_HANDLE) { + vkDestroyDevice(m_device, nullptr); + } - if (m_surface != VK_NULL_HANDLE) { - vkDestroySurfaceKHR(m_instance, m_surface, nullptr); + if (m_enableValidation) { + PFN_vkDestroyDebugUtilsMessengerEXT func = + (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr( + m_instance, "vkDestroyDebugUtilsMessengerEXT"); + if (func != nullptr) { + func(m_instance, m_debugMessenger, nullptr); } + } - if (m_instance != VK_NULL_HANDLE) { - vkDestroyInstance(m_instance, nullptr); - } + if (m_surface != VK_NULL_HANDLE) { + vkDestroySurfaceKHR(m_instance, m_surface, nullptr); + } - if (m_window) { - glfwDestroyWindow(m_window); - } + if (m_instance != VK_NULL_HANDLE) { + vkDestroyInstance(m_instance, nullptr); + } - glfwTerminate(); + if (m_window) { + glfwDestroyWindow(m_window); + } + + glfwTerminate(); } -void VulkanBaseApp::init() -{ - initWindow(); - initVulkan(); +void VulkanBaseApp::init() { + initWindow(); + initVulkan(); } -VkCommandBuffer VulkanBaseApp::beginSingleTimeCommands() -{ - VkCommandBufferAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; - allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; - allocInfo.commandPool = m_commandPool; - allocInfo.commandBufferCount = 1; +VkCommandBuffer VulkanBaseApp::beginSingleTimeCommands() { + VkCommandBufferAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + allocInfo.commandPool = m_commandPool; + allocInfo.commandBufferCount = 1; - VkCommandBuffer commandBuffer; - vkAllocateCommandBuffers(m_device, &allocInfo, &commandBuffer); + VkCommandBuffer commandBuffer; + vkAllocateCommandBuffers(m_device, &allocInfo, &commandBuffer); - VkCommandBufferBeginInfo beginInfo = {}; - beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; - beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + VkCommandBufferBeginInfo beginInfo = {}; + beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - vkBeginCommandBuffer(commandBuffer, &beginInfo); + vkBeginCommandBuffer(commandBuffer, &beginInfo); - return commandBuffer; + return commandBuffer; } -void VulkanBaseApp::endSingleTimeCommands(VkCommandBuffer commandBuffer) -{ - vkEndCommandBuffer(commandBuffer); +void VulkanBaseApp::endSingleTimeCommands(VkCommandBuffer commandBuffer) { + vkEndCommandBuffer(commandBuffer); - VkSubmitInfo submitInfo = {}; - submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - submitInfo.commandBufferCount = 1; - submitInfo.pCommandBuffers = &commandBuffer; + VkSubmitInfo submitInfo = {}; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &commandBuffer; - vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE); - vkQueueWaitIdle(m_graphicsQueue); + vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE); + vkQueueWaitIdle(m_graphicsQueue); - vkFreeCommandBuffers(m_device, m_commandPool, 1, &commandBuffer); + vkFreeCommandBuffers(m_device, m_commandPool, 1, &commandBuffer); } -void VulkanBaseApp::initWindow() -{ - glfwInit(); +void VulkanBaseApp::initWindow() { + glfwInit(); - glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API); - glfwWindowHint(GLFW_RESIZABLE, GLFW_FALSE); + glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API); + glfwWindowHint(GLFW_RESIZABLE, GLFW_FALSE); - m_window = glfwCreateWindow(1280, 800, m_appName.c_str(), nullptr, nullptr); - glfwSetWindowUserPointer(m_window, this); - glfwSetFramebufferSizeCallback(m_window, resizeCallback); + m_window = glfwCreateWindow(1280, 800, m_appName.c_str(), nullptr, nullptr); + glfwSetWindowUserPointer(m_window, this); + glfwSetFramebufferSizeCallback(m_window, resizeCallback); } - -std::vector VulkanBaseApp::getRequiredExtensions() const -{ - return std::vector(); +std::vector VulkanBaseApp::getRequiredExtensions() const { + return std::vector(); } -std::vector VulkanBaseApp::getRequiredDeviceExtensions() const -{ - return std::vector(); +std::vector VulkanBaseApp::getRequiredDeviceExtensions() const { + return std::vector(); } -void VulkanBaseApp::initVulkan() -{ - createInstance(); - createSurface(); - createDevice(); - createSwapChain(); - createImageViews(); - createRenderPass(); - createDescriptorSetLayout(); - createGraphicsPipeline(); - createCommandPool(); - createDepthResources(); - createFramebuffers(); - initVulkanApp(); - createUniformBuffers(); - createDescriptorPool(); - createDescriptorSets(); - createCommandBuffers(); - createSyncObjects(); +void VulkanBaseApp::initVulkan() { + createInstance(); + createSurface(); + createDevice(); + createSwapChain(); + createImageViews(); + createRenderPass(); + createDescriptorSetLayout(); + createGraphicsPipeline(); + createCommandPool(); + createDepthResources(); + createFramebuffers(); + initVulkanApp(); + createUniformBuffers(); + createDescriptorPool(); + createDescriptorSets(); + createCommandBuffers(); + createSyncObjects(); } #ifdef _WIN64 -class WindowsSecurityAttributes -{ -protected: - SECURITY_ATTRIBUTES m_winSecurityAttributes; - PSECURITY_DESCRIPTOR m_winPSecurityDescriptor; +class WindowsSecurityAttributes { + protected: + SECURITY_ATTRIBUTES m_winSecurityAttributes; + PSECURITY_DESCRIPTOR m_winPSecurityDescriptor; -public: - WindowsSecurityAttributes(); - SECURITY_ATTRIBUTES *operator&(); - ~WindowsSecurityAttributes(); + public: + WindowsSecurityAttributes(); + SECURITY_ATTRIBUTES *operator&(); + ~WindowsSecurityAttributes(); }; -WindowsSecurityAttributes::WindowsSecurityAttributes() -{ - m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc(1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void **)); - if (!m_winPSecurityDescriptor) { - throw std::runtime_error("Failed to allocate memory for security descriptor"); - } +WindowsSecurityAttributes::WindowsSecurityAttributes() { + m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc( + 1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void **)); + if (!m_winPSecurityDescriptor) { + throw std::runtime_error( + "Failed to allocate memory for security descriptor"); + } - PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH); - PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); + PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + + SECURITY_DESCRIPTOR_MIN_LENGTH); + PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); - InitializeSecurityDescriptor(m_winPSecurityDescriptor, SECURITY_DESCRIPTOR_REVISION); + InitializeSecurityDescriptor(m_winPSecurityDescriptor, + SECURITY_DESCRIPTOR_REVISION); - SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = SECURITY_WORLD_SID_AUTHORITY; - AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0, 0, 0, 0, 0, 0, ppSID); + SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = + SECURITY_WORLD_SID_AUTHORITY; + AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0, + 0, 0, 0, 0, 0, ppSID); - EXPLICIT_ACCESS explicitAccess; - ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS)); - explicitAccess.grfAccessPermissions = STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL; - explicitAccess.grfAccessMode = SET_ACCESS; - explicitAccess.grfInheritance = INHERIT_ONLY; - explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID; - explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP; - explicitAccess.Trustee.ptstrName = (LPTSTR) * ppSID; + EXPLICIT_ACCESS explicitAccess; + ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS)); + explicitAccess.grfAccessPermissions = + STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL; + explicitAccess.grfAccessMode = SET_ACCESS; + explicitAccess.grfInheritance = INHERIT_ONLY; + explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID; + explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP; + explicitAccess.Trustee.ptstrName = (LPTSTR)*ppSID; - SetEntriesInAcl(1, &explicitAccess, NULL, ppACL); + SetEntriesInAcl(1, &explicitAccess, NULL, ppACL); - SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE); + SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE); - m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes); - m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor; - m_winSecurityAttributes.bInheritHandle = TRUE; + m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes); + m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor; + m_winSecurityAttributes.bInheritHandle = TRUE; } -SECURITY_ATTRIBUTES * -WindowsSecurityAttributes::operator&() -{ - return &m_winSecurityAttributes; +SECURITY_ATTRIBUTES *WindowsSecurityAttributes::operator&() { + return &m_winSecurityAttributes; } -WindowsSecurityAttributes::~WindowsSecurityAttributes() -{ - PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH); - PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); +WindowsSecurityAttributes::~WindowsSecurityAttributes() { + PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + + SECURITY_DESCRIPTOR_MIN_LENGTH); + PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); - if (*ppSID) { - FreeSid(*ppSID); - } - if (*ppACL) { - LocalFree(*ppACL); - } - free(m_winPSecurityDescriptor); + if (*ppSID) { + FreeSid(*ppSID); + } + if (*ppACL) { + LocalFree(*ppACL); + } + free(m_winPSecurityDescriptor); } #endif /* _WIN64 */ - -static VkFormat findSupportedFormat(VkPhysicalDevice physicalDevice, const std::vector& candidates, VkImageTiling tiling, VkFormatFeatureFlags features) -{ - for (VkFormat format : candidates) { - VkFormatProperties props; - vkGetPhysicalDeviceFormatProperties(physicalDevice, format, &props); - if (tiling == VK_IMAGE_TILING_LINEAR && (props.linearTilingFeatures & features) == features) { - return format; - } - else if (tiling == VK_IMAGE_TILING_OPTIMAL && (props.optimalTilingFeatures & features) == features) { - return format; - } +static VkFormat findSupportedFormat(VkPhysicalDevice physicalDevice, + const std::vector &candidates, + VkImageTiling tiling, + VkFormatFeatureFlags features) { + for (VkFormat format : candidates) { + VkFormatProperties props; + vkGetPhysicalDeviceFormatProperties(physicalDevice, format, &props); + if (tiling == VK_IMAGE_TILING_LINEAR && + (props.linearTilingFeatures & features) == features) { + return format; + } else if (tiling == VK_IMAGE_TILING_OPTIMAL && + (props.optimalTilingFeatures & features) == features) { + return format; } - throw std::runtime_error("Failed to find supported format!"); + } + throw std::runtime_error("Failed to find supported format!"); } -static uint32_t findMemoryType(VkPhysicalDevice physicalDevice, uint32_t typeFilter, VkMemoryPropertyFlags properties) -{ - VkPhysicalDeviceMemoryProperties memProperties; - vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties); - for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) { - if (typeFilter & (1 << i) && (memProperties.memoryTypes[i].propertyFlags & properties) == properties) { - return i; - } +static uint32_t findMemoryType(VkPhysicalDevice physicalDevice, + uint32_t typeFilter, + VkMemoryPropertyFlags properties) { + VkPhysicalDeviceMemoryProperties memProperties; + vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties); + for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) { + if (typeFilter & (1 << i) && + (memProperties.memoryTypes[i].propertyFlags & properties) == + properties) { + return i; } - return ~0; + } + return ~0; } -static bool supportsValidationLayers() -{ - std::vector availableLayers; - uint32_t layerCount; +static bool supportsValidationLayers() { + std::vector availableLayers; + uint32_t layerCount; - vkEnumerateInstanceLayerProperties(&layerCount, nullptr); - availableLayers.resize(layerCount); - vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data()); + vkEnumerateInstanceLayerProperties(&layerCount, nullptr); + availableLayers.resize(layerCount); + vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data()); - for (const char * layerName : validationLayers) { - bool layerFound = false; + for (const char *layerName : validationLayers) { + bool layerFound = false; - for (const auto & layerProperties : availableLayers) { - if (strcmp(layerName, layerProperties.layerName) == 0) { - layerFound = true; - break; - } - } - - if (!layerFound) { - return false; - } + for (const auto &layerProperties : availableLayers) { + if (strcmp(layerName, layerProperties.layerName) == 0) { + layerFound = true; + break; + } } - return true; + if (!layerFound) { + return false; + } + } + + return true; } -void VulkanBaseApp::createInstance() -{ - if (m_enableValidation && !supportsValidationLayers()) { - throw std::runtime_error("Validation requested, but not supported!"); - } +void VulkanBaseApp::createInstance() { + if (m_enableValidation && !supportsValidationLayers()) { + throw std::runtime_error("Validation requested, but not supported!"); + } - VkApplicationInfo appInfo = {}; - appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; - appInfo.pApplicationName = m_appName.c_str(); - appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0); - appInfo.pEngineName = "No Engine"; - appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0); - appInfo.apiVersion = VK_API_VERSION_1_0; + VkApplicationInfo appInfo = {}; + appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; + appInfo.pApplicationName = m_appName.c_str(); + appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0); + appInfo.pEngineName = "No Engine"; + appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0); + appInfo.apiVersion = VK_API_VERSION_1_2; - VkInstanceCreateInfo createInfo = {}; - createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; - createInfo.pApplicationInfo = &appInfo; + VkInstanceCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; + createInfo.pApplicationInfo = &appInfo; - std::vector exts = getRequiredExtensions(); + std::vector exts = getRequiredExtensions(); - { - uint32_t glfwExtensionCount = 0; - const char **glfwExtensions; + { + uint32_t glfwExtensionCount = 0; + const char **glfwExtensions; - glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount); + glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount); - exts.insert(exts.begin(), glfwExtensions, glfwExtensions + glfwExtensionCount); - - if (m_enableValidation) { - exts.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); - } - } - - createInfo.enabledExtensionCount = static_cast(exts.size()); - createInfo.ppEnabledExtensionNames = exts.data(); - VkDebugUtilsMessengerCreateInfoEXT debugCreateInfo = {}; - if (m_enableValidation) { - createInfo.enabledLayerCount = static_cast(countof(validationLayers)); - createInfo.ppEnabledLayerNames = validationLayers; - - debugCreateInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT; - debugCreateInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT; - debugCreateInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT; - debugCreateInfo.pfnUserCallback = debugCallback; - - createInfo.pNext = &debugCreateInfo; - } - else { - createInfo.enabledLayerCount = 0; - createInfo.pNext = nullptr; - } - - if (vkCreateInstance(&createInfo, nullptr, &m_instance) != VK_SUCCESS) { - throw std::runtime_error("Failed to create Vulkan instance!"); - } + exts.insert(exts.begin(), glfwExtensions, + glfwExtensions + glfwExtensionCount); if (m_enableValidation) { - PFN_vkCreateDebugUtilsMessengerEXT func = (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(m_instance, "vkCreateDebugUtilsMessengerEXT"); - if (func == nullptr || func(m_instance, &debugCreateInfo, nullptr, &m_debugMessenger) != VK_SUCCESS) { - throw std::runtime_error("Failed to set up debug messenger!"); - } + exts.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); } + } + + createInfo.enabledExtensionCount = static_cast(exts.size()); + createInfo.ppEnabledExtensionNames = exts.data(); + VkDebugUtilsMessengerCreateInfoEXT debugCreateInfo = {}; + if (m_enableValidation) { + createInfo.enabledLayerCount = + static_cast(countof(validationLayers)); + createInfo.ppEnabledLayerNames = validationLayers; + + debugCreateInfo.sType = + VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT; + debugCreateInfo.messageSeverity = + VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT; + debugCreateInfo.messageType = + VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT; + debugCreateInfo.pfnUserCallback = debugCallback; + + createInfo.pNext = &debugCreateInfo; + } else { + createInfo.enabledLayerCount = 0; + createInfo.pNext = nullptr; + } + + if (vkCreateInstance(&createInfo, nullptr, &m_instance) != VK_SUCCESS) { + throw std::runtime_error("Failed to create Vulkan instance!"); + } + + if (m_enableValidation) { + PFN_vkCreateDebugUtilsMessengerEXT func = + (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr( + m_instance, "vkCreateDebugUtilsMessengerEXT"); + if (func == nullptr || + func(m_instance, &debugCreateInfo, nullptr, &m_debugMessenger) != + VK_SUCCESS) { + throw std::runtime_error("Failed to set up debug messenger!"); + } + } } -void VulkanBaseApp::createSurface() -{ - if (glfwCreateWindowSurface(m_instance, m_window, nullptr, &m_surface) != VK_SUCCESS) { - throw std::runtime_error("failed to create window surface!"); - } +void VulkanBaseApp::createSurface() { + if (glfwCreateWindowSurface(m_instance, m_window, nullptr, &m_surface) != + VK_SUCCESS) { + throw std::runtime_error("failed to create window surface!"); + } } -static bool findGraphicsQueueIndicies(VkPhysicalDevice device, VkSurfaceKHR surface, uint32_t& graphicsFamily, uint32_t& presentFamily) -{ - uint32_t queueFamilyCount = 0; +static bool findGraphicsQueueIndicies(VkPhysicalDevice device, + VkSurfaceKHR surface, + uint32_t &graphicsFamily, + uint32_t &presentFamily) { + uint32_t queueFamilyCount = 0; - vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, nullptr); + vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, nullptr); - std::vector queueFamilies(queueFamilyCount); - vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, queueFamilies.data()); + std::vector queueFamilies(queueFamilyCount); + vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, + queueFamilies.data()); - graphicsFamily = presentFamily = ~0; + graphicsFamily = presentFamily = ~0; - for (uint32_t i = 0; i < queueFamilyCount; i++) { - - if (queueFamilies[i].queueCount > 0) { - if (graphicsFamily == ~0 && queueFamilies[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) { - graphicsFamily = i; - } - uint32_t presentSupport = 0; - vkGetPhysicalDeviceSurfaceSupportKHR(device, i, surface, &presentSupport); - if (presentFamily == ~0 && presentSupport) { - presentFamily = i; - } - if (presentFamily != ~0 && graphicsFamily != ~0) { - break; - } - } + for (uint32_t i = 0; i < queueFamilyCount; i++) { + if (queueFamilies[i].queueCount > 0) { + if (graphicsFamily == ~0 && + queueFamilies[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) { + graphicsFamily = i; + } + uint32_t presentSupport = 0; + vkGetPhysicalDeviceSurfaceSupportKHR(device, i, surface, &presentSupport); + if (presentFamily == ~0 && presentSupport) { + presentFamily = i; + } + if (presentFamily != ~0 && graphicsFamily != ~0) { + break; + } } + } - return graphicsFamily != ~0 && presentFamily != ~0; + return graphicsFamily != ~0 && presentFamily != ~0; } -static bool hasAllExtensions(VkPhysicalDevice device, const std::vector& deviceExtensions) -{ - uint32_t extensionCount; - vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, nullptr); - std::vector availableExtensions(extensionCount); - vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, availableExtensions.data()); +static bool hasAllExtensions( + VkPhysicalDevice device, + const std::vector &deviceExtensions) { + uint32_t extensionCount; + vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, + nullptr); + std::vector availableExtensions(extensionCount); + vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, + availableExtensions.data()); - std::set requiredExtensions(deviceExtensions.begin(), deviceExtensions.end()); + std::set requiredExtensions(deviceExtensions.begin(), + deviceExtensions.end()); - for (const auto & extension : availableExtensions) { - requiredExtensions.erase(extension.extensionName); - } + for (const auto &extension : availableExtensions) { + requiredExtensions.erase(extension.extensionName); + } - return requiredExtensions.empty(); + return requiredExtensions.empty(); } -static void getSwapChainProperties(VkPhysicalDevice device, VkSurfaceKHR surface, VkSurfaceCapabilitiesKHR& capabilities, std::vector& formats, std::vector& presentModes) -{ - vkGetPhysicalDeviceSurfaceCapabilitiesKHR(device, surface, &capabilities); - uint32_t formatCount; - vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, nullptr); - if (formatCount != 0) { - formats.resize(formatCount); - vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, formats.data()); - } - uint32_t presentModeCount; - vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, nullptr); - if (presentModeCount != 0) { - presentModes.resize(presentModeCount); - vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, presentModes.data()); - } +static void getSwapChainProperties( + VkPhysicalDevice device, VkSurfaceKHR surface, + VkSurfaceCapabilitiesKHR &capabilities, + std::vector &formats, + std::vector &presentModes) { + vkGetPhysicalDeviceSurfaceCapabilitiesKHR(device, surface, &capabilities); + uint32_t formatCount; + vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, nullptr); + if (formatCount != 0) { + formats.resize(formatCount); + vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, + formats.data()); + } + uint32_t presentModeCount; + vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, + nullptr); + if (presentModeCount != 0) { + presentModes.resize(presentModeCount); + vkGetPhysicalDeviceSurfacePresentModesKHR( + device, surface, &presentModeCount, presentModes.data()); + } } -bool VulkanBaseApp::isSuitableDevice(VkPhysicalDevice dev) const -{ - uint32_t graphicsQueueIndex, presentQueueIndex; - std::vector deviceExtensions = getRequiredDeviceExtensions(); - VkSurfaceCapabilitiesKHR caps; +bool VulkanBaseApp::isSuitableDevice(VkPhysicalDevice dev) const { + uint32_t graphicsQueueIndex, presentQueueIndex; + std::vector deviceExtensions = getRequiredDeviceExtensions(); + VkSurfaceCapabilitiesKHR caps; + std::vector formats; + std::vector presentModes; + deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); + getSwapChainProperties(dev, m_surface, caps, formats, presentModes); + return hasAllExtensions(dev, deviceExtensions) && !formats.empty() && + !presentModes.empty() && + findGraphicsQueueIndicies(dev, m_surface, graphicsQueueIndex, + presentQueueIndex); +} + +void VulkanBaseApp::createDevice() { + { + uint32_t deviceCount = 0; + vkEnumeratePhysicalDevices(m_instance, &deviceCount, nullptr); + if (deviceCount == 0) { + throw std::runtime_error("Failed to find Vulkan capable GPUs!"); + } + std::vector phyDevs(deviceCount); + vkEnumeratePhysicalDevices(m_instance, &deviceCount, phyDevs.data()); + std::vector::iterator it = + std::find_if(phyDevs.begin(), phyDevs.end(), + std::bind(&VulkanBaseApp::isSuitableDevice, this, + std::placeholders::_1)); + if (it == phyDevs.end()) { + throw std::runtime_error("No suitable device found!"); + } + m_physicalDevice = *it; + } + + uint32_t graphicsQueueIndex, presentQueueIndex; + findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsQueueIndex, + presentQueueIndex); + + std::vector queueCreateInfos; + std::set uniqueFamilyIndices = {graphicsQueueIndex, + presentQueueIndex}; + + float queuePriority = 1.0f; + + for (uint32_t queueFamily : uniqueFamilyIndices) { + VkDeviceQueueCreateInfo queueCreateInfo = {}; + queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; + queueCreateInfo.queueFamilyIndex = queueFamily; + queueCreateInfo.queueCount = 1; + queueCreateInfo.pQueuePriorities = &queuePriority; + queueCreateInfos.push_back(queueCreateInfo); + } + + VkPhysicalDeviceFeatures deviceFeatures = {}; + deviceFeatures.fillModeNonSolid = true; + + VkDeviceCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; + + createInfo.pQueueCreateInfos = queueCreateInfos.data(); + createInfo.queueCreateInfoCount = + static_cast(queueCreateInfos.size()); + + createInfo.pEnabledFeatures = &deviceFeatures; + + std::vector deviceExtensions = getRequiredDeviceExtensions(); + deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); + + createInfo.enabledExtensionCount = + static_cast(deviceExtensions.size()); + createInfo.ppEnabledExtensionNames = deviceExtensions.data(); + + if (m_enableValidation) { + createInfo.enabledLayerCount = + static_cast(countof(validationLayers)); + createInfo.ppEnabledLayerNames = validationLayers; + } else { + createInfo.enabledLayerCount = 0; + } + + if (vkCreateDevice(m_physicalDevice, &createInfo, nullptr, &m_device) != + VK_SUCCESS) { + throw std::runtime_error("failed to create logical device!"); + } + + vkGetDeviceQueue(m_device, graphicsQueueIndex, 0, &m_graphicsQueue); + vkGetDeviceQueue(m_device, presentQueueIndex, 0, &m_presentQueue); + + VkPhysicalDeviceIDProperties vkPhysicalDeviceIDProperties = {}; + vkPhysicalDeviceIDProperties.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES; + vkPhysicalDeviceIDProperties.pNext = NULL; + + VkPhysicalDeviceProperties2 vkPhysicalDeviceProperties2 = {}; + vkPhysicalDeviceProperties2.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; + vkPhysicalDeviceProperties2.pNext = &vkPhysicalDeviceIDProperties; + + PFN_vkGetPhysicalDeviceProperties2 fpGetPhysicalDeviceProperties2; + fpGetPhysicalDeviceProperties2 = + (PFN_vkGetPhysicalDeviceProperties2)vkGetInstanceProcAddr( + m_instance, "vkGetPhysicalDeviceProperties2"); + if (fpGetPhysicalDeviceProperties2 == NULL) { + throw std::runtime_error( + "Vulkan: Proc address for \"vkGetPhysicalDeviceProperties2KHR\" not " + "found.\n"); + } + + fpGetPhysicalDeviceProperties2(m_physicalDevice, + &vkPhysicalDeviceProperties2); + + memcpy(m_vkDeviceUUID, vkPhysicalDeviceIDProperties.deviceUUID, VK_UUID_SIZE); +} + +static VkSurfaceFormatKHR chooseSwapSurfaceFormat( + const std::vector &availableFormats) { + if (availableFormats.size() == 1 && + availableFormats[0].format == VK_FORMAT_UNDEFINED) { + return {VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR}; + } + + for (const auto &availableFormat : availableFormats) { + if (availableFormat.format == VK_FORMAT_B8G8R8A8_UNORM && + availableFormat.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) { + return availableFormat; + } + } + + return availableFormats[0]; +} + +static VkPresentModeKHR chooseSwapPresentMode( + const std::vector &availablePresentModes) { + VkPresentModeKHR bestMode = VK_PRESENT_MODE_FIFO_KHR; + + for (const auto &availablePresentMode : availablePresentModes) { + if (availablePresentMode == VK_PRESENT_MODE_MAILBOX_KHR) { + return availablePresentMode; + } else if (availablePresentMode == VK_PRESENT_MODE_IMMEDIATE_KHR) { + bestMode = availablePresentMode; + } + } + + return bestMode; +} + +static VkExtent2D chooseSwapExtent( + GLFWwindow *window, const VkSurfaceCapabilitiesKHR &capabilities) { + if (capabilities.currentExtent.width != + std::numeric_limits::max()) { + return capabilities.currentExtent; + } else { + int width, height; + glfwGetFramebufferSize(window, &width, &height); + VkExtent2D actualExtent = {static_cast(width), + static_cast(height)}; + + actualExtent.width = std::max( + capabilities.minImageExtent.width, + std::min(capabilities.maxImageExtent.width, actualExtent.width)); + actualExtent.height = std::max( + capabilities.minImageExtent.height, + std::min(capabilities.maxImageExtent.height, actualExtent.height)); + + return actualExtent; + } +} + +void VulkanBaseApp::createSwapChain() { + VkSurfaceCapabilitiesKHR capabilities; + VkSurfaceFormatKHR format; + VkPresentModeKHR presentMode; + VkExtent2D extent; + uint32_t imageCount; + + { std::vector formats; std::vector presentModes; - deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); - getSwapChainProperties(dev, m_surface, caps, formats, presentModes); - return hasAllExtensions(dev, deviceExtensions) - && !formats.empty() && !presentModes.empty() - && findGraphicsQueueIndicies(dev, m_surface, graphicsQueueIndex, presentQueueIndex); + + getSwapChainProperties(m_physicalDevice, m_surface, capabilities, formats, + presentModes); + format = chooseSwapSurfaceFormat(formats); + presentMode = chooseSwapPresentMode(presentModes); + extent = chooseSwapExtent(m_window, capabilities); + imageCount = capabilities.minImageCount + 1; + if (capabilities.maxImageCount > 0 && + imageCount > capabilities.maxImageCount) { + imageCount = capabilities.maxImageCount; + } + } + + VkSwapchainCreateInfoKHR createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR; + createInfo.surface = m_surface; + + createInfo.minImageCount = imageCount; + createInfo.imageFormat = format.format; + createInfo.imageColorSpace = format.colorSpace; + createInfo.imageExtent = extent; + createInfo.imageArrayLayers = 1; + createInfo.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + + uint32_t queueFamilyIndices[2]; + findGraphicsQueueIndicies(m_physicalDevice, m_surface, queueFamilyIndices[0], + queueFamilyIndices[1]); + + if (queueFamilyIndices[0] != queueFamilyIndices[1]) { + createInfo.imageSharingMode = VK_SHARING_MODE_CONCURRENT; + createInfo.queueFamilyIndexCount = countof(queueFamilyIndices); + createInfo.pQueueFamilyIndices = queueFamilyIndices; + } else { + createInfo.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE; + } + + createInfo.preTransform = capabilities.currentTransform; + createInfo.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; + createInfo.presentMode = presentMode; + createInfo.clipped = VK_TRUE; + + createInfo.oldSwapchain = VK_NULL_HANDLE; + + if (vkCreateSwapchainKHR(m_device, &createInfo, nullptr, &m_swapChain) != + VK_SUCCESS) { + throw std::runtime_error("failed to create swap chain!"); + } + + vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount, nullptr); + m_swapChainImages.resize(imageCount); + vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount, + m_swapChainImages.data()); + + m_swapChainFormat = format.format; + m_swapChainExtent = extent; } -void VulkanBaseApp::createDevice() -{ - { - uint32_t deviceCount = 0; - vkEnumeratePhysicalDevices(m_instance, &deviceCount, nullptr); - if (deviceCount == 0) { - throw std::runtime_error("Failed to find Vulkan capable GPUs!"); - } - std::vector phyDevs(deviceCount); - vkEnumeratePhysicalDevices(m_instance, &deviceCount, phyDevs.data()); - std::vector::iterator it = std::find_if(phyDevs.begin(), phyDevs.end(), - std::bind(&VulkanBaseApp::isSuitableDevice, this, std::placeholders::_1)); - if (it == phyDevs.end()) { - throw std::runtime_error("No suitable device found!"); - } - m_physicalDevice = *it; - } +static VkImageView createImageView(VkDevice dev, VkImage image, VkFormat format, + VkImageAspectFlags aspectFlags) { + VkImageView imageView; + VkImageViewCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; + createInfo.image = image; + createInfo.viewType = VK_IMAGE_VIEW_TYPE_2D; + createInfo.format = format; + createInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY; + createInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY; + createInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY; + createInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY; + createInfo.subresourceRange.aspectMask = aspectFlags; + createInfo.subresourceRange.baseMipLevel = 0; + createInfo.subresourceRange.levelCount = 1; + createInfo.subresourceRange.baseArrayLayer = 0; + createInfo.subresourceRange.layerCount = 1; + if (vkCreateImageView(dev, &createInfo, nullptr, &imageView) != VK_SUCCESS) { + throw std::runtime_error("Failed to create image views!"); + } - uint32_t graphicsQueueIndex, presentQueueIndex; - findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsQueueIndex, presentQueueIndex); - - std::vector queueCreateInfos; - std::set uniqueFamilyIndices = { graphicsQueueIndex, presentQueueIndex }; - - float queuePriority = 1.0f; - - for (uint32_t queueFamily : uniqueFamilyIndices) { - VkDeviceQueueCreateInfo queueCreateInfo = {}; - queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; - queueCreateInfo.queueFamilyIndex = graphicsQueueIndex; - queueCreateInfo.queueCount = 1; - queueCreateInfo.pQueuePriorities = &queuePriority; - queueCreateInfos.push_back(queueCreateInfo); - } - - VkPhysicalDeviceFeatures deviceFeatures = {}; - deviceFeatures.fillModeNonSolid = true; - - VkDeviceCreateInfo createInfo = {}; - createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; - - createInfo.pQueueCreateInfos = queueCreateInfos.data(); - createInfo.queueCreateInfoCount = static_cast(queueCreateInfos.size()); - - createInfo.pEnabledFeatures = &deviceFeatures; - - std::vector deviceExtensions = getRequiredDeviceExtensions(); - deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); - - createInfo.enabledExtensionCount = static_cast(deviceExtensions.size()); - createInfo.ppEnabledExtensionNames = deviceExtensions.data(); - - if (m_enableValidation) { - createInfo.enabledLayerCount = static_cast(countof(validationLayers)); - createInfo.ppEnabledLayerNames = validationLayers; - } - else { - createInfo.enabledLayerCount = 0; - } - - if (vkCreateDevice(m_physicalDevice, &createInfo, nullptr, &m_device) != VK_SUCCESS) { - throw std::runtime_error("failed to create logical device!"); - } - - vkGetDeviceQueue(m_device, graphicsQueueIndex, 0, &m_graphicsQueue); - vkGetDeviceQueue(m_device, presentQueueIndex, 0, &m_presentQueue); - - VkPhysicalDeviceIDProperties vkPhysicalDeviceIDProperties = {}; - vkPhysicalDeviceIDProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES; - vkPhysicalDeviceIDProperties.pNext = NULL; - - VkPhysicalDeviceProperties2 vkPhysicalDeviceProperties2 = {}; - vkPhysicalDeviceProperties2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; - vkPhysicalDeviceProperties2.pNext = &vkPhysicalDeviceIDProperties; - - PFN_vkGetPhysicalDeviceProperties2 fpGetPhysicalDeviceProperties2; - fpGetPhysicalDeviceProperties2 = (PFN_vkGetPhysicalDeviceProperties2)vkGetInstanceProcAddr(m_instance, "vkGetPhysicalDeviceProperties2"); - if (fpGetPhysicalDeviceProperties2 == NULL) { - throw std::runtime_error("Vulkan: Proc address for \"vkGetPhysicalDeviceProperties2KHR\" not found.\n"); - } - - fpGetPhysicalDeviceProperties2(m_physicalDevice, &vkPhysicalDeviceProperties2); - - memcpy(m_vkDeviceUUID, vkPhysicalDeviceIDProperties.deviceUUID, VK_UUID_SIZE); + return imageView; } -static VkSurfaceFormatKHR chooseSwapSurfaceFormat(const std::vector& availableFormats) -{ - if (availableFormats.size() == 1 && availableFormats[0].format == VK_FORMAT_UNDEFINED) { - return { VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR }; - } +static void createImage(VkPhysicalDevice physicalDevice, VkDevice device, + uint32_t width, uint32_t height, VkFormat format, + VkImageTiling tiling, VkImageUsageFlags usage, + VkMemoryPropertyFlags properties, VkImage &image, + VkDeviceMemory &imageMemory) { + VkImageCreateInfo imageInfo = {}; + imageInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; + imageInfo.imageType = VK_IMAGE_TYPE_2D; + imageInfo.extent.width = width; + imageInfo.extent.height = height; + imageInfo.extent.depth = 1; + imageInfo.mipLevels = 1; + imageInfo.arrayLayers = 1; + imageInfo.format = format; + imageInfo.tiling = tiling; + imageInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + imageInfo.usage = usage; + imageInfo.samples = VK_SAMPLE_COUNT_1_BIT; + imageInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - for (const auto & availableFormat : availableFormats) { - if (availableFormat.format == VK_FORMAT_B8G8R8A8_UNORM && availableFormat.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) { - return availableFormat; - } - } + if (vkCreateImage(device, &imageInfo, nullptr, &image) != VK_SUCCESS) { + throw std::runtime_error("failed to create image!"); + } - return availableFormats[0]; + VkMemoryRequirements memRequirements; + vkGetImageMemoryRequirements(device, image, &memRequirements); + + VkMemoryAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + allocInfo.allocationSize = memRequirements.size; + allocInfo.memoryTypeIndex = findMemoryType( + physicalDevice, memRequirements.memoryTypeBits, properties); + + if (vkAllocateMemory(device, &allocInfo, nullptr, &imageMemory) != + VK_SUCCESS) { + throw std::runtime_error("failed to allocate image memory!"); + } + + vkBindImageMemory(device, image, imageMemory, 0); } -static VkPresentModeKHR chooseSwapPresentMode(const std::vector& availablePresentModes) -{ - VkPresentModeKHR bestMode = VK_PRESENT_MODE_FIFO_KHR; +void VulkanBaseApp::createImageViews() { + m_swapChainImageViews.resize(m_swapChainImages.size()); - for (const auto & availablePresentMode : availablePresentModes) { - if (availablePresentMode == VK_PRESENT_MODE_MAILBOX_KHR) { - return availablePresentMode; - } - else if (availablePresentMode == VK_PRESENT_MODE_IMMEDIATE_KHR) { - bestMode = availablePresentMode; - } - } - - return bestMode; + for (uint32_t i = 0; i < m_swapChainImages.size(); i++) { + m_swapChainImageViews[i] = + createImageView(m_device, m_swapChainImages[i], m_swapChainFormat, + VK_IMAGE_ASPECT_COLOR_BIT); + } } -static VkExtent2D chooseSwapExtent(GLFWwindow *window, const VkSurfaceCapabilitiesKHR& capabilities) -{ - if (capabilities.currentExtent.width != std::numeric_limits::max()) { - return capabilities.currentExtent; - } - else { - int width, height; - glfwGetFramebufferSize(window, &width, &height); - VkExtent2D actualExtent = { static_cast(width), static_cast(height) }; +void VulkanBaseApp::createRenderPass() { + VkAttachmentDescription colorAttachment = {}; + colorAttachment.format = m_swapChainFormat; + colorAttachment.samples = VK_SAMPLE_COUNT_1_BIT; + colorAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; + colorAttachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE; + colorAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; + colorAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; + colorAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + colorAttachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; - actualExtent.width = std::max(capabilities.minImageExtent.width, std::min(capabilities.maxImageExtent.width, actualExtent.width)); - actualExtent.height = std::max(capabilities.minImageExtent.height, std::min(capabilities.maxImageExtent.height, actualExtent.height)); + VkAttachmentReference colorAttachmentRef = {}; + colorAttachmentRef.attachment = 0; + colorAttachmentRef.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; - return actualExtent; - } + VkAttachmentDescription depthAttachment = {}; + depthAttachment.format = findSupportedFormat( + m_physicalDevice, {VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT, + VK_FORMAT_D24_UNORM_S8_UINT}, + VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT); + depthAttachment.samples = VK_SAMPLE_COUNT_1_BIT; + depthAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; + depthAttachment.storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; + depthAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; + depthAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; + depthAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + depthAttachment.finalLayout = + VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; + + VkAttachmentReference depthAttachmentRef = {}; + depthAttachmentRef.attachment = 1; + depthAttachmentRef.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; + + VkSubpassDescription subpass = {}; + subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; + subpass.colorAttachmentCount = 1; + subpass.pColorAttachments = &colorAttachmentRef; + subpass.pDepthStencilAttachment = &depthAttachmentRef; + + VkSubpassDependency dependency = {}; + dependency.srcSubpass = VK_SUBPASS_EXTERNAL; + dependency.dstSubpass = 0; + dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + dependency.srcAccessMask = 0; + dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + dependency.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | + VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; + + VkAttachmentDescription attachments[] = {colorAttachment, depthAttachment}; + VkRenderPassCreateInfo renderPassInfo = {}; + renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO; + renderPassInfo.attachmentCount = countof(attachments); + renderPassInfo.pAttachments = attachments; + renderPassInfo.subpassCount = 1; + renderPassInfo.pSubpasses = &subpass; + renderPassInfo.dependencyCount = 1; + renderPassInfo.pDependencies = &dependency; + + if (vkCreateRenderPass(m_device, &renderPassInfo, nullptr, &m_renderPass) != + VK_SUCCESS) { + throw std::runtime_error("failed to create render pass!"); + } } -void VulkanBaseApp::createSwapChain() -{ - VkSurfaceCapabilitiesKHR capabilities; - VkSurfaceFormatKHR format; - VkPresentModeKHR presentMode; - VkExtent2D extent; - uint32_t imageCount; +void VulkanBaseApp::createDescriptorSetLayout() { + VkDescriptorSetLayoutBinding uboLayoutBinding = {}; + uboLayoutBinding.binding = 0; + uboLayoutBinding.descriptorCount = 1; + uboLayoutBinding.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + uboLayoutBinding.pImmutableSamplers = nullptr; + uboLayoutBinding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT; - { - std::vector formats; - std::vector presentModes; + VkDescriptorSetLayoutCreateInfo layoutInfo = {}; + layoutInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + layoutInfo.bindingCount = 1; + layoutInfo.pBindings = &uboLayoutBinding; - getSwapChainProperties(m_physicalDevice, m_surface, capabilities, formats, presentModes); - format = chooseSwapSurfaceFormat(formats); - presentMode = chooseSwapPresentMode(presentModes); - extent = chooseSwapExtent(m_window, capabilities); - imageCount = capabilities.minImageCount + 1; - if (capabilities.maxImageCount > 0 && imageCount > capabilities.maxImageCount) { - imageCount = capabilities.maxImageCount; - } - } - - VkSwapchainCreateInfoKHR createInfo = {}; - createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR; - createInfo.surface = m_surface; - - createInfo.minImageCount = imageCount; - createInfo.imageFormat = format.format; - createInfo.imageColorSpace = format.colorSpace; - createInfo.imageExtent = extent; - createInfo.imageArrayLayers = 1; - createInfo.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; - - uint32_t queueFamilyIndices[2]; - findGraphicsQueueIndicies(m_physicalDevice, m_surface, queueFamilyIndices[0], queueFamilyIndices[1]); - - if (queueFamilyIndices[0] != queueFamilyIndices[1]) { - createInfo.imageSharingMode = VK_SHARING_MODE_CONCURRENT; - createInfo.queueFamilyIndexCount = countof(queueFamilyIndices); - createInfo.pQueueFamilyIndices = queueFamilyIndices; - } - else { - createInfo.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE; - } - - createInfo.preTransform = capabilities.currentTransform; - createInfo.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; - createInfo.presentMode = presentMode; - createInfo.clipped = VK_TRUE; - - createInfo.oldSwapchain = VK_NULL_HANDLE; - - if (vkCreateSwapchainKHR(m_device, &createInfo, nullptr, &m_swapChain) != VK_SUCCESS) { - throw std::runtime_error("failed to create swap chain!"); - } - - vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount, nullptr); - m_swapChainImages.resize(imageCount); - vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount, m_swapChainImages.data()); - - m_swapChainFormat = format.format; - m_swapChainExtent = extent; + if (vkCreateDescriptorSetLayout(m_device, &layoutInfo, nullptr, + &m_descriptorSetLayout) != VK_SUCCESS) { + throw std::runtime_error("failed to create descriptor set layout!"); + } } -static VkImageView createImageView(VkDevice dev, VkImage image, VkFormat format, VkImageAspectFlags aspectFlags) -{ - VkImageView imageView; - VkImageViewCreateInfo createInfo = {}; - createInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; - createInfo.image = image; - createInfo.viewType = VK_IMAGE_VIEW_TYPE_2D; - createInfo.format = format; - createInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY; - createInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY; - createInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY; - createInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY; - createInfo.subresourceRange.aspectMask = aspectFlags; - createInfo.subresourceRange.baseMipLevel = 0; - createInfo.subresourceRange.levelCount = 1; - createInfo.subresourceRange.baseArrayLayer = 0; - createInfo.subresourceRange.layerCount = 1; - if (vkCreateImageView(dev, &createInfo, nullptr, &imageView) != VK_SUCCESS) { - throw std::runtime_error("Failed to create image views!"); - } +VkShaderModule createShaderModule(VkDevice device, const char *filename) { + std::vector shaderContents; + std::ifstream shaderFile(filename, std::ios_base::in | std::ios_base::binary); + VkShaderModuleCreateInfo createInfo = {}; + VkShaderModule shaderModule; - return imageView; + if (!shaderFile.good()) { + throw std::runtime_error("Failed to load shader contents"); + } + readFile(shaderFile, shaderContents); + + createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; + createInfo.codeSize = shaderContents.size(); + createInfo.pCode = reinterpret_cast(shaderContents.data()); + + if (vkCreateShaderModule(device, &createInfo, nullptr, &shaderModule) != + VK_SUCCESS) { + throw std::runtime_error("Failed to create shader module!"); + } + + return shaderModule; } -static void createImage(VkPhysicalDevice physicalDevice, VkDevice device, uint32_t width, uint32_t height, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage, VkMemoryPropertyFlags properties, VkImage& image, VkDeviceMemory& imageMemory) -{ - VkImageCreateInfo imageInfo = {}; - imageInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; - imageInfo.imageType = VK_IMAGE_TYPE_2D; - imageInfo.extent.width = width; - imageInfo.extent.height = height; - imageInfo.extent.depth = 1; - imageInfo.mipLevels = 1; - imageInfo.arrayLayers = 1; - imageInfo.format = format; - imageInfo.tiling = tiling; - imageInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; - imageInfo.usage = usage; - imageInfo.samples = VK_SAMPLE_COUNT_1_BIT; - imageInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; +void VulkanBaseApp::getVertexDescriptions( + std::vector &bindingDesc, + std::vector &attribDesc) {} - if (vkCreateImage(device, &imageInfo, nullptr, &image) != VK_SUCCESS) { - throw std::runtime_error("failed to create image!"); - } +void VulkanBaseApp::getAssemblyStateInfo( + VkPipelineInputAssemblyStateCreateInfo &info) {} - VkMemoryRequirements memRequirements; - vkGetImageMemoryRequirements(device, image, &memRequirements); +void VulkanBaseApp::createGraphicsPipeline() { + std::vector shaderStageInfos( + m_shaderFiles.size()); + for (size_t i = 0; i < m_shaderFiles.size(); i++) { + shaderStageInfos[i] = {}; + shaderStageInfos[i].sType = + VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + shaderStageInfos[i].stage = m_shaderFiles[i].first; + shaderStageInfos[i].module = + createShaderModule(m_device, m_shaderFiles[i].second.c_str()); + shaderStageInfos[i].pName = "main"; + } - VkMemoryAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - allocInfo.allocationSize = memRequirements.size; - allocInfo.memoryTypeIndex = findMemoryType(physicalDevice, memRequirements.memoryTypeBits, properties); + VkPipelineVertexInputStateCreateInfo vertexInputInfo = {}; - if (vkAllocateMemory(device, &allocInfo, nullptr, &imageMemory) != VK_SUCCESS) { - throw std::runtime_error("failed to allocate image memory!"); - } + std::vector vertexBindingDescriptions; + std::vector vertexAttributeDescriptions; - vkBindImageMemory(device, image, imageMemory, 0); + getVertexDescriptions(vertexBindingDescriptions, vertexAttributeDescriptions); + + vertexInputInfo.sType = + VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; + vertexInputInfo.vertexBindingDescriptionCount = + static_cast(vertexBindingDescriptions.size()); + vertexInputInfo.pVertexBindingDescriptions = vertexBindingDescriptions.data(); + vertexInputInfo.vertexAttributeDescriptionCount = + static_cast(vertexAttributeDescriptions.size()); + vertexInputInfo.pVertexAttributeDescriptions = + vertexAttributeDescriptions.data(); + + VkPipelineInputAssemblyStateCreateInfo inputAssembly = {}; + getAssemblyStateInfo(inputAssembly); + + VkViewport viewport = {}; + viewport.x = 0.0f; + viewport.y = 0.0f; + viewport.width = (float)m_swapChainExtent.width; + viewport.height = (float)m_swapChainExtent.height; + viewport.minDepth = 0.0f; + viewport.maxDepth = 1.0f; + + VkRect2D scissor = {}; + scissor.offset = {0, 0}; + scissor.extent = m_swapChainExtent; + + VkPipelineViewportStateCreateInfo viewportState = {}; + viewportState.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; + viewportState.viewportCount = 1; + viewportState.pViewports = &viewport; + viewportState.scissorCount = 1; + viewportState.pScissors = &scissor; + + VkPipelineRasterizationStateCreateInfo rasterizer = {}; + rasterizer.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; + rasterizer.depthClampEnable = VK_FALSE; + rasterizer.rasterizerDiscardEnable = VK_FALSE; + rasterizer.polygonMode = VK_POLYGON_MODE_LINE; + rasterizer.lineWidth = 1.0f; + rasterizer.cullMode = VK_CULL_MODE_NONE; + rasterizer.frontFace = VK_FRONT_FACE_CLOCKWISE; + rasterizer.depthBiasEnable = VK_FALSE; + + VkPipelineMultisampleStateCreateInfo multisampling = {}; + multisampling.sType = + VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; + multisampling.sampleShadingEnable = VK_FALSE; + multisampling.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; + multisampling.minSampleShading = 1.0f; // Optional + multisampling.pSampleMask = nullptr; // Optional + multisampling.alphaToCoverageEnable = VK_FALSE; // Optional + multisampling.alphaToOneEnable = VK_FALSE; // Optional + + VkPipelineDepthStencilStateCreateInfo depthStencil = {}; + depthStencil.sType = + VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; + depthStencil.depthTestEnable = VK_TRUE; + depthStencil.depthWriteEnable = VK_TRUE; + depthStencil.depthCompareOp = VK_COMPARE_OP_LESS; + depthStencil.depthBoundsTestEnable = VK_FALSE; + depthStencil.stencilTestEnable = VK_FALSE; + + VkPipelineColorBlendAttachmentState colorBlendAttachment = {}; + colorBlendAttachment.colorWriteMask = + VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; + colorBlendAttachment.blendEnable = VK_FALSE; + + VkPipelineColorBlendStateCreateInfo colorBlending = {}; + colorBlending.sType = + VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; + colorBlending.logicOpEnable = VK_FALSE; + colorBlending.logicOp = VK_LOGIC_OP_COPY; + colorBlending.attachmentCount = 1; + colorBlending.pAttachments = &colorBlendAttachment; + colorBlending.blendConstants[0] = 0.0f; + colorBlending.blendConstants[1] = 0.0f; + colorBlending.blendConstants[2] = 0.0f; + colorBlending.blendConstants[3] = 0.0f; + + VkPipelineLayoutCreateInfo pipelineLayoutInfo = {}; + pipelineLayoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + pipelineLayoutInfo.setLayoutCount = 1; // Optional + pipelineLayoutInfo.pSetLayouts = &m_descriptorSetLayout; // Optional + pipelineLayoutInfo.pushConstantRangeCount = 0; // Optional + pipelineLayoutInfo.pPushConstantRanges = nullptr; // Optional + + if (vkCreatePipelineLayout(m_device, &pipelineLayoutInfo, nullptr, + &m_pipelineLayout) != VK_SUCCESS) { + throw std::runtime_error("failed to create pipeline layout!"); + } + + VkGraphicsPipelineCreateInfo pipelineInfo = {}; + pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; + pipelineInfo.stageCount = static_cast(shaderStageInfos.size()); + pipelineInfo.pStages = shaderStageInfos.data(); + + pipelineInfo.pVertexInputState = &vertexInputInfo; + pipelineInfo.pInputAssemblyState = &inputAssembly; + pipelineInfo.pViewportState = &viewportState; + pipelineInfo.pRasterizationState = &rasterizer; + pipelineInfo.pMultisampleState = &multisampling; + pipelineInfo.pDepthStencilState = &depthStencil; // Optional + pipelineInfo.pColorBlendState = &colorBlending; + pipelineInfo.pDynamicState = nullptr; // Optional + + pipelineInfo.layout = m_pipelineLayout; + + pipelineInfo.renderPass = m_renderPass; + pipelineInfo.subpass = 0; + + pipelineInfo.basePipelineHandle = VK_NULL_HANDLE; // Optional + pipelineInfo.basePipelineIndex = -1; // Optional + + if (vkCreateGraphicsPipelines(m_device, VK_NULL_HANDLE, 1, &pipelineInfo, + nullptr, &m_graphicsPipeline) != VK_SUCCESS) { + throw std::runtime_error("failed to create graphics pipeline!"); + } + + for (size_t i = 0; i < shaderStageInfos.size(); i++) { + vkDestroyShaderModule(m_device, shaderStageInfos[i].module, nullptr); + } } -void VulkanBaseApp::createImageViews() -{ - m_swapChainImageViews.resize(m_swapChainImages.size()); +void VulkanBaseApp::createFramebuffers() { + m_swapChainFramebuffers.resize(m_swapChainImageViews.size()); + for (size_t i = 0; i < m_swapChainImageViews.size(); i++) { + VkImageView attachments[] = {m_swapChainImageViews[i], m_depthImageView}; - for (uint32_t i = 0; i < m_swapChainImages.size(); i++) { - m_swapChainImageViews[i] = createImageView(m_device, m_swapChainImages[i], m_swapChainFormat, VK_IMAGE_ASPECT_COLOR_BIT); + VkFramebufferCreateInfo framebufferInfo = {}; + framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; + framebufferInfo.renderPass = m_renderPass; + framebufferInfo.attachmentCount = countof(attachments); + framebufferInfo.pAttachments = attachments; + framebufferInfo.width = m_swapChainExtent.width; + framebufferInfo.height = m_swapChainExtent.height; + framebufferInfo.layers = 1; + + if (vkCreateFramebuffer(m_device, &framebufferInfo, nullptr, + &m_swapChainFramebuffers[i]) != VK_SUCCESS) { + throw std::runtime_error("failed to create framebuffer!"); } + } } -void VulkanBaseApp::createRenderPass() -{ - VkAttachmentDescription colorAttachment = {}; - colorAttachment.format = m_swapChainFormat; - colorAttachment.samples = VK_SAMPLE_COUNT_1_BIT; - colorAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; - colorAttachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE; - colorAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; - colorAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; - colorAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; - colorAttachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; +void VulkanBaseApp::createCommandPool() { + VkCommandPoolCreateInfo poolInfo = {}; + uint32_t graphicsIndex, presentIndex; - VkAttachmentReference colorAttachmentRef = {}; - colorAttachmentRef.attachment = 0; - colorAttachmentRef.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; + findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsIndex, + presentIndex); - VkAttachmentDescription depthAttachment = {}; - depthAttachment.format = findSupportedFormat(m_physicalDevice, - { VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_D24_UNORM_S8_UINT }, - VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT); - depthAttachment.samples = VK_SAMPLE_COUNT_1_BIT; - depthAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; - depthAttachment.storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; - depthAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; - depthAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; - depthAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; - depthAttachment.finalLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; + poolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + poolInfo.queueFamilyIndex = graphicsIndex; + poolInfo.flags = 0; // Optional - VkAttachmentReference depthAttachmentRef = {}; - depthAttachmentRef.attachment = 1; - depthAttachmentRef.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; - - VkSubpassDescription subpass = {}; - subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; - subpass.colorAttachmentCount = 1; - subpass.pColorAttachments = &colorAttachmentRef; - subpass.pDepthStencilAttachment = &depthAttachmentRef; - - - VkSubpassDependency dependency = {}; - dependency.srcSubpass = VK_SUBPASS_EXTERNAL; - dependency.dstSubpass = 0; - dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; - dependency.srcAccessMask = 0; - dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; - dependency.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; - - VkAttachmentDescription attachments[] = {colorAttachment, depthAttachment}; - VkRenderPassCreateInfo renderPassInfo = {}; - renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO; - renderPassInfo.attachmentCount = countof(attachments); - renderPassInfo.pAttachments = attachments; - renderPassInfo.subpassCount = 1; - renderPassInfo.pSubpasses = &subpass; - renderPassInfo.dependencyCount = 1; - renderPassInfo.pDependencies = &dependency; - - if (vkCreateRenderPass(m_device, &renderPassInfo, nullptr, &m_renderPass) != VK_SUCCESS) { - throw std::runtime_error("failed to create render pass!"); - } + if (vkCreateCommandPool(m_device, &poolInfo, nullptr, &m_commandPool) != + VK_SUCCESS) { + throw std::runtime_error("Failed to create command pool!"); + } } -void VulkanBaseApp::createDescriptorSetLayout() -{ - VkDescriptorSetLayoutBinding uboLayoutBinding = {}; - uboLayoutBinding.binding = 0; - uboLayoutBinding.descriptorCount = 1; - uboLayoutBinding.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; - uboLayoutBinding.pImmutableSamplers = nullptr; - uboLayoutBinding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT; +static void transitionImageLayout(VulkanBaseApp *app, VkImage image, + VkFormat format, VkImageLayout oldLayout, + VkImageLayout newLayout) { + VkCommandBuffer commandBuffer = app->beginSingleTimeCommands(); - VkDescriptorSetLayoutCreateInfo layoutInfo = {}; - layoutInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; - layoutInfo.bindingCount = 1; - layoutInfo.pBindings = &uboLayoutBinding; + VkImageMemoryBarrier barrier = {}; + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.oldLayout = oldLayout; + barrier.newLayout = newLayout; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.image = image; - if (vkCreateDescriptorSetLayout(m_device, &layoutInfo, nullptr, &m_descriptorSetLayout) != VK_SUCCESS) { - throw std::runtime_error("failed to create descriptor set layout!"); + if (newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) { + barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; + + if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || + format == VK_FORMAT_D24_UNORM_S8_UINT) { + barrier.subresourceRange.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT; } + } else { + barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + } + + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = 1; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + + VkPipelineStageFlags sourceStage; + VkPipelineStageFlags destinationStage; + + if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && + newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) { + barrier.srcAccessMask = 0; + barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + + sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT; + } else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL && + newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) { + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + + sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT; + destinationStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + } else if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && + newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) { + barrier.srcAccessMask = 0; + barrier.dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + + sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + destinationStage = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT; + } else { + throw std::invalid_argument("unsupported layout transition!"); + } + + vkCmdPipelineBarrier(commandBuffer, sourceStage, destinationStage, 0, 0, + nullptr, 0, nullptr, 1, &barrier); + + app->endSingleTimeCommands(commandBuffer); } -VkShaderModule createShaderModule(VkDevice device, const char *filename) -{ - std::vector shaderContents; - std::ifstream shaderFile(filename, std::ios_base::in | std::ios_base::binary); - VkShaderModuleCreateInfo createInfo = {}; - VkShaderModule shaderModule; - - if (!shaderFile.good()) { - throw std::runtime_error("Failed to load shader contents"); - } - readFile(shaderFile, shaderContents); - - createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; - createInfo.codeSize = shaderContents.size(); - createInfo.pCode = reinterpret_cast(shaderContents.data()); - - if (vkCreateShaderModule(device, &createInfo, nullptr, &shaderModule) != VK_SUCCESS) { - throw std::runtime_error("Failed to create shader module!"); - } - - return shaderModule; +void VulkanBaseApp::createDepthResources() { + VkFormat depthFormat = findSupportedFormat( + m_physicalDevice, {VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT, + VK_FORMAT_D24_UNORM_S8_UINT}, + VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT); + createImage(m_physicalDevice, m_device, m_swapChainExtent.width, + m_swapChainExtent.height, depthFormat, VK_IMAGE_TILING_OPTIMAL, + VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_depthImage, + m_depthImageMemory); + m_depthImageView = createImageView(m_device, m_depthImage, depthFormat, + VK_IMAGE_ASPECT_DEPTH_BIT); + transitionImageLayout(this, m_depthImage, depthFormat, + VK_IMAGE_LAYOUT_UNDEFINED, + VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL); } -void VulkanBaseApp::getVertexDescriptions(std::vector& bindingDesc, std::vector& attribDesc) -{ -} - -void VulkanBaseApp::getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info) -{ - -} - -void VulkanBaseApp::createGraphicsPipeline() -{ - std::vector shaderStageInfos(m_shaderFiles.size()); - for (size_t i = 0; i < m_shaderFiles.size(); i++) { - shaderStageInfos[i] = {}; - shaderStageInfos[i].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; - shaderStageInfos[i].stage = m_shaderFiles[i].first; - shaderStageInfos[i].module = createShaderModule(m_device, m_shaderFiles[i].second.c_str()); - shaderStageInfos[i].pName = "main"; - } - - VkPipelineVertexInputStateCreateInfo vertexInputInfo = {}; - - std::vector vertexBindingDescriptions; - std::vector vertexAttributeDescriptions; - - getVertexDescriptions(vertexBindingDescriptions, vertexAttributeDescriptions); - - vertexInputInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; - vertexInputInfo.vertexBindingDescriptionCount = static_cast(vertexBindingDescriptions.size()); - vertexInputInfo.pVertexBindingDescriptions = vertexBindingDescriptions.data(); - vertexInputInfo.vertexAttributeDescriptionCount = static_cast(vertexAttributeDescriptions.size()); - vertexInputInfo.pVertexAttributeDescriptions = vertexAttributeDescriptions.data(); - - VkPipelineInputAssemblyStateCreateInfo inputAssembly = {}; - getAssemblyStateInfo(inputAssembly); - - VkViewport viewport = {}; - viewport.x = 0.0f; - viewport.y = 0.0f; - viewport.width = (float)m_swapChainExtent.width; - viewport.height = (float)m_swapChainExtent.height; - viewport.minDepth = 0.0f; - viewport.maxDepth = 1.0f; - - VkRect2D scissor = {}; - scissor.offset = { 0, 0 }; - scissor.extent = m_swapChainExtent; - - VkPipelineViewportStateCreateInfo viewportState = {}; - viewportState.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; - viewportState.viewportCount = 1; - viewportState.pViewports = &viewport; - viewportState.scissorCount = 1; - viewportState.pScissors = &scissor; - - VkPipelineRasterizationStateCreateInfo rasterizer = {}; - rasterizer.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; - rasterizer.depthClampEnable = VK_FALSE; - rasterizer.rasterizerDiscardEnable = VK_FALSE; - rasterizer.polygonMode = VK_POLYGON_MODE_LINE; - rasterizer.lineWidth = 1.0f; - rasterizer.cullMode = VK_CULL_MODE_NONE; - rasterizer.frontFace = VK_FRONT_FACE_CLOCKWISE; - rasterizer.depthBiasEnable = VK_FALSE; - - VkPipelineMultisampleStateCreateInfo multisampling = {}; - multisampling.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; - multisampling.sampleShadingEnable = VK_FALSE; - multisampling.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; - multisampling.minSampleShading = 1.0f; // Optional - multisampling.pSampleMask = nullptr; // Optional - multisampling.alphaToCoverageEnable = VK_FALSE; // Optional - multisampling.alphaToOneEnable = VK_FALSE; // Optional - - VkPipelineDepthStencilStateCreateInfo depthStencil = {}; - depthStencil.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; - depthStencil.depthTestEnable = VK_TRUE; - depthStencil.depthWriteEnable = VK_TRUE; - depthStencil.depthCompareOp = VK_COMPARE_OP_LESS; - depthStencil.depthBoundsTestEnable = VK_FALSE; - depthStencil.stencilTestEnable = VK_FALSE; - - VkPipelineColorBlendAttachmentState colorBlendAttachment = {}; - colorBlendAttachment.colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; - colorBlendAttachment.blendEnable = VK_FALSE; - - VkPipelineColorBlendStateCreateInfo colorBlending = {}; - colorBlending.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; - colorBlending.logicOpEnable = VK_FALSE; - colorBlending.logicOp = VK_LOGIC_OP_COPY; - colorBlending.attachmentCount = 1; - colorBlending.pAttachments = &colorBlendAttachment; - colorBlending.blendConstants[0] = 0.0f; - colorBlending.blendConstants[1] = 0.0f; - colorBlending.blendConstants[2] = 0.0f; - colorBlending.blendConstants[3] = 0.0f; - - VkPipelineLayoutCreateInfo pipelineLayoutInfo = {}; - pipelineLayoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; - pipelineLayoutInfo.setLayoutCount = 1; // Optional - pipelineLayoutInfo.pSetLayouts = &m_descriptorSetLayout; // Optional - pipelineLayoutInfo.pushConstantRangeCount = 0; // Optional - pipelineLayoutInfo.pPushConstantRanges = nullptr; // Optional - - if (vkCreatePipelineLayout(m_device, &pipelineLayoutInfo, nullptr, &m_pipelineLayout) != VK_SUCCESS) { - throw std::runtime_error("failed to create pipeline layout!"); - } - - VkGraphicsPipelineCreateInfo pipelineInfo = {}; - pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; - pipelineInfo.stageCount = static_cast(shaderStageInfos.size()); - pipelineInfo.pStages = shaderStageInfos.data(); - - pipelineInfo.pVertexInputState = &vertexInputInfo; - pipelineInfo.pInputAssemblyState = &inputAssembly; - pipelineInfo.pViewportState = &viewportState; - pipelineInfo.pRasterizationState = &rasterizer; - pipelineInfo.pMultisampleState = &multisampling; - pipelineInfo.pDepthStencilState = &depthStencil; // Optional - pipelineInfo.pColorBlendState = &colorBlending; - pipelineInfo.pDynamicState = nullptr; // Optional - - pipelineInfo.layout = m_pipelineLayout; - - pipelineInfo.renderPass = m_renderPass; - pipelineInfo.subpass = 0; - - pipelineInfo.basePipelineHandle = VK_NULL_HANDLE; // Optional - pipelineInfo.basePipelineIndex = -1; // Optional - - if (vkCreateGraphicsPipelines(m_device, VK_NULL_HANDLE, 1, &pipelineInfo, nullptr, &m_graphicsPipeline) != VK_SUCCESS) { - throw std::runtime_error("failed to create graphics pipeline!"); - } - - for (size_t i = 0; i < shaderStageInfos.size(); i++) { - vkDestroyShaderModule(m_device, shaderStageInfos[i].module, nullptr); - } -} - -void VulkanBaseApp::createFramebuffers() -{ - m_swapChainFramebuffers.resize(m_swapChainImageViews.size()); - for (size_t i = 0; i < m_swapChainImageViews.size(); i++) { - VkImageView attachments[] = { - m_swapChainImageViews[i], - m_depthImageView - }; - - VkFramebufferCreateInfo framebufferInfo = {}; - framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; - framebufferInfo.renderPass = m_renderPass; - framebufferInfo.attachmentCount = countof(attachments); - framebufferInfo.pAttachments = attachments; - framebufferInfo.width = m_swapChainExtent.width; - framebufferInfo.height = m_swapChainExtent.height; - framebufferInfo.layers = 1; - - if (vkCreateFramebuffer(m_device, &framebufferInfo, nullptr, &m_swapChainFramebuffers[i]) != VK_SUCCESS) { - throw std::runtime_error("failed to create framebuffer!"); - } - } -} - -void VulkanBaseApp::createCommandPool() -{ - VkCommandPoolCreateInfo poolInfo = {}; - uint32_t graphicsIndex, presentIndex; - - findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsIndex, presentIndex); - - poolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; - poolInfo.queueFamilyIndex = graphicsIndex; - poolInfo.flags = 0; // Optional - - if (vkCreateCommandPool(m_device, &poolInfo, nullptr, &m_commandPool) != VK_SUCCESS) { - throw std::runtime_error("Failed to create command pool!"); - } -} - -static void transitionImageLayout(VulkanBaseApp *app, VkImage image, VkFormat format, VkImageLayout oldLayout, VkImageLayout newLayout) -{ - VkCommandBuffer commandBuffer = app->beginSingleTimeCommands(); - - VkImageMemoryBarrier barrier = {}; - barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; - barrier.oldLayout = oldLayout; - barrier.newLayout = newLayout; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.image = image; - - if (newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) { - barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; - - if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_D24_UNORM_S8_UINT) { - barrier.subresourceRange.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT; - } - } - else { - barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - } - - barrier.subresourceRange.baseMipLevel = 0; - barrier.subresourceRange.levelCount = 1; - barrier.subresourceRange.baseArrayLayer = 0; - barrier.subresourceRange.layerCount = 1; - - VkPipelineStageFlags sourceStage; - VkPipelineStageFlags destinationStage; - - if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) { - barrier.srcAccessMask = 0; - barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - - sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT; - } - else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL && newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) { - barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - - sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT; - destinationStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; - } - else if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) { - barrier.srcAccessMask = 0; - barrier.dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; - - sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - destinationStage = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT; - } - else { - throw std::invalid_argument("unsupported layout transition!"); - } - - vkCmdPipelineBarrier( - commandBuffer, - sourceStage, destinationStage, - 0, - 0, nullptr, - 0, nullptr, - 1, &barrier - ); - - app->endSingleTimeCommands(commandBuffer); -} - -void VulkanBaseApp::createDepthResources() -{ - VkFormat depthFormat = findSupportedFormat(m_physicalDevice, - { VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_D24_UNORM_S8_UINT }, - VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT); - createImage(m_physicalDevice, m_device, m_swapChainExtent.width, m_swapChainExtent.height, depthFormat, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_depthImage, m_depthImageMemory); - m_depthImageView = createImageView(m_device, m_depthImage, depthFormat, VK_IMAGE_ASPECT_DEPTH_BIT); - transitionImageLayout(this, m_depthImage, depthFormat, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL); -} - -void VulkanBaseApp::createUniformBuffers() -{ - VkDeviceSize size = getUniformSize(); - if (size > 0) { - m_uniformBuffers.resize(m_swapChainImages.size()); - m_uniformMemory.resize(m_swapChainImages.size()); - for (size_t i = 0; i < m_uniformBuffers.size(); i++) { - createBuffer(getUniformSize(), - VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, - m_uniformBuffers[i], m_uniformMemory[i]); - } - } -} - -void VulkanBaseApp::createDescriptorPool() -{ - VkDescriptorPoolSize poolSize = {}; - poolSize.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; - poolSize.descriptorCount = static_cast(m_swapChainImages.size()); - VkDescriptorPoolCreateInfo poolInfo = {}; - poolInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; - poolInfo.poolSizeCount = 1; - poolInfo.pPoolSizes = &poolSize; - poolInfo.maxSets = static_cast(m_swapChainImages.size());; - if (vkCreateDescriptorPool(m_device, &poolInfo, nullptr, &m_descriptorPool) != VK_SUCCESS) { - throw std::runtime_error("failed to create descriptor pool!"); - } -} - -void VulkanBaseApp::createDescriptorSets() -{ - std::vector layouts(m_swapChainImages.size(), m_descriptorSetLayout); - VkDescriptorSetAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; - allocInfo.descriptorPool = m_descriptorPool; - allocInfo.descriptorSetCount = static_cast(m_swapChainImages.size()); - allocInfo.pSetLayouts = layouts.data(); - m_descriptorSets.resize(m_swapChainImages.size()); - - if (vkAllocateDescriptorSets(m_device, &allocInfo, m_descriptorSets.data()) != VK_SUCCESS) { - throw std::runtime_error("failed to allocate descriptor sets!"); - } - - VkDescriptorBufferInfo bufferInfo = {}; - bufferInfo.offset = 0; - bufferInfo.range = VK_WHOLE_SIZE; - VkWriteDescriptorSet descriptorWrite = {}; - descriptorWrite.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - descriptorWrite.dstBinding = 0; - descriptorWrite.dstArrayElement = 0; - descriptorWrite.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; - descriptorWrite.descriptorCount = 1; - descriptorWrite.pBufferInfo = &bufferInfo; - descriptorWrite.pImageInfo = nullptr; // Optional - descriptorWrite.pTexelBufferView = nullptr; // Optional - - for (size_t i = 0; i < m_swapChainImages.size(); i++) { - bufferInfo.buffer = m_uniformBuffers[i]; - descriptorWrite.dstSet = m_descriptorSets[i]; - vkUpdateDescriptorSets(m_device, 1, &descriptorWrite, 0, nullptr); - } -} - -void VulkanBaseApp::createCommandBuffers() -{ - m_commandBuffers.resize(m_swapChainFramebuffers.size()); - VkCommandBufferAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; - allocInfo.commandPool = m_commandPool; - allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; - allocInfo.commandBufferCount = (uint32_t)m_commandBuffers.size(); - - if (vkAllocateCommandBuffers(m_device, &allocInfo, m_commandBuffers.data()) != VK_SUCCESS) { - throw std::runtime_error("failed to allocate command buffers!"); - } - - for (size_t i = 0; i < m_commandBuffers.size(); i++) { - VkCommandBufferBeginInfo beginInfo = {}; - beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; - beginInfo.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; - beginInfo.pInheritanceInfo = nullptr; // Optional - - if (vkBeginCommandBuffer(m_commandBuffers[i], &beginInfo) != VK_SUCCESS) { - throw std::runtime_error("failed to begin recording command buffer!"); - } - - VkRenderPassBeginInfo renderPassInfo = {}; - renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; - renderPassInfo.renderPass = m_renderPass; - renderPassInfo.framebuffer = m_swapChainFramebuffers[i]; - - renderPassInfo.renderArea.offset = { 0, 0 }; - renderPassInfo.renderArea.extent = m_swapChainExtent; - - VkClearValue clearColors[2]; - clearColors[0].color = { 0.0f, 0.0f, 0.0f, 1.0f }; - clearColors[1].depthStencil = { 1.0f, 0 }; - renderPassInfo.clearValueCount = countof(clearColors); - renderPassInfo.pClearValues = clearColors; - - vkCmdBeginRenderPass(m_commandBuffers[i], &renderPassInfo, VK_SUBPASS_CONTENTS_INLINE); - - vkCmdBindPipeline(m_commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, m_graphicsPipeline); - - vkCmdBindDescriptorSets(m_commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, m_pipelineLayout, 0, 1, &m_descriptorSets[i], 0, nullptr); - - fillRenderingCommandBuffer(m_commandBuffers[i]); - - vkCmdEndRenderPass(m_commandBuffers[i]); - - if (vkEndCommandBuffer(m_commandBuffers[i]) != VK_SUCCESS) { - throw std::runtime_error("failed to record command buffer!"); - } - } -} - -void VulkanBaseApp::createSyncObjects() -{ - VkSemaphoreCreateInfo semaphoreInfo = {}; - semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; - VkFenceCreateInfo fenceInfo = {}; - fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; - fenceInfo.flags = VK_FENCE_CREATE_SIGNALED_BIT; - - m_inFlightFences.resize(MAX_FRAMES_IN_FLIGHT); - m_imageAvailableSemaphores.resize(MAX_FRAMES_IN_FLIGHT); - m_renderFinishedSemaphores.resize(MAX_FRAMES_IN_FLIGHT); - - for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) { - if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &m_imageAvailableSemaphores[i]) != VK_SUCCESS) { - throw std::runtime_error("Failed to create image available semaphore!"); - } - if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &m_renderFinishedSemaphores[i]) != VK_SUCCESS) { - throw std::runtime_error("Failed to create image available semaphore!"); - } - if (vkCreateFence(m_device, &fenceInfo, nullptr, &m_inFlightFences[i]) != VK_SUCCESS) { - throw std::runtime_error("Failed to create image available semaphore!"); - } - } -} - -void VulkanBaseApp::getWaitFrameSemaphores(std::vector& wait, std::vector& waitStages) const -{ -} - -void VulkanBaseApp::getSignalFrameSemaphores(std::vector& signal) const -{ -} - -VkDeviceSize VulkanBaseApp::getUniformSize() const -{ - return VkDeviceSize(0); -} - -void VulkanBaseApp::updateUniformBuffer(uint32_t imageIndex) -{ -} - -void VulkanBaseApp::createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& bufferMemory) -{ - VkBufferCreateInfo bufferInfo = {}; - bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - bufferInfo.size = size; - bufferInfo.usage = usage; - bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - - if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { - throw std::runtime_error("failed to create buffer!"); - } - - VkMemoryRequirements memRequirements; - vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements); - - VkMemoryAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - allocInfo.allocationSize = memRequirements.size; - allocInfo.memoryTypeIndex = findMemoryType(m_physicalDevice, memRequirements.memoryTypeBits, properties); - - if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) != VK_SUCCESS) { - throw std::runtime_error("failed to allocate buffer memory!"); - } - - vkBindBufferMemory(m_device, buffer, bufferMemory, 0); -} - -void VulkanBaseApp::createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, VkBuffer& buffer, VkDeviceMemory& bufferMemory) -{ - VkBufferCreateInfo bufferInfo = {}; - bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - bufferInfo.size = size; - bufferInfo.usage = usage; - bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - - if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { - throw std::runtime_error("failed to create buffer!"); - } - - VkMemoryRequirements memRequirements; - vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements); - -#ifdef _WIN64 - WindowsSecurityAttributes winSecurityAttributes; - - VkExportMemoryWin32HandleInfoKHR vulkanExportMemoryWin32HandleInfoKHR = {}; - vulkanExportMemoryWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR; - vulkanExportMemoryWin32HandleInfoKHR.pNext = NULL; - vulkanExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes; - vulkanExportMemoryWin32HandleInfoKHR.dwAccess = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; - vulkanExportMemoryWin32HandleInfoKHR.name = (LPCWSTR)NULL; -#endif - VkExportMemoryAllocateInfoKHR vulkanExportMemoryAllocateInfoKHR = {}; - vulkanExportMemoryAllocateInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR; -#ifdef _WIN64 - vulkanExportMemoryAllocateInfoKHR.pNext = extMemHandleType & VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR ? &vulkanExportMemoryWin32HandleInfoKHR : NULL; - vulkanExportMemoryAllocateInfoKHR.handleTypes = extMemHandleType; -#else - vulkanExportMemoryAllocateInfoKHR.pNext = NULL; - vulkanExportMemoryAllocateInfoKHR.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; -#endif - VkMemoryAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - allocInfo.pNext = &vulkanExportMemoryAllocateInfoKHR; - allocInfo.allocationSize = memRequirements.size; - allocInfo.memoryTypeIndex = findMemoryType(m_physicalDevice, memRequirements.memoryTypeBits, properties); - - if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) != VK_SUCCESS) { - throw std::runtime_error("failed to allocate external buffer memory!"); - } - - vkBindBufferMemory(m_device, buffer, bufferMemory, 0); -} - -void *VulkanBaseApp::getMemHandle(VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType) -{ -#ifdef _WIN64 - HANDLE handle = 0; - - VkMemoryGetWin32HandleInfoKHR vkMemoryGetWin32HandleInfoKHR = {}; - vkMemoryGetWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR; - vkMemoryGetWin32HandleInfoKHR.pNext = NULL; - vkMemoryGetWin32HandleInfoKHR.memory = memory; - vkMemoryGetWin32HandleInfoKHR.handleType = handleType; - - PFN_vkGetMemoryWin32HandleKHR fpGetMemoryWin32HandleKHR; - fpGetMemoryWin32HandleKHR = (PFN_vkGetMemoryWin32HandleKHR)vkGetDeviceProcAddr(m_device, "vkGetMemoryWin32HandleKHR"); - if (!fpGetMemoryWin32HandleKHR) { - throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); - } - if (fpGetMemoryWin32HandleKHR(m_device, &vkMemoryGetWin32HandleInfoKHR, &handle) != VK_SUCCESS) { - throw std::runtime_error("Failed to retrieve handle for buffer!"); - } - return (void *)handle; -#else - int fd = -1; - - VkMemoryGetFdInfoKHR vkMemoryGetFdInfoKHR = {}; - vkMemoryGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR; - vkMemoryGetFdInfoKHR.pNext = NULL; - vkMemoryGetFdInfoKHR.memory = memory; - vkMemoryGetFdInfoKHR.handleType = handleType; - - PFN_vkGetMemoryFdKHR fpGetMemoryFdKHR; - fpGetMemoryFdKHR = (PFN_vkGetMemoryFdKHR)vkGetDeviceProcAddr(m_device, "vkGetMemoryFdKHR"); - if (!fpGetMemoryFdKHR) { - throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); - } - if (fpGetMemoryFdKHR(m_device, &vkMemoryGetFdInfoKHR, &fd) != VK_SUCCESS) { - throw std::runtime_error("Failed to retrieve handle for buffer!"); - } - return (void *)(uintptr_t)fd; -#endif /* _WIN64 */ -} - -void *VulkanBaseApp::getSemaphoreHandle(VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType) -{ -#ifdef _WIN64 - HANDLE handle; - - VkSemaphoreGetWin32HandleInfoKHR semaphoreGetWin32HandleInfoKHR = {}; - semaphoreGetWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR; - semaphoreGetWin32HandleInfoKHR.pNext = NULL; - semaphoreGetWin32HandleInfoKHR.semaphore = semaphore; - semaphoreGetWin32HandleInfoKHR.handleType = handleType; - - PFN_vkGetSemaphoreWin32HandleKHR fpGetSemaphoreWin32HandleKHR; - fpGetSemaphoreWin32HandleKHR = (PFN_vkGetSemaphoreWin32HandleKHR)vkGetDeviceProcAddr(m_device, "vkGetSemaphoreWin32HandleKHR"); - if (!fpGetSemaphoreWin32HandleKHR) { - throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); - } - if (fpGetSemaphoreWin32HandleKHR(m_device, &semaphoreGetWin32HandleInfoKHR, &handle) != VK_SUCCESS) { - throw std::runtime_error("Failed to retrieve handle for buffer!"); - } - - return (void *)handle; -#else - int fd; - - VkSemaphoreGetFdInfoKHR semaphoreGetFdInfoKHR = {}; - semaphoreGetFdInfoKHR.sType =VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR; - semaphoreGetFdInfoKHR.pNext = NULL; - semaphoreGetFdInfoKHR.semaphore = semaphore; - semaphoreGetFdInfoKHR.handleType = handleType; - - PFN_vkGetSemaphoreFdKHR fpGetSemaphoreFdKHR; - fpGetSemaphoreFdKHR = (PFN_vkGetSemaphoreFdKHR)vkGetDeviceProcAddr(m_device, "vkGetSemaphoreFdKHR"); - if (!fpGetSemaphoreFdKHR) { - throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); - } - if (fpGetSemaphoreFdKHR(m_device, &semaphoreGetFdInfoKHR, &fd) != VK_SUCCESS) { - throw std::runtime_error("Failed to retrieve handle for buffer!"); - } - - return (void *)(uintptr_t)fd; -#endif -} - -void VulkanBaseApp::createExternalSemaphore(VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType) -{ - VkSemaphoreCreateInfo semaphoreInfo = {}; - semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; - VkExportSemaphoreCreateInfoKHR exportSemaphoreCreateInfo = {}; - exportSemaphoreCreateInfo.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR; - -#ifdef _WIN64 - WindowsSecurityAttributes winSecurityAttributes; - - VkExportSemaphoreWin32HandleInfoKHR exportSemaphoreWin32HandleInfoKHR = {}; - exportSemaphoreWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR; - exportSemaphoreWin32HandleInfoKHR.pNext = NULL; - exportSemaphoreWin32HandleInfoKHR.pAttributes = &winSecurityAttributes; - exportSemaphoreWin32HandleInfoKHR.dwAccess = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; - exportSemaphoreWin32HandleInfoKHR.name = (LPCWSTR)NULL; - exportSemaphoreCreateInfo.pNext = (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) ? &exportSemaphoreWin32HandleInfoKHR : NULL; -#else - exportSemaphoreCreateInfo.pNext = NULL; -#endif - exportSemaphoreCreateInfo.handleTypes = handleType; - semaphoreInfo.pNext = &exportSemaphoreCreateInfo; - - if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &semaphore) != VK_SUCCESS) { - throw std::runtime_error("failed to create synchronization objects for a CUDA-Vulkan!"); - } -} - -void VulkanBaseApp::importExternalBuffer(void *handle, VkExternalMemoryHandleTypeFlagBits handleType, size_t size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& memory) -{ - VkBufferCreateInfo bufferInfo = {}; - bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - bufferInfo.size = size; - bufferInfo.usage = usage; - bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - - if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { - throw std::runtime_error("failed to create buffer!"); - } - - VkMemoryRequirements memRequirements; - vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements); - -#ifdef _WIN64 - VkImportMemoryWin32HandleInfoKHR handleInfo = {}; - handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR; - handleInfo.pNext = NULL; - handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT; - handleInfo.handle = handle; - handleInfo.name = NULL; -#else - VkImportMemoryFdInfoKHR handleInfo = {}; - handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR; - handleInfo.pNext = NULL; - handleInfo.fd = (int)(uintptr_t)handle; - handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; -#endif /* _WIN64 */ - - VkMemoryAllocateInfo memAllocation = {}; - memAllocation.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - memAllocation.pNext = (void *)&handleInfo; - memAllocation.allocationSize = size; - memAllocation.memoryTypeIndex = findMemoryType(m_physicalDevice, memRequirements.memoryTypeBits, properties); - - if (vkAllocateMemory(m_device, &memAllocation, nullptr, &memory) != VK_SUCCESS) { - throw std::runtime_error("Failed to import allocation!"); - } - - vkBindBufferMemory(m_device, buffer, memory, 0); -} - -void VulkanBaseApp::copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size) -{ - - VkCommandBuffer commandBuffer = beginSingleTimeCommands(); - - VkBufferCopy copyRegion = {}; - copyRegion.size = size; - vkCmdCopyBuffer(commandBuffer, src, dst, 1, ©Region); - - endSingleTimeCommands(commandBuffer); -} - -void VulkanBaseApp::drawFrame() -{ - size_t currentFrameIdx = m_currentFrame % MAX_FRAMES_IN_FLIGHT; - vkWaitForFences(m_device, 1, &m_inFlightFences[currentFrameIdx], VK_TRUE, std::numeric_limits::max()); - - uint32_t imageIndex; - VkResult result = vkAcquireNextImageKHR(m_device, m_swapChain, std::numeric_limits::max(), m_imageAvailableSemaphores[currentFrameIdx], VK_NULL_HANDLE, &imageIndex); - if (result == VK_ERROR_OUT_OF_DATE_KHR) { - recreateSwapChain(); - } - else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) { - throw std::runtime_error("Failed to acquire swap chain image!"); - } - - updateUniformBuffer(imageIndex); - - VkSubmitInfo submitInfo = {}; - submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - - std::vector waitSemaphores; - std::vector waitStages; - - waitSemaphores.push_back(m_imageAvailableSemaphores[currentFrameIdx]); - waitStages.push_back(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT); - getWaitFrameSemaphores(waitSemaphores, waitStages); - - submitInfo.waitSemaphoreCount = (uint32_t)waitSemaphores.size(); - submitInfo.pWaitSemaphores = waitSemaphores.data(); - submitInfo.pWaitDstStageMask = waitStages.data(); - - submitInfo.commandBufferCount = 1; - submitInfo.pCommandBuffers = &m_commandBuffers[imageIndex]; - - std::vector signalSemaphores; - getSignalFrameSemaphores(signalSemaphores); - signalSemaphores.push_back(m_renderFinishedSemaphores[currentFrameIdx]); - submitInfo.signalSemaphoreCount = (uint32_t)signalSemaphores.size(); - submitInfo.pSignalSemaphores = signalSemaphores.data(); - - vkResetFences(m_device, 1, &m_inFlightFences[currentFrameIdx]); - - if (vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, m_inFlightFences[currentFrameIdx]) != VK_SUCCESS) { - throw std::runtime_error("failed to submit draw command buffer!"); - } - - VkPresentInfoKHR presentInfo = {}; - presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR; - presentInfo.waitSemaphoreCount = 1; - presentInfo.pWaitSemaphores = &m_renderFinishedSemaphores[currentFrameIdx]; - - VkSwapchainKHR swapChains[] = { m_swapChain }; - presentInfo.swapchainCount = 1; - presentInfo.pSwapchains = swapChains; - presentInfo.pImageIndices = &imageIndex; - - result = vkQueuePresentKHR(m_presentQueue, &presentInfo); - if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR || m_framebufferResized) { - recreateSwapChain(); - m_framebufferResized = false; - } - else if (result != VK_SUCCESS) { - throw std::runtime_error("Failed to acquire swap chain image!"); - } - - m_currentFrame++; -} - -void VulkanBaseApp::cleanupSwapChain() -{ - - if (m_depthImageView != VK_NULL_HANDLE) { - vkDestroyImageView(m_device, m_depthImageView, nullptr); - } - if (m_depthImage != VK_NULL_HANDLE) { - vkDestroyImage(m_device, m_depthImage, nullptr); - } - if (m_depthImageMemory != VK_NULL_HANDLE) { - vkFreeMemory(m_device, m_depthImageMemory, nullptr); - } - +void VulkanBaseApp::createUniformBuffers() { + VkDeviceSize size = getUniformSize(); + if (size > 0) { + m_uniformBuffers.resize(m_swapChainImages.size()); + m_uniformMemory.resize(m_swapChainImages.size()); for (size_t i = 0; i < m_uniformBuffers.size(); i++) { - vkDestroyBuffer(m_device, m_uniformBuffers[i], nullptr); - vkFreeMemory(m_device, m_uniformMemory[i], nullptr); - } - - if (m_descriptorPool != VK_NULL_HANDLE) { - vkDestroyDescriptorPool(m_device, m_descriptorPool, nullptr); - } - - for (size_t i = 0; i < m_swapChainFramebuffers.size(); i++) { - vkDestroyFramebuffer(m_device, m_swapChainFramebuffers[i], nullptr); - } - - if (m_graphicsPipeline != VK_NULL_HANDLE) { - vkDestroyPipeline(m_device, m_graphicsPipeline, nullptr); - } - - if (m_pipelineLayout != VK_NULL_HANDLE) { - vkDestroyPipelineLayout(m_device, m_pipelineLayout, nullptr); - } - - if (m_renderPass != VK_NULL_HANDLE) { - vkDestroyRenderPass(m_device, m_renderPass, nullptr); - } - - for (size_t i = 0; i < m_swapChainImageViews.size(); i++) { - vkDestroyImageView(m_device, m_swapChainImageViews[i], nullptr); - } - - if (m_swapChain != VK_NULL_HANDLE) { - vkDestroySwapchainKHR(m_device, m_swapChain, nullptr); + createBuffer(getUniformSize(), VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + m_uniformBuffers[i], m_uniformMemory[i]); } + } } -void VulkanBaseApp::recreateSwapChain() -{ - int width, height; +void VulkanBaseApp::createDescriptorPool() { + VkDescriptorPoolSize poolSize = {}; + poolSize.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + poolSize.descriptorCount = static_cast(m_swapChainImages.size()); + VkDescriptorPoolCreateInfo poolInfo = {}; + poolInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + poolInfo.poolSizeCount = 1; + poolInfo.pPoolSizes = &poolSize; + poolInfo.maxSets = static_cast(m_swapChainImages.size()); + if (vkCreateDescriptorPool(m_device, &poolInfo, nullptr, &m_descriptorPool) != + VK_SUCCESS) { + throw std::runtime_error("failed to create descriptor pool!"); + } +} +void VulkanBaseApp::createDescriptorSets() { + std::vector layouts(m_swapChainImages.size(), + m_descriptorSetLayout); + VkDescriptorSetAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + allocInfo.descriptorPool = m_descriptorPool; + allocInfo.descriptorSetCount = + static_cast(m_swapChainImages.size()); + allocInfo.pSetLayouts = layouts.data(); + m_descriptorSets.resize(m_swapChainImages.size()); + + if (vkAllocateDescriptorSets(m_device, &allocInfo, m_descriptorSets.data()) != + VK_SUCCESS) { + throw std::runtime_error("failed to allocate descriptor sets!"); + } + + VkDescriptorBufferInfo bufferInfo = {}; + bufferInfo.offset = 0; + bufferInfo.range = VK_WHOLE_SIZE; + VkWriteDescriptorSet descriptorWrite = {}; + descriptorWrite.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + descriptorWrite.dstBinding = 0; + descriptorWrite.dstArrayElement = 0; + descriptorWrite.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + descriptorWrite.descriptorCount = 1; + descriptorWrite.pBufferInfo = &bufferInfo; + descriptorWrite.pImageInfo = nullptr; // Optional + descriptorWrite.pTexelBufferView = nullptr; // Optional + + for (size_t i = 0; i < m_swapChainImages.size(); i++) { + bufferInfo.buffer = m_uniformBuffers[i]; + descriptorWrite.dstSet = m_descriptorSets[i]; + vkUpdateDescriptorSets(m_device, 1, &descriptorWrite, 0, nullptr); + } +} + +void VulkanBaseApp::createCommandBuffers() { + m_commandBuffers.resize(m_swapChainFramebuffers.size()); + VkCommandBufferAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + allocInfo.commandPool = m_commandPool; + allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + allocInfo.commandBufferCount = (uint32_t)m_commandBuffers.size(); + + if (vkAllocateCommandBuffers(m_device, &allocInfo, m_commandBuffers.data()) != + VK_SUCCESS) { + throw std::runtime_error("failed to allocate command buffers!"); + } + + for (size_t i = 0; i < m_commandBuffers.size(); i++) { + VkCommandBufferBeginInfo beginInfo = {}; + beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + beginInfo.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; + beginInfo.pInheritanceInfo = nullptr; // Optional + + if (vkBeginCommandBuffer(m_commandBuffers[i], &beginInfo) != VK_SUCCESS) { + throw std::runtime_error("failed to begin recording command buffer!"); + } + + VkRenderPassBeginInfo renderPassInfo = {}; + renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; + renderPassInfo.renderPass = m_renderPass; + renderPassInfo.framebuffer = m_swapChainFramebuffers[i]; + + renderPassInfo.renderArea.offset = {0, 0}; + renderPassInfo.renderArea.extent = m_swapChainExtent; + + VkClearValue clearColors[2]; + clearColors[0].color = {0.0f, 0.0f, 0.0f, 1.0f}; + clearColors[1].depthStencil = {1.0f, 0}; + renderPassInfo.clearValueCount = countof(clearColors); + renderPassInfo.pClearValues = clearColors; + + vkCmdBeginRenderPass(m_commandBuffers[i], &renderPassInfo, + VK_SUBPASS_CONTENTS_INLINE); + + vkCmdBindPipeline(m_commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, + m_graphicsPipeline); + + vkCmdBindDescriptorSets(m_commandBuffers[i], + VK_PIPELINE_BIND_POINT_GRAPHICS, m_pipelineLayout, + 0, 1, &m_descriptorSets[i], 0, nullptr); + + fillRenderingCommandBuffer(m_commandBuffers[i]); + + vkCmdEndRenderPass(m_commandBuffers[i]); + + if (vkEndCommandBuffer(m_commandBuffers[i]) != VK_SUCCESS) { + throw std::runtime_error("failed to record command buffer!"); + } + } +} + +void VulkanBaseApp::createSyncObjects() { + VkSemaphoreCreateInfo semaphoreInfo = {}; + semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + VkFenceCreateInfo fenceInfo = {}; + fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; + fenceInfo.flags = VK_FENCE_CREATE_SIGNALED_BIT; + + m_inFlightFences.resize(MAX_FRAMES_IN_FLIGHT); + m_imageAvailableSemaphores.resize(MAX_FRAMES_IN_FLIGHT); + m_renderFinishedSemaphores.resize(MAX_FRAMES_IN_FLIGHT); + + for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) { + if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, + &m_imageAvailableSemaphores[i]) != VK_SUCCESS) { + throw std::runtime_error("Failed to create image available semaphore!"); + } + if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, + &m_renderFinishedSemaphores[i]) != VK_SUCCESS) { + throw std::runtime_error("Failed to create image available semaphore!"); + } + if (vkCreateFence(m_device, &fenceInfo, nullptr, &m_inFlightFences[i]) != + VK_SUCCESS) { + throw std::runtime_error("Failed to create image available semaphore!"); + } + } + +#ifdef _VK_TIMELINE_SEMAPHORE + if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, + &m_vkPresentationSemaphore) != VK_SUCCESS) { + throw std::runtime_error("Failed to create binary semaphore!"); + } +#endif /* _VK_TIMELINE_SEMAPHORE */ +} + +void VulkanBaseApp::getWaitFrameSemaphores( + std::vector &wait, + std::vector &waitStages) const {} + +void VulkanBaseApp::getSignalFrameSemaphores( + std::vector &signal) const {} + +VkDeviceSize VulkanBaseApp::getUniformSize() const { return VkDeviceSize(0); } + +void VulkanBaseApp::updateUniformBuffer(uint32_t imageIndex) {} + +void VulkanBaseApp::createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, + VkMemoryPropertyFlags properties, + VkBuffer &buffer, + VkDeviceMemory &bufferMemory) { + VkBufferCreateInfo bufferInfo = {}; + bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + bufferInfo.size = size; + bufferInfo.usage = usage; + bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + + if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { + throw std::runtime_error("failed to create buffer!"); + } + + VkMemoryRequirements memRequirements; + vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements); + + VkMemoryAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + allocInfo.allocationSize = memRequirements.size; + allocInfo.memoryTypeIndex = findMemoryType( + m_physicalDevice, memRequirements.memoryTypeBits, properties); + + if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) != + VK_SUCCESS) { + throw std::runtime_error("failed to allocate buffer memory!"); + } + + vkBindBufferMemory(m_device, buffer, bufferMemory, 0); +} + +void VulkanBaseApp::createExternalBuffer( + VkDeviceSize size, VkBufferUsageFlags usage, + VkMemoryPropertyFlags properties, + VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, VkBuffer &buffer, + VkDeviceMemory &bufferMemory) { + VkBufferCreateInfo bufferInfo = {}; + bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + bufferInfo.size = size; + bufferInfo.usage = usage; + bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + + VkExternalMemoryBufferCreateInfo externalMemoryBufferInfo = {}; + externalMemoryBufferInfo.sType = + VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO; + externalMemoryBufferInfo.handleTypes = extMemHandleType; + bufferInfo.pNext = &externalMemoryBufferInfo; + + if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { + throw std::runtime_error("failed to create buffer!"); + } + + VkMemoryRequirements memRequirements; + vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements); + +#ifdef _WIN64 + WindowsSecurityAttributes winSecurityAttributes; + + VkExportMemoryWin32HandleInfoKHR vulkanExportMemoryWin32HandleInfoKHR = {}; + vulkanExportMemoryWin32HandleInfoKHR.sType = + VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR; + vulkanExportMemoryWin32HandleInfoKHR.pNext = NULL; + vulkanExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes; + vulkanExportMemoryWin32HandleInfoKHR.dwAccess = + DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; + vulkanExportMemoryWin32HandleInfoKHR.name = (LPCWSTR)NULL; +#endif /* _WIN64 */ + VkExportMemoryAllocateInfoKHR vulkanExportMemoryAllocateInfoKHR = {}; + vulkanExportMemoryAllocateInfoKHR.sType = + VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR; +#ifdef _WIN64 + vulkanExportMemoryAllocateInfoKHR.pNext = + extMemHandleType & VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR + ? &vulkanExportMemoryWin32HandleInfoKHR + : NULL; + vulkanExportMemoryAllocateInfoKHR.handleTypes = extMemHandleType; +#else + vulkanExportMemoryAllocateInfoKHR.pNext = NULL; + vulkanExportMemoryAllocateInfoKHR.handleTypes = + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; +#endif /* _WIN64 */ + VkMemoryAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + allocInfo.pNext = &vulkanExportMemoryAllocateInfoKHR; + allocInfo.allocationSize = memRequirements.size; + allocInfo.memoryTypeIndex = findMemoryType( + m_physicalDevice, memRequirements.memoryTypeBits, properties); + + if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) != + VK_SUCCESS) { + throw std::runtime_error("failed to allocate external buffer memory!"); + } + + vkBindBufferMemory(m_device, buffer, bufferMemory, 0); +} + +void *VulkanBaseApp::getMemHandle( + VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType) { +#ifdef _WIN64 + HANDLE handle = 0; + + VkMemoryGetWin32HandleInfoKHR vkMemoryGetWin32HandleInfoKHR = {}; + vkMemoryGetWin32HandleInfoKHR.sType = + VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR; + vkMemoryGetWin32HandleInfoKHR.pNext = NULL; + vkMemoryGetWin32HandleInfoKHR.memory = memory; + vkMemoryGetWin32HandleInfoKHR.handleType = handleType; + + PFN_vkGetMemoryWin32HandleKHR fpGetMemoryWin32HandleKHR; + fpGetMemoryWin32HandleKHR = + (PFN_vkGetMemoryWin32HandleKHR)vkGetDeviceProcAddr( + m_device, "vkGetMemoryWin32HandleKHR"); + if (!fpGetMemoryWin32HandleKHR) { + throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); + } + if (fpGetMemoryWin32HandleKHR(m_device, &vkMemoryGetWin32HandleInfoKHR, + &handle) != VK_SUCCESS) { + throw std::runtime_error("Failed to retrieve handle for buffer!"); + } + return (void *)handle; +#else + int fd = -1; + + VkMemoryGetFdInfoKHR vkMemoryGetFdInfoKHR = {}; + vkMemoryGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR; + vkMemoryGetFdInfoKHR.pNext = NULL; + vkMemoryGetFdInfoKHR.memory = memory; + vkMemoryGetFdInfoKHR.handleType = handleType; + + PFN_vkGetMemoryFdKHR fpGetMemoryFdKHR; + fpGetMemoryFdKHR = + (PFN_vkGetMemoryFdKHR)vkGetDeviceProcAddr(m_device, "vkGetMemoryFdKHR"); + if (!fpGetMemoryFdKHR) { + throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); + } + if (fpGetMemoryFdKHR(m_device, &vkMemoryGetFdInfoKHR, &fd) != VK_SUCCESS) { + throw std::runtime_error("Failed to retrieve handle for buffer!"); + } + return (void *)(uintptr_t)fd; +#endif /* _WIN64 */ +} + +void *VulkanBaseApp::getSemaphoreHandle( + VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType) { +#ifdef _WIN64 + HANDLE handle; + + VkSemaphoreGetWin32HandleInfoKHR semaphoreGetWin32HandleInfoKHR = {}; + semaphoreGetWin32HandleInfoKHR.sType = + VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR; + semaphoreGetWin32HandleInfoKHR.pNext = NULL; + semaphoreGetWin32HandleInfoKHR.semaphore = semaphore; + semaphoreGetWin32HandleInfoKHR.handleType = handleType; + + PFN_vkGetSemaphoreWin32HandleKHR fpGetSemaphoreWin32HandleKHR; + fpGetSemaphoreWin32HandleKHR = + (PFN_vkGetSemaphoreWin32HandleKHR)vkGetDeviceProcAddr( + m_device, "vkGetSemaphoreWin32HandleKHR"); + if (!fpGetSemaphoreWin32HandleKHR) { + throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); + } + if (fpGetSemaphoreWin32HandleKHR(m_device, &semaphoreGetWin32HandleInfoKHR, + &handle) != VK_SUCCESS) { + throw std::runtime_error("Failed to retrieve handle for buffer!"); + } + + return (void *)handle; +#else + int fd; + + VkSemaphoreGetFdInfoKHR semaphoreGetFdInfoKHR = {}; + semaphoreGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR; + semaphoreGetFdInfoKHR.pNext = NULL; + semaphoreGetFdInfoKHR.semaphore = semaphore; + semaphoreGetFdInfoKHR.handleType = handleType; + + PFN_vkGetSemaphoreFdKHR fpGetSemaphoreFdKHR; + fpGetSemaphoreFdKHR = (PFN_vkGetSemaphoreFdKHR)vkGetDeviceProcAddr( + m_device, "vkGetSemaphoreFdKHR"); + if (!fpGetSemaphoreFdKHR) { + throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); + } + if (fpGetSemaphoreFdKHR(m_device, &semaphoreGetFdInfoKHR, &fd) != + VK_SUCCESS) { + throw std::runtime_error("Failed to retrieve handle for buffer!"); + } + + return (void *)(uintptr_t)fd; +#endif /* _WIN64 */ +} + +void VulkanBaseApp::createExternalSemaphore( + VkSemaphore &semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType) { + VkSemaphoreCreateInfo semaphoreInfo = {}; + semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + VkExportSemaphoreCreateInfoKHR exportSemaphoreCreateInfo = {}; + exportSemaphoreCreateInfo.sType = + VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR; + +#ifdef _VK_TIMELINE_SEMAPHORE + VkSemaphoreTypeCreateInfo timelineCreateInfo; + timelineCreateInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO; + timelineCreateInfo.pNext = NULL; + timelineCreateInfo.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE; + timelineCreateInfo.initialValue = 0; +#endif /* _VK_TIMELINE_SEMAPHORE */ + +#ifdef _WIN64 + WindowsSecurityAttributes winSecurityAttributes; + + VkExportSemaphoreWin32HandleInfoKHR exportSemaphoreWin32HandleInfoKHR = {}; + exportSemaphoreWin32HandleInfoKHR.sType = + VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR; + +#ifdef _VK_TIMELINE_SEMAPHORE + exportSemaphoreWin32HandleInfoKHR.pNext = &timelineCreateInfo; +#else + exportSemaphoreWin32HandleInfoKHR.pNext = NULL; +#endif /* _VK_TIMELINE_SEMAPHORE */ + + exportSemaphoreWin32HandleInfoKHR.pAttributes = &winSecurityAttributes; + exportSemaphoreWin32HandleInfoKHR.dwAccess = + DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; + exportSemaphoreWin32HandleInfoKHR.name = (LPCWSTR)NULL; + exportSemaphoreCreateInfo.pNext = + (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) + ? &exportSemaphoreWin32HandleInfoKHR + : NULL; +#else +#ifdef _VK_TIMELINE_SEMAPHORE + exportSemaphoreCreateInfo.pNext = &timelineCreateInfo; +#else + exportSemaphoreCreateInfo.pNext = NULL; +#endif /* _VK_TIMELINE_SEMAPHORE */ +#endif /* _WIN64 */ + exportSemaphoreCreateInfo.handleTypes = handleType; + semaphoreInfo.pNext = &exportSemaphoreCreateInfo; + + if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &semaphore) != + VK_SUCCESS) { + throw std::runtime_error( + "failed to create synchronization objects for a CUDA-Vulkan!"); + } +} + +void VulkanBaseApp::importExternalBuffer( + void *handle, VkExternalMemoryHandleTypeFlagBits handleType, size_t size, + VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, + VkBuffer &buffer, VkDeviceMemory &memory) { + VkBufferCreateInfo bufferInfo = {}; + bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + bufferInfo.size = size; + bufferInfo.usage = usage; + bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + + if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { + throw std::runtime_error("failed to create buffer!"); + } + + VkMemoryRequirements memRequirements; + vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements); + +#ifdef _WIN64 + VkImportMemoryWin32HandleInfoKHR handleInfo = {}; + handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR; + handleInfo.pNext = NULL; + handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT; + handleInfo.handle = handle; + handleInfo.name = NULL; +#else + VkImportMemoryFdInfoKHR handleInfo = {}; + handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR; + handleInfo.pNext = NULL; + handleInfo.fd = (int)(uintptr_t)handle; + handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; +#endif /* _WIN64 */ + + VkMemoryAllocateInfo memAllocation = {}; + memAllocation.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + memAllocation.pNext = (void *)&handleInfo; + memAllocation.allocationSize = size; + memAllocation.memoryTypeIndex = findMemoryType( + m_physicalDevice, memRequirements.memoryTypeBits, properties); + + if (vkAllocateMemory(m_device, &memAllocation, nullptr, &memory) != + VK_SUCCESS) { + throw std::runtime_error("Failed to import allocation!"); + } + + vkBindBufferMemory(m_device, buffer, memory, 0); +} + +void VulkanBaseApp::copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size) { + VkCommandBuffer commandBuffer = beginSingleTimeCommands(); + + VkBufferCopy copyRegion = {}; + copyRegion.size = size; + vkCmdCopyBuffer(commandBuffer, src, dst, 1, ©Region); + + endSingleTimeCommands(commandBuffer); +} + +#ifdef _VK_TIMELINE_SEMAPHORE +void VulkanBaseApp::drawFrame() { + const uint64_t waitValue = 0; + const uint64_t signalValue = 1; + + VkSemaphoreWaitInfo semaphoreWaitInfo = {}; + semaphoreWaitInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO; + semaphoreWaitInfo.pSemaphores = &m_vkTimelineSemaphore; + semaphoreWaitInfo.semaphoreCount = 1; + semaphoreWaitInfo.pValues = &waitValue; + vkWaitSemaphores(m_device, &semaphoreWaitInfo, + std::numeric_limits::max()); + + uint32_t imageIndex; + VkResult result = vkAcquireNextImageKHR( + m_device, m_swapChain, std::numeric_limits::max(), + m_vkPresentationSemaphore, VK_NULL_HANDLE, &imageIndex); + if (result == VK_ERROR_OUT_OF_DATE_KHR) { + recreateSwapChain(); + } else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) { + throw std::runtime_error("Failed to acquire swap chain image!"); + } + + updateUniformBuffer(imageIndex); + + VkSubmitInfo submitInfo = {}; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + + std::vector waitSemaphores; + std::vector waitStages; + + waitSemaphores.push_back(m_vkTimelineSemaphore); + waitStages.push_back(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT); + + submitInfo.waitSemaphoreCount = (uint32_t)waitSemaphores.size(); + submitInfo.pWaitSemaphores = waitSemaphores.data(); + submitInfo.pWaitDstStageMask = waitStages.data(); + + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &m_commandBuffers[imageIndex]; + + std::vector signalSemaphores; + signalSemaphores.push_back(m_vkTimelineSemaphore); + submitInfo.signalSemaphoreCount = (uint32_t)signalSemaphores.size(); + submitInfo.pSignalSemaphores = signalSemaphores.data(); + + VkTimelineSemaphoreSubmitInfo timelineInfo = {}; + timelineInfo.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO; + timelineInfo.waitSemaphoreValueCount = 1; + timelineInfo.pWaitSemaphoreValues = &waitValue; + timelineInfo.signalSemaphoreValueCount = 1; + timelineInfo.pSignalSemaphoreValues = &signalValue; + + submitInfo.pNext = &timelineInfo; + + if (vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE) != + VK_SUCCESS) { + throw std::runtime_error("failed to submit draw command buffer!"); + } + + VkPresentInfoKHR presentInfo = {}; + presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR; + presentInfo.waitSemaphoreCount = 1; + presentInfo.pWaitSemaphores = &m_vkPresentationSemaphore; + + VkSwapchainKHR swapChains[] = {m_swapChain}; + presentInfo.swapchainCount = 1; + presentInfo.pSwapchains = swapChains; + presentInfo.pImageIndices = &imageIndex; + + result = vkQueuePresentKHR(m_presentQueue, &presentInfo); + if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR || + m_framebufferResized) { + recreateSwapChain(); + m_framebufferResized = false; + } else if (result != VK_SUCCESS) { + throw std::runtime_error("Failed to acquire swap chain image!"); + } + + m_currentFrame++; +} +#else +void VulkanBaseApp::drawFrame() { + size_t currentFrameIdx = m_currentFrame % MAX_FRAMES_IN_FLIGHT; + vkWaitForFences(m_device, 1, &m_inFlightFences[currentFrameIdx], VK_TRUE, + std::numeric_limits::max()); + + uint32_t imageIndex; + VkResult result = vkAcquireNextImageKHR( + m_device, m_swapChain, std::numeric_limits::max(), + m_imageAvailableSemaphores[currentFrameIdx], VK_NULL_HANDLE, &imageIndex); + if (result == VK_ERROR_OUT_OF_DATE_KHR) { + recreateSwapChain(); + } else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) { + throw std::runtime_error("Failed to acquire swap chain image!"); + } + + updateUniformBuffer(imageIndex); + + VkSubmitInfo submitInfo = {}; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + + std::vector waitSemaphores; + std::vector waitStages; + + waitSemaphores.push_back(m_imageAvailableSemaphores[currentFrameIdx]); + waitStages.push_back(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT); + getWaitFrameSemaphores(waitSemaphores, waitStages); + + submitInfo.waitSemaphoreCount = (uint32_t)waitSemaphores.size(); + submitInfo.pWaitSemaphores = waitSemaphores.data(); + submitInfo.pWaitDstStageMask = waitStages.data(); + + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &m_commandBuffers[imageIndex]; + + std::vector signalSemaphores; + getSignalFrameSemaphores(signalSemaphores); + signalSemaphores.push_back(m_renderFinishedSemaphores[currentFrameIdx]); + submitInfo.signalSemaphoreCount = (uint32_t)signalSemaphores.size(); + submitInfo.pSignalSemaphores = signalSemaphores.data(); + + vkResetFences(m_device, 1, &m_inFlightFences[currentFrameIdx]); + + if (vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, + m_inFlightFences[currentFrameIdx]) != VK_SUCCESS) { + throw std::runtime_error("failed to submit draw command buffer!"); + } + + VkPresentInfoKHR presentInfo = {}; + presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR; + presentInfo.waitSemaphoreCount = 1; + presentInfo.pWaitSemaphores = &m_renderFinishedSemaphores[currentFrameIdx]; + + VkSwapchainKHR swapChains[] = {m_swapChain}; + presentInfo.swapchainCount = 1; + presentInfo.pSwapchains = swapChains; + presentInfo.pImageIndices = &imageIndex; + + result = vkQueuePresentKHR(m_presentQueue, &presentInfo); + if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR || + m_framebufferResized) { + recreateSwapChain(); + m_framebufferResized = false; + } else if (result != VK_SUCCESS) { + throw std::runtime_error("Failed to acquire swap chain image!"); + } + + m_currentFrame++; +} +#endif /* _VK_TIMELINE_SEMAPHORE */ + +void VulkanBaseApp::cleanupSwapChain() { + if (m_depthImageView != VK_NULL_HANDLE) { + vkDestroyImageView(m_device, m_depthImageView, nullptr); + } + if (m_depthImage != VK_NULL_HANDLE) { + vkDestroyImage(m_device, m_depthImage, nullptr); + } + if (m_depthImageMemory != VK_NULL_HANDLE) { + vkFreeMemory(m_device, m_depthImageMemory, nullptr); + } + + for (size_t i = 0; i < m_uniformBuffers.size(); i++) { + vkDestroyBuffer(m_device, m_uniformBuffers[i], nullptr); + vkFreeMemory(m_device, m_uniformMemory[i], nullptr); + } + + if (m_descriptorPool != VK_NULL_HANDLE) { + vkDestroyDescriptorPool(m_device, m_descriptorPool, nullptr); + } + + for (size_t i = 0; i < m_swapChainFramebuffers.size(); i++) { + vkDestroyFramebuffer(m_device, m_swapChainFramebuffers[i], nullptr); + } + + if (m_graphicsPipeline != VK_NULL_HANDLE) { + vkDestroyPipeline(m_device, m_graphicsPipeline, nullptr); + } + + if (m_pipelineLayout != VK_NULL_HANDLE) { + vkDestroyPipelineLayout(m_device, m_pipelineLayout, nullptr); + } + + if (m_renderPass != VK_NULL_HANDLE) { + vkDestroyRenderPass(m_device, m_renderPass, nullptr); + } + + for (size_t i = 0; i < m_swapChainImageViews.size(); i++) { + vkDestroyImageView(m_device, m_swapChainImageViews[i], nullptr); + } + + if (m_swapChain != VK_NULL_HANDLE) { + vkDestroySwapchainKHR(m_device, m_swapChain, nullptr); + } +} + +void VulkanBaseApp::recreateSwapChain() { + int width, height; + + glfwGetFramebufferSize(m_window, &width, &height); + while (width == 0 || height == 0) { + glfwWaitEvents(); glfwGetFramebufferSize(m_window, &width, &height); - while (width == 0 || height == 0) { - glfwWaitEvents(); - glfwGetFramebufferSize(m_window, &width, &height); - } + } - vkDeviceWaitIdle(m_device); + vkDeviceWaitIdle(m_device); - cleanupSwapChain(); + cleanupSwapChain(); - createSwapChain(); - createImageViews(); - createRenderPass(); - createGraphicsPipeline(); - createDepthResources(); - createFramebuffers(); - createUniformBuffers(); - createDescriptorPool(); - createDescriptorSets(); - createCommandBuffers(); + createSwapChain(); + createImageViews(); + createRenderPass(); + createGraphicsPipeline(); + createDepthResources(); + createFramebuffers(); + createUniformBuffers(); + createDescriptorPool(); + createDescriptorSets(); + createCommandBuffers(); } -void VulkanBaseApp::mainLoop() -{ - while (!glfwWindowShouldClose(m_window)) { - glfwPollEvents(); - drawFrame(); - } - vkDeviceWaitIdle(m_device); +void VulkanBaseApp::mainLoop() { + while (!glfwWindowShouldClose(m_window)) { + glfwPollEvents(); + drawFrame(); + } + vkDeviceWaitIdle(m_device); } -void readFile(std::istream& s, std::vector& data) -{ - s.seekg(0, std::ios_base::end); - data.resize(s.tellg()); - s.clear(); - s.seekg(0, std::ios_base::beg); - s.read(data.data(), data.size()); +void readFile(std::istream &s, std::vector &data) { + s.seekg(0, std::ios_base::end); + data.resize(s.tellg()); + s.clear(); + s.seekg(0, std::ios_base::beg); + s.read(data.data(), data.size()); } diff --git a/Samples/simpleVulkan/VulkanBaseApp.h b/Samples/simpleVulkan/VulkanBaseApp.h index 5cb7396d..5609acd7 100644 --- a/Samples/simpleVulkan/VulkanBaseApp.h +++ b/Samples/simpleVulkan/VulkanBaseApp.h @@ -38,101 +38,125 @@ #include #endif /* _WIN64 */ +/* remove _VK_TIMELINE_SEMAPHORE to use binary semaphores */ +// use vulkan timeline semaphore +#define _VK_TIMELINE_SEMAPHORE + struct GLFWwindow; -class VulkanBaseApp -{ -public: - VulkanBaseApp(const std::string& appName, bool enableValidation = false); - static VkExternalSemaphoreHandleTypeFlagBits getDefaultSemaphoreHandleType(); - static VkExternalMemoryHandleTypeFlagBits getDefaultMemHandleType(); - virtual ~VulkanBaseApp(); - void init(); - void *getMemHandle(VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType); - void *getSemaphoreHandle(VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType); - void createExternalSemaphore(VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType); - void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& bufferMemory); - void createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, VkBuffer& buffer, VkDeviceMemory& bufferMemory); - void importExternalBuffer(void *handle, VkExternalMemoryHandleTypeFlagBits handleType, size_t size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& memory); - void copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size); - VkCommandBuffer beginSingleTimeCommands(); - void endSingleTimeCommands(VkCommandBuffer commandBuffer); - void mainLoop(); -protected: - const std::string m_appName; - const bool m_enableValidation; - VkInstance m_instance; - VkDebugUtilsMessengerEXT m_debugMessenger; - VkSurfaceKHR m_surface; - VkPhysicalDevice m_physicalDevice; - VkDevice m_device; - VkQueue m_graphicsQueue; - VkQueue m_presentQueue; - VkSwapchainKHR m_swapChain; - std::vector m_swapChainImages; - VkFormat m_swapChainFormat; - VkExtent2D m_swapChainExtent; - std::vector m_swapChainImageViews; - std::vector > m_shaderFiles; - VkRenderPass m_renderPass; - VkPipelineLayout m_pipelineLayout; - VkPipeline m_graphicsPipeline; - std::vector m_swapChainFramebuffers; - VkCommandPool m_commandPool; - std::vector m_commandBuffers; - std::vector m_imageAvailableSemaphores; - std::vector m_renderFinishedSemaphores; - std::vector m_inFlightFences; - std::vector m_uniformBuffers; - std::vector m_uniformMemory; - VkDescriptorSetLayout m_descriptorSetLayout; - VkDescriptorPool m_descriptorPool; - std::vector m_descriptorSets; - VkImage m_depthImage; - VkDeviceMemory m_depthImageMemory; - VkImageView m_depthImageView; - size_t m_currentFrame; - bool m_framebufferResized; - uint8_t m_vkDeviceUUID[VK_UUID_SIZE]; +class VulkanBaseApp { + public: + VulkanBaseApp(const std::string& appName, bool enableValidation = false); + static VkExternalSemaphoreHandleTypeFlagBits getDefaultSemaphoreHandleType(); + static VkExternalMemoryHandleTypeFlagBits getDefaultMemHandleType(); + virtual ~VulkanBaseApp(); + void init(); + void* getMemHandle(VkDeviceMemory memory, + VkExternalMemoryHandleTypeFlagBits handleType); + void* getSemaphoreHandle(VkSemaphore semaphore, + VkExternalSemaphoreHandleTypeFlagBits handleType); + void createExternalSemaphore( + VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType); + void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, + VkMemoryPropertyFlags properties, VkBuffer& buffer, + VkDeviceMemory& bufferMemory); + void createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage, + VkMemoryPropertyFlags properties, + VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, + VkBuffer& buffer, VkDeviceMemory& bufferMemory); + void importExternalBuffer(void* handle, + VkExternalMemoryHandleTypeFlagBits handleType, + size_t size, VkBufferUsageFlags usage, + VkMemoryPropertyFlags properties, VkBuffer& buffer, + VkDeviceMemory& memory); + void copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size); + VkCommandBuffer beginSingleTimeCommands(); + void endSingleTimeCommands(VkCommandBuffer commandBuffer); + void mainLoop(); - virtual void initVulkanApp() {} - virtual void fillRenderingCommandBuffer(VkCommandBuffer& buffer) {} - virtual std::vector getRequiredExtensions() const; - virtual std::vector getRequiredDeviceExtensions() const; - virtual void getVertexDescriptions(std::vector& bindingDesc, std::vector& attribDesc); - virtual void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info); - virtual void getWaitFrameSemaphores(std::vector& wait, std::vector< VkPipelineStageFlags>& waitStages) const; - virtual void getSignalFrameSemaphores(std::vector& signal) const; - virtual VkDeviceSize getUniformSize() const; - virtual void updateUniformBuffer(uint32_t imageIndex); - virtual void drawFrame(); -private: - GLFWwindow *m_window; + protected: + const std::string m_appName; + const bool m_enableValidation; + VkInstance m_instance; + VkDebugUtilsMessengerEXT m_debugMessenger; + VkSurfaceKHR m_surface; + VkPhysicalDevice m_physicalDevice; + VkDevice m_device; + VkQueue m_graphicsQueue; + VkQueue m_presentQueue; + VkSwapchainKHR m_swapChain; + std::vector m_swapChainImages; + VkFormat m_swapChainFormat; + VkExtent2D m_swapChainExtent; + std::vector m_swapChainImageViews; + std::vector > m_shaderFiles; + VkRenderPass m_renderPass; + VkPipelineLayout m_pipelineLayout; + VkPipeline m_graphicsPipeline; + std::vector m_swapChainFramebuffers; + VkCommandPool m_commandPool; + std::vector m_commandBuffers; + std::vector m_imageAvailableSemaphores; + std::vector m_renderFinishedSemaphores; + std::vector m_inFlightFences; + std::vector m_uniformBuffers; + std::vector m_uniformMemory; + VkSemaphore m_vkPresentationSemaphore; + VkSemaphore m_vkTimelineSemaphore; + VkDescriptorSetLayout m_descriptorSetLayout; + VkDescriptorPool m_descriptorPool; + std::vector m_descriptorSets; + VkImage m_depthImage; + VkDeviceMemory m_depthImageMemory; + VkImageView m_depthImageView; + size_t m_currentFrame; + bool m_framebufferResized; + uint8_t m_vkDeviceUUID[VK_UUID_SIZE]; - void initWindow(); - void initVulkan(); - void createInstance(); - void createSurface(); - void createDevice(); - void createSwapChain(); - void createImageViews(); - void createRenderPass(); - void createDescriptorSetLayout(); - void createGraphicsPipeline(); - void createFramebuffers(); - void createCommandPool(); - void createDepthResources(); - void createUniformBuffers(); - void createDescriptorPool(); - void createDescriptorSets(); - void createCommandBuffers(); - void createSyncObjects(); + virtual void initVulkanApp() {} + virtual void fillRenderingCommandBuffer(VkCommandBuffer& buffer) {} + virtual std::vector getRequiredExtensions() const; + virtual std::vector getRequiredDeviceExtensions() const; + virtual void getVertexDescriptions( + std::vector& bindingDesc, + std::vector& attribDesc); + virtual void getAssemblyStateInfo( + VkPipelineInputAssemblyStateCreateInfo& info); + virtual void getWaitFrameSemaphores( + std::vector& wait, + std::vector& waitStages) const; + virtual void getSignalFrameSemaphores(std::vector& signal) const; + virtual VkDeviceSize getUniformSize() const; + virtual void updateUniformBuffer(uint32_t imageIndex); + virtual void drawFrame(); - void cleanupSwapChain(); - void recreateSwapChain(); + private: + GLFWwindow* m_window; - bool isSuitableDevice(VkPhysicalDevice dev) const; - static void resizeCallback(GLFWwindow *window, int width, int height); + void initWindow(); + void initVulkan(); + void createInstance(); + void createSurface(); + void createDevice(); + void createSwapChain(); + void createImageViews(); + void createRenderPass(); + void createDescriptorSetLayout(); + void createGraphicsPipeline(); + void createFramebuffers(); + void createCommandPool(); + void createDepthResources(); + void createUniformBuffers(); + void createDescriptorPool(); + void createDescriptorSets(); + void createCommandBuffers(); + void createSyncObjects(); + + void cleanupSwapChain(); + void recreateSwapChain(); + + bool isSuitableDevice(VkPhysicalDevice dev) const; + static void resizeCallback(GLFWwindow* window, int width, int height); }; void readFile(std::istream& s, std::vector& data); diff --git a/Samples/simpleVulkan/main.cpp b/Samples/simpleVulkan/main.cpp index 303361b1..3277cb7b 100644 --- a/Samples/simpleVulkan/main.cpp +++ b/Samples/simpleVulkan/main.cpp @@ -46,28 +46,28 @@ std::string execution_path; #define ENABLE_VALIDATION (true) #endif -class VulkanCudaSineWave : public VulkanBaseApp -{ +class VulkanCudaSineWave : public VulkanBaseApp { + typedef struct UniformBufferObject_st { + mat4x4 modelViewProj; + } UniformBufferObject; - typedef struct UniformBufferObject_st { - mat4x4 modelViewProj; - } UniformBufferObject; + VkBuffer m_heightBuffer, m_xyBuffer, m_indexBuffer; + VkDeviceMemory m_heightMemory, m_xyMemory, m_indexMemory; + UniformBufferObject m_ubo; + VkSemaphore m_vkWaitSemaphore, m_vkSignalSemaphore; + SineWaveSimulation m_sim; + cudaStream_t m_stream; + cudaExternalSemaphore_t m_cudaWaitSemaphore, m_cudaSignalSemaphore, + m_cudaTimelineSemaphore; + cudaExternalMemory_t m_cudaVertMem; + float *m_cudaHeightMap; + using chrono_tp = std::chrono::time_point; + chrono_tp m_lastTime; + size_t m_lastFrame; - VkBuffer m_heightBuffer, m_xyBuffer, m_indexBuffer; - VkDeviceMemory m_heightMemory, m_xyMemory, m_indexMemory; - UniformBufferObject m_ubo; - VkSemaphore m_vkWaitSemaphore, m_vkSignalSemaphore; - SineWaveSimulation m_sim; - cudaStream_t m_stream; - cudaExternalSemaphore_t m_cudaWaitSemaphore, m_cudaSignalSemaphore; - cudaExternalMemory_t m_cudaVertMem; - float *m_cudaHeightMap; - using chrono_tp = std::chrono::time_point; - chrono_tp m_lastTime; - size_t m_lastFrame; -public: - VulkanCudaSineWave(size_t width, size_t height) : - VulkanBaseApp("vulkanCudaSineWave", ENABLE_VALIDATION), + public: + VulkanCudaSineWave(size_t width, size_t height) + : VulkanBaseApp("vulkanCudaSineWave", ENABLE_VALIDATION), m_heightBuffer(VK_NULL_HANDLE), m_xyBuffer(VK_NULL_HANDLE), m_indexBuffer(VK_NULL_HANDLE), @@ -81,361 +81,458 @@ public: m_vkSignalSemaphore(VK_NULL_HANDLE), m_cudaWaitSemaphore(), m_cudaSignalSemaphore(), + m_cudaTimelineSemaphore(), m_cudaVertMem(), m_cudaHeightMap(nullptr), m_lastFrame(0) { - // Our index buffer can only index 32-bits of the vertex buffer - if ((width * height) > (1ULL << 32ULL)) { - throw std::runtime_error("Requested height and width is too large for this sample!"); - } - // Add our compiled vulkan shader files - char* vertex_shader_path = sdkFindFilePath("sinewave.vert", execution_path.c_str()); - char* fragment_shader_path = sdkFindFilePath("sinewave.frag", execution_path.c_str()); - m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path)); - m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader_path)); - + // Our index buffer can only index 32-bits of the vertex buffer + if ((width * height) > (1ULL << 32ULL)) { + throw std::runtime_error( + "Requested height and width is too large for this sample!"); } - ~VulkanCudaSineWave() { - // Make sure there's no pending work before we start tearing down - checkCudaErrors(cudaStreamSynchronize(m_stream)); + // Add our compiled vulkan shader files + char *vertex_shader_path = + sdkFindFilePath("sinewave.vert", execution_path.c_str()); + char *fragment_shader_path = + sdkFindFilePath("sinewave.frag", execution_path.c_str()); + m_shaderFiles.push_back( + std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path)); + m_shaderFiles.push_back( + std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader_path)); + } + ~VulkanCudaSineWave() { + // Make sure there's no pending work before we start tearing down + checkCudaErrors(cudaStreamSynchronize(m_stream)); - if (m_vkSignalSemaphore != VK_NULL_HANDLE) { - checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaSignalSemaphore)); - vkDestroySemaphore(m_device, m_vkSignalSemaphore, nullptr); - } - if (m_vkWaitSemaphore != VK_NULL_HANDLE) { - checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaWaitSemaphore)); - vkDestroySemaphore(m_device, m_vkWaitSemaphore, nullptr); - } +#ifdef _VK_TIMELINE_SEMAPHORE + if (m_vkTimelineSemaphore != VK_NULL_HANDLE) { + checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaTimelineSemaphore)); + vkDestroySemaphore(m_device, m_vkTimelineSemaphore, nullptr); + } +#endif /* _VK_TIMELINE_SEMAPHORE */ - if (m_xyBuffer != VK_NULL_HANDLE) { - vkDestroyBuffer(m_device, m_xyBuffer, nullptr); - } - if (m_xyMemory != VK_NULL_HANDLE) { - vkFreeMemory(m_device, m_xyMemory, nullptr); - } - - if (m_heightBuffer != VK_NULL_HANDLE) { - vkDestroyBuffer(m_device, m_heightBuffer, nullptr); - } - if (m_heightMemory != VK_NULL_HANDLE) { - vkFreeMemory(m_device, m_heightMemory, nullptr); - } - if (m_cudaHeightMap) { - checkCudaErrors(cudaDestroyExternalMemory(m_cudaVertMem)); - } - - if (m_indexBuffer != VK_NULL_HANDLE) { - vkDestroyBuffer(m_device, m_indexBuffer, nullptr); - } - if (m_indexMemory != VK_NULL_HANDLE) { - vkFreeMemory(m_device, m_indexMemory, nullptr); - } + if (m_vkSignalSemaphore != VK_NULL_HANDLE) { + checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaSignalSemaphore)); + vkDestroySemaphore(m_device, m_vkSignalSemaphore, nullptr); + } + if (m_vkWaitSemaphore != VK_NULL_HANDLE) { + checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaWaitSemaphore)); + vkDestroySemaphore(m_device, m_vkWaitSemaphore, nullptr); } - void fillRenderingCommandBuffer(VkCommandBuffer& commandBuffer) { - VkBuffer vertexBuffers[] = { m_heightBuffer, m_xyBuffer }; - VkDeviceSize offsets[] = { 0, 0 }; - vkCmdBindVertexBuffers(commandBuffer, 0, sizeof(vertexBuffers) / sizeof(vertexBuffers[0]), vertexBuffers, offsets); - vkCmdBindIndexBuffer(commandBuffer, m_indexBuffer, 0, VK_INDEX_TYPE_UINT32); - vkCmdDrawIndexed(commandBuffer, (uint32_t)((m_sim.getWidth() - 1) * (m_sim.getHeight() - 1) * 6), 1, 0, 0, 0); + if (m_xyBuffer != VK_NULL_HANDLE) { + vkDestroyBuffer(m_device, m_xyBuffer, nullptr); + } + if (m_xyMemory != VK_NULL_HANDLE) { + vkFreeMemory(m_device, m_xyMemory, nullptr); } - void getVertexDescriptions(std::vector& bindingDesc, std::vector& attribDesc) { - bindingDesc.resize(2); - attribDesc.resize(2); - - bindingDesc[0].binding = 0; - bindingDesc[0].stride = sizeof(float); - bindingDesc[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX; - - bindingDesc[1].binding = 1; - bindingDesc[1].stride = sizeof(vec2); - bindingDesc[1].inputRate = VK_VERTEX_INPUT_RATE_VERTEX; - - attribDesc[0].binding = 0; - attribDesc[0].location = 0; - attribDesc[0].format = VK_FORMAT_R32_SFLOAT; - attribDesc[0].offset = 0; - - attribDesc[1].binding = 1; - attribDesc[1].location = 1; - attribDesc[1].format = VK_FORMAT_R32G32_SFLOAT; - attribDesc[1].offset = 0; + if (m_heightBuffer != VK_NULL_HANDLE) { + vkDestroyBuffer(m_device, m_heightBuffer, nullptr); + } + if (m_heightMemory != VK_NULL_HANDLE) { + vkFreeMemory(m_device, m_heightMemory, nullptr); + } + if (m_cudaHeightMap) { + checkCudaErrors(cudaDestroyExternalMemory(m_cudaVertMem)); } - void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info) { - info.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; - info.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; - info.primitiveRestartEnable = VK_FALSE; + if (m_indexBuffer != VK_NULL_HANDLE) { + vkDestroyBuffer(m_device, m_indexBuffer, nullptr); + } + if (m_indexMemory != VK_NULL_HANDLE) { + vkFreeMemory(m_device, m_indexMemory, nullptr); + } + } + + void fillRenderingCommandBuffer(VkCommandBuffer &commandBuffer) { + VkBuffer vertexBuffers[] = {m_heightBuffer, m_xyBuffer}; + VkDeviceSize offsets[] = {0, 0}; + vkCmdBindVertexBuffers(commandBuffer, 0, + sizeof(vertexBuffers) / sizeof(vertexBuffers[0]), + vertexBuffers, offsets); + vkCmdBindIndexBuffer(commandBuffer, m_indexBuffer, 0, VK_INDEX_TYPE_UINT32); + vkCmdDrawIndexed(commandBuffer, (uint32_t)((m_sim.getWidth() - 1) * + (m_sim.getHeight() - 1) * 6), + 1, 0, 0, 0); + } + + void getVertexDescriptions( + std::vector &bindingDesc, + std::vector &attribDesc) { + bindingDesc.resize(2); + attribDesc.resize(2); + + bindingDesc[0].binding = 0; + bindingDesc[0].stride = sizeof(float); + bindingDesc[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX; + + bindingDesc[1].binding = 1; + bindingDesc[1].stride = sizeof(vec2); + bindingDesc[1].inputRate = VK_VERTEX_INPUT_RATE_VERTEX; + + attribDesc[0].binding = 0; + attribDesc[0].location = 0; + attribDesc[0].format = VK_FORMAT_R32_SFLOAT; + attribDesc[0].offset = 0; + + attribDesc[1].binding = 1; + attribDesc[1].location = 1; + attribDesc[1].format = VK_FORMAT_R32G32_SFLOAT; + attribDesc[1].offset = 0; + } + + void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo &info) { + info.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; + info.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; + info.primitiveRestartEnable = VK_FALSE; + } + + void getWaitFrameSemaphores( + std::vector &wait, + std::vector &waitStages) const { + if (m_currentFrame != 0) { + // Have vulkan wait until cuda is done with the vertex buffer before + // rendering, We don't do this on the first frame, as the wait semaphore + // hasn't been initialized yet + wait.push_back(m_vkWaitSemaphore); + // We want to wait until all the pipeline commands are complete before + // letting cuda work + waitStages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT); + } + } + + void getSignalFrameSemaphores(std::vector &signal) const { + // Add this semaphore for vulkan to signal once the vertex buffer is ready + // for cuda to modify + signal.push_back(m_vkSignalSemaphore); + } + + void initVulkanApp() { + int cuda_device = -1; + + // Select cuda device where vulkan is running. + cuda_device = m_sim.initCuda(m_vkDeviceUUID, VK_UUID_SIZE); + if (cuda_device == -1) { + printf("Error: No CUDA-Vulkan interop capable device found\n"); + exit(EXIT_FAILURE); } - void getWaitFrameSemaphores(std::vector& wait, std::vector< VkPipelineStageFlags>& waitStages) const { - if (m_currentFrame != 0) { - // Have vulkan wait until cuda is done with the vertex buffer before rendering - // We don't do this on the first frame, as the wait semaphore hasn't been initialized yet - wait.push_back(m_vkWaitSemaphore); - // We want to wait until all the pipeline commands are complete before letting cuda work - waitStages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT); + m_sim.initCudaLaunchConfig(cuda_device); + + // Create the cuda stream we'll be using + checkCudaErrors( + cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking)); + + const size_t nVerts = m_sim.getWidth() * m_sim.getHeight(); + const size_t nInds = (m_sim.getWidth() - 1) * (m_sim.getHeight() - 1) * 6; + + // Create the height map cuda will write to + createExternalBuffer( + nVerts * sizeof(float), + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, getDefaultMemHandleType(), + m_heightBuffer, m_heightMemory); + + // Create the vertex buffer that will hold the xy coordinates for the grid + createBuffer(nVerts * sizeof(vec2), VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_xyBuffer, m_xyMemory); + + // Create the index buffer that references from both buffers above + createBuffer( + nInds * sizeof(uint32_t), + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_indexBuffer, m_indexMemory); + + // Import the height map into cuda and retrieve a device pointer to use + importCudaExternalMemory((void **)&m_cudaHeightMap, m_cudaVertMem, + m_heightMemory, nVerts * sizeof(*m_cudaHeightMap), + getDefaultMemHandleType()); + // Set the height map to use in the simulation + m_sim.initSimulation(m_cudaHeightMap); + + { + // Set up the initial values for the vertex buffers with Vulkan + void *stagingBase; + VkBuffer stagingBuffer; + VkDeviceMemory stagingMemory; + VkDeviceSize stagingSz = + std::max(nVerts * sizeof(vec2), nInds * sizeof(uint32_t)); + createBuffer(stagingSz, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + stagingBuffer, stagingMemory); + + vkMapMemory(m_device, stagingMemory, 0, stagingSz, 0, &stagingBase); + + memset(stagingBase, 0, nVerts * sizeof(float)); + copyBuffer(m_heightBuffer, stagingBuffer, nVerts * sizeof(float)); + + for (size_t y = 0; y < m_sim.getHeight(); y++) { + for (size_t x = 0; x < m_sim.getWidth(); x++) { + vec2 *stagedVert = (vec2 *)stagingBase; + stagedVert[y * m_sim.getWidth() + x][0] = + (2.0f * x) / (m_sim.getWidth() - 1) - 1; + stagedVert[y * m_sim.getWidth() + x][1] = + (2.0f * y) / (m_sim.getHeight() - 1) - 1; } + } + copyBuffer(m_xyBuffer, stagingBuffer, nVerts * sizeof(vec2)); + + { + uint32_t *indices = (uint32_t *)stagingBase; + for (size_t y = 0; y < m_sim.getHeight() - 1; y++) { + for (size_t x = 0; x < m_sim.getWidth() - 1; x++) { + indices[0] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 0)); + indices[1] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 0)); + indices[2] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 1)); + indices[3] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 0)); + indices[4] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 1)); + indices[5] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 1)); + indices += 6; + } + } + } + copyBuffer(m_indexBuffer, stagingBuffer, nInds * sizeof(uint32_t)); + + vkUnmapMemory(m_device, stagingMemory); + vkDestroyBuffer(m_device, stagingBuffer, nullptr); + vkFreeMemory(m_device, stagingMemory, nullptr); } - void getSignalFrameSemaphores(std::vector& signal) const { - // Add this semaphore for vulkan to signal once the vertex buffer is ready for cuda to modify - signal.push_back(m_vkSignalSemaphore); +#ifdef _VK_TIMELINE_SEMAPHORE + // Create the timeline semaphore to sync cuda and vulkan access to vertex + // buffer + createExternalSemaphore(m_vkTimelineSemaphore, + getDefaultSemaphoreHandleType()); + // Import the timeline semaphore cuda will use to sync cuda and vulkan + // access to vertex buffer + importCudaExternalSemaphore(m_cudaTimelineSemaphore, m_vkTimelineSemaphore, + getDefaultSemaphoreHandleType()); +#else + + // Create the semaphore vulkan will signal when it's done with the vertex + // buffer + createExternalSemaphore(m_vkSignalSemaphore, + getDefaultSemaphoreHandleType()); + // Create the semaphore vulkan will wait for before using the vertex buffer + createExternalSemaphore(m_vkWaitSemaphore, getDefaultSemaphoreHandleType()); + // Import the semaphore cuda will use -- vulkan's signal will be cuda's wait + importCudaExternalSemaphore(m_cudaWaitSemaphore, m_vkSignalSemaphore, + getDefaultSemaphoreHandleType()); + // Import the semaphore cuda will use -- cuda's signal will be vulkan's wait + importCudaExternalSemaphore(m_cudaSignalSemaphore, m_vkWaitSemaphore, + getDefaultSemaphoreHandleType()); +#endif /* _VK_TIMELINE_SEMAPHORE */ + } + + void importCudaExternalMemory(void **cudaPtr, cudaExternalMemory_t &cudaMem, + VkDeviceMemory &vkMem, VkDeviceSize size, + VkExternalMemoryHandleTypeFlagBits handleType) { + cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {}; + + if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) { + externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32; + } else if (handleType & + VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) { + externalMemoryHandleDesc.type = + cudaExternalMemoryHandleTypeOpaqueWin32Kmt; + } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { + externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueFd; + } else { + throw std::runtime_error("Unknown handle type requested!"); } - void initVulkanApp() { - int cuda_device = -1; - - // Select cuda device where vulkan is running. - cuda_device = m_sim.initCuda(m_vkDeviceUUID, VK_UUID_SIZE); - if (cuda_device == -1) - { - printf("Error: No CUDA-Vulkan interop capable device found\n"); - exit(EXIT_FAILURE); - } - - m_sim.initCudaLaunchConfig(cuda_device); - - // Create the cuda stream we'll be using - checkCudaErrors(cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking)); - - const size_t nVerts = m_sim.getWidth() * m_sim.getHeight(); - const size_t nInds = (m_sim.getWidth() - 1) * (m_sim.getHeight() - 1) * 6; - - // Create the height map cuda will write to - createExternalBuffer(nVerts * sizeof(float), - VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, - getDefaultMemHandleType(), - m_heightBuffer, m_heightMemory); - - // Create the vertex buffer that will hold the xy coordinates for the grid - createBuffer(nVerts * sizeof(vec2), - VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, - m_xyBuffer, m_xyMemory); - - // Create the index buffer that references from both buffers above - createBuffer(nInds * sizeof(uint32_t), - VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, - m_indexBuffer, m_indexMemory); - - // Import the height map into cuda and retrieve a device pointer to use - importCudaExternalMemory((void **)&m_cudaHeightMap, m_cudaVertMem, m_heightMemory, nVerts * sizeof(*m_cudaHeightMap), getDefaultMemHandleType()); - // Set the height map to use in the simulation - m_sim.initSimulation(m_cudaHeightMap); - - { - // Set up the initial values for the vertex buffers with Vulkan - void *stagingBase; - VkBuffer stagingBuffer; - VkDeviceMemory stagingMemory; - VkDeviceSize stagingSz = std::max(nVerts * sizeof(vec2), nInds * sizeof(uint32_t)); - createBuffer(stagingSz, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBuffer, stagingMemory); - - vkMapMemory(m_device, stagingMemory, 0, stagingSz, 0, &stagingBase); - - memset(stagingBase, 0, nVerts * sizeof(float)); - copyBuffer(m_heightBuffer, stagingBuffer, nVerts * sizeof(float)); - - for (size_t y = 0; y < m_sim.getHeight(); y++) { - for (size_t x = 0; x < m_sim.getWidth(); x++) { - vec2 *stagedVert = (vec2 *)stagingBase; - stagedVert[y * m_sim.getWidth() + x][0] = (2.0f * x) / (m_sim.getWidth() - 1) - 1; - stagedVert[y * m_sim.getWidth() + x][1] = (2.0f * y) / (m_sim.getHeight() - 1) - 1; - } - } - copyBuffer(m_xyBuffer, stagingBuffer, nVerts * sizeof(vec2)); - - { - uint32_t *indices = (uint32_t *)stagingBase; - for (size_t y = 0; y < m_sim.getHeight() - 1; y++) { - for (size_t x = 0; x < m_sim.getWidth() - 1; x++) { - indices[0] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 0)); - indices[1] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 0)); - indices[2] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 1)); - indices[3] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 0)); - indices[4] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 1)); - indices[5] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 1)); - indices += 6; - } - } - } - copyBuffer(m_indexBuffer, stagingBuffer, nInds * sizeof(uint32_t)); - - vkUnmapMemory(m_device, stagingMemory); - vkDestroyBuffer(m_device, stagingBuffer, nullptr); - vkFreeMemory(m_device, stagingMemory, nullptr); - } - - // Create the semaphore vulkan will signal when it's done with the vertex buffer - createExternalSemaphore(m_vkSignalSemaphore, getDefaultSemaphoreHandleType()); - // Create the semaphore vulkan will wait for before using the vertex buffer - createExternalSemaphore(m_vkWaitSemaphore, getDefaultSemaphoreHandleType()); - // Import the semaphore cuda will use -- vulkan's signal will be cuda's wait - importCudaExternalSemaphore(m_cudaWaitSemaphore, m_vkSignalSemaphore, getDefaultSemaphoreHandleType()); - // Import the semaphore cuda will use -- cuda's signal will be vulkan's wait - importCudaExternalSemaphore(m_cudaSignalSemaphore, m_vkWaitSemaphore, getDefaultSemaphoreHandleType()); - } - - void importCudaExternalMemory(void **cudaPtr, cudaExternalMemory_t& cudaMem, VkDeviceMemory& vkMem, VkDeviceSize size, VkExternalMemoryHandleTypeFlagBits handleType) { - cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {}; - - if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) { - externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32; - } - else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) { - externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32Kmt; - } - else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { - externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueFd; - } - else { - throw std::runtime_error("Unknown handle type requested!"); - } - - externalMemoryHandleDesc.size = size; + externalMemoryHandleDesc.size = size; #ifdef _WIN64 - externalMemoryHandleDesc.handle.win32.handle = (HANDLE)getMemHandle(vkMem, handleType); + externalMemoryHandleDesc.handle.win32.handle = + (HANDLE)getMemHandle(vkMem, handleType); #else - externalMemoryHandleDesc.handle.fd = (int)(uintptr_t)getMemHandle(vkMem, handleType); + externalMemoryHandleDesc.handle.fd = + (int)(uintptr_t)getMemHandle(vkMem, handleType); #endif - checkCudaErrors(cudaImportExternalMemory(&cudaMem, &externalMemoryHandleDesc)); + checkCudaErrors( + cudaImportExternalMemory(&cudaMem, &externalMemoryHandleDesc)); - cudaExternalMemoryBufferDesc externalMemBufferDesc = {}; - externalMemBufferDesc.offset = 0; - externalMemBufferDesc.size = size; - externalMemBufferDesc.flags = 0; + cudaExternalMemoryBufferDesc externalMemBufferDesc = {}; + externalMemBufferDesc.offset = 0; + externalMemBufferDesc.size = size; + externalMemBufferDesc.flags = 0; - checkCudaErrors(cudaExternalMemoryGetMappedBuffer(cudaPtr, cudaMem, &externalMemBufferDesc)); + checkCudaErrors(cudaExternalMemoryGetMappedBuffer(cudaPtr, cudaMem, + &externalMemBufferDesc)); + } + + void importCudaExternalSemaphore( + cudaExternalSemaphore_t &cudaSem, VkSemaphore &vkSem, + VkExternalSemaphoreHandleTypeFlagBits handleType) { + cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc = {}; + +#ifdef _VK_TIMELINE_SEMAPHORE + if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) { + externalSemaphoreHandleDesc.type = + cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32; + } else if (handleType & + VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) { + externalSemaphoreHandleDesc.type = + cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32; + } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { + externalSemaphoreHandleDesc.type = + cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd; + } +#else + if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) { + externalSemaphoreHandleDesc.type = + cudaExternalSemaphoreHandleTypeOpaqueWin32; + } else if (handleType & + VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) { + externalSemaphoreHandleDesc.type = + cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt; + } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { + externalSemaphoreHandleDesc.type = + cudaExternalSemaphoreHandleTypeOpaqueFd; + } +#endif /* _VK_TIMELINE_SEMAPHORE */ + else { + throw std::runtime_error("Unknown handle type requested!"); } - void importCudaExternalSemaphore(cudaExternalSemaphore_t& cudaSem, VkSemaphore& vkSem, VkExternalSemaphoreHandleTypeFlagBits handleType) { - cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc = {}; - - if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) { - externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32; - } - else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) { - externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt; - } - else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { - externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd; - } - else { - throw std::runtime_error("Unknown handle type requested!"); - } - #ifdef _WIN64 - externalSemaphoreHandleDesc.handle.win32.handle = (HANDLE)getSemaphoreHandle(vkSem, handleType); + externalSemaphoreHandleDesc.handle.win32.handle = + (HANDLE)getSemaphoreHandle(vkSem, handleType); #else - externalSemaphoreHandleDesc.handle.fd = (int)(uintptr_t)getSemaphoreHandle(vkSem, handleType); + externalSemaphoreHandleDesc.handle.fd = + (int)(uintptr_t)getSemaphoreHandle(vkSem, handleType); #endif - externalSemaphoreHandleDesc.flags = 0; + externalSemaphoreHandleDesc.flags = 0; - checkCudaErrors(cudaImportExternalSemaphore(&cudaSem, &externalSemaphoreHandleDesc)); + checkCudaErrors( + cudaImportExternalSemaphore(&cudaSem, &externalSemaphoreHandleDesc)); + } + + VkDeviceSize getUniformSize() const { return sizeof(UniformBufferObject); } + + void updateUniformBuffer(uint32_t imageIndex) { + { + mat4x4 view, proj; + vec3 eye = {1.75f, 1.75f, 1.25f}; + vec3 center = {0.0f, 0.0f, -0.25f}; + vec3 up = {0.0f, 0.0f, 1.0f}; + + mat4x4_perspective( + proj, (float)degreesToRadians(45.0f), + m_swapChainExtent.width / (float)m_swapChainExtent.height, 0.1f, + 10.0f); + proj[1][1] *= -1.0f; // Flip y axis + + mat4x4_look_at(view, eye, center, up); + mat4x4_mul(m_ubo.modelViewProj, proj, view); } - VkDeviceSize getUniformSize() const { - return sizeof(UniformBufferObject); - } + void *data; + vkMapMemory(m_device, m_uniformMemory[imageIndex], 0, getUniformSize(), 0, + &data); + memcpy(data, &m_ubo, sizeof(m_ubo)); + vkUnmapMemory(m_device, m_uniformMemory[imageIndex]); + } - void updateUniformBuffer(uint32_t imageIndex) { - { - mat4x4 view, proj; - vec3 eye = { 1.75f, 1.75f, 1.25f }; - vec3 center = { 0.0f, 0.0f, -0.25f }; - vec3 up = { 0.0f, 0.0f, 1.0f }; + std::vector getRequiredExtensions() const { + std::vector extensions; + extensions.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME); + extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME); + return extensions; + } - mat4x4_perspective(proj, (float)degreesToRadians(45.0f), m_swapChainExtent.width / (float)m_swapChainExtent.height, 0.1f, 10.0f); - proj[1][1] *= -1.0f; // Flip y axis - - mat4x4_look_at(view, eye, center, up); - mat4x4_mul(m_ubo.modelViewProj, proj, view); - } - - void *data; - vkMapMemory(m_device, m_uniformMemory[imageIndex], 0, getUniformSize(), 0, &data); - memcpy(data, &m_ubo, sizeof(m_ubo)); - vkUnmapMemory(m_device, m_uniformMemory[imageIndex]); - } - - std::vector getRequiredExtensions() const { - std::vector extensions; - extensions.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME); - extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME); - return extensions; - } - - std::vector getRequiredDeviceExtensions() const { - std::vector extensions; - extensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME); - extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME); + std::vector getRequiredDeviceExtensions() const { + std::vector extensions; + extensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME); + extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME); + extensions.push_back(VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME); #ifdef _WIN64 - extensions.push_back(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME); - extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME); + extensions.push_back(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME); + extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME); #else - extensions.push_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME); - extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME); + extensions.push_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME); + extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME); #endif /* _WIN64 */ - return extensions; + return extensions; + } + + void drawFrame() { + static chrono_tp startTime = std::chrono::high_resolution_clock::now(); + + chrono_tp currentTime = std::chrono::high_resolution_clock::now(); + float time = std::chrono::duration( + currentTime - startTime) + .count(); + + if (m_currentFrame == 0) { + m_lastTime = startTime; } - void drawFrame() { - static chrono_tp startTime = std::chrono::high_resolution_clock::now(); + float frame_time = + std::chrono::duration(currentTime - + m_lastTime) + .count(); - chrono_tp currentTime = std::chrono::high_resolution_clock::now(); - float time = std::chrono::duration(currentTime - startTime).count(); + // Have vulkan draw the current frame... + VulkanBaseApp::drawFrame(); - if (m_currentFrame == 0) { - m_lastTime = startTime; - } +#ifdef _VK_TIMELINE_SEMAPHORE + cudaExternalSemaphoreWaitParams waitParams = {}; + waitParams.flags = 0; + waitParams.params.fence.value = 1; - float frame_time = std::chrono::duration(currentTime - m_lastTime).count(); + cudaExternalSemaphoreSignalParams signalParams = {}; + signalParams.flags = 0; + signalParams.params.fence.value = 0; + // Wait for vulkan to complete it's work + checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaTimelineSemaphore, + &waitParams, 1, m_stream)); + // Now step the simulation + m_sim.stepSimulation(time, m_stream); + // Signal vulkan to continue with the updated buffers + checkCudaErrors(cudaSignalExternalSemaphoresAsync( + &m_cudaTimelineSemaphore, &signalParams, 1, m_stream)); +#else + cudaExternalSemaphoreWaitParams waitParams = {}; + waitParams.flags = 0; + waitParams.params.fence.value = 0; - cudaExternalSemaphoreWaitParams waitParams = {}; - waitParams.flags = 0; - waitParams.params.fence.value = 0; + cudaExternalSemaphoreSignalParams signalParams = {}; + signalParams.flags = 0; + signalParams.params.fence.value = 0; - cudaExternalSemaphoreSignalParams signalParams = {}; - signalParams.flags = 0; - signalParams.params.fence.value = 0; + // Wait for vulkan to complete it's work + checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaWaitSemaphore, + &waitParams, 1, m_stream)); + // Now step the simulation + m_sim.stepSimulation(time, m_stream); + // Signal vulkan to continue with the updated buffers + checkCudaErrors(cudaSignalExternalSemaphoresAsync( + &m_cudaSignalSemaphore, &signalParams, 1, m_stream)); +#endif /* _VK_TIMELINE_SEMAPHORE */ - // Have vulkan draw the current frame... - VulkanBaseApp::drawFrame(); - // Wait for vulkan to complete it's work - checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaWaitSemaphore, &waitParams, 1, m_stream)); - // Now step the simulation - m_sim.stepSimulation(time, m_stream); - // Signal vulkan to continue with the updated buffers - checkCudaErrors(cudaSignalExternalSemaphoresAsync(&m_cudaSignalSemaphore, &signalParams, 1, m_stream)); - - // Output a naive measurement of the frames per second every five seconds - if (frame_time > 5) { - std::cout << "Average FPS (over " - << std::fixed << std::setprecision(2) << frame_time - << " seconds): " - << std::fixed << std::setprecision(2) - << ((m_currentFrame - m_lastFrame) / frame_time) - << std::endl; - m_lastFrame = m_currentFrame; - m_lastTime = currentTime; - } + // Output a naive measurement of the frames per second every five seconds + if (frame_time > 5) { + std::cout << "Average FPS (over " << std::fixed << std::setprecision(2) + << frame_time << " seconds): " << std::fixed + << std::setprecision(2) + << ((m_currentFrame - m_lastFrame) / frame_time) << std::endl; + m_lastFrame = m_currentFrame; + m_lastTime = currentTime; } + } }; -int main(int argc, char **argv) -{ - execution_path = argv[0]; - VulkanCudaSineWave app((1ULL << 8ULL), (1ULL << 8ULL)); - app.init(); - app.mainLoop(); - return 0; +int main(int argc, char **argv) { + execution_path = argv[0]; + VulkanCudaSineWave app((1ULL << 8ULL), (1ULL << 8ULL)); + app.init(); + app.mainLoop(); + return 0; } diff --git a/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj b/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj index b44dbe13..713ae122 100644 --- a/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj +++ b/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -122,6 +122,6 @@ - + diff --git a/Samples/simpleVulkan/simpleVulkan_vs2019.vcxproj b/Samples/simpleVulkan/simpleVulkan_vs2019.vcxproj index 0f6b767a..a03ea3de 100644 --- a/Samples/simpleVulkan/simpleVulkan_vs2019.vcxproj +++ b/Samples/simpleVulkan/simpleVulkan_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -118,6 +118,6 @@ - + diff --git a/Samples/simpleVulkanMMAP/README.md b/Samples/simpleVulkanMMAP/README.md index 9a51b048..bd3aeb63 100644 --- a/Samples/simpleVulkanMMAP/README.md +++ b/Samples/simpleVulkanMMAP/README.md @@ -33,7 +33,7 @@ cudaGetDeviceProperties, cudaImportExternalMemory, cudaExternalMemoryGetMappedBu ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2017.vcxproj b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2017.vcxproj index b9d1658c..95ec4011 100644 --- a/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2017.vcxproj +++ b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -124,6 +124,6 @@ - + diff --git a/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2019.vcxproj b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2019.vcxproj index f2232967..c15cb955 100644 --- a/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2019.vcxproj +++ b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -120,6 +120,6 @@ - + diff --git a/Samples/simpleZeroCopy/README.md b/Samples/simpleZeroCopy/README.md index 128cdf31..12919ca0 100644 --- a/Samples/simpleZeroCopy/README.md +++ b/Samples/simpleZeroCopy/README.md @@ -27,7 +27,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/simpleZeroCopy/simpleZeroCopy_vs2017.vcxproj b/Samples/simpleZeroCopy/simpleZeroCopy_vs2017.vcxproj index a40d2464..72ad3aaa 100644 --- a/Samples/simpleZeroCopy/simpleZeroCopy_vs2017.vcxproj +++ b/Samples/simpleZeroCopy/simpleZeroCopy_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleZeroCopy/simpleZeroCopy_vs2019.vcxproj b/Samples/simpleZeroCopy/simpleZeroCopy_vs2019.vcxproj index 20f55821..40b06783 100644 --- a/Samples/simpleZeroCopy/simpleZeroCopy_vs2019.vcxproj +++ b/Samples/simpleZeroCopy/simpleZeroCopy_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/streamOrderedAllocation/README.md b/Samples/streamOrderedAllocation/README.md index 2dd455bd..be8d5602 100644 --- a/Samples/streamOrderedAllocation/README.md +++ b/Samples/streamOrderedAllocation/README.md @@ -27,7 +27,7 @@ cudaMallocAsync, cudaFreeAsync, cudaMemPoolSetAttribute, cudaDeviceGetDefaultMem ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2017.vcxproj b/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2017.vcxproj index 402a041c..1113cafe 100644 --- a/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2017.vcxproj +++ b/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2019.vcxproj b/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2019.vcxproj index d5f8b08d..5e884ceb 100644 --- a/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2019.vcxproj +++ b/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/streamOrderedAllocationIPC/Makefile b/Samples/streamOrderedAllocationIPC/Makefile new file mode 100644 index 00000000..910b6064 --- /dev/null +++ b/Samples/streamOrderedAllocationIPC/Makefile @@ -0,0 +1,423 @@ +################################################################################ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif + +# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now. +ifeq ($(HOST_ARCH),aarch64) + ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null)) + HOST_ARCH := sbsa + TARGET_ARCH := sbsa + endif +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-clang++ + endif + else ifeq ($(TARGET_ARCH),sbsa) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu + LDFLAGS += --unresolved-symbols=ignore-in-shared-libs + CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm + CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + NVCCFLAGS += --qpp-config 5.4.0,gcc_ntoaarch64le + CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu + LDFLAGS += -lsocket + LDFLAGS += -L/usr/lib/aarch64-qnx-gnu + CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu" + ifdef TARGET_OVERRIDE + LDFLAGS += -lslog2 + endif + + ifneq ($(TARGET_FS),) + LDFLAGS += -L$(TARGET_FS)/usr/lib + CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib" + LDFLAGS += -L$(TARGET_FS)/usr/libnvidia + CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia" + CCFLAGS += -I$(TARGET_FS)/../include + endif + endif +endif + +ifdef TARGET_OVERRIDE # cuda toolkit targets override + NVCCFLAGS += -target-dir $(TARGET_OVERRIDE) +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux) + CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu) + +SAMPLE_ENABLED := 1 + +# This sample is not supported on Mac OSX +ifeq ($(TARGET_OS),darwin) + $(info >>> WARNING - streamOrderedAllocationIPC is not supported on Mac OSX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +# This sample is not supported on ARMv7 +ifeq ($(TARGET_ARCH),armv7l) + $(info >>> WARNING - streamOrderedAllocationIPC is not supported on ARMv7 - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +# This sample is not supported on aarch64 +ifeq ($(TARGET_ARCH),aarch64) + $(info >>> WARNING - streamOrderedAllocationIPC is not supported on aarch64 - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +# This sample is not supported on sbsa +ifeq ($(TARGET_ARCH),sbsa) + $(info >>> WARNING - streamOrderedAllocationIPC is not supported on sbsa - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../Common +LIBRARIES := + +################################################################################ + +# Gencode arguments +ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64)) +SMS ?= 35 37 50 52 60 61 70 72 75 80 86 +else +SMS ?= 35 37 50 52 60 61 70 75 80 86 +endif + +ifeq ($(SMS),) +$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) +SAMPLE_ENABLED := 0 +endif + +ifeq ($(GENCODE_FLAGS),) +# Generate SASS code for each SM architecture listed in $(SMS) +$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) + +# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility +HIGHEST_SM := $(lastword $(sort $(SMS))) +ifneq ($(HIGHEST_SM),) +GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) +endif +endif + +ifeq ($(TARGET_OS),darwin) + ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA +else + ifeq ($(TARGET_ARCH),x86_64) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs + CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs + ifdef TARGET_OVERRIDE + CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs + endif + endif + + ifeq ($(TARGET_ARCH),ppc64le) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs + endif + + ifeq ($(HOST_ARCH),ppc64le) + CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs + endif + + CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null) + ifeq ("$(CUDALIB)","") + $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed. Please re-install the driver. <<<) + SAMPLE_ENABLED := 0 + else + CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" ) + LIBRARIES += -L$(CUDALIB) -lcuda + endif +endif + +ALL_CCFLAGS += --std=c++11 --threads 0 + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: streamOrderedAllocationIPC + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +helper_multiprocess.o:../../common/src/helper_multiprocess.cpp + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +streamOrderedAllocationIPC.o:streamOrderedAllocationIPC.cu + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +streamOrderedAllocationIPC: helper_multiprocess.o streamOrderedAllocationIPC.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./streamOrderedAllocationIPC + +clean: + rm -f streamOrderedAllocationIPC helper_multiprocess.o streamOrderedAllocationIPC.o + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/streamOrderedAllocationIPC + +clobber: clean diff --git a/Samples/streamOrderedAllocationIPC/NsightEclipse.xml b/Samples/streamOrderedAllocationIPC/NsightEclipse.xml new file mode 100644 index 00000000..713a3d8f --- /dev/null +++ b/Samples/streamOrderedAllocationIPC/NsightEclipse.xml @@ -0,0 +1,65 @@ + + + + streamOrderedAllocationIPC + + --std=c++11 + + + cudaMallocAsync + cudaFreeAsync + cudaMemPoolCreate + cudaMemPoolImportPointer + cudaMemPoolSetAccess + cudaMemPoolGetAccess + cudaMemPoolExportToShareableHandle + cudaMemPoolExportPointer + cudaMemPoolDestroy + + + whole + + ./ + ../ + ../../common/inc + + + Performance Strategies + + + + + cuda + CUDA + + + + true + streamOrderedAllocationIPC.cu + + 1:CUDA Basic Topics + 1:Performance Strategies + + sm60 + sm61 + sm70 + sm72 + sm75 + sm80 + sm86 + + ../../common/src/helper_multiprocess.cpp + ../../common/inc/helper_multiprocess.h + + + + x86_64 + linux + + + + 6.0 + + stream Ordered Allocation IPC Pools + exe + diff --git a/Samples/streamOrderedAllocationIPC/README.md b/Samples/streamOrderedAllocationIPC/README.md new file mode 100644 index 00000000..04948fae --- /dev/null +++ b/Samples/streamOrderedAllocationIPC/README.md @@ -0,0 +1,60 @@ +# streamOrderedAllocationIPC - stream Ordered Allocation IPC Pools + +## Description + +This sample demonstrates IPC pools of stream ordered memory allocated using cudaMallocAsync and cudaMemPool family of APIs. + +## Key Concepts + +Performance Strategies + +## Supported SM Architectures + +[SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux + +## Supported CPU Architecture + +x86_64 + +## CUDA APIs involved + +### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) +cudaMallocAsync, cudaFreeAsync, cudaMemPoolCreate, cudaMemPoolImportPointer, cudaMemPoolSetAccess, cudaMemPoolGetAccess, cudaMemPoolExportToShareableHandle, cudaMemPoolExportPointer, cudaMemPoolDestroy + +## Prerequisites + +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. + +## Build and Run + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +## References (for more details) + diff --git a/Samples/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu b/Samples/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu new file mode 100644 index 00000000..f8f783b8 --- /dev/null +++ b/Samples/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu @@ -0,0 +1,440 @@ +/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This sample demonstrates Inter Process Communication + * using one process per GPU for computation. + */ + +#include +#include +#include +#include +#define CUDA_DRIVER_API 1 +#include "helper_cuda.h" +#include "helper_cuda_drvapi.h" +#include "helper_multiprocess.h" + +static const char shmName[] = "streamOrderedAllocationIPCshm"; +static const char ipcName[] = "streamOrderedAllocationIPC_pipe"; +// For direct NVLINK and PCI-E peers, at max 8 simultaneous peers are allowed +// For NVSWITCH connected peers like DGX-2, simultaneous peers are not limited +// in the same way. +#define MAX_DEVICES (32) +#define DATA_SIZE (64ULL << 20ULL) // 64MB + +#if defined(__linux__) +#define cpu_atomic_add32(a, x) __sync_add_and_fetch(a, x) +#elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#define cpu_atomic_add32(a, x) InterlockedAdd((volatile LONG *)a, x) +#else +#error Unsupported system +#endif + +typedef struct shmStruct_st { + size_t nprocesses; + int barrier; + int sense; + int devices[MAX_DEVICES]; + cudaMemPoolPtrExportData exportPtrData[MAX_DEVICES]; +} shmStruct; + +__global__ void simpleKernel(char *ptr, int sz, char val) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + for (; idx < sz; idx += (gridDim.x * blockDim.x)) { + ptr[idx] = val; + } +} + +static void barrierWait(volatile int *barrier, volatile int *sense, + unsigned int n) { + int count; + + // Check-in + count = cpu_atomic_add32(barrier, 1); + if (count == n) // Last one in + *sense = 1; + while (!*sense) + ; + + // Check-out + count = cpu_atomic_add32(barrier, -1); + if (count == 0) // Last one out + *sense = 0; + while (*sense) + ; +} + +static void childProcess(int id) { + volatile shmStruct *shm = NULL; + cudaStream_t stream; + sharedMemoryInfo info; + size_t procCount, i; + int blocks = 0; + int threads = 128; + cudaDeviceProp prop; + std::vector ptrs; + + std::vector verification_buffer(DATA_SIZE); + + ipcHandle *ipcChildHandle = NULL; + checkIpcErrors(ipcOpenSocket(ipcChildHandle)); + + if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) { + printf("Failed to create shared memory slab\n"); + exit(EXIT_FAILURE); + } + shm = (volatile shmStruct *)info.addr; + procCount = shm->nprocesses; + + barrierWait(&shm->barrier, &shm->sense, (unsigned int)(procCount + 1)); + + // Receive all allocation handles shared by Parent. + std::vector shHandle(shm->nprocesses); + checkIpcErrors(ipcRecvShareableHandles(ipcChildHandle, shHandle)); + + checkCudaErrors(cudaSetDevice(shm->devices[id])); + checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id])); + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &blocks, simpleKernel, threads, 0)); + blocks *= prop.multiProcessorCount; + + std::vector pools(shm->nprocesses); + + cudaMemAllocationHandleType handleType = cudaMemHandleTypePosixFileDescriptor; + + // Import mem pools from all the devices created in the master + // process using shareable handles received via socket + // and import the pointer to the allocated buffer using + // exportData filled in shared memory by the master process. + for (i = 0; i < procCount; i++) { + checkCudaErrors(cudaMemPoolImportFromShareableHandle( + &pools[i], (void *)shHandle[i], handleType, 0)); + + cudaMemAccessFlags accessFlags; + cudaMemLocation location; + location.type = cudaMemLocationTypeDevice; + location.id = shm->devices[id]; + checkCudaErrors(cudaMemPoolGetAccess(&accessFlags, pools[i], &location)); + if (accessFlags != cudaMemAccessFlagsProtReadWrite) { + cudaMemAccessDesc desc; + memset(&desc, 0, sizeof(cudaMemAccessDesc)); + desc.location.type = cudaMemLocationTypeDevice; + desc.location.id = shm->devices[id]; + desc.flags = cudaMemAccessFlagsProtReadWrite; + checkCudaErrors(cudaMemPoolSetAccess(pools[i], &desc, 1)); + } + + // Import the allocation from each memory pool by iterating over exportData + // until import is success. + for (int j = 0; j < procCount; j++) { + void *ptr = NULL; + // Import the allocation using the opaque export data retrieved through + // the shared memory". + cudaError_t ret = cudaMemPoolImportPointer( + &ptr, pools[i], (cudaMemPoolPtrExportData *)&shm->exportPtrData[j]); + + if (ret == cudaSuccess) { + // Pointer import is successful hence add it to the ptrs bag. + ptrs.push_back(ptr); + break; + } else { + // Reset failure error received from cudaMemPoolImportPointer + // for further try. + cudaGetLastError(); + } + } + // Since we have imported allocations shared by the parent with us, we can + // close this ShareableHandle. + checkIpcErrors(ipcCloseShareableHandle(shHandle[i])); + } + + // Since we have imported allocations shared by the parent with us, we can + // close the socket. + checkIpcErrors(ipcCloseSocket(ipcChildHandle)); + + // At each iteration of the loop, each sibling process will push work on + // their respective devices accessing the next peer mapped buffer allocated + // by the master process (these can come from other sibling processes as + // well). To coordinate each process' access, we force the stream to wait for + // the work already accessing this buffer. + for (i = 0; i < procCount; i++) { + size_t bufferId = (i + id) % procCount; + + // Push a simple kernel on it + simpleKernel<<>>((char *)ptrs[bufferId], + DATA_SIZE, id); + checkCudaErrors(cudaGetLastError()); + checkCudaErrors(cudaStreamSynchronize(stream)); + + // Wait for all my sibling processes to push this stage of their work + // before proceeding to the next. This prevents siblings from racing + // ahead and clobbering the recorded event or waiting on the wrong + // recorded event. + barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount); + if (id == 0) { + printf("Step %lld done\n", (unsigned long long)i); + } + } + + // Now wait for my buffer to be ready so I can copy it locally and verify it + checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE, + cudaMemcpyDeviceToHost, stream)); + + // And wait for all the queued up work to complete + checkCudaErrors(cudaStreamSynchronize(stream)); + + printf("Process %d: verifying...\n", id); + + // The contents should have the id of the sibling just after me + char compareId = (char)((id + 1) % procCount); + for (unsigned long long j = 0; j < DATA_SIZE; j++) { + if (verification_buffer[j] != compareId) { + printf("Process %d: Verification mismatch at %lld: %d != %d\n", id, j, + (int)verification_buffer[j], (int)compareId); + } + } + + // Clean up! + for (i = 0; i < procCount; i++) { + // Free the memory before the exporter process frees it + checkCudaErrors(cudaFreeAsync(ptrs[i], stream)); + } + + // And wait for all the queued up work to complete + checkCudaErrors(cudaStreamSynchronize(stream)); + checkCudaErrors(cudaStreamDestroy(stream)); + + printf("Process %d complete!\n", id); +} + +static void parentProcess(char *app) { + sharedMemoryInfo info; + int devCount, i; + volatile shmStruct *shm = NULL; + std::vector ptrs; + std::vector processes; + + checkCudaErrors(cudaGetDeviceCount(&devCount)); + std::vector devices(devCount); + for (i = 0; i < devCount; i++) { + cuDeviceGet(&devices[i], i); + } + + if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) { + printf("Failed to create shared memory slab\n"); + exit(EXIT_FAILURE); + } + shm = (volatile shmStruct *)info.addr; + memset((void *)shm, 0, sizeof(*shm)); + + // Pick all the devices that can access each other's memory for this test + // Keep in mind that CUDA has minimal support for fork() without a + // corresponding exec() in the child process, but in this case our + // spawnProcess will always exec, so no need to worry. + for (i = 0; i < devCount; i++) { + bool allPeers = true; + cudaDeviceProp prop; + checkCudaErrors(cudaGetDeviceProperties(&prop, i)); + + int isMemPoolSupported = 0; + checkCudaErrors(cudaDeviceGetAttribute(&isMemPoolSupported, + cudaDevAttrMemoryPoolsSupported, i)); + // CUDA IPC is only supported on devices with unified addressing + if (!isMemPoolSupported) { + printf("Device %d does not support cuda memory pools, skipping...\n", i); + continue; + } + int deviceSupportsIpcHandle = 0; +#if defined(__linux__) + checkCudaErrors(cuDeviceGetAttribute( + &deviceSupportsIpcHandle, + CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED, + devices[i])); +#else + cuDeviceGetAttribute(&deviceSupportsIpcHandle, + CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED, + devices[i]); +#endif + + if (!deviceSupportsIpcHandle) { + printf("Device %d does not support CUDA IPC Handle, skipping...\n", i); + continue; + } + // This sample requires two processes accessing each device, so we need + // to ensure exclusive or prohibited mode is not set + if (prop.computeMode != cudaComputeModeDefault) { + printf("Device %d is in an unsupported compute mode for this sample\n", + i); + continue; + } +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + // CUDA IPC on Windows is only supported on TCC + if (!prop.tccDriver) { + printf("Device %d is not in TCC mode\n", i); + continue; + } +#endif + + for (int j = 0; j < shm->nprocesses; j++) { + int canAccessPeerIJ, canAccessPeerJI; + checkCudaErrors( + cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i)); + checkCudaErrors( + cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j])); + if (!canAccessPeerIJ || !canAccessPeerJI) { + allPeers = false; + break; + } + } + if (allPeers) { + // Enable peers here. This isn't necessary for IPC, but it will + // setup the peers for the device. For systems that only allow 8 + // peers per GPU at a time, this acts to remove devices from CanAccessPeer + for (int j = 0; j < shm->nprocesses; j++) { + checkCudaErrors(cudaSetDevice(i)); + checkCudaErrors(cudaDeviceEnablePeerAccess(shm->devices[j], 0)); + checkCudaErrors(cudaSetDevice(shm->devices[j])); + checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0)); + } + shm->devices[shm->nprocesses++] = i; + if (shm->nprocesses >= MAX_DEVICES) break; + } else { + printf( + "Device %d is not peer capable with some other selected peers, " + "skipping\n", + i); + } + } + + if (shm->nprocesses == 0) { + printf("No CUDA devices support IPC\n"); + exit(EXIT_WAIVED); + } + + std::vector shareableHandles(shm->nprocesses); + std::vector streams(shm->nprocesses); + std::vector pools(shm->nprocesses); + + // Now allocate memory for each process and fill the shared + // memory buffer with the export data and get memPool handles to communicate + for (i = 0; i < shm->nprocesses; i++) { + void *ptr = NULL; + checkCudaErrors(cudaSetDevice(shm->devices[i])); + checkCudaErrors( + cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking)); + // Allocate an explicit pool with IPC capabilities + cudaMemPoolProps poolProps; + memset(&poolProps, 0, sizeof(cudaMemPoolProps)); + poolProps.allocType = cudaMemAllocationTypePinned; + poolProps.handleTypes = cudaMemHandleTypePosixFileDescriptor; + + poolProps.location.type = cudaMemLocationTypeDevice; + poolProps.location.id = shm->devices[i]; + + checkCudaErrors(cudaMemPoolCreate(&pools[i], &poolProps)); + + // Query the shareable handle for the pool + cudaMemAllocationHandleType handleType = + cudaMemHandleTypePosixFileDescriptor; + // Allocate memory in a stream from the pool just created + checkCudaErrors(cudaMallocAsync(&ptr, DATA_SIZE, pools[i], streams[i])); + + checkCudaErrors(cudaMemPoolExportToShareableHandle( + &shareableHandles[i], pools[i], handleType, 0)); + + // Get the opaque ‘bag-of-bits’ representing the allocation + memset((void *)&shm->exportPtrData[i], 0, sizeof(cudaMemPoolPtrExportData)); + checkCudaErrors(cudaMemPoolExportPointer( + (cudaMemPoolPtrExportData *)&shm->exportPtrData[i], ptr)); + ptrs.push_back(ptr); + } + + // Launch the child processes! + for (i = 0; i < shm->nprocesses; i++) { + char devIdx[10]; + char *const args[] = {app, devIdx, NULL}; + Process process; + + SPRINTF(devIdx, "%d", i); + + if (spawnProcess(&process, app, args)) { + printf("Failed to create process\n"); + exit(EXIT_FAILURE); + } + + processes.push_back(process); + } + + barrierWait(&shm->barrier, &shm->sense, (unsigned int)(shm->nprocesses + 1)); + + ipcHandle *ipcParentHandle = NULL; + checkIpcErrors(ipcCreateSocket(ipcParentHandle, ipcName, processes)); + checkIpcErrors( + ipcSendShareableHandles(ipcParentHandle, shareableHandles, processes)); + + // Close the shareable handles as they are not needed anymore. + for (int i = 0; i < shm->nprocesses; i++) { + checkIpcErrors(ipcCloseShareableHandle(shareableHandles[i])); + } + checkIpcErrors(ipcCloseSocket(ipcParentHandle)); + + // And wait for them to finish + for (i = 0; i < processes.size(); i++) { + if (waitProcess(&processes[i]) != EXIT_SUCCESS) { + printf("Process %d failed!\n", i); + exit(EXIT_FAILURE); + } + } + + // Clean up! + for (i = 0; i < shm->nprocesses; i++) { + checkCudaErrors(cudaSetDevice(shm->devices[i])); + checkCudaErrors(cudaFreeAsync(ptrs[i], streams[i])); + checkCudaErrors(cudaStreamSynchronize(streams[i])); + checkCudaErrors(cudaMemPoolDestroy(pools[i])); + } + + sharedMemoryClose(&info); +} + +// Host code +int main(int argc, char **argv) { +#if defined(__arm__) || defined(__aarch64__) || defined(WIN32) || \ + defined(_WIN32) || defined(WIN64) || defined(_WIN64) + printf("Not supported on ARM\n"); + return EXIT_WAIVED; +#else + if (argc == 1) { + parentProcess(argv[0]); + } else { + childProcess(atoi(argv[1])); + } + return EXIT_SUCCESS; +#endif +} diff --git a/Samples/streamOrderedAllocationP2P/README.md b/Samples/streamOrderedAllocationP2P/README.md index ce4c20a6..164284a9 100644 --- a/Samples/streamOrderedAllocationP2P/README.md +++ b/Samples/streamOrderedAllocationP2P/README.md @@ -27,7 +27,7 @@ cudaMallocAsync, cudaFreeAsync, cudaMemPoolSetAccess, cudaDeviceGetDefaultMemPoo ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P.cu b/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P.cu index e289de4e..3c6cffb5 100644 --- a/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P.cu +++ b/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P.cu @@ -122,24 +122,23 @@ std::pair getP2PCapableGpuPair() { int deviceId = itr->second; checkCudaErrors(cudaSetDevice(deviceId)); - std::for_each( - itr, bestFit.second, - [&deviceId, &bestFitDeviceIds](decltype(*itr) mapPair) { - if (deviceId != mapPair.second) { - int access = 0; - checkCudaErrors( - cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second)); - printf("Device=%d %s Access Peer Device=%d\n", deviceId, - access ? "CAN" : "CANNOT", mapPair.second); - if (access && bestFitDeviceIds.size() < kNumGpusRequired) { - bestFitDeviceIds.emplace(deviceId); - bestFitDeviceIds.emplace(mapPair.second); - } else { - printf("Ignoring device %i (max devices exceeded)\n", - mapPair.second); - } - } - }); + std::for_each(itr, bestFit.second, [&deviceId, &bestFitDeviceIds, + &kNumGpusRequired]( + decltype(*itr) mapPair) { + if (deviceId != mapPair.second) { + int access = 0; + checkCudaErrors( + cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second)); + printf("Device=%d %s Access Peer Device=%d\n", deviceId, + access ? "CAN" : "CANNOT", mapPair.second); + if (access && bestFitDeviceIds.size() < kNumGpusRequired) { + bestFitDeviceIds.emplace(deviceId); + bestFitDeviceIds.emplace(mapPair.second); + } else { + printf("Ignoring device %i (max devices exceeded)\n", mapPair.second); + } + } + }); if (bestFitDeviceIds.size() >= kNumGpusRequired) { printf("Selected p2p capable devices - "); diff --git a/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2017.vcxproj b/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2017.vcxproj index 724e2228..116d3c9c 100644 --- a/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2017.vcxproj +++ b/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2019.vcxproj b/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2019.vcxproj index abe713cb..50529ea0 100644 --- a/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2019.vcxproj +++ b/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/systemWideAtomics/README.md b/Samples/systemWideAtomics/README.md index bfd9d101..530df9d3 100644 --- a/Samples/systemWideAtomics/README.md +++ b/Samples/systemWideAtomics/README.md @@ -30,7 +30,7 @@ cudaMalloc, cudaFree, cudaMemcpy, cudaFreeHost ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/tf32TensorCoreGemm/README.md b/Samples/tf32TensorCoreGemm/README.md index 517eb9bf..c1513be1 100644 --- a/Samples/tf32TensorCoreGemm/README.md +++ b/Samples/tf32TensorCoreGemm/README.md @@ -27,7 +27,7 @@ cudaMalloc, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate, cudaEv ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2017.vcxproj b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2017.vcxproj index 29078f91..4cd44a20 100644 --- a/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2017.vcxproj +++ b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2019.vcxproj b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2019.vcxproj index 6d7501ec..5ed41711 100644 --- a/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2019.vcxproj +++ b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/vectorAddMMAP/README.md b/Samples/vectorAddMMAP/README.md index 3d8bfeeb..c385d627 100644 --- a/Samples/vectorAddMMAP/README.md +++ b/Samples/vectorAddMMAP/README.md @@ -27,7 +27,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cu ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/vectorAddMMAP/vectorAddMMAP_vs2017.vcxproj b/Samples/vectorAddMMAP/vectorAddMMAP_vs2017.vcxproj index 6a2619bd..39f9ba5b 100644 --- a/Samples/vectorAddMMAP/vectorAddMMAP_vs2017.vcxproj +++ b/Samples/vectorAddMMAP/vectorAddMMAP_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -113,6 +113,6 @@ - + diff --git a/Samples/vectorAddMMAP/vectorAddMMAP_vs2019.vcxproj b/Samples/vectorAddMMAP/vectorAddMMAP_vs2019.vcxproj index 366a3747..ccc98fe0 100644 --- a/Samples/vectorAddMMAP/vectorAddMMAP_vs2019.vcxproj +++ b/Samples/vectorAddMMAP/vectorAddMMAP_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -109,6 +109,6 @@ - + diff --git a/Samples/vectorAdd_nvrtc/README.md b/Samples/vectorAdd_nvrtc/README.md index 24c16b06..64132b80 100644 --- a/Samples/vectorAdd_nvrtc/README.md +++ b/Samples/vectorAdd_nvrtc/README.md @@ -30,7 +30,7 @@ cuMemAlloc, cuMemFree, cuMemcpyHtoD, cuMemcpyDtoH ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj index bf1d6e6a..3fecdd25 100644 --- a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj +++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2019.vcxproj b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2019.vcxproj index eafec39f..697e47aa 100644 --- a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2019.vcxproj +++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/vulkanImageCUDA/README.md b/Samples/vulkanImageCUDA/README.md index b970b993..f37e25f3 100644 --- a/Samples/vulkanImageCUDA/README.md +++ b/Samples/vulkanImageCUDA/README.md @@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedMipmappedArray, cudaImportE ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2017.vcxproj b/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2017.vcxproj index 84fedd67..f14a0515 100644 --- a/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2017.vcxproj +++ b/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -118,6 +118,6 @@ - + diff --git a/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2019.vcxproj b/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2019.vcxproj index 0d444252..42673869 100644 --- a/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2019.vcxproj +++ b/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -114,6 +114,6 @@ - + diff --git a/Samples/warpAggregatedAtomicsCG/README.md b/Samples/warpAggregatedAtomicsCG/README.md index fe0541f3..c4c351a7 100644 --- a/Samples/warpAggregatedAtomicsCG/README.md +++ b/Samples/warpAggregatedAtomicsCG/README.md @@ -22,7 +22,7 @@ x86_64, ppc64le, armv7l, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj index c1087bb9..6a6c8655 100644 --- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj +++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2019.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2019.vcxproj index 141ebb15..6e83354a 100644 --- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2019.vcxproj +++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/watershedSegmentationNPP/Makefile b/Samples/watershedSegmentationNPP/Makefile index a65719dc..c03a879d 100644 --- a/Samples/watershedSegmentationNPP/Makefile +++ b/Samples/watershedSegmentationNPP/Makefile @@ -271,6 +271,12 @@ ifeq ($(TARGET_OS),darwin) SAMPLE_ENABLED := 0 endif +# This sample is not supported on QNX +ifeq ($(TARGET_OS),qnx) + $(info >>> WARNING - watershedSegmentationNPP is not supported on QNX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) diff --git a/Samples/watershedSegmentationNPP/README.md b/Samples/watershedSegmentationNPP/README.md index 496cb03b..0b320280 100644 --- a/Samples/watershedSegmentationNPP/README.md +++ b/Samples/watershedSegmentationNPP/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l ## Prerequisites -Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2017.vcxproj b/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2017.vcxproj index 48a54fee..548b5361 100644 --- a/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2017.vcxproj +++ b/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2019.vcxproj b/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2019.vcxproj index 8956ca3b..ee297d72 100644 --- a/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2019.vcxproj +++ b/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - +