From e950012e722c4cb583f4ae7ce01d07a78d5e5e60 Mon Sep 17 00:00:00 2001 From: Rutwik Choughule Date: Wed, 30 Jun 2021 11:26:41 +0530 Subject: [PATCH] add and update samples with CUDA 11.4 support --- README.md | 66 +- Samples/EGLStream_CUDA_Interop/Makefile | 6 - Samples/EGLStream_CUDA_Interop/README.md | 2 +- Samples/MersenneTwisterGP11213/Makefile | 29 +- .../MersenneTwisterGP11213_vs2017.vcxproj | 4 +- .../MersenneTwisterGP11213_vs2019.vcxproj | 4 +- Samples/MersenneTwisterGP11213/README.md | 2 +- .../NV12toBGRandResize_vs2017.vcxproj | 4 +- .../NV12toBGRandResize_vs2019.vcxproj | 4 +- Samples/NV12toBGRandResize/README.md | 2 +- Samples/UnifiedMemoryPerf/README.md | 2 +- .../UnifiedMemoryPerf_vs2017.vcxproj | 4 +- .../UnifiedMemoryPerf_vs2019.vcxproj | 4 +- Samples/bandwidthTest/README.md | 2 +- .../bandwidthTest_vs2017.vcxproj | 4 +- .../bandwidthTest_vs2019.vcxproj | 4 +- .../Makefile | 6 - .../README.md | 2 +- ...rkersAndLabelCompressionNPP_vs2017.vcxproj | 4 +- ...rkersAndLabelCompressionNPP_vs2019.vcxproj | 4 +- Samples/bf16TensorCoreGemm/README.md | 2 +- .../bf16TensorCoreGemm_vs2017.vcxproj | 4 +- .../bf16TensorCoreGemm_vs2019.vcxproj | 4 +- Samples/binaryPartitionCG/README.md | 2 +- .../binaryPartitionCG_vs2017.vcxproj | 4 +- .../binaryPartitionCG_vs2019.vcxproj | 4 +- Samples/boxFilterNPP/README.md | 2 +- .../boxFilterNPP/boxFilterNPP_vs2017.vcxproj | 4 +- .../boxFilterNPP/boxFilterNPP_vs2019.vcxproj | 4 +- Samples/cannyEdgeDetectorNPP/README.md | 2 +- .../cannyEdgeDetectorNPP_vs2017.vcxproj | 4 +- .../cannyEdgeDetectorNPP_vs2019.vcxproj | 4 +- Samples/cdpQuadtree/Makefile | 370 +++++++++ Samples/cdpQuadtree/NsightEclipse.xml | 72 ++ Samples/cdpQuadtree/README.md | 71 ++ Samples/cdpQuadtree/cdpQuadtree.cu | 742 ++++++++++++++++++ Samples/cdpQuadtree/cdpQuadtree_vs2017.sln | 20 + .../cdpQuadtree/cdpQuadtree_vs2017.vcxproj | 114 +++ Samples/cdpQuadtree/cdpQuadtree_vs2019.sln | 20 + .../cdpQuadtree/cdpQuadtree_vs2019.vcxproj | 110 +++ Samples/concurrentKernels/README.md | 2 +- .../concurrentKernels_vs2017.vcxproj | 4 +- .../concurrentKernels_vs2019.vcxproj | 4 +- Samples/conjugateGradientCudaGraphs/Makefile | 6 - Samples/conjugateGradientCudaGraphs/README.md | 2 +- ...conjugateGradientCudaGraphs_vs2017.vcxproj | 4 +- ...conjugateGradientCudaGraphs_vs2019.vcxproj | 4 +- .../conjugateGradientMultiBlockCG/README.md | 2 +- ...njugateGradientMultiBlockCG_vs2017.vcxproj | 4 +- ...njugateGradientMultiBlockCG_vs2019.vcxproj | 4 +- .../conjugateGradientMultiDeviceCG/README.md | 2 +- ...jugateGradientMultiDeviceCG_vs2017.vcxproj | 4 +- ...jugateGradientMultiDeviceCG_vs2019.vcxproj | 4 +- Samples/cuSolverDn_LinearSolver/Makefile | 6 - Samples/cuSolverDn_LinearSolver/README.md | 2 +- .../cuSolverDn_LinearSolver_vs2017.vcxproj | 4 +- .../cuSolverDn_LinearSolver_vs2019.vcxproj | 4 +- Samples/cuSolverSp_LinearSolver/Makefile | 6 - Samples/cuSolverSp_LinearSolver/README.md | 2 +- .../cuSolverSp_LinearSolver_vs2017.vcxproj | 4 +- .../cuSolverSp_LinearSolver_vs2019.vcxproj | 4 +- Samples/cudaCompressibleMemory/README.md | 2 +- .../cudaCompressibleMemory_vs2017.vcxproj | 4 +- .../cudaCompressibleMemory_vs2019.vcxproj | 4 +- Samples/cudaNvSci/Makefile | 6 - Samples/cudaNvSci/README.md | 2 +- Samples/cudaNvSciNvMedia/README.md | 2 +- Samples/cudaOpenMP/README.md | 2 +- Samples/cudaOpenMP/cudaOpenMP_vs2017.vcxproj | 4 +- Samples/cudaOpenMP/cudaOpenMP_vs2019.vcxproj | 4 +- Samples/cudaTensorCoreGemm/README.md | 2 +- .../cudaTensorCoreGemm_vs2017.vcxproj | 4 +- .../cudaTensorCoreGemm_vs2019.vcxproj | 4 +- Samples/deviceQuery/README.md | 2 +- .../deviceQuery/deviceQuery_vs2017.vcxproj | 4 +- .../deviceQuery/deviceQuery_vs2019.vcxproj | 4 +- Samples/dmmaTensorCoreGemm/README.md | 2 +- .../dmmaTensorCoreGemm_vs2017.vcxproj | 4 +- .../dmmaTensorCoreGemm_vs2019.vcxproj | 4 +- Samples/globalToShmemAsyncCopy/README.md | 2 +- .../globalToShmemAsyncCopy_vs2017.vcxproj | 4 +- .../globalToShmemAsyncCopy_vs2019.vcxproj | 4 +- Samples/immaTensorCoreGemm/README.md | 2 +- .../immaTensorCoreGemm_vs2017.vcxproj | 4 +- .../immaTensorCoreGemm_vs2019.vcxproj | 4 +- Samples/jacobiCudaGraphs/README.md | 2 +- .../jacobiCudaGraphs_vs2017.vcxproj | 4 +- .../jacobiCudaGraphs_vs2019.vcxproj | 4 +- Samples/matrixMul/README.md | 2 +- Samples/matrixMul/matrixMul_vs2017.vcxproj | 4 +- Samples/matrixMul/matrixMul_vs2019.vcxproj | 4 +- Samples/matrixMulDrv/README.md | 2 +- .../matrixMulDrv/matrixMulDrv_vs2017.vcxproj | 4 +- .../matrixMulDrv/matrixMulDrv_vs2019.vcxproj | 4 +- Samples/memMapIPCDrv/README.md | 2 +- .../memMapIPCDrv/memMapIPCDrv_vs2017.vcxproj | 4 +- .../memMapIPCDrv/memMapIPCDrv_vs2019.vcxproj | 4 +- Samples/nvJPEG/Makefile | 6 - Samples/nvJPEG/README.md | 2 +- Samples/nvJPEG/nvJPEG_vs2017.vcxproj | 4 +- Samples/nvJPEG/nvJPEG_vs2019.vcxproj | 4 +- Samples/nvJPEG_encoder/Makefile | 6 - Samples/nvJPEG_encoder/README.md | 2 +- .../nvJPEG_encoder_vs2017.vcxproj | 4 +- .../nvJPEG_encoder_vs2019.vcxproj | 4 +- Samples/p2pBandwidthLatencyTest/README.md | 2 +- .../p2pBandwidthLatencyTest_vs2017.vcxproj | 4 +- .../p2pBandwidthLatencyTest_vs2019.vcxproj | 4 +- Samples/reduction/README.md | 2 +- Samples/reduction/reduction_vs2017.vcxproj | 4 +- Samples/reduction/reduction_vs2019.vcxproj | 4 +- Samples/shfl_scan/README.md | 2 +- Samples/shfl_scan/shfl_scan_vs2017.vcxproj | 4 +- Samples/shfl_scan/shfl_scan_vs2019.vcxproj | 4 +- Samples/simpleAWBarrier/README.md | 2 +- .../simpleAWBarrier_vs2017.vcxproj | 4 +- .../simpleAWBarrier_vs2019.vcxproj | 4 +- Samples/simpleAttributes/README.md | 2 +- .../simpleAttributes_vs2017.vcxproj | 4 +- .../simpleAttributes_vs2019.vcxproj | 4 +- Samples/simpleCUBLAS/Makefile | 29 +- Samples/simpleCUBLAS/README.md | 2 +- .../simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj | 4 +- .../simpleCUBLAS/simpleCUBLAS_vs2019.vcxproj | 4 +- Samples/simpleCUBLASXT/Makefile | 6 - Samples/simpleCUBLASXT/README.md | 2 +- .../simpleCUBLASXT_vs2017.vcxproj | 4 +- .../simpleCUBLASXT_vs2019.vcxproj | 4 +- Samples/simpleCUBLAS_LU/Makefile | 6 - Samples/simpleCUBLAS_LU/README.md | 2 +- .../simpleCUBLAS_LU_vs2017.vcxproj | 4 +- .../simpleCUBLAS_LU_vs2019.vcxproj | 4 +- Samples/simpleCUFFT/Makefile | 6 - Samples/simpleCUFFT/README.md | 2 +- .../simpleCUFFT/simpleCUFFT_vs2017.vcxproj | 4 +- .../simpleCUFFT/simpleCUFFT_vs2019.vcxproj | 4 +- Samples/simpleCudaGraphs/README.md | 2 +- .../simpleCudaGraphs_vs2017.vcxproj | 4 +- .../simpleCudaGraphs_vs2019.vcxproj | 4 +- Samples/simpleD3D11/README.md | 2 +- .../simpleD3D11/simpleD3D11_vs2017.vcxproj | 4 +- .../simpleD3D11/simpleD3D11_vs2019.vcxproj | 4 +- Samples/simpleD3D12/README.md | 2 +- .../simpleD3D12/simpleD3D12_vs2017.vcxproj | 4 +- .../simpleD3D12/simpleD3D12_vs2019.vcxproj | 4 +- Samples/simpleDrvRuntime/README.md | 2 +- .../simpleDrvRuntime_vs2017.vcxproj | 4 +- .../simpleDrvRuntime_vs2019.vcxproj | 4 +- Samples/simpleGL/README.md | 2 +- Samples/simpleGL/simpleGL_vs2017.vcxproj | 4 +- Samples/simpleGL/simpleGL_vs2019.vcxproj | 4 +- Samples/simpleIPC/README.md | 2 +- Samples/simpleIPC/simpleIPC_vs2017.vcxproj | 4 +- Samples/simpleIPC/simpleIPC_vs2019.vcxproj | 4 +- Samples/simpleVoteIntrinsics/README.md | 2 +- .../simpleVoteIntrinsics_vs2017.vcxproj | 4 +- .../simpleVoteIntrinsics_vs2019.vcxproj | 4 +- Samples/simpleVulkan/README.md | 2 +- .../simpleVulkan/simpleVulkan_vs2017.vcxproj | 4 +- .../simpleVulkan/simpleVulkan_vs2019.vcxproj | 4 +- Samples/simpleVulkanMMAP/README.md | 2 +- .../simpleVulkanMMAP_vs2017.vcxproj | 4 +- .../simpleVulkanMMAP_vs2019.vcxproj | 4 +- Samples/simpleZeroCopy/README.md | 2 +- .../simpleZeroCopy_vs2017.vcxproj | 4 +- .../simpleZeroCopy_vs2019.vcxproj | 4 +- Samples/streamOrderedAllocation/README.md | 2 +- .../streamOrderedAllocation_vs2017.vcxproj | 4 +- .../streamOrderedAllocation_vs2019.vcxproj | 4 +- Samples/streamOrderedAllocationIPC/README.md | 2 +- Samples/streamOrderedAllocationP2P/README.md | 2 +- .../streamOrderedAllocationP2P_vs2017.vcxproj | 4 +- .../streamOrderedAllocationP2P_vs2019.vcxproj | 4 +- Samples/systemWideAtomics/README.md | 2 +- Samples/tf32TensorCoreGemm/README.md | 2 +- .../tf32TensorCoreGemm_vs2017.vcxproj | 4 +- .../tf32TensorCoreGemm_vs2019.vcxproj | 4 +- Samples/vectorAddMMAP/README.md | 2 +- .../vectorAddMMAP_vs2017.vcxproj | 4 +- .../vectorAddMMAP_vs2019.vcxproj | 4 +- Samples/vectorAdd_nvrtc/README.md | 2 +- .../vectorAdd_nvrtc_vs2017.vcxproj | 4 +- .../vectorAdd_nvrtc_vs2019.vcxproj | 4 +- Samples/vulkanImageCUDA/README.md | 2 +- .../vulkanImageCUDA_vs2017.vcxproj | 4 +- .../vulkanImageCUDA_vs2019.vcxproj | 4 +- Samples/warpAggregatedAtomicsCG/README.md | 2 +- .../warpAggregatedAtomicsCG_vs2017.vcxproj | 4 +- .../warpAggregatedAtomicsCG_vs2019.vcxproj | 4 +- Samples/watershedSegmentationNPP/Makefile | 6 - Samples/watershedSegmentationNPP/README.md | 2 +- .../watershedSegmentationNPP_vs2017.vcxproj | 4 +- .../watershedSegmentationNPP_vs2019.vcxproj | 4 +- 193 files changed, 1844 insertions(+), 431 deletions(-) create mode 100644 Samples/cdpQuadtree/Makefile create mode 100644 Samples/cdpQuadtree/NsightEclipse.xml create mode 100644 Samples/cdpQuadtree/README.md create mode 100644 Samples/cdpQuadtree/cdpQuadtree.cu create mode 100644 Samples/cdpQuadtree/cdpQuadtree_vs2017.sln create mode 100644 Samples/cdpQuadtree/cdpQuadtree_vs2017.vcxproj create mode 100644 Samples/cdpQuadtree/cdpQuadtree_vs2019.sln create mode 100644 Samples/cdpQuadtree/cdpQuadtree_vs2019.vcxproj diff --git a/README.md b/README.md index a4fafa7f..5efaefc0 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,15 @@ # CUDA Samples -Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads). +Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads). ## Release Notes This section describes the release notes for the CUDA Samples on GitHub only. +### CUDA 11.4 +* Added `cdpQuadtree`. Demonstrates Quad Trees implementation using CUDA Dynamic Parallelism. +* Updated `simpleVulkan`, `simpleVulkanMMAP` and `vulkanImageCUDA`. Demonstrates use of SPIR-V shaders. + ### CUDA 11.3 * Added `streamOrderedAllocationIPC`. Demonstrates Inter Process Communication using one process per GPU for computation. * Added `simpleCUBLAS_LU`. Demonstrates batched matrix LU decomposition using cuBLAS API `cublasgetrfBatched()` @@ -109,7 +113,7 @@ This is the first release of CUDA Samples on GitHub: ### Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), and the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html). ### Getting the CUDA Samples @@ -166,39 +170,39 @@ The samples makefiles can take advantage of certain options: ### Samples by OS #### Linux -**[bandwidthTest](./Samples/bandwidthTest)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** | +**[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[shfl_scan](./Samples/shfl_scan)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** | **[concurrentKernels](./Samples/concurrentKernels)** | ---|---|---|---| -**[boxFilterNPP](./Samples/boxFilterNPP)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[concurrentKernels](./Samples/concurrentKernels)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | -**[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[cudaNvSci](./Samples/cudaNvSci)** | -**[cudaNvSciNvMedia](./Samples/cudaNvSciNvMedia)** | **[cudaOpenMP](./Samples/cudaOpenMP)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | -**[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[deviceQuery](./Samples/deviceQuery)** | **[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[EGLStream_CUDA_Interop](./Samples/EGLStream_CUDA_Interop)** | -**[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | **[matrixMul](./Samples/matrixMul)** | -**[matrixMulDrv](./Samples/matrixMulDrv)** | **[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | -**[nvJPEG](./Samples/nvJPEG)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[reduction](./Samples/reduction)** | -**[shfl_scan](./Samples/shfl_scan)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | -**[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleCUBLAS_LU](./Samples/simpleCUBLAS_LU)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | -**[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[simpleGL](./Samples/simpleGL)** | **[simpleIPC](./Samples/simpleIPC)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | -**[simpleVulkan](./Samples/simpleVulkan)** | **[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** | -**[streamOrderedAllocationIPC](./Samples/streamOrderedAllocationIPC)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | **[systemWideAtomics](./Samples/systemWideAtomics)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | -**[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | -**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** | +**[streamOrderedAllocationIPC](./Samples/streamOrderedAllocationIPC)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | +**[nvJPEG](./Samples/nvJPEG)** | **[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | +**[matrixMul](./Samples/matrixMul)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | +**[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[cudaOpenMP](./Samples/cudaOpenMP)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[cudaNvSci](./Samples/cudaNvSci)** | +**[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[simpleVulkan](./Samples/simpleVulkan)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | +**[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | **[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[EGLStream_CUDA_Interop](./Samples/EGLStream_CUDA_Interop)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** | +**[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | +**[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[simpleGL](./Samples/simpleGL)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | +**[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[deviceQuery](./Samples/deviceQuery)** | **[systemWideAtomics](./Samples/systemWideAtomics)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | +**[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | +**[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[simpleIPC](./Samples/simpleIPC)** | **[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[cudaNvSciNvMedia](./Samples/cudaNvSciNvMedia)** | +**[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLAS_LU](./Samples/simpleCUBLAS_LU)** | **[bandwidthTest](./Samples/bandwidthTest)** | **[cdpQuadtree](./Samples/cdpQuadtree)** | +**[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[reduction](./Samples/reduction)** | +**[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | #### Windows -**[bandwidthTest](./Samples/bandwidthTest)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** | +**[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[shfl_scan](./Samples/shfl_scan)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** | **[concurrentKernels](./Samples/concurrentKernels)** | ---|---|---|---| -**[boxFilterNPP](./Samples/boxFilterNPP)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[concurrentKernels](./Samples/concurrentKernels)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | -**[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[cudaOpenMP](./Samples/cudaOpenMP)** | -**[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[deviceQuery](./Samples/deviceQuery)** | -**[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | -**[matrixMul](./Samples/matrixMul)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | -**[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[nvJPEG](./Samples/nvJPEG)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | -**[reduction](./Samples/reduction)** | **[shfl_scan](./Samples/shfl_scan)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** | -**[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleCUBLAS_LU](./Samples/simpleCUBLAS_LU)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | -**[simpleCUFFT](./Samples/simpleCUFFT)** | **[simpleD3D11](./Samples/simpleD3D11)** | **[simpleD3D12](./Samples/simpleD3D12)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | -**[simpleGL](./Samples/simpleGL)** | **[simpleIPC](./Samples/simpleIPC)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleVulkan](./Samples/simpleVulkan)** | -**[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | -**[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | -**[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** | +**[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[nvJPEG](./Samples/nvJPEG)** | +**[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | **[matrixMul](./Samples/matrixMul)** | +**[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | +**[cudaOpenMP](./Samples/cudaOpenMP)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[simpleVulkan](./Samples/simpleVulkan)** | +**[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | **[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | +**[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | +**[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[simpleGL](./Samples/simpleGL)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | +**[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[simpleD3D11](./Samples/simpleD3D11)** | **[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[deviceQuery](./Samples/deviceQuery)** | +**[matrixMulDrv](./Samples/matrixMulDrv)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[simpleAttributes](./Samples/simpleAttributes)** | +**[simpleD3D12](./Samples/simpleD3D12)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[simpleIPC](./Samples/simpleIPC)** | +**[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLAS_LU](./Samples/simpleCUBLAS_LU)** | **[bandwidthTest](./Samples/bandwidthTest)** | +**[cdpQuadtree](./Samples/cdpQuadtree)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | +**[reduction](./Samples/reduction)** | **[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | ## Dependencies diff --git a/Samples/EGLStream_CUDA_Interop/Makefile b/Samples/EGLStream_CUDA_Interop/Makefile index 1e901d99..010ce65c 100644 --- a/Samples/EGLStream_CUDA_Interop/Makefile +++ b/Samples/EGLStream_CUDA_Interop/Makefile @@ -285,12 +285,6 @@ ifeq ($(TARGET_OS),android) SAMPLE_ENABLED := 0 endif -# This sample is not supported on QNX -ifeq ($(TARGET_OS),qnx) - $(info >>> WARNING - EGLStream_CUDA_Interop is not supported on QNX - waiving sample <<<) - SAMPLE_ENABLED := 0 -endif - ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) diff --git a/Samples/EGLStream_CUDA_Interop/README.md b/Samples/EGLStream_CUDA_Interop/README.md index 2a0f654d..204c6e4f 100644 --- a/Samples/EGLStream_CUDA_Interop/README.md +++ b/Samples/EGLStream_CUDA_Interop/README.md @@ -30,7 +30,7 @@ cuDeviceGet, cuDeviceGetAttribute, cuDeviceComputeCapability, cuDeviceGetCount, ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/MersenneTwisterGP11213/Makefile b/Samples/MersenneTwisterGP11213/Makefile index fb3aa590..e40b5b99 100644 --- a/Samples/MersenneTwisterGP11213/Makefile +++ b/Samples/MersenneTwisterGP11213/Makefile @@ -263,14 +263,6 @@ ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) -SAMPLE_ENABLED := 1 - -# This sample is not supported on QNX -ifeq ($(TARGET_OS),qnx) - $(info >>> WARNING - MersenneTwisterGP11213 is not supported on QNX - waiving sample <<<) - SAMPLE_ENABLED := 0 -endif - ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) @@ -305,10 +297,6 @@ ALL_CCFLAGS += --threads 0 LIBRARIES += -lcurand_static -lculibos -ifeq ($(SAMPLE_ENABLED),0) -EXEC ?= @echo "[@]" -endif - ################################################################################ # Target rules @@ -316,23 +304,16 @@ all: build build: MersenneTwisterGP11213 -check.deps: -ifeq ($(SAMPLE_ENABLED),0) - @echo "Sample will be waived due to the above missing dependencies" -else - @echo "Sample is ready - all dependencies have been met" -endif - MersenneTwister.o:MersenneTwister.cpp - $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< MersenneTwisterGP11213: MersenneTwister.o - $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) - $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) - $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) run: build - $(EXEC) ./MersenneTwisterGP11213 + ./MersenneTwisterGP11213 clean: rm -f MersenneTwisterGP11213 MersenneTwister.o diff --git a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2017.vcxproj b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2017.vcxproj index e39c60a4..07a7c276 100644 --- a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2017.vcxproj +++ b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2019.vcxproj b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2019.vcxproj index 8648205f..ffe973b8 100644 --- a/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2019.vcxproj +++ b/Samples/MersenneTwisterGP11213/MersenneTwisterGP11213_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/MersenneTwisterGP11213/README.md b/Samples/MersenneTwisterGP11213/README.md index eb8bd797..24a482d8 100644 --- a/Samples/MersenneTwisterGP11213/README.md +++ b/Samples/MersenneTwisterGP11213/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.vcxproj b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.vcxproj index 3bbad98a..18faacc5 100644 --- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.vcxproj +++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -113,6 +113,6 @@ - + diff --git a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.vcxproj b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.vcxproj index a5149390..78191a45 100644 --- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.vcxproj +++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -109,6 +109,6 @@ - + diff --git a/Samples/NV12toBGRandResize/README.md b/Samples/NV12toBGRandResize/README.md index 8070aaf0..28d44a1b 100644 --- a/Samples/NV12toBGRandResize/README.md +++ b/Samples/NV12toBGRandResize/README.md @@ -27,7 +27,7 @@ cudaMemcpy2D, cudaMallocManaged ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/UnifiedMemoryPerf/README.md b/Samples/UnifiedMemoryPerf/README.md index 43cecf11..3a054229 100644 --- a/Samples/UnifiedMemoryPerf/README.md +++ b/Samples/UnifiedMemoryPerf/README.md @@ -28,7 +28,7 @@ cudaMallocManaged, cudaStreamAttachMemAsync, cudaMemcpyAsync, cudaMallocHost, cu ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj index b767c25f..a8e56e5f 100644 --- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj +++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -111,6 +111,6 @@ - + diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.vcxproj b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.vcxproj index cfcb126c..1e740893 100644 --- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.vcxproj +++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -107,6 +107,6 @@ - + diff --git a/Samples/bandwidthTest/README.md b/Samples/bandwidthTest/README.md index 8f70b9c0..d5546001 100644 --- a/Samples/bandwidthTest/README.md +++ b/Samples/bandwidthTest/README.md @@ -27,7 +27,7 @@ cudaSetDevice, cudaHostAlloc, cudaFree, cudaMallocHost, cudaFreeHost, cudaMemcpy ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj b/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj index c6979275..22944af3 100644 --- a/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj +++ b/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/bandwidthTest/bandwidthTest_vs2019.vcxproj b/Samples/bandwidthTest/bandwidthTest_vs2019.vcxproj index 40850f7e..786e0a3b 100644 --- a/Samples/bandwidthTest/bandwidthTest_vs2019.vcxproj +++ b/Samples/bandwidthTest/bandwidthTest_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/batchedLabelMarkersAndLabelCompressionNPP/Makefile b/Samples/batchedLabelMarkersAndLabelCompressionNPP/Makefile index fccab0a1..00ee41fa 100644 --- a/Samples/batchedLabelMarkersAndLabelCompressionNPP/Makefile +++ b/Samples/batchedLabelMarkersAndLabelCompressionNPP/Makefile @@ -271,12 +271,6 @@ ifeq ($(TARGET_OS),darwin) SAMPLE_ENABLED := 0 endif -# This sample is not supported on QNX -ifeq ($(TARGET_OS),qnx) - $(info >>> WARNING - batchedLabelMarkersAndLabelCompressionNPP is not supported on QNX - waiving sample <<<) - SAMPLE_ENABLED := 0 -endif - ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) diff --git a/Samples/batchedLabelMarkersAndLabelCompressionNPP/README.md b/Samples/batchedLabelMarkersAndLabelCompressionNPP/README.md index 16270de7..240c8efe 100644 --- a/Samples/batchedLabelMarkersAndLabelCompressionNPP/README.md +++ b/Samples/batchedLabelMarkersAndLabelCompressionNPP/README.md @@ -28,7 +28,7 @@ x86_64, ppc64le, armv7l ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2017.vcxproj b/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2017.vcxproj index 0f34cd13..aa91fca2 100644 --- a/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2017.vcxproj +++ b/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2019.vcxproj b/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2019.vcxproj index 22205f6b..74ec541d 100644 --- a/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2019.vcxproj +++ b/Samples/batchedLabelMarkersAndLabelCompressionNPP/batchedLabelMarkersAndLabelCompressionNPP_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/bf16TensorCoreGemm/README.md b/Samples/bf16TensorCoreGemm/README.md index 5a51bb4e..8910ac7f 100644 --- a/Samples/bf16TensorCoreGemm/README.md +++ b/Samples/bf16TensorCoreGemm/README.md @@ -27,7 +27,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate, ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2017.vcxproj b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2017.vcxproj index 156376ad..e814745b 100644 --- a/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2017.vcxproj +++ b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2019.vcxproj b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2019.vcxproj index 1146105a..fc0b68f7 100644 --- a/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2019.vcxproj +++ b/Samples/bf16TensorCoreGemm/bf16TensorCoreGemm_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/binaryPartitionCG/README.md b/Samples/binaryPartitionCG/README.md index 98c3418d..f7d335fd 100644 --- a/Samples/binaryPartitionCG/README.md +++ b/Samples/binaryPartitionCG/README.md @@ -24,7 +24,7 @@ x86_64, ppc64le, armv7l ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/binaryPartitionCG/binaryPartitionCG_vs2017.vcxproj b/Samples/binaryPartitionCG/binaryPartitionCG_vs2017.vcxproj index 2399c9ec..51aa64df 100644 --- a/Samples/binaryPartitionCG/binaryPartitionCG_vs2017.vcxproj +++ b/Samples/binaryPartitionCG/binaryPartitionCG_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/binaryPartitionCG/binaryPartitionCG_vs2019.vcxproj b/Samples/binaryPartitionCG/binaryPartitionCG_vs2019.vcxproj index fe7bb11f..bdee4499 100644 --- a/Samples/binaryPartitionCG/binaryPartitionCG_vs2019.vcxproj +++ b/Samples/binaryPartitionCG/binaryPartitionCG_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/boxFilterNPP/README.md b/Samples/boxFilterNPP/README.md index 54f26d6a..4824e0e4 100644 --- a/Samples/boxFilterNPP/README.md +++ b/Samples/boxFilterNPP/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/boxFilterNPP/boxFilterNPP_vs2017.vcxproj b/Samples/boxFilterNPP/boxFilterNPP_vs2017.vcxproj index 580c3df5..8772ff10 100644 --- a/Samples/boxFilterNPP/boxFilterNPP_vs2017.vcxproj +++ b/Samples/boxFilterNPP/boxFilterNPP_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -118,6 +118,6 @@ - + diff --git a/Samples/boxFilterNPP/boxFilterNPP_vs2019.vcxproj b/Samples/boxFilterNPP/boxFilterNPP_vs2019.vcxproj index 91f4db2d..6499c47d 100644 --- a/Samples/boxFilterNPP/boxFilterNPP_vs2019.vcxproj +++ b/Samples/boxFilterNPP/boxFilterNPP_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -114,6 +114,6 @@ - + diff --git a/Samples/cannyEdgeDetectorNPP/README.md b/Samples/cannyEdgeDetectorNPP/README.md index 0c969c8e..7e88a000 100644 --- a/Samples/cannyEdgeDetectorNPP/README.md +++ b/Samples/cannyEdgeDetectorNPP/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2017.vcxproj b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2017.vcxproj index f0140b6a..f4166ea6 100644 --- a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2017.vcxproj +++ b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -118,6 +118,6 @@ - + diff --git a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2019.vcxproj b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2019.vcxproj index f919b081..6f6c4407 100644 --- a/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2019.vcxproj +++ b/Samples/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -114,6 +114,6 @@ - + diff --git a/Samples/cdpQuadtree/Makefile b/Samples/cdpQuadtree/Makefile new file mode 100644 index 00000000..56cec011 --- /dev/null +++ b/Samples/cdpQuadtree/Makefile @@ -0,0 +1,370 @@ +################################################################################ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif + +# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now. +ifeq ($(HOST_ARCH),aarch64) + ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null)) + HOST_ARCH := sbsa + TARGET_ARCH := sbsa + endif +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-clang++ + endif + else ifeq ($(TARGET_ARCH),sbsa) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu + LDFLAGS += --unresolved-symbols=ignore-in-shared-libs + CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm + CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + NVCCFLAGS += --qpp-config 5.4.0,gcc_ntoaarch64le + CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu + LDFLAGS += -lsocket + LDFLAGS += -L/usr/lib/aarch64-qnx-gnu + CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu" + ifdef TARGET_OVERRIDE + LDFLAGS += -lslog2 + endif + + ifneq ($(TARGET_FS),) + LDFLAGS += -L$(TARGET_FS)/usr/lib + CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib" + LDFLAGS += -L$(TARGET_FS)/usr/libnvidia + CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia" + CCFLAGS += -I$(TARGET_FS)/../include + endif + endif +endif + +ifdef TARGET_OVERRIDE # cuda toolkit targets override + NVCCFLAGS += -target-dir $(TARGET_OVERRIDE) +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux) + CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +SAMPLE_ENABLED := 1 + +# This sample is not supported on QNX +ifeq ($(TARGET_OS),qnx) + $(info >>> WARNING - cdpQuadtree is not supported on QNX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../Common +LIBRARIES := + +################################################################################ + +#Detect if installed version of GCC supports required C++14 +ifeq ($(TARGET_OS),linux) + empty := + space := $(empty) $(empty) + GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`) +#Create version number without "." + GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.) + GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.) + GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.) +# Make sure the version number has at least 3 decimals + GCCVERSION += 00 +# Remove spaces from the version number + GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION)) +#$(warning $(GCCVERSION)) + + IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 50000) + + ifeq ($(IS_MIN_VERSION), 1) + $(info >>> GCC Version is greater or equal to 5.0.0 <<<) + else + $(info >>> Waiving build. Minimum GCC version required is 5.0.0<<<) + SAMPLE_ENABLED := 0 + endif +endif + +# Gencode arguments +ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64)) +SMS ?= 35 37 50 52 60 61 70 72 75 80 86 +else +SMS ?= 35 37 50 52 60 61 70 75 80 86 +endif + +ifeq ($(SMS),) +$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) +SAMPLE_ENABLED := 0 +endif + +ifeq ($(GENCODE_FLAGS),) +# Generate SASS code for each SM architecture listed in $(SMS) +$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) + +# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility +HIGHEST_SM := $(lastword $(sort $(SMS))) +ifneq ($(HIGHEST_SM),) +GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) +endif +endif + +ALL_CCFLAGS += -dc --std=c++14 --threads 0 + +LIBRARIES += -lcudadevrt + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: cdpQuadtree + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +cdpQuadtree.o:cdpQuadtree.cu + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +cdpQuadtree: cdpQuadtree.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./cdpQuadtree + +clean: + rm -f cdpQuadtree cdpQuadtree.o + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/cdpQuadtree + +clobber: clean diff --git a/Samples/cdpQuadtree/NsightEclipse.xml b/Samples/cdpQuadtree/NsightEclipse.xml new file mode 100644 index 00000000..0dd9a41c --- /dev/null +++ b/Samples/cdpQuadtree/NsightEclipse.xml @@ -0,0 +1,72 @@ + + + + cdpQuadtree + + -dc + --std=c++14 + + + + ./ + ../ + ../../common/inc + + + Cooperative Groups + CUDA Dynamic Parallelism + + + GPGPU + CPP14 + + + cudadevrt + + + + true + cdpQuadTree.cu + + CDP + + + 1:CUDA Advanced Topics + + sm35 + sm37 + sm50 + sm52 + sm60 + sm61 + sm70 + sm72 + sm75 + sm80 + sm86 + + + x86_64 + linux + + + windows7 + + + x86_64 + macosx + + + arm + + + ppc64le + linux + + + + 3.5 + + Quad Tree (CUDA Dynamic Parallelism) + exe + diff --git a/Samples/cdpQuadtree/README.md b/Samples/cdpQuadtree/README.md new file mode 100644 index 00000000..80054911 --- /dev/null +++ b/Samples/cdpQuadtree/README.md @@ -0,0 +1,71 @@ +# cdpQuadtree - Quad Tree (CUDA Dynamic Parallelism) + +## Description + +This sample demonstrates Quad Trees implemented using CUDA Dynamic Parallelism. This sample requires devices with compute capability 3.5 or higher. + +## Key Concepts + +Cooperative Groups, CUDA Dynamic Parallelism + +## Supported SM Architectures + +[SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows + +## Supported CPU Architecture + +x86_64, ppc64le, armv7l + +## CUDA APIs involved + +## Dependencies needed to build/run +[CDP](../../README.md#cdp) + +## Prerequisites + +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Make sure the dependencies mentioned in [Dependencies]() section above are installed. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +## References (for more details) + diff --git a/Samples/cdpQuadtree/cdpQuadtree.cu b/Samples/cdpQuadtree/cdpQuadtree.cu new file mode 100644 index 00000000..256b1b8f --- /dev/null +++ b/Samples/cdpQuadtree/cdpQuadtree.cu @@ -0,0 +1,742 @@ +/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include + +namespace cg = cooperative_groups; +#include + +//////////////////////////////////////////////////////////////////////////////// +// A structure of 2D points (structure of arrays). +//////////////////////////////////////////////////////////////////////////////// +class Points { + float *m_x; + float *m_y; + + public: + // Constructor. + __host__ __device__ Points() : m_x(NULL), m_y(NULL) {} + + // Constructor. + __host__ __device__ Points(float *x, float *y) : m_x(x), m_y(y) {} + + // Get a point. + __host__ __device__ __forceinline__ float2 get_point(int idx) const { + return make_float2(m_x[idx], m_y[idx]); + } + + // Set a point. + __host__ __device__ __forceinline__ void set_point(int idx, const float2 &p) { + m_x[idx] = p.x; + m_y[idx] = p.y; + } + + // Set the pointers. + __host__ __device__ __forceinline__ void set(float *x, float *y) { + m_x = x; + m_y = y; + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// A 2D bounding box +//////////////////////////////////////////////////////////////////////////////// +class Bounding_box { + // Extreme points of the bounding box. + float2 m_p_min; + float2 m_p_max; + + public: + // Constructor. Create a unit box. + __host__ __device__ Bounding_box() { + m_p_min = make_float2(0.0f, 0.0f); + m_p_max = make_float2(1.0f, 1.0f); + } + + // Compute the center of the bounding-box. + __host__ __device__ void compute_center(float2 ¢er) const { + center.x = 0.5f * (m_p_min.x + m_p_max.x); + center.y = 0.5f * (m_p_min.y + m_p_max.y); + } + + // The points of the box. + __host__ __device__ __forceinline__ const float2 &get_max() const { + return m_p_max; + } + + __host__ __device__ __forceinline__ const float2 &get_min() const { + return m_p_min; + } + + // Does a box contain a point. + __host__ __device__ bool contains(const float2 &p) const { + return p.x >= m_p_min.x && p.x < m_p_max.x && p.y >= m_p_min.y && + p.y < m_p_max.y; + } + + // Define the bounding box. + __host__ __device__ void set(float min_x, float min_y, float max_x, + float max_y) { + m_p_min.x = min_x; + m_p_min.y = min_y; + m_p_max.x = max_x; + m_p_max.y = max_y; + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// A node of a quadree. +//////////////////////////////////////////////////////////////////////////////// +class Quadtree_node { + // The identifier of the node. + int m_id; + // The bounding box of the tree. + Bounding_box m_bounding_box; + // The range of points. + int m_begin, m_end; + + public: + // Constructor. + __host__ __device__ Quadtree_node() : m_id(0), m_begin(0), m_end(0) {} + + // The ID of a node at its level. + __host__ __device__ int id() const { return m_id; } + + // The ID of a node at its level. + __host__ __device__ void set_id(int new_id) { m_id = new_id; } + + // The bounding box. + __host__ __device__ __forceinline__ const Bounding_box &bounding_box() const { + return m_bounding_box; + } + + // Set the bounding box. + __host__ __device__ __forceinline__ void set_bounding_box(float min_x, + float min_y, + float max_x, + float max_y) { + m_bounding_box.set(min_x, min_y, max_x, max_y); + } + + // The number of points in the tree. + __host__ __device__ __forceinline__ int num_points() const { + return m_end - m_begin; + } + + // The range of points in the tree. + __host__ __device__ __forceinline__ int points_begin() const { + return m_begin; + } + + __host__ __device__ __forceinline__ int points_end() const { return m_end; } + + // Define the range for that node. + __host__ __device__ __forceinline__ void set_range(int begin, int end) { + m_begin = begin; + m_end = end; + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// Algorithm parameters. +//////////////////////////////////////////////////////////////////////////////// +struct Parameters { + // Choose the right set of points to use as in/out. + int point_selector; + // The number of nodes at a given level (2^k for level k). + int num_nodes_at_this_level; + // The recursion depth. + int depth; + // The max value for depth. + const int max_depth; + // The minimum number of points in a node to stop recursion. + const int min_points_per_node; + + // Constructor set to default values. + __host__ __device__ Parameters(int max_depth, int min_points_per_node) + : point_selector(0), + num_nodes_at_this_level(1), + depth(0), + max_depth(max_depth), + min_points_per_node(min_points_per_node) {} + + // Copy constructor. Changes the values for next iteration. + __host__ __device__ Parameters(const Parameters ¶ms, bool) + : point_selector((params.point_selector + 1) % 2), + num_nodes_at_this_level(4 * params.num_nodes_at_this_level), + depth(params.depth + 1), + max_depth(params.max_depth), + min_points_per_node(params.min_points_per_node) {} +}; + +//////////////////////////////////////////////////////////////////////////////// +// Build a quadtree on the GPU. Use CUDA Dynamic Parallelism. +// +// The algorithm works as follows. The host (CPU) launches one block of +// NUM_THREADS_PER_BLOCK threads. That block will do the following steps: +// +// 1- Check the number of points and its depth. +// +// We impose a maximum depth to the tree and a minimum number of points per +// node. If the maximum depth is exceeded or the minimum number of points is +// reached. The threads in the block exit. +// +// Before exiting, they perform a buffer swap if it is needed. Indeed, the +// algorithm uses two buffers to permute the points and make sure they are +// properly distributed in the quadtree. By design we want all points to be +// in the first buffer of points at the end of the algorithm. It is the reason +// why we may have to swap the buffer before leavin (if the points are in the +// 2nd buffer). +// +// 2- Count the number of points in each child. +// +// If the depth is not too high and the number of points is sufficient, the +// block has to dispatch the points into four geometrical buckets: Its +// children. For that purpose, we compute the center of the bounding box and +// count the number of points in each quadrant. +// +// The set of points is divided into sections. Each section is given to a +// warp of threads (32 threads). Warps use __ballot and __popc intrinsics +// to count the points. See the Programming Guide for more information about +// those functions. +// +// 3- Scan the warps' results to know the "global" numbers. +// +// Warps work independently from each other. At the end, each warp knows the +// number of points in its section. To know the numbers for the block, the +// block has to run a scan/reduce at the block level. It's a traditional +// approach. The implementation in that sample is not as optimized as what +// could be found in fast radix sorts, for example, but it relies on the same +// idea. +// +// 4- Move points. +// +// Now that the block knows how many points go in each of its 4 children, it +// remains to dispatch the points. It is straightforward. +// +// 5- Launch new blocks. +// +// The block launches four new blocks: One per children. Each of the four blocks +// will apply the same algorithm. +//////////////////////////////////////////////////////////////////////////////// +template +__global__ void build_quadtree_kernel(Quadtree_node *nodes, Points *points, + Parameters params) { + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // The number of warps in a block. + const int NUM_WARPS_PER_BLOCK = NUM_THREADS_PER_BLOCK / warpSize; + + // Shared memory to store the number of points. + extern __shared__ int smem[]; + + // s_num_pts[4][NUM_WARPS_PER_BLOCK]; + // Addresses of shared memory. + volatile int *s_num_pts[4]; + + for (int i = 0; i < 4; ++i) + s_num_pts[i] = (volatile int *)&smem[i * NUM_WARPS_PER_BLOCK]; + + // Compute the coordinates of the threads in the block. + const int warp_id = threadIdx.x / warpSize; + const int lane_id = threadIdx.x % warpSize; + + // Mask for compaction. + // Same as: asm( "mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt) ); + int lane_mask_lt = (1 << lane_id) - 1; + + // The current node. + Quadtree_node &node = nodes[blockIdx.x]; + + // The number of points in the node. + int num_points = node.num_points(); + + float2 center; + int range_begin, range_end; + int warp_cnts[4] = {0, 0, 0, 0}; + // + // 1- Check the number of points and its depth. + // + + // Stop the recursion here. Make sure points[0] contains all the points. + if (params.depth >= params.max_depth || + num_points <= params.min_points_per_node) { + if (params.point_selector == 1) { + int it = node.points_begin(), end = node.points_end(); + + for (it += threadIdx.x; it < end; it += NUM_THREADS_PER_BLOCK) + if (it < end) points[0].set_point(it, points[1].get_point(it)); + } + + return; + } + + // Compute the center of the bounding box of the points. + const Bounding_box &bbox = node.bounding_box(); + + bbox.compute_center(center); + + // Find how many points to give to each warp. + int num_points_per_warp = max( + warpSize, (num_points + NUM_WARPS_PER_BLOCK - 1) / NUM_WARPS_PER_BLOCK); + + // Each warp of threads will compute the number of points to move to each + // quadrant. + range_begin = node.points_begin() + warp_id * num_points_per_warp; + range_end = min(range_begin + num_points_per_warp, node.points_end()); + + // + // 2- Count the number of points in each child. + // + + // Input points. + const Points &in_points = points[params.point_selector]; + + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + // Compute the number of points. + for (int range_it = range_begin + tile32.thread_rank(); + tile32.any(range_it < range_end); range_it += warpSize) { + // Is it still an active thread? + bool is_active = range_it < range_end; + + // Load the coordinates of the point. + float2 p = + is_active ? in_points.get_point(range_it) : make_float2(0.0f, 0.0f); + + // Count top-left points. + int num_pts = + __popc(tile32.ballot(is_active && p.x < center.x && p.y >= center.y)); + warp_cnts[0] += tile32.shfl(num_pts, 0); + + // Count top-right points. + num_pts = + __popc(tile32.ballot(is_active && p.x >= center.x && p.y >= center.y)); + warp_cnts[1] += tile32.shfl(num_pts, 0); + + // Count bottom-left points. + num_pts = + __popc(tile32.ballot(is_active && p.x < center.x && p.y < center.y)); + warp_cnts[2] += tile32.shfl(num_pts, 0); + + // Count bottom-right points. + num_pts = + __popc(tile32.ballot(is_active && p.x >= center.x && p.y < center.y)); + warp_cnts[3] += tile32.shfl(num_pts, 0); + } + + if (tile32.thread_rank() == 0) { + s_num_pts[0][warp_id] = warp_cnts[0]; + s_num_pts[1][warp_id] = warp_cnts[1]; + s_num_pts[2][warp_id] = warp_cnts[2]; + s_num_pts[3][warp_id] = warp_cnts[3]; + } + + // Make sure warps have finished counting. + cg::sync(cta); + + // + // 3- Scan the warps' results to know the "global" numbers. + // + + // First 4 warps scan the numbers of points per child (inclusive scan). + if (warp_id < 4) { + int num_pts = tile32.thread_rank() < NUM_WARPS_PER_BLOCK + ? s_num_pts[warp_id][tile32.thread_rank()] + : 0; +#pragma unroll + + for (int offset = 1; offset < NUM_WARPS_PER_BLOCK; offset *= 2) { + int n = tile32.shfl_up(num_pts, offset); + + if (tile32.thread_rank() >= offset) num_pts += n; + } + + if (tile32.thread_rank() < NUM_WARPS_PER_BLOCK) + s_num_pts[warp_id][tile32.thread_rank()] = num_pts; + } + + cg::sync(cta); + + // Compute global offsets. + if (warp_id == 0) { + int sum = s_num_pts[0][NUM_WARPS_PER_BLOCK - 1]; + + for (int row = 1; row < 4; ++row) { + int tmp = s_num_pts[row][NUM_WARPS_PER_BLOCK - 1]; + cg::sync(tile32); + + if (tile32.thread_rank() < NUM_WARPS_PER_BLOCK) + s_num_pts[row][tile32.thread_rank()] += sum; + + cg::sync(tile32); + sum += tmp; + } + } + + cg::sync(cta); + + // Make the scan exclusive. + int val = 0; + if (threadIdx.x < 4 * NUM_WARPS_PER_BLOCK) { + val = threadIdx.x == 0 ? 0 : smem[threadIdx.x - 1]; + val += node.points_begin(); + } + + cg::sync(cta); + + if (threadIdx.x < 4 * NUM_WARPS_PER_BLOCK) { + smem[threadIdx.x] = val; + } + + cg::sync(cta); + + // + // 4- Move points. + // + if (!(params.depth >= params.max_depth || + num_points <= params.min_points_per_node)) { + // Output points. + Points &out_points = points[(params.point_selector + 1) % 2]; + + warp_cnts[0] = s_num_pts[0][warp_id]; + warp_cnts[1] = s_num_pts[1][warp_id]; + warp_cnts[2] = s_num_pts[2][warp_id]; + warp_cnts[3] = s_num_pts[3][warp_id]; + + const Points &in_points = points[params.point_selector]; + // Reorder points. + for (int range_it = range_begin + tile32.thread_rank(); + tile32.any(range_it < range_end); range_it += warpSize) { + // Is it still an active thread? + bool is_active = range_it < range_end; + + // Load the coordinates of the point. + float2 p = + is_active ? in_points.get_point(range_it) : make_float2(0.0f, 0.0f); + + // Count top-left points. + bool pred = is_active && p.x < center.x && p.y >= center.y; + int vote = tile32.ballot(pred); + int dest = warp_cnts[0] + __popc(vote & lane_mask_lt); + + if (pred) out_points.set_point(dest, p); + + warp_cnts[0] += tile32.shfl(__popc(vote), 0); + + // Count top-right points. + pred = is_active && p.x >= center.x && p.y >= center.y; + vote = tile32.ballot(pred); + dest = warp_cnts[1] + __popc(vote & lane_mask_lt); + + if (pred) out_points.set_point(dest, p); + + warp_cnts[1] += tile32.shfl(__popc(vote), 0); + + // Count bottom-left points. + pred = is_active && p.x < center.x && p.y < center.y; + vote = tile32.ballot(pred); + dest = warp_cnts[2] + __popc(vote & lane_mask_lt); + + if (pred) out_points.set_point(dest, p); + + warp_cnts[2] += tile32.shfl(__popc(vote), 0); + + // Count bottom-right points. + pred = is_active && p.x >= center.x && p.y < center.y; + vote = tile32.ballot(pred); + dest = warp_cnts[3] + __popc(vote & lane_mask_lt); + + if (pred) out_points.set_point(dest, p); + + warp_cnts[3] += tile32.shfl(__popc(vote), 0); + } + } + + cg::sync(cta); + + if (tile32.thread_rank() == 0) { + s_num_pts[0][warp_id] = warp_cnts[0]; + s_num_pts[1][warp_id] = warp_cnts[1]; + s_num_pts[2][warp_id] = warp_cnts[2]; + s_num_pts[3][warp_id] = warp_cnts[3]; + } + + cg::sync(cta); + + // + // 5- Launch new blocks. + // + if (!(params.depth >= params.max_depth || + num_points <= params.min_points_per_node)) { + // The last thread launches new blocks. + if (threadIdx.x == NUM_THREADS_PER_BLOCK - 1) { + // The children. + Quadtree_node *children = + &nodes[params.num_nodes_at_this_level - (node.id() & ~3)]; + + // The offsets of the children at their level. + int child_offset = 4 * node.id(); + + // Set IDs. + children[child_offset + 0].set_id(4 * node.id() + 0); + children[child_offset + 1].set_id(4 * node.id() + 1); + children[child_offset + 2].set_id(4 * node.id() + 2); + children[child_offset + 3].set_id(4 * node.id() + 3); + + const Bounding_box &bbox = node.bounding_box(); + // Points of the bounding-box. + const float2 &p_min = bbox.get_min(); + const float2 &p_max = bbox.get_max(); + + // Set the bounding boxes of the children. + children[child_offset + 0].set_bounding_box(p_min.x, center.y, center.x, + p_max.y); // Top-left. + children[child_offset + 1].set_bounding_box(center.x, center.y, p_max.x, + p_max.y); // Top-right. + children[child_offset + 2].set_bounding_box(p_min.x, p_min.y, center.x, + center.y); // Bottom-left. + children[child_offset + 3].set_bounding_box(center.x, p_min.y, p_max.x, + center.y); // Bottom-right. + + // Set the ranges of the children. + + children[child_offset + 0].set_range(node.points_begin(), + s_num_pts[0][warp_id]); + children[child_offset + 1].set_range(s_num_pts[0][warp_id], + s_num_pts[1][warp_id]); + children[child_offset + 2].set_range(s_num_pts[1][warp_id], + s_num_pts[2][warp_id]); + children[child_offset + 3].set_range(s_num_pts[2][warp_id], + s_num_pts[3][warp_id]); + + // Launch 4 children. + build_quadtree_kernel<<< + 4, NUM_THREADS_PER_BLOCK, 4 * NUM_WARPS_PER_BLOCK * sizeof(int)>>>( + &children[child_offset], points, Parameters(params, true)); + } + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Make sure a Quadtree is properly defined. +//////////////////////////////////////////////////////////////////////////////// +bool check_quadtree(const Quadtree_node *nodes, int idx, int num_pts, + Points *pts, Parameters params) { + const Quadtree_node &node = nodes[idx]; + int num_points = node.num_points(); + + if (!(params.depth == params.max_depth || + num_points <= params.min_points_per_node)) { + int num_points_in_children = 0; + + num_points_in_children += + nodes[params.num_nodes_at_this_level + 4 * idx + 0].num_points(); + num_points_in_children += + nodes[params.num_nodes_at_this_level + 4 * idx + 1].num_points(); + num_points_in_children += + nodes[params.num_nodes_at_this_level + 4 * idx + 2].num_points(); + num_points_in_children += + nodes[params.num_nodes_at_this_level + 4 * idx + 3].num_points(); + + if (num_points_in_children != node.num_points()) return false; + + return check_quadtree(&nodes[params.num_nodes_at_this_level], 4 * idx + 0, + num_pts, pts, Parameters(params, true)) && + check_quadtree(&nodes[params.num_nodes_at_this_level], 4 * idx + 1, + num_pts, pts, Parameters(params, true)) && + check_quadtree(&nodes[params.num_nodes_at_this_level], 4 * idx + 2, + num_pts, pts, Parameters(params, true)) && + check_quadtree(&nodes[params.num_nodes_at_this_level], 4 * idx + 3, + num_pts, pts, Parameters(params, true)); + } + + const Bounding_box &bbox = node.bounding_box(); + + for (int it = node.points_begin(); it < node.points_end(); ++it) { + if (it >= num_pts) return false; + + float2 p = pts->get_point(it); + + if (!bbox.contains(p)) return false; + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +// Parallel random number generator. +//////////////////////////////////////////////////////////////////////////////// +struct Random_generator { + int count; + + __host__ __device__ Random_generator() : count(0) {} + __host__ __device__ unsigned int hash(unsigned int a) { + a = (a + 0x7ed55d16) + (a << 12); + a = (a ^ 0xc761c23c) ^ (a >> 19); + a = (a + 0x165667b1) + (a << 5); + a = (a + 0xd3a2646c) ^ (a << 9); + a = (a + 0xfd7046c5) + (a << 3); + a = (a ^ 0xb55a4f09) ^ (a >> 16); + return a; + } + + __host__ __device__ __forceinline__ thrust::tuple operator()() { +#ifdef __CUDA_ARCH__ + unsigned seed = hash(blockIdx.x * blockDim.x + threadIdx.x + count); + // thrust::generate may call operator() more than once per thread. + // Hence, increment count by grid size to ensure uniqueness of seed + count += blockDim.x * gridDim.x; +#else + unsigned seed = hash(0); +#endif + thrust::default_random_engine rng(seed); + thrust::random::uniform_real_distribution distrib; + return thrust::make_tuple(distrib(rng), distrib(rng)); + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// Allocate GPU structs, launch kernel and clean up +//////////////////////////////////////////////////////////////////////////////// +bool cdpQuadtree(int warp_size) { + // Constants to control the algorithm. + const int num_points = 1024; + const int max_depth = 8; + const int min_points_per_node = 16; + + // Allocate memory for points. + thrust::device_vector x_d0(num_points); + thrust::device_vector x_d1(num_points); + thrust::device_vector y_d0(num_points); + thrust::device_vector y_d1(num_points); + + // Generate random points. + Random_generator rnd; + thrust::generate( + thrust::make_zip_iterator(thrust::make_tuple(x_d0.begin(), y_d0.begin())), + thrust::make_zip_iterator(thrust::make_tuple(x_d0.end(), y_d0.end())), + rnd); + + // Host structures to analyze the device ones. + Points points_init[2]; + points_init[0].set(thrust::raw_pointer_cast(&x_d0[0]), + thrust::raw_pointer_cast(&y_d0[0])); + points_init[1].set(thrust::raw_pointer_cast(&x_d1[0]), + thrust::raw_pointer_cast(&y_d1[0])); + + // Allocate memory to store points. + Points *points; + checkCudaErrors(cudaMalloc((void **)&points, 2 * sizeof(Points))); + checkCudaErrors(cudaMemcpy(points, points_init, 2 * sizeof(Points), + cudaMemcpyHostToDevice)); + + // We could use a close form... + int max_nodes = 0; + + for (int i = 0, num_nodes_at_level = 1; i < max_depth; + ++i, num_nodes_at_level *= 4) + max_nodes += num_nodes_at_level; + + // Allocate memory to store the tree. + Quadtree_node root; + root.set_range(0, num_points); + Quadtree_node *nodes; + checkCudaErrors( + cudaMalloc((void **)&nodes, max_nodes * sizeof(Quadtree_node))); + checkCudaErrors( + cudaMemcpy(nodes, &root, sizeof(Quadtree_node), cudaMemcpyHostToDevice)); + + // We set the recursion limit for CDP to max_depth. + cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth); + + // Build the quadtree. + Parameters params(max_depth, min_points_per_node); + std::cout << "Launching CDP kernel to build the quadtree" << std::endl; + const int NUM_THREADS_PER_BLOCK = 128; // Do not use less than 128 threads. + const int NUM_WARPS_PER_BLOCK = NUM_THREADS_PER_BLOCK / warp_size; + const size_t smem_size = 4 * NUM_WARPS_PER_BLOCK * sizeof(int); + build_quadtree_kernel< + NUM_THREADS_PER_BLOCK><<<1, NUM_THREADS_PER_BLOCK, smem_size>>>( + nodes, points, params); + checkCudaErrors(cudaGetLastError()); + + // Copy points to CPU. + thrust::host_vector x_h(x_d0); + thrust::host_vector y_h(y_d0); + Points host_points; + host_points.set(thrust::raw_pointer_cast(&x_h[0]), + thrust::raw_pointer_cast(&y_h[0])); + + // Copy nodes to CPU. + Quadtree_node *host_nodes = new Quadtree_node[max_nodes]; + checkCudaErrors(cudaMemcpy(host_nodes, nodes, + max_nodes * sizeof(Quadtree_node), + cudaMemcpyDeviceToHost)); + + // Validate the results. + bool ok = check_quadtree(host_nodes, 0, num_points, &host_points, params); + std::cout << "Results: " << (ok ? "OK" : "FAILED") << std::endl; + + // Free CPU memory. + delete[] host_nodes; + + // Free memory. + checkCudaErrors(cudaFree(nodes)); + checkCudaErrors(cudaFree(points)); + + return ok; +} + +//////////////////////////////////////////////////////////////////////////////// +// Main entry point. +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) { + // Find/set the device. + // The test requires an architecture SM35 or greater (CDP capable). + int cuda_device = findCudaDevice(argc, (const char **)argv); + cudaDeviceProp deviceProps; + checkCudaErrors(cudaGetDeviceProperties(&deviceProps, cuda_device)); + int cdpCapable = (deviceProps.major == 3 && deviceProps.minor >= 5) || + deviceProps.major >= 4; + + printf("GPU device %s has compute capabilities (SM %d.%d)\n", + deviceProps.name, deviceProps.major, deviceProps.minor); + + if (!cdpCapable) { + std::cerr << "cdpQuadTree requires SM 3.5 or higher to use CUDA Dynamic " + "Parallelism. Exiting...\n" + << std::endl; + exit(EXIT_WAIVED); + } + + bool ok = cdpQuadtree(deviceProps.warpSize); + + return (ok ? EXIT_SUCCESS : EXIT_FAILURE); +} diff --git a/Samples/cdpQuadtree/cdpQuadtree_vs2017.sln b/Samples/cdpQuadtree/cdpQuadtree_vs2017.sln new file mode 100644 index 00000000..e4f10db0 --- /dev/null +++ b/Samples/cdpQuadtree/cdpQuadtree_vs2017.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2017 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cdpQuadtree", "cdpQuadtree_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/cdpQuadtree/cdpQuadtree_vs2017.vcxproj b/Samples/cdpQuadtree/cdpQuadtree_vs2017.vcxproj new file mode 100644 index 00000000..162371e0 --- /dev/null +++ b/Samples/cdpQuadtree/cdpQuadtree_vs2017.vcxproj @@ -0,0 +1,114 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + cdpQuadtree_vs2017 + cdpQuadtree + + + + $([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0')) + $(LatestTargetPlatformVersion) + $(WindowsTargetPlatformVersion) + + + + Application + MultiByte + v141 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cudadevrt.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/cdpQuadtree.exe + + + compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + true + --threads 0 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/cdpQuadtree/cdpQuadtree_vs2019.sln b/Samples/cdpQuadtree/cdpQuadtree_vs2019.sln new file mode 100644 index 00000000..beafe616 --- /dev/null +++ b/Samples/cdpQuadtree/cdpQuadtree_vs2019.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2019 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cdpQuadtree", "cdpQuadtree_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/cdpQuadtree/cdpQuadtree_vs2019.vcxproj b/Samples/cdpQuadtree/cdpQuadtree_vs2019.vcxproj new file mode 100644 index 00000000..bb56716b --- /dev/null +++ b/Samples/cdpQuadtree/cdpQuadtree_vs2019.vcxproj @@ -0,0 +1,110 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + cdpQuadtree_vs2019 + cdpQuadtree + + + + + Application + MultiByte + v142 + 10.0 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cudadevrt.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/cdpQuadtree.exe + + + compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + true + --threads 0 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/concurrentKernels/README.md b/Samples/concurrentKernels/README.md index b3a52d91..0bd5287f 100644 --- a/Samples/concurrentKernels/README.md +++ b/Samples/concurrentKernels/README.md @@ -24,7 +24,7 @@ x86_64, ppc64le, armv7l ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/concurrentKernels/concurrentKernels_vs2017.vcxproj b/Samples/concurrentKernels/concurrentKernels_vs2017.vcxproj index f8036198..9c014c79 100644 --- a/Samples/concurrentKernels/concurrentKernels_vs2017.vcxproj +++ b/Samples/concurrentKernels/concurrentKernels_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/concurrentKernels/concurrentKernels_vs2019.vcxproj b/Samples/concurrentKernels/concurrentKernels_vs2019.vcxproj index f6224739..63ace459 100644 --- a/Samples/concurrentKernels/concurrentKernels_vs2019.vcxproj +++ b/Samples/concurrentKernels/concurrentKernels_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/conjugateGradientCudaGraphs/Makefile b/Samples/conjugateGradientCudaGraphs/Makefile index 6609440a..13d1e4ee 100644 --- a/Samples/conjugateGradientCudaGraphs/Makefile +++ b/Samples/conjugateGradientCudaGraphs/Makefile @@ -265,12 +265,6 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) SAMPLE_ENABLED := 1 -# This sample is not supported on QNX -ifeq ($(TARGET_OS),qnx) - $(info >>> WARNING - conjugateGradientCudaGraphs is not supported on QNX - waiving sample <<<) - SAMPLE_ENABLED := 0 -endif - ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) diff --git a/Samples/conjugateGradientCudaGraphs/README.md b/Samples/conjugateGradientCudaGraphs/README.md index 1e723476..beebe833 100644 --- a/Samples/conjugateGradientCudaGraphs/README.md +++ b/Samples/conjugateGradientCudaGraphs/README.md @@ -30,7 +30,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch, ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj index a662b455..9b9eebd9 100644 --- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj +++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.vcxproj b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.vcxproj index 5fe964d8..20595779 100644 --- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.vcxproj +++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/conjugateGradientMultiBlockCG/README.md b/Samples/conjugateGradientMultiBlockCG/README.md index 217fabf3..d2b76ee8 100644 --- a/Samples/conjugateGradientMultiBlockCG/README.md +++ b/Samples/conjugateGradientMultiBlockCG/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj index 9692e5fe..137acb27 100644 --- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj +++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.vcxproj b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.vcxproj index 9952e93f..0f7f669f 100644 --- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.vcxproj +++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/conjugateGradientMultiDeviceCG/README.md b/Samples/conjugateGradientMultiDeviceCG/README.md index 099a61bd..fbf8b563 100644 --- a/Samples/conjugateGradientMultiDeviceCG/README.md +++ b/Samples/conjugateGradientMultiDeviceCG/README.md @@ -30,7 +30,7 @@ cudaMemAdvise, cudaMemPrefetchAsync, cudaLaunchCooperativeKernelMultiDevice, cud ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj index 281b9f54..d5f0924e 100644 --- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj +++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -109,6 +109,6 @@ - + diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.vcxproj b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.vcxproj index da03363e..68964bc3 100644 --- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.vcxproj +++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -105,6 +105,6 @@ - + diff --git a/Samples/cuSolverDn_LinearSolver/Makefile b/Samples/cuSolverDn_LinearSolver/Makefile index 61e55f47..f8b34a31 100644 --- a/Samples/cuSolverDn_LinearSolver/Makefile +++ b/Samples/cuSolverDn_LinearSolver/Makefile @@ -271,12 +271,6 @@ ifeq ($(TARGET_ARCH),armv7l) SAMPLE_ENABLED := 0 endif -# This sample is not supported on QNX -ifeq ($(TARGET_OS),qnx) - $(info >>> WARNING - cuSolverDn_LinearSolver is not supported on QNX - waiving sample <<<) - SAMPLE_ENABLED := 0 -endif - ifeq ($(TARGET_OS),linux) ALL_CCFLAGS += -Xcompiler \"-Wl,--no-as-needed\" endif diff --git a/Samples/cuSolverDn_LinearSolver/README.md b/Samples/cuSolverDn_LinearSolver/README.md index 185f577b..84190e53 100644 --- a/Samples/cuSolverDn_LinearSolver/README.md +++ b/Samples/cuSolverDn_LinearSolver/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2017.vcxproj b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2017.vcxproj index 8d77015c..b725348d 100644 --- a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2017.vcxproj +++ b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -110,6 +110,6 @@ - + diff --git a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2019.vcxproj b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2019.vcxproj index d3f1e05e..ffdac8c8 100644 --- a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2019.vcxproj +++ b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -106,6 +106,6 @@ - + diff --git a/Samples/cuSolverSp_LinearSolver/Makefile b/Samples/cuSolverSp_LinearSolver/Makefile index cc002581..59a043d9 100644 --- a/Samples/cuSolverSp_LinearSolver/Makefile +++ b/Samples/cuSolverSp_LinearSolver/Makefile @@ -265,12 +265,6 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) SAMPLE_ENABLED := 1 -# This sample is not supported on QNX -ifeq ($(TARGET_OS),qnx) - $(info >>> WARNING - cuSolverSp_LinearSolver is not supported on QNX - waiving sample <<<) - SAMPLE_ENABLED := 0 -endif - ifeq ($(TARGET_OS),linux) ALL_CCFLAGS += -Xcompiler \"-Wl,--no-as-needed\" endif diff --git a/Samples/cuSolverSp_LinearSolver/README.md b/Samples/cuSolverSp_LinearSolver/README.md index 35c105ee..fd506b35 100644 --- a/Samples/cuSolverSp_LinearSolver/README.md +++ b/Samples/cuSolverSp_LinearSolver/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2017.vcxproj b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2017.vcxproj index 1bdf5779..25e29047 100644 --- a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2017.vcxproj +++ b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -110,6 +110,6 @@ - + diff --git a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2019.vcxproj b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2019.vcxproj index 665f795e..05e8ddf0 100644 --- a/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2019.vcxproj +++ b/Samples/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -106,6 +106,6 @@ - + diff --git a/Samples/cudaCompressibleMemory/README.md b/Samples/cudaCompressibleMemory/README.md index 6492f737..82a77278 100644 --- a/Samples/cudaCompressibleMemory/README.md +++ b/Samples/cudaCompressibleMemory/README.md @@ -30,7 +30,7 @@ cudaMalloc, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2017.vcxproj b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2017.vcxproj index 4f450a2c..a420c541 100644 --- a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2017.vcxproj +++ b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -109,6 +109,6 @@ - + diff --git a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2019.vcxproj b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2019.vcxproj index a932dd47..1be97e5e 100644 --- a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2019.vcxproj +++ b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -105,6 +105,6 @@ - + diff --git a/Samples/cudaNvSci/Makefile b/Samples/cudaNvSci/Makefile index d7db232f..1ef041a8 100644 --- a/Samples/cudaNvSci/Makefile +++ b/Samples/cudaNvSci/Makefile @@ -279,12 +279,6 @@ ifeq ($(TARGET_ARCH),armv7l) SAMPLE_ENABLED := 0 endif -# This sample is not supported on QNX -ifeq ($(TARGET_OS),qnx) - $(info >>> WARNING - cudaNvSci is not supported on QNX - waiving sample <<<) - SAMPLE_ENABLED := 0 -endif - ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) diff --git a/Samples/cudaNvSci/README.md b/Samples/cudaNvSci/README.md index 58d95d19..030c8ad8 100644 --- a/Samples/cudaNvSci/README.md +++ b/Samples/cudaNvSci/README.md @@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedBuffer, cudaExternalMemoryG ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/cudaNvSciNvMedia/README.md b/Samples/cudaNvSciNvMedia/README.md index a8e1a41c..1c009532 100644 --- a/Samples/cudaNvSciNvMedia/README.md +++ b/Samples/cudaNvSciNvMedia/README.md @@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedBuffer, cudaExternalMemoryG ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/cudaOpenMP/README.md b/Samples/cudaOpenMP/README.md index c2f88493..3d6fe64f 100644 --- a/Samples/cudaOpenMP/README.md +++ b/Samples/cudaOpenMP/README.md @@ -30,7 +30,7 @@ cudaMalloc, cudaFree, cudaMemcpy ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/cudaOpenMP/cudaOpenMP_vs2017.vcxproj b/Samples/cudaOpenMP/cudaOpenMP_vs2017.vcxproj index d3a04a75..42226124 100644 --- a/Samples/cudaOpenMP/cudaOpenMP_vs2017.vcxproj +++ b/Samples/cudaOpenMP/cudaOpenMP_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -109,6 +109,6 @@ - + diff --git a/Samples/cudaOpenMP/cudaOpenMP_vs2019.vcxproj b/Samples/cudaOpenMP/cudaOpenMP_vs2019.vcxproj index 59018165..a12a7d9e 100644 --- a/Samples/cudaOpenMP/cudaOpenMP_vs2019.vcxproj +++ b/Samples/cudaOpenMP/cudaOpenMP_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -105,6 +105,6 @@ - + diff --git a/Samples/cudaTensorCoreGemm/README.md b/Samples/cudaTensorCoreGemm/README.md index 502941ae..3f88fe67 100644 --- a/Samples/cudaTensorCoreGemm/README.md +++ b/Samples/cudaTensorCoreGemm/README.md @@ -31,7 +31,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate, ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj index 622ffe8f..7f7f17ce 100644 --- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj +++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2019.vcxproj b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2019.vcxproj index fb649d4c..42487257 100644 --- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2019.vcxproj +++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/deviceQuery/README.md b/Samples/deviceQuery/README.md index 76fb08d5..dbd8e3da 100644 --- a/Samples/deviceQuery/README.md +++ b/Samples/deviceQuery/README.md @@ -27,7 +27,7 @@ cudaSetDevice, cudaGetDeviceCount, cudaGetDeviceProperties, cudaDriverGetVersion ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/deviceQuery/deviceQuery_vs2017.vcxproj b/Samples/deviceQuery/deviceQuery_vs2017.vcxproj index 5bd56297..0013948e 100644 --- a/Samples/deviceQuery/deviceQuery_vs2017.vcxproj +++ b/Samples/deviceQuery/deviceQuery_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/deviceQuery/deviceQuery_vs2019.vcxproj b/Samples/deviceQuery/deviceQuery_vs2019.vcxproj index f8532544..8a797564 100644 --- a/Samples/deviceQuery/deviceQuery_vs2019.vcxproj +++ b/Samples/deviceQuery/deviceQuery_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/dmmaTensorCoreGemm/README.md b/Samples/dmmaTensorCoreGemm/README.md index aa6e6f16..f157b6f1 100644 --- a/Samples/dmmaTensorCoreGemm/README.md +++ b/Samples/dmmaTensorCoreGemm/README.md @@ -27,7 +27,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate, ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2017.vcxproj b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2017.vcxproj index 5ea929c1..19d1e7ab 100644 --- a/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2017.vcxproj +++ b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2019.vcxproj b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2019.vcxproj index b415db92..7423334b 100644 --- a/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2019.vcxproj +++ b/Samples/dmmaTensorCoreGemm/dmmaTensorCoreGemm_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/globalToShmemAsyncCopy/README.md b/Samples/globalToShmemAsyncCopy/README.md index 233d5b50..1646dc0d 100644 --- a/Samples/globalToShmemAsyncCopy/README.md +++ b/Samples/globalToShmemAsyncCopy/README.md @@ -30,7 +30,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2017.vcxproj b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2017.vcxproj index added1d2..ef431b2f 100644 --- a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2017.vcxproj +++ b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2019.vcxproj b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2019.vcxproj index bf65f63a..b506e825 100644 --- a/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2019.vcxproj +++ b/Samples/globalToShmemAsyncCopy/globalToShmemAsyncCopy_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/immaTensorCoreGemm/README.md b/Samples/immaTensorCoreGemm/README.md index 3c07bb95..69655ee6 100644 --- a/Samples/immaTensorCoreGemm/README.md +++ b/Samples/immaTensorCoreGemm/README.md @@ -27,7 +27,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate, ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj index d6942bc2..b6b86157 100644 --- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj +++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2019.vcxproj b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2019.vcxproj index 6ecb5d5f..98e76bba 100644 --- a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2019.vcxproj +++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/jacobiCudaGraphs/README.md b/Samples/jacobiCudaGraphs/README.md index c6223ff2..591325fc 100644 --- a/Samples/jacobiCudaGraphs/README.md +++ b/Samples/jacobiCudaGraphs/README.md @@ -25,7 +25,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch, ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2017.vcxproj b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2017.vcxproj index c899fc38..f077e41b 100644 --- a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2017.vcxproj +++ b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -109,6 +109,6 @@ - + diff --git a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2019.vcxproj b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2019.vcxproj index c6158ebd..438f463e 100644 --- a/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2019.vcxproj +++ b/Samples/jacobiCudaGraphs/jacobiCudaGraphs_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -105,6 +105,6 @@ - + diff --git a/Samples/matrixMul/README.md b/Samples/matrixMul/README.md index 5d9dba69..bb0e280e 100644 --- a/Samples/matrixMul/README.md +++ b/Samples/matrixMul/README.md @@ -27,7 +27,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/matrixMul/matrixMul_vs2017.vcxproj b/Samples/matrixMul/matrixMul_vs2017.vcxproj index c362684f..3e4e5864 100644 --- a/Samples/matrixMul/matrixMul_vs2017.vcxproj +++ b/Samples/matrixMul/matrixMul_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/matrixMul/matrixMul_vs2019.vcxproj b/Samples/matrixMul/matrixMul_vs2019.vcxproj index 084d32b0..dc1a1886 100644 --- a/Samples/matrixMul/matrixMul_vs2019.vcxproj +++ b/Samples/matrixMul/matrixMul_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/matrixMulDrv/README.md b/Samples/matrixMulDrv/README.md index 248b61c5..a604f373 100644 --- a/Samples/matrixMulDrv/README.md +++ b/Samples/matrixMulDrv/README.md @@ -27,7 +27,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cu ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj index 6360c07c..8d95104b 100644 --- a/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj +++ b/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -112,6 +112,6 @@ - + diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2019.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2019.vcxproj index 69a91d3c..4e4254ce 100644 --- a/Samples/matrixMulDrv/matrixMulDrv_vs2019.vcxproj +++ b/Samples/matrixMulDrv/matrixMulDrv_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/memMapIPCDrv/README.md b/Samples/memMapIPCDrv/README.md index 1e343fd1..64347638 100644 --- a/Samples/memMapIPCDrv/README.md +++ b/Samples/memMapIPCDrv/README.md @@ -30,7 +30,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuLaunchKernel, cuMemcpyD ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/memMapIPCDrv/memMapIPCDrv_vs2017.vcxproj b/Samples/memMapIPCDrv/memMapIPCDrv_vs2017.vcxproj index 86d80be6..48555b9a 100644 --- a/Samples/memMapIPCDrv/memMapIPCDrv_vs2017.vcxproj +++ b/Samples/memMapIPCDrv/memMapIPCDrv_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -113,6 +113,6 @@ - + diff --git a/Samples/memMapIPCDrv/memMapIPCDrv_vs2019.vcxproj b/Samples/memMapIPCDrv/memMapIPCDrv_vs2019.vcxproj index 3c928e83..4873be62 100644 --- a/Samples/memMapIPCDrv/memMapIPCDrv_vs2019.vcxproj +++ b/Samples/memMapIPCDrv/memMapIPCDrv_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -109,6 +109,6 @@ - + diff --git a/Samples/nvJPEG/Makefile b/Samples/nvJPEG/Makefile index f3515c78..d8c228df 100644 --- a/Samples/nvJPEG/Makefile +++ b/Samples/nvJPEG/Makefile @@ -277,12 +277,6 @@ ifeq ($(TARGET_ARCH),armv7l) SAMPLE_ENABLED := 0 endif -# This sample is not supported on QNX -ifeq ($(TARGET_OS),qnx) - $(info >>> WARNING - nvJPEG is not supported on QNX - waiving sample <<<) - SAMPLE_ENABLED := 0 -endif - ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) diff --git a/Samples/nvJPEG/README.md b/Samples/nvJPEG/README.md index 53c1b60d..4b67d66c 100644 --- a/Samples/nvJPEG/README.md +++ b/Samples/nvJPEG/README.md @@ -25,7 +25,7 @@ x86_64, ppc64le, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/nvJPEG/nvJPEG_vs2017.vcxproj b/Samples/nvJPEG/nvJPEG_vs2017.vcxproj index 7d16e568..cb425c51 100644 --- a/Samples/nvJPEG/nvJPEG_vs2017.vcxproj +++ b/Samples/nvJPEG/nvJPEG_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/nvJPEG/nvJPEG_vs2019.vcxproj b/Samples/nvJPEG/nvJPEG_vs2019.vcxproj index 378b9198..f3ef157d 100644 --- a/Samples/nvJPEG/nvJPEG_vs2019.vcxproj +++ b/Samples/nvJPEG/nvJPEG_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/nvJPEG_encoder/Makefile b/Samples/nvJPEG_encoder/Makefile index da0b82b7..05228d1d 100644 --- a/Samples/nvJPEG_encoder/Makefile +++ b/Samples/nvJPEG_encoder/Makefile @@ -277,12 +277,6 @@ ifeq ($(TARGET_ARCH),armv7l) SAMPLE_ENABLED := 0 endif -# This sample is not supported on QNX -ifeq ($(TARGET_OS),qnx) - $(info >>> WARNING - nvJPEG_encoder is not supported on QNX - waiving sample <<<) - SAMPLE_ENABLED := 0 -endif - ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) diff --git a/Samples/nvJPEG_encoder/README.md b/Samples/nvJPEG_encoder/README.md index 40f092b3..9cf7266a 100644 --- a/Samples/nvJPEG_encoder/README.md +++ b/Samples/nvJPEG_encoder/README.md @@ -25,7 +25,7 @@ x86_64, ppc64le, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2017.vcxproj b/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2017.vcxproj index 765f1f35..f81ee359 100644 --- a/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2017.vcxproj +++ b/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2019.vcxproj b/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2019.vcxproj index 76fcec11..b4801b00 100644 --- a/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2019.vcxproj +++ b/Samples/nvJPEG_encoder/nvJPEG_encoder_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/p2pBandwidthLatencyTest/README.md b/Samples/p2pBandwidthLatencyTest/README.md index ab1f4685..0399afcf 100644 --- a/Samples/p2pBandwidthLatencyTest/README.md +++ b/Samples/p2pBandwidthLatencyTest/README.md @@ -27,7 +27,7 @@ cudaDeviceCanAccessPeer, cudaDeviceEnablePeerAccess, cudaDeviceDisablePeerAccess ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj index 28d5f5cd..036c966e 100644 --- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj +++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2019.vcxproj b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2019.vcxproj index 8a2d5450..00a6c14e 100644 --- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2019.vcxproj +++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/reduction/README.md b/Samples/reduction/README.md index 172d748a..d7be1b62 100644 --- a/Samples/reduction/README.md +++ b/Samples/reduction/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/reduction/reduction_vs2017.vcxproj b/Samples/reduction/reduction_vs2017.vcxproj index 7e14bc82..9c658bc1 100644 --- a/Samples/reduction/reduction_vs2017.vcxproj +++ b/Samples/reduction/reduction_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -109,6 +109,6 @@ - + diff --git a/Samples/reduction/reduction_vs2019.vcxproj b/Samples/reduction/reduction_vs2019.vcxproj index 74fb1d6a..08b7216f 100644 --- a/Samples/reduction/reduction_vs2019.vcxproj +++ b/Samples/reduction/reduction_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -105,6 +105,6 @@ - + diff --git a/Samples/shfl_scan/README.md b/Samples/shfl_scan/README.md index 87b6872b..a9ce5ccb 100644 --- a/Samples/shfl_scan/README.md +++ b/Samples/shfl_scan/README.md @@ -25,7 +25,7 @@ x86_64, ppc64le, armv7l, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/shfl_scan/shfl_scan_vs2017.vcxproj b/Samples/shfl_scan/shfl_scan_vs2017.vcxproj index beaad3f8..d07230e5 100644 --- a/Samples/shfl_scan/shfl_scan_vs2017.vcxproj +++ b/Samples/shfl_scan/shfl_scan_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -109,6 +109,6 @@ - + diff --git a/Samples/shfl_scan/shfl_scan_vs2019.vcxproj b/Samples/shfl_scan/shfl_scan_vs2019.vcxproj index 8757714e..5a4d4dda 100644 --- a/Samples/shfl_scan/shfl_scan_vs2019.vcxproj +++ b/Samples/shfl_scan/shfl_scan_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -105,6 +105,6 @@ - + diff --git a/Samples/simpleAWBarrier/README.md b/Samples/simpleAWBarrier/README.md index c4003183..4f13fe3d 100644 --- a/Samples/simpleAWBarrier/README.md +++ b/Samples/simpleAWBarrier/README.md @@ -30,7 +30,7 @@ cudaMalloc, cudaFree, cudaMemcpyAsync ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleAWBarrier/simpleAWBarrier_vs2017.vcxproj b/Samples/simpleAWBarrier/simpleAWBarrier_vs2017.vcxproj index e03ef6e2..3d2e214a 100644 --- a/Samples/simpleAWBarrier/simpleAWBarrier_vs2017.vcxproj +++ b/Samples/simpleAWBarrier/simpleAWBarrier_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleAWBarrier/simpleAWBarrier_vs2019.vcxproj b/Samples/simpleAWBarrier/simpleAWBarrier_vs2019.vcxproj index b4be9610..d0c785cd 100644 --- a/Samples/simpleAWBarrier/simpleAWBarrier_vs2019.vcxproj +++ b/Samples/simpleAWBarrier/simpleAWBarrier_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/simpleAttributes/README.md b/Samples/simpleAttributes/README.md index 5d643c86..3d829879 100644 --- a/Samples/simpleAttributes/README.md +++ b/Samples/simpleAttributes/README.md @@ -27,7 +27,7 @@ cudaCtxResetPersistingL2Cache, cudaDeviceSetLimit, cudaFree, cudaGetDeviceProper ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/simpleAttributes/simpleAttributes_vs2017.vcxproj b/Samples/simpleAttributes/simpleAttributes_vs2017.vcxproj index a31cf815..823668ed 100644 --- a/Samples/simpleAttributes/simpleAttributes_vs2017.vcxproj +++ b/Samples/simpleAttributes/simpleAttributes_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleAttributes/simpleAttributes_vs2019.vcxproj b/Samples/simpleAttributes/simpleAttributes_vs2019.vcxproj index b74d221e..e66d59e7 100644 --- a/Samples/simpleAttributes/simpleAttributes_vs2019.vcxproj +++ b/Samples/simpleAttributes/simpleAttributes_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/simpleCUBLAS/Makefile b/Samples/simpleCUBLAS/Makefile index 516da194..bbdaed39 100644 --- a/Samples/simpleCUBLAS/Makefile +++ b/Samples/simpleCUBLAS/Makefile @@ -263,14 +263,6 @@ ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) -SAMPLE_ENABLED := 1 - -# This sample is not supported on QNX -ifeq ($(TARGET_OS),qnx) - $(info >>> WARNING - simpleCUBLAS is not supported on QNX - waiving sample <<<) - SAMPLE_ENABLED := 0 -endif - ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) @@ -305,10 +297,6 @@ ALL_CCFLAGS += --threads 0 LIBRARIES += -lcublas -ifeq ($(SAMPLE_ENABLED),0) -EXEC ?= @echo "[@]" -endif - ################################################################################ # Target rules @@ -316,23 +304,16 @@ all: build build: simpleCUBLAS -check.deps: -ifeq ($(SAMPLE_ENABLED),0) - @echo "Sample will be waived due to the above missing dependencies" -else - @echo "Sample is ready - all dependencies have been met" -endif - simpleCUBLAS.o:simpleCUBLAS.cpp - $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< simpleCUBLAS: simpleCUBLAS.o - $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) - $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) - $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) run: build - $(EXEC) ./simpleCUBLAS + ./simpleCUBLAS clean: rm -f simpleCUBLAS simpleCUBLAS.o diff --git a/Samples/simpleCUBLAS/README.md b/Samples/simpleCUBLAS/README.md index 67dd4ce5..ba3a8517 100644 --- a/Samples/simpleCUBLAS/README.md +++ b/Samples/simpleCUBLAS/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj index 3a68d707..8098fae1 100644 --- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj +++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2019.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2019.vcxproj index 6370f200..7870a053 100644 --- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2019.vcxproj +++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/simpleCUBLASXT/Makefile b/Samples/simpleCUBLASXT/Makefile index b5759857..24e5af89 100644 --- a/Samples/simpleCUBLASXT/Makefile +++ b/Samples/simpleCUBLASXT/Makefile @@ -271,12 +271,6 @@ ifeq ($(TARGET_ARCH),armv7l) SAMPLE_ENABLED := 0 endif -# This sample is not supported on QNX -ifeq ($(TARGET_OS),qnx) - $(info >>> WARNING - simpleCUBLASXT is not supported on QNX - waiving sample <<<) - SAMPLE_ENABLED := 0 -endif - ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) diff --git a/Samples/simpleCUBLASXT/README.md b/Samples/simpleCUBLASXT/README.md index fd3decae..a92b933b 100644 --- a/Samples/simpleCUBLASXT/README.md +++ b/Samples/simpleCUBLASXT/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj index 3805ce26..12c606a9 100644 --- a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj +++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2019.vcxproj b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2019.vcxproj index b0472a39..c527611c 100644 --- a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2019.vcxproj +++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/simpleCUBLAS_LU/Makefile b/Samples/simpleCUBLAS_LU/Makefile index 2c49cc17..d6094132 100644 --- a/Samples/simpleCUBLAS_LU/Makefile +++ b/Samples/simpleCUBLAS_LU/Makefile @@ -277,12 +277,6 @@ ifeq ($(TARGET_ARCH),armv7l) SAMPLE_ENABLED := 0 endif -# This sample is not supported on QNX -ifeq ($(TARGET_OS),qnx) - $(info >>> WARNING - simpleCUBLAS_LU is not supported on QNX - waiving sample <<<) - SAMPLE_ENABLED := 0 -endif - ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) diff --git a/Samples/simpleCUBLAS_LU/README.md b/Samples/simpleCUBLAS_LU/README.md index 9ef4764b..12d0ed1e 100644 --- a/Samples/simpleCUBLAS_LU/README.md +++ b/Samples/simpleCUBLAS_LU/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.vcxproj b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.vcxproj index 7599aeda..96fc7f61 100644 --- a/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.vcxproj +++ b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.vcxproj b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.vcxproj index 154dac0a..1f00aec9 100644 --- a/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.vcxproj +++ b/Samples/simpleCUBLAS_LU/simpleCUBLAS_LU_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/simpleCUFFT/Makefile b/Samples/simpleCUFFT/Makefile index c716cd0c..9e9475ee 100644 --- a/Samples/simpleCUFFT/Makefile +++ b/Samples/simpleCUFFT/Makefile @@ -265,12 +265,6 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) SAMPLE_ENABLED := 1 -# This sample is not supported on QNX -ifeq ($(TARGET_OS),qnx) - $(info >>> WARNING - simpleCUFFT is not supported on QNX - waiving sample <<<) - SAMPLE_ENABLED := 0 -endif - ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) diff --git a/Samples/simpleCUFFT/README.md b/Samples/simpleCUFFT/README.md index 67227805..1efea4fd 100644 --- a/Samples/simpleCUFFT/README.md +++ b/Samples/simpleCUFFT/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj index 914b65a7..f2b76a4b 100644 --- a/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj +++ b/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2019.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2019.vcxproj index 339d7959..b8f8fa04 100644 --- a/Samples/simpleCUFFT/simpleCUFFT_vs2019.vcxproj +++ b/Samples/simpleCUFFT/simpleCUFFT_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/simpleCudaGraphs/README.md b/Samples/simpleCudaGraphs/README.md index 9e044f33..aa3567e2 100644 --- a/Samples/simpleCudaGraphs/README.md +++ b/Samples/simpleCudaGraphs/README.md @@ -25,7 +25,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaLaunchHostFunc, cudaGraphCreat ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj index 5a48206e..7b8fa0ca 100644 --- a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj +++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2019.vcxproj b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2019.vcxproj index e7aeecd2..348817a8 100644 --- a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2019.vcxproj +++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/simpleD3D11/README.md b/Samples/simpleD3D11/README.md index eb8d6428..bd59b087 100644 --- a/Samples/simpleD3D11/README.md +++ b/Samples/simpleD3D11/README.md @@ -30,7 +30,7 @@ cudaD3D11GetDevice, cudaImportExternalSemaphore, cudaImportExternalMemory, cudaE ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleD3D11/simpleD3D11_vs2017.vcxproj b/Samples/simpleD3D11/simpleD3D11_vs2017.vcxproj index 651c1a4c..882d1481 100644 --- a/Samples/simpleD3D11/simpleD3D11_vs2017.vcxproj +++ b/Samples/simpleD3D11/simpleD3D11_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -112,6 +112,6 @@ - + diff --git a/Samples/simpleD3D11/simpleD3D11_vs2019.vcxproj b/Samples/simpleD3D11/simpleD3D11_vs2019.vcxproj index c3bcdbe4..95f974e1 100644 --- a/Samples/simpleD3D11/simpleD3D11_vs2019.vcxproj +++ b/Samples/simpleD3D11/simpleD3D11_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleD3D12/README.md b/Samples/simpleD3D12/README.md index 4c60ed5b..065c9360 100644 --- a/Samples/simpleD3D12/README.md +++ b/Samples/simpleD3D12/README.md @@ -30,7 +30,7 @@ cudaWaitExternalSemaphoresAsync, cudaSignalExternalSemaphoresAsync, cudaImportEx ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj b/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj index e0bee149..3804d197 100644 --- a/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj +++ b/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -120,6 +120,6 @@ - + diff --git a/Samples/simpleD3D12/simpleD3D12_vs2019.vcxproj b/Samples/simpleD3D12/simpleD3D12_vs2019.vcxproj index ec724d05..e97672dc 100644 --- a/Samples/simpleD3D12/simpleD3D12_vs2019.vcxproj +++ b/Samples/simpleD3D12/simpleD3D12_vs2019.vcxproj @@ -39,7 +39,7 @@ - + @@ -121,6 +121,6 @@ - + diff --git a/Samples/simpleDrvRuntime/README.md b/Samples/simpleDrvRuntime/README.md index 8c09f93b..9ea98174 100644 --- a/Samples/simpleDrvRuntime/README.md +++ b/Samples/simpleDrvRuntime/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaMalloc, cudaStreamCreateWithFlags ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2017.vcxproj b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2017.vcxproj index 94e29419..9e1d0a4a 100644 --- a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2017.vcxproj +++ b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -112,6 +112,6 @@ - + diff --git a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2019.vcxproj b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2019.vcxproj index 42556e0c..554ac904 100644 --- a/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2019.vcxproj +++ b/Samples/simpleDrvRuntime/simpleDrvRuntime_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleGL/README.md b/Samples/simpleGL/README.md index 5176ee1b..008c0be6 100644 --- a/Samples/simpleGL/README.md +++ b/Samples/simpleGL/README.md @@ -30,7 +30,7 @@ cudaGraphicsMapResources, cudaGraphicsUnmapResources, cudaGraphicsResourceGetMap ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleGL/simpleGL_vs2017.vcxproj b/Samples/simpleGL/simpleGL_vs2017.vcxproj index d096e815..92e84cc4 100644 --- a/Samples/simpleGL/simpleGL_vs2017.vcxproj +++ b/Samples/simpleGL/simpleGL_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -118,6 +118,6 @@ - + diff --git a/Samples/simpleGL/simpleGL_vs2019.vcxproj b/Samples/simpleGL/simpleGL_vs2019.vcxproj index 527b22d0..60eab5e8 100644 --- a/Samples/simpleGL/simpleGL_vs2019.vcxproj +++ b/Samples/simpleGL/simpleGL_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -114,6 +114,6 @@ - + diff --git a/Samples/simpleIPC/README.md b/Samples/simpleIPC/README.md index 3fcb740a..cf35bf0d 100644 --- a/Samples/simpleIPC/README.md +++ b/Samples/simpleIPC/README.md @@ -30,7 +30,7 @@ cudaIpcGetEventHandle, cudaIpcOpenMemHandle, cudaIpcCloseMemHandle, cudaMemcpyAs ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleIPC/simpleIPC_vs2017.vcxproj b/Samples/simpleIPC/simpleIPC_vs2017.vcxproj index 640802b1..e3345f52 100644 --- a/Samples/simpleIPC/simpleIPC_vs2017.vcxproj +++ b/Samples/simpleIPC/simpleIPC_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -109,6 +109,6 @@ - + diff --git a/Samples/simpleIPC/simpleIPC_vs2019.vcxproj b/Samples/simpleIPC/simpleIPC_vs2019.vcxproj index 8c03b709..b176762f 100644 --- a/Samples/simpleIPC/simpleIPC_vs2019.vcxproj +++ b/Samples/simpleIPC/simpleIPC_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -105,6 +105,6 @@ - + diff --git a/Samples/simpleVoteIntrinsics/README.md b/Samples/simpleVoteIntrinsics/README.md index 314de841..162a13fe 100644 --- a/Samples/simpleVoteIntrinsics/README.md +++ b/Samples/simpleVoteIntrinsics/README.md @@ -27,7 +27,7 @@ cudaMalloc, cudaFree, cudaMemcpy, cudaFreeHost ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj index 62b48298..908d81ac 100644 --- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj +++ b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2019.vcxproj b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2019.vcxproj index 630c35e5..a8bfac7b 100644 --- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2019.vcxproj +++ b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/simpleVulkan/README.md b/Samples/simpleVulkan/README.md index 4cbb0122..91a4f186 100644 --- a/Samples/simpleVulkan/README.md +++ b/Samples/simpleVulkan/README.md @@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedBuffer, cudaImportExternalS ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj b/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj index 713ae122..069fdc1e 100644 --- a/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj +++ b/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -122,6 +122,6 @@ - + diff --git a/Samples/simpleVulkan/simpleVulkan_vs2019.vcxproj b/Samples/simpleVulkan/simpleVulkan_vs2019.vcxproj index a03ea3de..88d61caf 100644 --- a/Samples/simpleVulkan/simpleVulkan_vs2019.vcxproj +++ b/Samples/simpleVulkan/simpleVulkan_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -118,6 +118,6 @@ - + diff --git a/Samples/simpleVulkanMMAP/README.md b/Samples/simpleVulkanMMAP/README.md index bd3aeb63..52f4b74f 100644 --- a/Samples/simpleVulkanMMAP/README.md +++ b/Samples/simpleVulkanMMAP/README.md @@ -33,7 +33,7 @@ cudaGetDeviceProperties, cudaImportExternalMemory, cudaExternalMemoryGetMappedBu ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2017.vcxproj b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2017.vcxproj index 95ec4011..ca7b8dec 100644 --- a/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2017.vcxproj +++ b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -124,6 +124,6 @@ - + diff --git a/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2019.vcxproj b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2019.vcxproj index c15cb955..2537bca1 100644 --- a/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2019.vcxproj +++ b/Samples/simpleVulkanMMAP/simpleVulkanMMAP_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -120,6 +120,6 @@ - + diff --git a/Samples/simpleZeroCopy/README.md b/Samples/simpleZeroCopy/README.md index 12919ca0..8172982d 100644 --- a/Samples/simpleZeroCopy/README.md +++ b/Samples/simpleZeroCopy/README.md @@ -27,7 +27,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/simpleZeroCopy/simpleZeroCopy_vs2017.vcxproj b/Samples/simpleZeroCopy/simpleZeroCopy_vs2017.vcxproj index 72ad3aaa..8c74cd84 100644 --- a/Samples/simpleZeroCopy/simpleZeroCopy_vs2017.vcxproj +++ b/Samples/simpleZeroCopy/simpleZeroCopy_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/simpleZeroCopy/simpleZeroCopy_vs2019.vcxproj b/Samples/simpleZeroCopy/simpleZeroCopy_vs2019.vcxproj index 40b06783..f96996d8 100644 --- a/Samples/simpleZeroCopy/simpleZeroCopy_vs2019.vcxproj +++ b/Samples/simpleZeroCopy/simpleZeroCopy_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/streamOrderedAllocation/README.md b/Samples/streamOrderedAllocation/README.md index be8d5602..c2cafafe 100644 --- a/Samples/streamOrderedAllocation/README.md +++ b/Samples/streamOrderedAllocation/README.md @@ -27,7 +27,7 @@ cudaMallocAsync, cudaFreeAsync, cudaMemPoolSetAttribute, cudaDeviceGetDefaultMem ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2017.vcxproj b/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2017.vcxproj index 1113cafe..33b24ecd 100644 --- a/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2017.vcxproj +++ b/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2019.vcxproj b/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2019.vcxproj index 5e884ceb..046057bc 100644 --- a/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2019.vcxproj +++ b/Samples/streamOrderedAllocation/streamOrderedAllocation_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/streamOrderedAllocationIPC/README.md b/Samples/streamOrderedAllocationIPC/README.md index 04948fae..3ac4a03c 100644 --- a/Samples/streamOrderedAllocationIPC/README.md +++ b/Samples/streamOrderedAllocationIPC/README.md @@ -27,7 +27,7 @@ cudaMallocAsync, cudaFreeAsync, cudaMemPoolCreate, cudaMemPoolImportPointer, cud ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/streamOrderedAllocationP2P/README.md b/Samples/streamOrderedAllocationP2P/README.md index 164284a9..4b03cbdc 100644 --- a/Samples/streamOrderedAllocationP2P/README.md +++ b/Samples/streamOrderedAllocationP2P/README.md @@ -27,7 +27,7 @@ cudaMallocAsync, cudaFreeAsync, cudaMemPoolSetAccess, cudaDeviceGetDefaultMemPoo ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2017.vcxproj b/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2017.vcxproj index 116d3c9c..4d56b360 100644 --- a/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2017.vcxproj +++ b/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2019.vcxproj b/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2019.vcxproj index 50529ea0..b6b72aed 100644 --- a/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2019.vcxproj +++ b/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/systemWideAtomics/README.md b/Samples/systemWideAtomics/README.md index 530df9d3..115d8ac7 100644 --- a/Samples/systemWideAtomics/README.md +++ b/Samples/systemWideAtomics/README.md @@ -30,7 +30,7 @@ cudaMalloc, cudaFree, cudaMemcpy, cudaFreeHost ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/tf32TensorCoreGemm/README.md b/Samples/tf32TensorCoreGemm/README.md index c1513be1..72577bdf 100644 --- a/Samples/tf32TensorCoreGemm/README.md +++ b/Samples/tf32TensorCoreGemm/README.md @@ -27,7 +27,7 @@ cudaMalloc, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate, cudaEv ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2017.vcxproj b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2017.vcxproj index 4cd44a20..305b25b5 100644 --- a/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2017.vcxproj +++ b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2019.vcxproj b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2019.vcxproj index 5ed41711..731b304a 100644 --- a/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2019.vcxproj +++ b/Samples/tf32TensorCoreGemm/tf32TensorCoreGemm_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/vectorAddMMAP/README.md b/Samples/vectorAddMMAP/README.md index c385d627..24783aef 100644 --- a/Samples/vectorAddMMAP/README.md +++ b/Samples/vectorAddMMAP/README.md @@ -27,7 +27,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cu ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/vectorAddMMAP/vectorAddMMAP_vs2017.vcxproj b/Samples/vectorAddMMAP/vectorAddMMAP_vs2017.vcxproj index 39f9ba5b..2e987db6 100644 --- a/Samples/vectorAddMMAP/vectorAddMMAP_vs2017.vcxproj +++ b/Samples/vectorAddMMAP/vectorAddMMAP_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -113,6 +113,6 @@ - + diff --git a/Samples/vectorAddMMAP/vectorAddMMAP_vs2019.vcxproj b/Samples/vectorAddMMAP/vectorAddMMAP_vs2019.vcxproj index ccc98fe0..22b8a10d 100644 --- a/Samples/vectorAddMMAP/vectorAddMMAP_vs2019.vcxproj +++ b/Samples/vectorAddMMAP/vectorAddMMAP_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -109,6 +109,6 @@ - + diff --git a/Samples/vectorAdd_nvrtc/README.md b/Samples/vectorAdd_nvrtc/README.md index 64132b80..8520e04b 100644 --- a/Samples/vectorAdd_nvrtc/README.md +++ b/Samples/vectorAdd_nvrtc/README.md @@ -30,7 +30,7 @@ cuMemAlloc, cuMemFree, cuMemcpyHtoD, cuMemcpyDtoH ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj index 3fecdd25..9cb2e2c1 100644 --- a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj +++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2019.vcxproj b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2019.vcxproj index 697e47aa..72d19488 100644 --- a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2019.vcxproj +++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/vulkanImageCUDA/README.md b/Samples/vulkanImageCUDA/README.md index f37e25f3..aae79199 100644 --- a/Samples/vulkanImageCUDA/README.md +++ b/Samples/vulkanImageCUDA/README.md @@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedMipmappedArray, cudaImportE ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2017.vcxproj b/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2017.vcxproj index f14a0515..2bfc68f7 100644 --- a/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2017.vcxproj +++ b/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -118,6 +118,6 @@ - + diff --git a/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2019.vcxproj b/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2019.vcxproj index 42673869..655c8105 100644 --- a/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2019.vcxproj +++ b/Samples/vulkanImageCUDA/vulkanImageCUDA_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -114,6 +114,6 @@ - + diff --git a/Samples/warpAggregatedAtomicsCG/README.md b/Samples/warpAggregatedAtomicsCG/README.md index c4c351a7..3344aa18 100644 --- a/Samples/warpAggregatedAtomicsCG/README.md +++ b/Samples/warpAggregatedAtomicsCG/README.md @@ -22,7 +22,7 @@ x86_64, ppc64le, armv7l, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj index 6a6c8655..018e27e0 100644 --- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj +++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2019.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2019.vcxproj index 6e83354a..f22e4976 100644 --- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2019.vcxproj +++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - + diff --git a/Samples/watershedSegmentationNPP/Makefile b/Samples/watershedSegmentationNPP/Makefile index c03a879d..a65719dc 100644 --- a/Samples/watershedSegmentationNPP/Makefile +++ b/Samples/watershedSegmentationNPP/Makefile @@ -271,12 +271,6 @@ ifeq ($(TARGET_OS),darwin) SAMPLE_ENABLED := 0 endif -# This sample is not supported on QNX -ifeq ($(TARGET_OS),qnx) - $(info >>> WARNING - watershedSegmentationNPP is not supported on QNX - waiving sample <<<) - SAMPLE_ENABLED := 0 -endif - ALL_LDFLAGS := ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) diff --git a/Samples/watershedSegmentationNPP/README.md b/Samples/watershedSegmentationNPP/README.md index 0b320280..96169587 100644 --- a/Samples/watershedSegmentationNPP/README.md +++ b/Samples/watershedSegmentationNPP/README.md @@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l ## Prerequisites -Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2017.vcxproj b/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2017.vcxproj index 548b5361..9fa9dbb1 100644 --- a/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2017.vcxproj +++ b/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2017.vcxproj @@ -38,7 +38,7 @@ - + @@ -108,6 +108,6 @@ - + diff --git a/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2019.vcxproj b/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2019.vcxproj index ee297d72..97d89664 100644 --- a/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2019.vcxproj +++ b/Samples/watershedSegmentationNPP/watershedSegmentationNPP_vs2019.vcxproj @@ -34,7 +34,7 @@ - + @@ -104,6 +104,6 @@ - +