From 21c36d356841762b3840fefdb009ba60726bbbff Mon Sep 17 00:00:00 2001 From: Mahesh Doijade Date: Fri, 24 Aug 2018 22:35:15 +0530 Subject: [PATCH] Add and Update samples for CUDA 10.0 --- Common/helper_cuda.h | 525 +---- Common/helper_cuda_drvapi.h | 25 +- README.md | 40 +- Samples/UnifiedMemoryPerf/Makefile | 306 +++ Samples/UnifiedMemoryPerf/NsightEclipse.xml | 84 + Samples/UnifiedMemoryPerf/README.md | 98 + .../UnifiedMemoryPerf_vs2012.sln | 20 + .../UnifiedMemoryPerf_vs2012.vcxproj | 110 + .../UnifiedMemoryPerf_vs2013.sln | 20 + .../UnifiedMemoryPerf_vs2013.vcxproj | 110 + .../UnifiedMemoryPerf_vs2015.sln | 20 + .../UnifiedMemoryPerf_vs2015.vcxproj | 110 + .../UnifiedMemoryPerf_vs2017.sln | 20 + .../UnifiedMemoryPerf_vs2017.vcxproj | 111 + Samples/UnifiedMemoryPerf/commonDefs.hpp | 88 + Samples/UnifiedMemoryPerf/commonKernels.cu | 33 + Samples/UnifiedMemoryPerf/commonKernels.hpp | 28 + Samples/UnifiedMemoryPerf/helperFunctions.cpp | 303 +++ .../UnifiedMemoryPerf/matrixMultiplyPerf.cu | 697 ++++++ Samples/conjugateGradientCudaGraphs/Makefile | 302 +++ .../NsightEclipse.xml | 84 + Samples/conjugateGradientCudaGraphs/README.md | 98 + .../conjugateGradientCudaGraphs.cu | 466 +++++ .../conjugateGradientCudaGraphs_vs2012.sln} | 6 +- ...onjugateGradientCudaGraphs_vs2012.vcxproj} | 17 +- .../conjugateGradientCudaGraphs_vs2013.sln | 20 + ...conjugateGradientCudaGraphs_vs2013.vcxproj | 107 + .../conjugateGradientCudaGraphs_vs2015.sln | 20 + ...conjugateGradientCudaGraphs_vs2015.vcxproj | 107 + .../conjugateGradientCudaGraphs_vs2017.sln | 20 + ...conjugateGradientCudaGraphs_vs2017.vcxproj | 108 + .../conjugateGradientMultiBlockCG/Makefile | 52 +- .../NsightEclipse.xml | 1 + .../conjugateGradientMultiBlockCG/README.md | 4 +- ...njugateGradientMultiBlockCG_vs2012.vcxproj | 6 +- ...njugateGradientMultiBlockCG_vs2013.vcxproj | 6 +- ...njugateGradientMultiBlockCG_vs2015.vcxproj | 6 +- ...njugateGradientMultiBlockCG_vs2017.vcxproj | 6 +- .../conjugateGradientMultiDeviceCG/Makefile | 52 +- .../NsightEclipse.xml | 4 + .../conjugateGradientMultiDeviceCG/README.md | 16 +- .../conjugateGradientMultiDeviceCG.cu | 15 +- .../conjugateGradientMultiDeviceCG_vs2012.sln | 20 + ...ugateGradientMultiDeviceCG_vs2012.vcxproj} | 15 +- .../conjugateGradientMultiDeviceCG_vs2013.sln | 20 + ...jugateGradientMultiDeviceCG_vs2013.vcxproj | 108 + .../conjugateGradientMultiDeviceCG_vs2015.sln | 20 + ...jugateGradientMultiDeviceCG_vs2015.vcxproj | 108 + .../conjugateGradientMultiDeviceCG_vs2017.sln | 20 + ...jugateGradientMultiDeviceCG_vs2017.vcxproj | 109 + Samples/cudaTensorCoreGemm/Makefile | 52 +- Samples/cudaTensorCoreGemm/NsightEclipse.xml | 1 + Samples/cudaTensorCoreGemm/README.md | 4 +- .../cudaTensorCoreGemm/cudaTensorCoreGemm.cu | 235 ++- .../cudaTensorCoreGemm_vs2012.vcxproj | 6 +- .../cudaTensorCoreGemm_vs2013.vcxproj | 6 +- .../cudaTensorCoreGemm_vs2015.vcxproj | 6 +- .../cudaTensorCoreGemm_vs2017.vcxproj | 6 +- Samples/deviceQuery/Makefile | 52 +- Samples/deviceQuery/NsightEclipse.xml | 1 + Samples/deviceQuery/README.md | 4 +- .../deviceQuery/deviceQuery_vs2012.vcxproj | 6 +- .../deviceQuery/deviceQuery_vs2013.vcxproj | 6 +- .../deviceQuery/deviceQuery_vs2015.vcxproj | 6 +- .../deviceQuery/deviceQuery_vs2017.vcxproj | 6 +- Samples/matrixMul/Makefile | 52 +- Samples/matrixMul/NsightEclipse.xml | 1 + Samples/matrixMul/README.md | 4 +- Samples/matrixMul/matrixMul_vs2012.vcxproj | 6 +- Samples/matrixMul/matrixMul_vs2013.vcxproj | 6 +- Samples/matrixMul/matrixMul_vs2015.vcxproj | 6 +- Samples/matrixMul/matrixMul_vs2017.vcxproj | 6 +- Samples/matrixMulDrv/Makefile | 50 +- Samples/matrixMulDrv/README.md | 4 +- .../matrixMulDrv/matrixMulDrv_vs2012.vcxproj | 4 +- .../matrixMulDrv/matrixMulDrv_vs2013.vcxproj | 4 +- .../matrixMulDrv/matrixMulDrv_vs2015.vcxproj | 4 +- .../matrixMulDrv/matrixMulDrv_vs2017.vcxproj | 4 +- Samples/p2pBandwidthLatencyTest/Makefile | 300 +++ .../p2pBandwidthLatencyTest/NsightEclipse.xml | 77 + Samples/p2pBandwidthLatencyTest/README.md | 94 + .../p2pBandwidthLatencyTest.cu | 682 ++++++ .../p2pBandwidthLatencyTest_vs2012.sln} | 6 +- .../p2pBandwidthLatencyTest_vs2012.vcxproj} | 17 +- .../p2pBandwidthLatencyTest_vs2013.sln} | 6 +- .../p2pBandwidthLatencyTest_vs2013.vcxproj | 107 + .../p2pBandwidthLatencyTest_vs2015.sln | 20 + .../p2pBandwidthLatencyTest_vs2015.vcxproj | 107 + .../p2pBandwidthLatencyTest_vs2017.sln | 20 + .../p2pBandwidthLatencyTest_vs2017.vcxproj | 108 + Samples/shfl_scan/Makefile | 52 +- Samples/shfl_scan/NsightEclipse.xml | 1 + Samples/shfl_scan/README.md | 4 +- Samples/shfl_scan/shfl_scan_vs2010.vcxproj | 107 - Samples/shfl_scan/shfl_scan_vs2012.vcxproj | 6 +- Samples/shfl_scan/shfl_scan_vs2013.vcxproj | 6 +- Samples/shfl_scan/shfl_scan_vs2015.vcxproj | 6 +- Samples/shfl_scan/shfl_scan_vs2017.vcxproj | 6 +- Samples/simpleCUBLAS/Makefile | 50 +- Samples/simpleCUBLAS/NsightEclipse.xml | 1 + Samples/simpleCUBLAS/README.md | 4 +- .../simpleCUBLAS/simpleCUBLAS_vs2012.vcxproj | 4 +- .../simpleCUBLAS/simpleCUBLAS_vs2013.vcxproj | 4 +- .../simpleCUBLAS/simpleCUBLAS_vs2015.vcxproj | 4 +- .../simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj | 4 +- Samples/simpleCUBLASXT/Makefile | 308 +++ Samples/simpleCUBLASXT/NsightEclipse.xml | 69 + Samples/simpleCUBLASXT/README.md | 95 + Samples/simpleCUBLASXT/simpleCUBLASXT.cpp | 301 +++ .../simpleCUBLASXT_vs2012.sln} | 6 +- .../simpleCUBLASXT_vs2012.vcxproj} | 13 +- .../simpleCUBLASXT_vs2013.sln} | 6 +- .../simpleCUBLASXT_vs2013.vcxproj} | 19 +- .../simpleCUBLASXT/simpleCUBLASXT_vs2015.sln | 20 + .../simpleCUBLASXT_vs2015.vcxproj} | 21 +- .../simpleCUBLASXT/simpleCUBLASXT_vs2017.sln | 20 + .../simpleCUBLASXT_vs2017.vcxproj} | 24 +- Samples/simpleCUFFT/Makefile | 52 +- Samples/simpleCUFFT/NsightEclipse.xml | 1 + Samples/simpleCUFFT/README.md | 4 +- Samples/simpleCUFFT/simpleCUFFT_vs2010.sln | 20 - .../simpleCUFFT/simpleCUFFT_vs2012.vcxproj | 6 +- .../simpleCUFFT/simpleCUFFT_vs2013.vcxproj | 6 +- .../simpleCUFFT/simpleCUFFT_vs2015.vcxproj | 6 +- .../simpleCUFFT/simpleCUFFT_vs2017.vcxproj | 6 +- Samples/simpleCudaGraphs/Makefile | 300 +++ Samples/simpleCudaGraphs/NsightEclipse.xml | 78 + Samples/simpleCudaGraphs/README.md | 94 + Samples/simpleCudaGraphs/simpleCudaGraphs.cu | 399 ++++ .../simpleCudaGraphs_vs2012.sln | 20 + .../simpleCudaGraphs_vs2012.vcxproj} | 15 +- .../simpleCudaGraphs_vs2013.sln | 20 + .../simpleCudaGraphs_vs2013.vcxproj} | 15 +- .../simpleCudaGraphs_vs2015.sln | 20 + .../simpleCudaGraphs_vs2015.vcxproj} | 15 +- .../simpleCudaGraphs_vs2017.sln | 20 + .../simpleCudaGraphs_vs2017.vcxproj | 108 + Samples/simpleVoteIntrinsics/Makefile | 52 +- .../simpleVoteIntrinsics/NsightEclipse.xml | 1 + Samples/simpleVoteIntrinsics/README.md | 4 +- .../simpleVoteIntrinsics_vs2010.sln | 20 - .../simpleVoteIntrinsics_vs2012.vcxproj | 6 +- .../simpleVoteIntrinsics_vs2013.vcxproj | 6 +- .../simpleVoteIntrinsics_vs2015.vcxproj | 6 +- .../simpleVoteIntrinsics_vs2017.vcxproj | 6 +- Samples/simpleVulkan/Build_instructions.txt | 18 + Samples/simpleVulkan/Makefile | 361 ++++ Samples/simpleVulkan/NsightEclipse.xml | 82 + Samples/simpleVulkan/README.md | 74 + Samples/simpleVulkan/findvulkan.mk | 143 ++ Samples/simpleVulkan/linmath.h | 501 +++++ Samples/simpleVulkan/shader_sine.frag | 10 + Samples/simpleVulkan/shader_sine.vert | 23 + .../simpleVulkan_vs2013.sln} | 6 +- .../simpleVulkan/simpleVulkan_vs2013.vcxproj | 122 ++ .../simpleVulkan_vs2015.sln} | 6 +- .../simpleVulkan/simpleVulkan_vs2015.vcxproj | 122 ++ .../simpleVulkan_vs2017.sln} | 6 +- .../simpleVulkan/simpleVulkan_vs2017.vcxproj | 123 ++ Samples/simpleVulkan/vulkanCUDASinewave.cu | 1863 +++++++++++++++++ Samples/systemWideAtomics/Makefile | 318 +++ Samples/systemWideAtomics/NsightEclipse.xml | 58 + Samples/systemWideAtomics/README.md | 64 + .../systemWideAtomics/systemWideAtomics.cu | 342 +++ Samples/vectorAdd_nvrtc/Makefile | 50 +- Samples/vectorAdd_nvrtc/README.md | 4 +- .../vectorAdd_nvrtc_vs2010.sln | 20 - .../vectorAdd_nvrtc_vs2012.vcxproj | 4 +- .../vectorAdd_nvrtc_vs2013.vcxproj | 4 +- .../vectorAdd_nvrtc_vs2015.vcxproj | 4 +- .../vectorAdd_nvrtc_vs2017.vcxproj | 4 +- Samples/warpAggregatedAtomicsCG/Makefile | 52 +- .../warpAggregatedAtomicsCG/NsightEclipse.xml | 1 + Samples/warpAggregatedAtomicsCG/README.md | 4 +- .../warpAggregatedAtomicsCG_vs2012.vcxproj | 6 +- .../warpAggregatedAtomicsCG_vs2013.vcxproj | 6 +- .../warpAggregatedAtomicsCG_vs2015.vcxproj | 6 +- .../warpAggregatedAtomicsCG_vs2017.vcxproj | 6 +- 178 files changed, 12375 insertions(+), 1288 deletions(-) create mode 100644 Samples/UnifiedMemoryPerf/Makefile create mode 100644 Samples/UnifiedMemoryPerf/NsightEclipse.xml create mode 100644 Samples/UnifiedMemoryPerf/README.md create mode 100644 Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.sln create mode 100644 Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.vcxproj create mode 100644 Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.sln create mode 100644 Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.vcxproj create mode 100644 Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.sln create mode 100644 Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.vcxproj create mode 100644 Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.sln create mode 100644 Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj create mode 100644 Samples/UnifiedMemoryPerf/commonDefs.hpp create mode 100644 Samples/UnifiedMemoryPerf/commonKernels.cu create mode 100644 Samples/UnifiedMemoryPerf/commonKernels.hpp create mode 100644 Samples/UnifiedMemoryPerf/helperFunctions.cpp create mode 100644 Samples/UnifiedMemoryPerf/matrixMultiplyPerf.cu create mode 100644 Samples/conjugateGradientCudaGraphs/Makefile create mode 100644 Samples/conjugateGradientCudaGraphs/NsightEclipse.xml create mode 100644 Samples/conjugateGradientCudaGraphs/README.md create mode 100644 Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu rename Samples/{conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2010.sln => conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.sln} (78%) rename Samples/{simpleCUFFT/simpleCUFFT_vs2010.vcxproj => conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.vcxproj} (84%) create mode 100644 Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.sln create mode 100644 Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.vcxproj create mode 100644 Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.sln create mode 100644 Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.vcxproj create mode 100644 Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.sln create mode 100644 Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj create mode 100644 Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2012.sln rename Samples/{conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2010.vcxproj => conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2012.vcxproj} (89%) create mode 100644 Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2013.sln create mode 100644 Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2013.vcxproj create mode 100644 Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.sln create mode 100644 Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.vcxproj create mode 100644 Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.sln create mode 100644 Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj create mode 100644 Samples/p2pBandwidthLatencyTest/Makefile create mode 100644 Samples/p2pBandwidthLatencyTest/NsightEclipse.xml create mode 100644 Samples/p2pBandwidthLatencyTest/README.md create mode 100644 Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu rename Samples/{warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2010.sln => p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.sln} (76%) rename Samples/{simpleVoteIntrinsics/simpleVoteIntrinsics_vs2010.vcxproj => p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.vcxproj} (90%) rename Samples/{cudaTensorCoreGemm/cudaTensorCoreGemm_vs2010.sln => p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.sln} (72%) create mode 100644 Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.vcxproj create mode 100644 Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.sln create mode 100644 Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.vcxproj create mode 100644 Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.sln create mode 100644 Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj delete mode 100644 Samples/shfl_scan/shfl_scan_vs2010.vcxproj create mode 100644 Samples/simpleCUBLASXT/Makefile create mode 100644 Samples/simpleCUBLASXT/NsightEclipse.xml create mode 100644 Samples/simpleCUBLASXT/README.md create mode 100644 Samples/simpleCUBLASXT/simpleCUBLASXT.cpp rename Samples/{simpleCUBLAS/simpleCUBLAS_vs2010.sln => simpleCUBLASXT/simpleCUBLASXT_vs2012.sln} (81%) rename Samples/{simpleCUBLAS/simpleCUBLAS_vs2010.vcxproj => simpleCUBLASXT/simpleCUBLASXT_vs2012.vcxproj} (91%) rename Samples/{deviceQuery/deviceQuery_vs2010.sln => simpleCUBLASXT/simpleCUBLASXT_vs2013.sln} (73%) rename Samples/{cudaTensorCoreGemm/cudaTensorCoreGemm_vs2010.vcxproj => simpleCUBLASXT/simpleCUBLASXT_vs2013.vcxproj} (83%) create mode 100644 Samples/simpleCUBLASXT/simpleCUBLASXT_vs2015.sln rename Samples/{vectorAdd_nvrtc/vectorAdd_nvrtc_vs2010.vcxproj => simpleCUBLASXT/simpleCUBLASXT_vs2015.vcxproj} (83%) create mode 100644 Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.sln rename Samples/{matrixMulDrv/matrixMulDrv_vs2010.vcxproj => simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj} (82%) delete mode 100644 Samples/simpleCUFFT/simpleCUFFT_vs2010.sln create mode 100644 Samples/simpleCudaGraphs/Makefile create mode 100644 Samples/simpleCudaGraphs/NsightEclipse.xml create mode 100644 Samples/simpleCudaGraphs/README.md create mode 100644 Samples/simpleCudaGraphs/simpleCudaGraphs.cu create mode 100644 Samples/simpleCudaGraphs/simpleCudaGraphs_vs2012.sln rename Samples/{deviceQuery/deviceQuery_vs2010.vcxproj => simpleCudaGraphs/simpleCudaGraphs_vs2012.vcxproj} (90%) create mode 100644 Samples/simpleCudaGraphs/simpleCudaGraphs_vs2013.sln rename Samples/{warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2010.vcxproj => simpleCudaGraphs/simpleCudaGraphs_vs2013.vcxproj} (90%) create mode 100644 Samples/simpleCudaGraphs/simpleCudaGraphs_vs2015.sln rename Samples/{matrixMul/matrixMul_vs2010.vcxproj => simpleCudaGraphs/simpleCudaGraphs_vs2015.vcxproj} (90%) create mode 100644 Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.sln create mode 100644 Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj delete mode 100644 Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2010.sln create mode 100644 Samples/simpleVulkan/Build_instructions.txt create mode 100644 Samples/simpleVulkan/Makefile create mode 100644 Samples/simpleVulkan/NsightEclipse.xml create mode 100644 Samples/simpleVulkan/README.md create mode 100644 Samples/simpleVulkan/findvulkan.mk create mode 100644 Samples/simpleVulkan/linmath.h create mode 100644 Samples/simpleVulkan/shader_sine.frag create mode 100644 Samples/simpleVulkan/shader_sine.vert rename Samples/{matrixMulDrv/matrixMulDrv_vs2010.sln => simpleVulkan/simpleVulkan_vs2013.sln} (75%) create mode 100644 Samples/simpleVulkan/simpleVulkan_vs2013.vcxproj rename Samples/{shfl_scan/shfl_scan_vs2010.sln => simpleVulkan/simpleVulkan_vs2015.sln} (74%) create mode 100644 Samples/simpleVulkan/simpleVulkan_vs2015.vcxproj rename Samples/{matrixMul/matrixMul_vs2010.sln => simpleVulkan/simpleVulkan_vs2017.sln} (74%) create mode 100644 Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj create mode 100644 Samples/simpleVulkan/vulkanCUDASinewave.cu create mode 100644 Samples/systemWideAtomics/Makefile create mode 100644 Samples/systemWideAtomics/NsightEclipse.xml create mode 100644 Samples/systemWideAtomics/README.md create mode 100644 Samples/systemWideAtomics/systemWideAtomics.cu delete mode 100644 Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2010.sln diff --git a/Common/helper_cuda.h b/Common/helper_cuda.h index 1e1c84f2..e5b8e9f3 100644 --- a/Common/helper_cuda.h +++ b/Common/helper_cuda.h @@ -51,457 +51,17 @@ // CUDA Runtime error messages #ifdef __DRIVER_TYPES_H__ static const char *_cudaGetErrorEnum(cudaError_t error) { - switch (error) { - case cudaSuccess: - return "cudaSuccess"; - - case cudaErrorMissingConfiguration: - return "cudaErrorMissingConfiguration"; - - case cudaErrorMemoryAllocation: - return "cudaErrorMemoryAllocation"; - - case cudaErrorInitializationError: - return "cudaErrorInitializationError"; - - case cudaErrorLaunchFailure: - return "cudaErrorLaunchFailure"; - - case cudaErrorPriorLaunchFailure: - return "cudaErrorPriorLaunchFailure"; - - case cudaErrorLaunchTimeout: - return "cudaErrorLaunchTimeout"; - - case cudaErrorLaunchOutOfResources: - return "cudaErrorLaunchOutOfResources"; - - case cudaErrorInvalidDeviceFunction: - return "cudaErrorInvalidDeviceFunction"; - - case cudaErrorInvalidConfiguration: - return "cudaErrorInvalidConfiguration"; - - case cudaErrorInvalidDevice: - return "cudaErrorInvalidDevice"; - - case cudaErrorInvalidValue: - return "cudaErrorInvalidValue"; - - case cudaErrorInvalidPitchValue: - return "cudaErrorInvalidPitchValue"; - - case cudaErrorInvalidSymbol: - return "cudaErrorInvalidSymbol"; - - case cudaErrorMapBufferObjectFailed: - return "cudaErrorMapBufferObjectFailed"; - - case cudaErrorUnmapBufferObjectFailed: - return "cudaErrorUnmapBufferObjectFailed"; - - case cudaErrorInvalidHostPointer: - return "cudaErrorInvalidHostPointer"; - - case cudaErrorInvalidDevicePointer: - return "cudaErrorInvalidDevicePointer"; - - case cudaErrorInvalidTexture: - return "cudaErrorInvalidTexture"; - - case cudaErrorInvalidTextureBinding: - return "cudaErrorInvalidTextureBinding"; - - case cudaErrorInvalidChannelDescriptor: - return "cudaErrorInvalidChannelDescriptor"; - - case cudaErrorInvalidMemcpyDirection: - return "cudaErrorInvalidMemcpyDirection"; - - case cudaErrorAddressOfConstant: - return "cudaErrorAddressOfConstant"; - - case cudaErrorTextureFetchFailed: - return "cudaErrorTextureFetchFailed"; - - case cudaErrorTextureNotBound: - return "cudaErrorTextureNotBound"; - - case cudaErrorSynchronizationError: - return "cudaErrorSynchronizationError"; - - case cudaErrorInvalidFilterSetting: - return "cudaErrorInvalidFilterSetting"; - - case cudaErrorInvalidNormSetting: - return "cudaErrorInvalidNormSetting"; - - case cudaErrorMixedDeviceExecution: - return "cudaErrorMixedDeviceExecution"; - - case cudaErrorCudartUnloading: - return "cudaErrorCudartUnloading"; - - case cudaErrorUnknown: - return "cudaErrorUnknown"; - - case cudaErrorNotYetImplemented: - return "cudaErrorNotYetImplemented"; - - case cudaErrorMemoryValueTooLarge: - return "cudaErrorMemoryValueTooLarge"; - - case cudaErrorInvalidResourceHandle: - return "cudaErrorInvalidResourceHandle"; - - case cudaErrorNotReady: - return "cudaErrorNotReady"; - - case cudaErrorInsufficientDriver: - return "cudaErrorInsufficientDriver"; - - case cudaErrorSetOnActiveProcess: - return "cudaErrorSetOnActiveProcess"; - - case cudaErrorInvalidSurface: - return "cudaErrorInvalidSurface"; - - case cudaErrorNoDevice: - return "cudaErrorNoDevice"; - - case cudaErrorECCUncorrectable: - return "cudaErrorECCUncorrectable"; - - case cudaErrorSharedObjectSymbolNotFound: - return "cudaErrorSharedObjectSymbolNotFound"; - - case cudaErrorSharedObjectInitFailed: - return "cudaErrorSharedObjectInitFailed"; - - case cudaErrorUnsupportedLimit: - return "cudaErrorUnsupportedLimit"; - - case cudaErrorDuplicateVariableName: - return "cudaErrorDuplicateVariableName"; - - case cudaErrorDuplicateTextureName: - return "cudaErrorDuplicateTextureName"; - - case cudaErrorDuplicateSurfaceName: - return "cudaErrorDuplicateSurfaceName"; - - case cudaErrorDevicesUnavailable: - return "cudaErrorDevicesUnavailable"; - - case cudaErrorInvalidKernelImage: - return "cudaErrorInvalidKernelImage"; - - case cudaErrorNoKernelImageForDevice: - return "cudaErrorNoKernelImageForDevice"; - - case cudaErrorIncompatibleDriverContext: - return "cudaErrorIncompatibleDriverContext"; - - case cudaErrorPeerAccessAlreadyEnabled: - return "cudaErrorPeerAccessAlreadyEnabled"; - - case cudaErrorPeerAccessNotEnabled: - return "cudaErrorPeerAccessNotEnabled"; - - case cudaErrorDeviceAlreadyInUse: - return "cudaErrorDeviceAlreadyInUse"; - - case cudaErrorProfilerDisabled: - return "cudaErrorProfilerDisabled"; - - case cudaErrorProfilerNotInitialized: - return "cudaErrorProfilerNotInitialized"; - - case cudaErrorProfilerAlreadyStarted: - return "cudaErrorProfilerAlreadyStarted"; - - case cudaErrorProfilerAlreadyStopped: - return "cudaErrorProfilerAlreadyStopped"; - - /* Since CUDA 4.0*/ - case cudaErrorAssert: - return "cudaErrorAssert"; - - case cudaErrorTooManyPeers: - return "cudaErrorTooManyPeers"; - - case cudaErrorHostMemoryAlreadyRegistered: - return "cudaErrorHostMemoryAlreadyRegistered"; - - case cudaErrorHostMemoryNotRegistered: - return "cudaErrorHostMemoryNotRegistered"; - - /* Since CUDA 5.0 */ - case cudaErrorOperatingSystem: - return "cudaErrorOperatingSystem"; - - case cudaErrorPeerAccessUnsupported: - return "cudaErrorPeerAccessUnsupported"; - - case cudaErrorLaunchMaxDepthExceeded: - return "cudaErrorLaunchMaxDepthExceeded"; - - case cudaErrorLaunchFileScopedTex: - return "cudaErrorLaunchFileScopedTex"; - - case cudaErrorLaunchFileScopedSurf: - return "cudaErrorLaunchFileScopedSurf"; - - case cudaErrorSyncDepthExceeded: - return "cudaErrorSyncDepthExceeded"; - - case cudaErrorLaunchPendingCountExceeded: - return "cudaErrorLaunchPendingCountExceeded"; - - case cudaErrorNotPermitted: - return "cudaErrorNotPermitted"; - - case cudaErrorNotSupported: - return "cudaErrorNotSupported"; - - /* Since CUDA 6.0 */ - case cudaErrorHardwareStackError: - return "cudaErrorHardwareStackError"; - - case cudaErrorIllegalInstruction: - return "cudaErrorIllegalInstruction"; - - case cudaErrorMisalignedAddress: - return "cudaErrorMisalignedAddress"; - - case cudaErrorInvalidAddressSpace: - return "cudaErrorInvalidAddressSpace"; - - case cudaErrorInvalidPc: - return "cudaErrorInvalidPc"; - - case cudaErrorIllegalAddress: - return "cudaErrorIllegalAddress"; - - /* Since CUDA 6.5*/ - case cudaErrorInvalidPtx: - return "cudaErrorInvalidPtx"; - - case cudaErrorInvalidGraphicsContext: - return "cudaErrorInvalidGraphicsContext"; - - case cudaErrorStartupFailure: - return "cudaErrorStartupFailure"; - - case cudaErrorApiFailureBase: - return "cudaErrorApiFailureBase"; - - /* Since CUDA 8.0*/ - case cudaErrorNvlinkUncorrectable: - return "cudaErrorNvlinkUncorrectable"; - - /* Since CUDA 8.5*/ - case cudaErrorJitCompilerNotFound: - return "cudaErrorJitCompilerNotFound"; - - /* Since CUDA 9.0*/ - case cudaErrorCooperativeLaunchTooLarge: - return "cudaErrorCooperativeLaunchTooLarge"; - } - - return ""; + return cudaGetErrorName(error); } #endif -#ifdef __cuda_cuda_h__ +#ifdef CUDA_DRIVER_API // CUDA Driver API errors static const char *_cudaGetErrorEnum(CUresult error) { - switch (error) { - case CUDA_SUCCESS: - return "CUDA_SUCCESS"; - - case CUDA_ERROR_INVALID_VALUE: - return "CUDA_ERROR_INVALID_VALUE"; - - case CUDA_ERROR_OUT_OF_MEMORY: - return "CUDA_ERROR_OUT_OF_MEMORY"; - - case CUDA_ERROR_NOT_INITIALIZED: - return "CUDA_ERROR_NOT_INITIALIZED"; - - case CUDA_ERROR_DEINITIALIZED: - return "CUDA_ERROR_DEINITIALIZED"; - - case CUDA_ERROR_PROFILER_DISABLED: - return "CUDA_ERROR_PROFILER_DISABLED"; - - case CUDA_ERROR_PROFILER_NOT_INITIALIZED: - return "CUDA_ERROR_PROFILER_NOT_INITIALIZED"; - - case CUDA_ERROR_PROFILER_ALREADY_STARTED: - return "CUDA_ERROR_PROFILER_ALREADY_STARTED"; - - case CUDA_ERROR_PROFILER_ALREADY_STOPPED: - return "CUDA_ERROR_PROFILER_ALREADY_STOPPED"; - - case CUDA_ERROR_NO_DEVICE: - return "CUDA_ERROR_NO_DEVICE"; - - case CUDA_ERROR_INVALID_DEVICE: - return "CUDA_ERROR_INVALID_DEVICE"; - - case CUDA_ERROR_INVALID_IMAGE: - return "CUDA_ERROR_INVALID_IMAGE"; - - case CUDA_ERROR_INVALID_CONTEXT: - return "CUDA_ERROR_INVALID_CONTEXT"; - - case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: - return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT"; - - case CUDA_ERROR_MAP_FAILED: - return "CUDA_ERROR_MAP_FAILED"; - - case CUDA_ERROR_UNMAP_FAILED: - return "CUDA_ERROR_UNMAP_FAILED"; - - case CUDA_ERROR_ARRAY_IS_MAPPED: - return "CUDA_ERROR_ARRAY_IS_MAPPED"; - - case CUDA_ERROR_ALREADY_MAPPED: - return "CUDA_ERROR_ALREADY_MAPPED"; - - case CUDA_ERROR_NO_BINARY_FOR_GPU: - return "CUDA_ERROR_NO_BINARY_FOR_GPU"; - - case CUDA_ERROR_ALREADY_ACQUIRED: - return "CUDA_ERROR_ALREADY_ACQUIRED"; - - case CUDA_ERROR_NOT_MAPPED: - return "CUDA_ERROR_NOT_MAPPED"; - - case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: - return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY"; - - case CUDA_ERROR_NOT_MAPPED_AS_POINTER: - return "CUDA_ERROR_NOT_MAPPED_AS_POINTER"; - - case CUDA_ERROR_ECC_UNCORRECTABLE: - return "CUDA_ERROR_ECC_UNCORRECTABLE"; - - case CUDA_ERROR_UNSUPPORTED_LIMIT: - return "CUDA_ERROR_UNSUPPORTED_LIMIT"; - - case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: - return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE"; - - case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: - return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED"; - - case CUDA_ERROR_INVALID_PTX: - return "CUDA_ERROR_INVALID_PTX"; - - case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: - return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT"; - - case CUDA_ERROR_NVLINK_UNCORRECTABLE: - return "CUDA_ERROR_NVLINK_UNCORRECTABLE"; - - case CUDA_ERROR_JIT_COMPILER_NOT_FOUND: - return "CUDA_ERROR_JIT_COMPILER_NOT_FOUND"; - - case CUDA_ERROR_INVALID_SOURCE: - return "CUDA_ERROR_INVALID_SOURCE"; - - case CUDA_ERROR_FILE_NOT_FOUND: - return "CUDA_ERROR_FILE_NOT_FOUND"; - - case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: - return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"; - - case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: - return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"; - - case CUDA_ERROR_OPERATING_SYSTEM: - return "CUDA_ERROR_OPERATING_SYSTEM"; - - case CUDA_ERROR_INVALID_HANDLE: - return "CUDA_ERROR_INVALID_HANDLE"; - - case CUDA_ERROR_NOT_FOUND: - return "CUDA_ERROR_NOT_FOUND"; - - case CUDA_ERROR_NOT_READY: - return "CUDA_ERROR_NOT_READY"; - - case CUDA_ERROR_ILLEGAL_ADDRESS: - return "CUDA_ERROR_ILLEGAL_ADDRESS"; - - case CUDA_ERROR_LAUNCH_FAILED: - return "CUDA_ERROR_LAUNCH_FAILED"; - - case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: - return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"; - - case CUDA_ERROR_LAUNCH_TIMEOUT: - return "CUDA_ERROR_LAUNCH_TIMEOUT"; - - case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: - return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"; - - case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: - return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"; - - case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: - return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"; - - case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: - return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"; - - case CUDA_ERROR_CONTEXT_IS_DESTROYED: - return "CUDA_ERROR_CONTEXT_IS_DESTROYED"; - - case CUDA_ERROR_ASSERT: - return "CUDA_ERROR_ASSERT"; - - case CUDA_ERROR_TOO_MANY_PEERS: - return "CUDA_ERROR_TOO_MANY_PEERS"; - - case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: - return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"; - - case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: - return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"; - - case CUDA_ERROR_HARDWARE_STACK_ERROR: - return "CUDA_ERROR_HARDWARE_STACK_ERROR"; - - case CUDA_ERROR_ILLEGAL_INSTRUCTION: - return "CUDA_ERROR_ILLEGAL_INSTRUCTION"; - - case CUDA_ERROR_MISALIGNED_ADDRESS: - return "CUDA_ERROR_MISALIGNED_ADDRESS"; - - case CUDA_ERROR_INVALID_ADDRESS_SPACE: - return "CUDA_ERROR_INVALID_ADDRESS_SPACE"; - - case CUDA_ERROR_INVALID_PC: - return "CUDA_ERROR_INVALID_PC"; - - case CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE: - return "CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE"; - - case CUDA_ERROR_NOT_PERMITTED: - return "CUDA_ERROR_NOT_PERMITTED"; - - case CUDA_ERROR_NOT_SUPPORTED: - return "CUDA_ERROR_NOT_SUPPORTED"; - - case CUDA_ERROR_UNKNOWN: - return "CUDA_ERROR_UNKNOWN"; - } - - return ""; + static char unknown[] = ""; + const char *ret = NULL; + cuGetErrorName(error, &ret); + return ret ? ret : unknown; } #endif @@ -1067,18 +627,19 @@ inline int _ConvertSMVer2Cores(int major, int minor) { } sSMtoCores; sSMtoCores nGpuArchCoresPerSM[] = { - {0x30, 192}, // Kepler Generation (SM 3.0) GK10x class - {0x32, 192}, // Kepler Generation (SM 3.2) GK10x class - {0x35, 192}, // Kepler Generation (SM 3.5) GK11x class - {0x37, 192}, // Kepler Generation (SM 3.7) GK21x class - {0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class - {0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class - {0x53, 128}, // Maxwell Generation (SM 5.3) GM20x class - {0x60, 64}, // Pascal Generation (SM 6.0) GP100 class - {0x61, 128}, // Pascal Generation (SM 6.1) GP10x class - {0x62, 128}, // Pascal Generation (SM 6.2) GP10x class - {0x70, 64}, // Volta Generation (SM 7.0) GV100 class - {0x72, 64}, // Volta Generation (SM 7.2) GV11b class + {0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64}, {-1, -1}}; int index = 0; @@ -1155,7 +716,7 @@ inline int gpuDeviceInit(int devID) { inline int gpuGetMaxGflopsDeviceId() { int current_device = 0, sm_per_multiproc = 0; int max_perf_device = 0; - int device_count = 0, best_SM_arch = 0; + int device_count = 0; int devices_prohibited = 0; uint64_t max_compute_perf = 0; @@ -1169,30 +730,6 @@ inline int gpuGetMaxGflopsDeviceId() { exit(EXIT_FAILURE); } - // Find the best major SM Architecture GPU device - while (current_device < device_count) { - cudaGetDeviceProperties(&deviceProp, current_device); - - // If this GPU is not running on Compute Mode prohibited, - // then we can add it to the list - if (deviceProp.computeMode != cudaComputeModeProhibited) { - if (deviceProp.major > 0 && deviceProp.major < 9999) { - best_SM_arch = MAX(best_SM_arch, deviceProp.major); - } - } else { - devices_prohibited++; - } - - current_device++; - } - - if (devices_prohibited == device_count) { - fprintf(stderr, - "gpuGetMaxGflopsDeviceId() CUDA error:" - " all devices have compute mode prohibited.\n"); - exit(EXIT_FAILURE); - } - // Find the best CUDA capable GPU device current_device = 0; @@ -1213,23 +750,23 @@ inline int gpuGetMaxGflopsDeviceId() { sm_per_multiproc * deviceProp.clockRate; if (compute_perf > max_compute_perf) { - // If we find GPU with SM major > 2, search only these - if (best_SM_arch > 2) { - // If our device==dest_SM_arch, choose this, or else pass - if (deviceProp.major == best_SM_arch) { - max_compute_perf = compute_perf; - max_perf_device = current_device; - } - } else { - max_compute_perf = compute_perf; - max_perf_device = current_device; - } + max_compute_perf = compute_perf; + max_perf_device = current_device; } + } else { + devices_prohibited++; } ++current_device; } + if (devices_prohibited == device_count) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " all devices have compute mode prohibited.\n"); + exit(EXIT_FAILURE); + } + return max_perf_device; } diff --git a/Common/helper_cuda_drvapi.h b/Common/helper_cuda_drvapi.h index db43cff7..c7f08a32 100644 --- a/Common/helper_cuda_drvapi.h +++ b/Common/helper_cuda_drvapi.h @@ -122,18 +122,19 @@ inline int _ConvertSMVer2CoresDRV(int major, int minor) { } sSMtoCores; sSMtoCores nGpuArchCoresPerSM[] = { - {0x30, 192}, // Kepler Generation (SM 3.0) GK10x class - {0x32, 192}, // Kepler Generation (SM 3.2) GK10x class - {0x35, 192}, // Kepler Generation (SM 3.5) GK11x class - {0x37, 192}, // Kepler Generation (SM 3.7) GK21x class - {0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class - {0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class - {0x53, 128}, // Maxwell Generation (SM 5.3) GM20x class - {0x60, 64}, // Pascal Generation (SM 6.0) GP100 class - {0x61, 128}, // Pascal Generation (SM 6.1) GP10x class - {0x62, 128}, // Pascal Generation (SM 6.2) GP10x class - {0x70, 64}, // Volta Generation (SM 7.0) GV100 class - {0x72, 64}, // Volta Generation (SM 7.2) GV11b class + {0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64}, {-1, -1}}; int index = 0; diff --git a/README.md b/README.md index 39636d03..ea2a56bc 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,22 @@ # CUDA Samples -Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads). +Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads). ## Release Notes This section describes the release notes for the CUDA Samples on GitHub only. +### CUDA 10.0 +* Added `simpleCudaGraphs`. Demonstrates CUDA Graphs creation, instantiation and launch using Graphs APIs and Stream Capture APIs. +* Added `conjugateGradientCudaGraphs`. Demonstrates conjugate gradient solver on GPU using CUBLAS and CUSPARSE library calls captured and called using CUDA Graph APIs. +* Added `simpleVulkan`. Demonstrates Vulkan - CUDA Interop. +* Added `UnifiedMemoryPerf`. Demonstrates performance comparision of various memory types involved in system. +* Added `p2pBandwidthLatencyTest`. Demonstrates Peer-To-Peer (P2P) data transfers between pairs of GPUs and computes latency and bandwidth. +* Added `systemWideAtomics`. Demonstrates system wide atomic instructions. +* Added `simpleCUBLASXT`. Demonstrates CUBLAS-XT library which performs GEMM operations over multiple GPUs. +* Added Windows OS support to `conjugateGradientMultiDeviceCG` sample. +* Removed support of Visual Studio 2010 from all samples. + ### CUDA 9.2 This is the first release of CUDA Samples on GitHub: @@ -26,7 +37,7 @@ This is the first release of CUDA Samples on GitHub: ### Prerequisites -Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html), and the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html). ### Getting the CUDA Samples @@ -108,22 +119,27 @@ The samples makefiles can take advantage of certain options: ### Samples by OS #### Linux -**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[shfl_scan](./Samples/shfl_scan)** | **[deviceQuery](./Samples/deviceQuery)** | +**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[shfl_scan](./Samples/shfl_scan)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | ---|---|---|---| -**[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | -**[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[matrixMul](./Samples/matrixMul)** | +**[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | +**[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | +**[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[simpleVulkan](./Samples/simpleVulkan)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | +**[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[matrixMul](./Samples/matrixMul)** | **[systemWideAtomics](./Samples/systemWideAtomics)** | #### Windows -**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[shfl_scan](./Samples/shfl_scan)** | **[deviceQuery](./Samples/deviceQuery)** | +**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[shfl_scan](./Samples/shfl_scan)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | ---|---|---|---| -**[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | -**[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[matrixMul](./Samples/matrixMul)** | +**[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | +**[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | +**[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[simpleVulkan](./Samples/simpleVulkan)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | +**[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[matrixMul](./Samples/matrixMul)** | #### Mac OSX -**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[shfl_scan](./Samples/shfl_scan)** | **[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | +**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[shfl_scan](./Samples/shfl_scan)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | ---|---|---|---| +**[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | -**[matrixMul](./Samples/matrixMul)** | +**[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[matrixMul](./Samples/matrixMul)** | ## Dependencies @@ -161,6 +177,10 @@ OpenGL is a graphics library used for 2D and 3D rendering. On systems which supp OpenGL ES is an embedded systems graphics library used for 2D and 3D rendering. On systems which support OpenGL ES, NVIDIA's OpenGL ES implementation is provided with the CUDA Driver. +#### Vulkan + +Vulkan is a low-overhead, cross-platform 3D graphics and compute API. Vulkan targets high-performance realtime 3D graphics applications such as video games and interactive media across all platforms. On systems which support Vulkan, NVIDIA's Vulkan implementation is provided with the CUDA Driver. For building and running Vulkan applications one needs to install the [Vulkan SDK](https://www.lunarg.com/vulkan-sdk/). + #### OpenMP OpenMP is an API for multiprocessing programming. OpenMP can be installed using your Linux distribution's package manager system. It usually comes preinstalled with GCC. It can also be found at the [OpenMP website](http://openmp.org/). diff --git a/Samples/UnifiedMemoryPerf/Makefile b/Samples/UnifiedMemoryPerf/Makefile new file mode 100644 index 00000000..628ebd40 --- /dev/null +++ b/Samples/UnifiedMemoryPerf/Makefile @@ -0,0 +1,306 @@ +################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-clang++ + endif + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu + LDFLAGS += --unresolved-symbols=ignore-in-shared-libs + CCFLAGS += -isystem=$(TARGET_FS)/usr/include + CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu + endif + endif +endif + +ifeq ($(TARGET_OS),qnx) + CCFLAGS += -DWIN_INTERFACE_CUSTOM + LDFLAGS += -lsocket +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +SAMPLE_ENABLED := 1 + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../Common +LIBRARIES := + +################################################################################ + +# Gencode arguments +SMS ?= 30 35 37 50 52 60 61 70 75 + +ifeq ($(SMS),) +$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) +SAMPLE_ENABLED := 0 +endif + +ifeq ($(GENCODE_FLAGS),) +# Generate SASS code for each SM architecture listed in $(SMS) +$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) + +# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility +HIGHEST_SM := $(lastword $(sort $(SMS))) +ifneq ($(HIGHEST_SM),) +GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) +endif +endif + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: UnifiedMemoryPerf + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +commonKernels.o:commonKernels.cu + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +helperFunctions.o:helperFunctions.cpp + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +matrixMultiplyPerf.o:matrixMultiplyPerf.cu + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +UnifiedMemoryPerf: commonKernels.o helperFunctions.o matrixMultiplyPerf.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./UnifiedMemoryPerf + +clean: + rm -f UnifiedMemoryPerf commonKernels.o helperFunctions.o matrixMultiplyPerf.o + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/UnifiedMemoryPerf + +clobber: clean diff --git a/Samples/UnifiedMemoryPerf/NsightEclipse.xml b/Samples/UnifiedMemoryPerf/NsightEclipse.xml new file mode 100644 index 00000000..32ab4ef2 --- /dev/null +++ b/Samples/UnifiedMemoryPerf/NsightEclipse.xml @@ -0,0 +1,84 @@ + + + + UnifiedMemoryPerf + + cudaMallocManaged + cudaStreamAttachMemAsync + cudaMemcpyAsync + cudaMallocHost + cudaMalloc + + + whole + + ./ + ../ + ../../common/inc + + + CUDA Systems Integration + Unified Memory + CUDA Streams and Events + Pinned System Paged Memory + + + CUDA + Unified Memory + Pinned Memory + Zero copy buffer + UVM + Streams + + + + + + true + matrixMultiplyPerf.cu + + UVM + + + 1:CUDA Basic Topics + 1:CUDA Systems Integration + 1:Unified Memory + + sm30 + sm35 + sm37 + sm50 + sm52 + sm60 + sm61 + sm70 + sm75 + + + x86_64 + linux + + + x86_64 + macosx + + + windows7 + + + arm + + + aarch64 + + + ppc64le + linux + + + + 3.0 + + Unified and other CUDA Memories Performance + exe + diff --git a/Samples/UnifiedMemoryPerf/README.md b/Samples/UnifiedMemoryPerf/README.md new file mode 100644 index 00000000..da827faf --- /dev/null +++ b/Samples/UnifiedMemoryPerf/README.md @@ -0,0 +1,98 @@ +# UnifiedMemoryPerf - Unified and other CUDA Memories Performance + +## Description + +This sample demonstrates the performance comparision using matrix multiplication kernel of Unified Memory with/without hints and other types of memory like zero copy buffers, pageable, pagelocked memory performing synchronous and Asynchronous transfers on a single GPU. + +## Key Concepts + +CUDA Systems Integration, Unified Memory, CUDA Streams and Events, Pinned System Paged Memory + +## Supported SM Architectures + +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows, MacOSX + +## Supported CPU Architecture + +x86_64, ppc64le, armv7l, aarch64 + +## CUDA APIs involved + +### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) +cudaMallocManaged, cudaStreamAttachMemAsync, cudaMemcpyAsync, cudaMallocHost, cudaMalloc + +## Dependencies needed to build/run +[UVM](../../README.md#uvm) + +## Prerequisites + +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Make sure the dependencies mentioned in [Dependencies]() section above are installed. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l, aarch64. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
`$ make TARGET_ARCH=aarch64`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +### Mac +The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` + +The samples makefiles can take advantage of certain options: + +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` + +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60". + ``` + $ make SMS="A B ..." + ``` + +* **HOST_COMPILER=** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers. + ``` + $ make HOST_COMPILER=clang + ``` + +## References (for more details) + diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.sln b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.sln new file mode 100644 index 00000000..5496720e --- /dev/null +++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnifiedMemoryPerf", "UnifiedMemoryPerf_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.vcxproj b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.vcxproj new file mode 100644 index 00000000..68259eda --- /dev/null +++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.vcxproj @@ -0,0 +1,110 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + UnifiedMemoryPerf_vs2012 + UnifiedMemoryPerf + + + + + Application + MultiByte + v110 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/UnifiedMemoryPerf.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + + + + diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.sln b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.sln new file mode 100644 index 00000000..aa0fcdcc --- /dev/null +++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 13.00 +# Visual Studio 2013 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnifiedMemoryPerf", "UnifiedMemoryPerf_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.vcxproj b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.vcxproj new file mode 100644 index 00000000..1bb100bf --- /dev/null +++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.vcxproj @@ -0,0 +1,110 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + UnifiedMemoryPerf_vs2013 + UnifiedMemoryPerf + + + + + Application + MultiByte + v120 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/UnifiedMemoryPerf.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + + + + diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.sln b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.sln new file mode 100644 index 00000000..2b3a4e08 --- /dev/null +++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 14.00 +# Visual Studio 2015 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnifiedMemoryPerf", "UnifiedMemoryPerf_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.vcxproj b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.vcxproj new file mode 100644 index 00000000..cfe0e652 --- /dev/null +++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.vcxproj @@ -0,0 +1,110 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + UnifiedMemoryPerf_vs2015 + UnifiedMemoryPerf + + + + + Application + MultiByte + v140 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/UnifiedMemoryPerf.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + + + + diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.sln b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.sln new file mode 100644 index 00000000..701c3596 --- /dev/null +++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2017 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnifiedMemoryPerf", "UnifiedMemoryPerf_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj new file mode 100644 index 00000000..8aa9ef46 --- /dev/null +++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj @@ -0,0 +1,111 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + UnifiedMemoryPerf_vs2017 + UnifiedMemoryPerf + + + + + Application + MultiByte + v141 + 10.0.15063.0 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/UnifiedMemoryPerf.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + + + + diff --git a/Samples/UnifiedMemoryPerf/commonDefs.hpp b/Samples/UnifiedMemoryPerf/commonDefs.hpp new file mode 100644 index 00000000..ef822ffe --- /dev/null +++ b/Samples/UnifiedMemoryPerf/commonDefs.hpp @@ -0,0 +1,88 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _COMMON_DEFS_ +#define _COMMON_DEFS_ +#include + +#define ONE_KB 1024 +#define ONE_MB (ONE_KB * ONE_KB) + +extern size_t maxSampleSizeInMb; +extern int numKernelRuns; +extern int verboseResults; + +extern unsigned int findNumSizesToTest(unsigned int minSize, + unsigned int maxSize, + unsigned int multiplier); + +// For Tracking the different memory allocation types +typedef enum memAllocType_enum { + MEMALLOC_TYPE_START, + USE_MANAGED_MEMORY_WITH_HINTS = MEMALLOC_TYPE_START, + USE_MANAGED_MEMORY_WITH_HINTS_ASYNC, + USE_MANAGED_MEMORY, + USE_ZERO_COPY, + USE_HOST_PAGEABLE_AND_DEVICE_MEMORY, + USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC, + USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY, + USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC, + MEMALLOC_TYPE_END = USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC, + MEMALLOC_TYPE_INVALID, + MEMALLOC_TYPE_COUNT = MEMALLOC_TYPE_INVALID +} MemAllocType; + +typedef enum bandwidthType_enum { + READ_BANDWIDTH, + WRITE_BANDWIDTH +} BandwidthType; + +extern const char *memAllocTypeStr[]; +extern const char *memAllocTypeShortStr[]; + +struct resultsData; +struct testResults; + +void createAndInitTestResults(struct testResults **results, + const char *testName, + unsigned int numMeasurements, + unsigned int numSizesToTest); +unsigned long *getPtrSizesToTest(struct testResults *results); + +void freeTestResultsAndAllResultsData(struct testResults *results); + +void createResultDataAndAddToTestResults(struct resultsData **ptrData, + struct testResults *results, + const char *resultsName, + bool printOnlyInVerbose, + bool reportAsBandwidth); +double *getPtrRunTimesInMs(struct resultsData *data, int allocType, + int sizeIndex); + +void printResults(struct testResults *results, + bool print_launch_transfer_results, bool print_std_deviation); +#endif diff --git a/Samples/UnifiedMemoryPerf/commonKernels.cu b/Samples/UnifiedMemoryPerf/commonKernels.cu new file mode 100644 index 00000000..a70dc9b6 --- /dev/null +++ b/Samples/UnifiedMemoryPerf/commonKernels.cu @@ -0,0 +1,33 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "commonKernels.hpp" + +__global__ void spinWhileLessThanOne(volatile unsigned int *latch) { + while (latch[0] < 1) + ; +} diff --git a/Samples/UnifiedMemoryPerf/commonKernels.hpp b/Samples/UnifiedMemoryPerf/commonKernels.hpp new file mode 100644 index 00000000..5b5df3ad --- /dev/null +++ b/Samples/UnifiedMemoryPerf/commonKernels.hpp @@ -0,0 +1,28 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +__global__ void spinWhileLessThanOne(volatile unsigned int *latch); diff --git a/Samples/UnifiedMemoryPerf/helperFunctions.cpp b/Samples/UnifiedMemoryPerf/helperFunctions.cpp new file mode 100644 index 00000000..5fc904ed --- /dev/null +++ b/Samples/UnifiedMemoryPerf/helperFunctions.cpp @@ -0,0 +1,303 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include "commonDefs.hpp" +#define CU_INIT_UUID +#include + +#define UNITS_Time "ms" +#define UNITS_BW "MB/s" +#define KB_str "KB" +#define MB_str "MB" + +struct resultsData { + char resultsName[64]; + struct testResults *results; + // this has MEMALLOC_TYPE_COUNT * results->numSizesToTest * + // results->numMeasurements elements + double **runTimesInMs[MEMALLOC_TYPE_COUNT]; + double *averageRunTimesInMs[MEMALLOC_TYPE_COUNT]; + double *stdDevRunTimesInMs[MEMALLOC_TYPE_COUNT]; + double *stdDevBandwidthInMBps[MEMALLOC_TYPE_COUNT]; + bool printOnlyInVerbose; + bool reportAsBandwidth; + struct resultsData *next; +}; + +struct testResults { + char testName[64]; + unsigned int numMeasurements; + unsigned long *sizesToTest; + unsigned int numSizesToTest; + struct resultsData *resultsDataHead; + struct resultsData *resultsDataTail; +}; + +unsigned int findNumSizesToTest(unsigned int minSize, unsigned int maxSize, + unsigned int multiplier) { + unsigned int numSizesToTest = 0; + while (minSize <= maxSize) { + numSizesToTest++; + minSize *= multiplier; + } + return numSizesToTest; +} + +int compareDoubles(const void *ptr1, const void *ptr2) { + return (*(double *)ptr1 > *(double *)ptr2) ? 1 : -1; +} + +static inline double getTimeOrBandwidth(double runTimeInMs, unsigned long size, + bool getBandwidth) { + return (getBandwidth) ? (1000 * (size / runTimeInMs)) / ONE_MB : runTimeInMs; +} + +void createAndInitTestResults(struct testResults **ptrResults, + const char *testName, + unsigned int numMeasurements, + unsigned int numSizesToTest) { + unsigned int i; + struct testResults *results; + results = (struct testResults *)malloc(sizeof(struct testResults)); + memset(results, 0, sizeof(struct testResults)); + strcpy(results->testName, testName); + results->numMeasurements = numMeasurements; + results->numSizesToTest = numSizesToTest; + results->sizesToTest = + (unsigned long *)malloc(numSizesToTest * sizeof(unsigned long)); + results->resultsDataHead = NULL; + results->resultsDataTail = NULL; + + *ptrResults = results; +} + +unsigned long *getPtrSizesToTest(struct testResults *results) { + return results->sizesToTest; +} + +void createResultDataAndAddToTestResults(struct resultsData **ptrData, + struct testResults *results, + const char *resultsName, + bool printOnlyInVerbose, + bool reportAsBandwidth) { + unsigned int i, j; + struct resultsData *data; + data = (struct resultsData *)malloc(sizeof(struct resultsData)); + memset(data, 0, sizeof(struct resultsData)); + strcpy(data->resultsName, resultsName); + data->results = results; + for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) { + data->runTimesInMs[i] = + (double **)malloc(results->numSizesToTest * sizeof(double *)); + for (j = 0; j < results->numSizesToTest; j++) { + data->runTimesInMs[i][j] = + (double *)malloc(results->numMeasurements * sizeof(double)); + } + data->averageRunTimesInMs[i] = + (double *)malloc(results->numSizesToTest * sizeof(double)); + data->stdDevRunTimesInMs[i] = + (double *)malloc(results->numSizesToTest * sizeof(double)); + data->stdDevBandwidthInMBps[i] = + (double *)malloc(results->numSizesToTest * sizeof(double)); + } + data->printOnlyInVerbose = printOnlyInVerbose; + data->reportAsBandwidth = reportAsBandwidth; + data->next = NULL; + *ptrData = data; + if (results->resultsDataHead == NULL) { + results->resultsDataHead = data; + results->resultsDataTail = data; + } else { + results->resultsDataTail->next = data; + results->resultsDataTail = data; + } +} + +double *getPtrRunTimesInMs(struct resultsData *data, int allocType, + int sizeIndex) { + return data->runTimesInMs[allocType][sizeIndex]; +} + +void freeTestResultsAndAllResultsData(struct testResults *results) { + struct resultsData *data, *dataToFree; + unsigned int i, j; + for (data = results->resultsDataHead; data != NULL;) { + for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) { + for (j = 0; j < results->numSizesToTest; j++) { + free(data->runTimesInMs[i][j]); + } + free(data->runTimesInMs[i]); + free(data->averageRunTimesInMs[i]); + free(data->stdDevRunTimesInMs[i]); + free(data->stdDevBandwidthInMBps[i]); + } + dataToFree = data; + data = data->next; + free(dataToFree); + } + free(results->sizesToTest); + free(results); +} + +void calculateAverageAndStdDev(double *pAverage, double *pStdDev, + double *allResults, unsigned int count) { + unsigned int i; + double average = 0.0; + double stdDev = 0.0; + for (i = 0; i < count; i++) { + average += allResults[i]; + } + average /= count; + for (i = 0; i < count; i++) { + stdDev += (allResults[i] - average) * (allResults[i] - average); + } + stdDev /= count; + stdDev = sqrt(stdDev); + *pAverage = average; + *pStdDev = (average == 0.0) ? 0.0 : ((100.0 * stdDev) / average); +} + +void calculateStdDevBandwidth(double *pStdDev, double *allResults, + unsigned int count, unsigned long size) { + unsigned int i; + double bandwidth; + double average = 0.0; + double stdDev = 0.0; + for (i = 0; i < count; i++) { + bandwidth = (1000 * (size / allResults[i])) / ONE_MB; + average += bandwidth; + } + average /= count; + for (i = 0; i < count; i++) { + bandwidth = (1000 * (size / allResults[i])) / ONE_MB; + stdDev += (bandwidth - average) * (bandwidth - average); + } + stdDev /= count; + stdDev = sqrt(stdDev); + *pStdDev = (average == 0.0) ? 0.0 : ((100.0 * stdDev) / average); +} + +void printTimesInTableFormat(struct testResults *results, + struct resultsData *data, bool printAverage, + bool printStdDev) { + unsigned int i, j; + bool printStdDevBandwidth = printStdDev && data->reportAsBandwidth; + printf("Size_KB"); + for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) { + printf("\t%7s", memAllocTypeShortStr[i]); + } + printf("\n"); + for (j = 0; j < results->numSizesToTest; j++) { + printf("%lu", results->sizesToTest[j] / ONE_KB); + for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) { + printf(data->reportAsBandwidth ? "\t%7.2lf" : "\t%7.3lf", + printStdDevBandwidth + ? data->stdDevBandwidthInMBps[i][j] + : getTimeOrBandwidth( + printAverage ? data->averageRunTimesInMs[i][j] + : data->stdDevRunTimesInMs[i][j], + results->sizesToTest[j], data->reportAsBandwidth)); + } + printf("\n"); + } +} + +void printAllResultsInVerboseMode(struct testResults *results, + struct resultsData *data) { + unsigned int i, j, k; + for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) { + printf("Verbose mode, printing all results for %s\n", memAllocTypeStr[i]); + printf("Instance"); + for (j = 0; j < results->numSizesToTest; j++) { + printf("\t%lu", results->sizesToTest[j] / ONE_KB); + } + printf("\n"); + for (k = 0; k < results->numMeasurements; k++) { + printf("%u", k); + for (j = 0; j < results->numSizesToTest; j++) { + printf(data->reportAsBandwidth ? "\t%7.2lf" : "\t%7.3lf", + getTimeOrBandwidth(data->runTimesInMs[i][j][k], + results->sizesToTest[j], + data->reportAsBandwidth)); + } + printf("\n"); + } + } +} + +void printResults(struct testResults *results, + bool print_launch_transfer_results, + bool print_std_deviation) { + char vulcanPrint[256]; + char resultNameNoSpaces[64]; + unsigned int i, j, k; + struct resultsData *resultsIter; + bool sizeGreaterThan1MB; + for (resultsIter = results->resultsDataHead; resultsIter != NULL; + resultsIter = resultsIter->next) { + if (!verboseResults && resultsIter->printOnlyInVerbose) { + continue; + } + if (!print_launch_transfer_results) { + if (!(strcmp(resultsIter->resultsName, "Overall Time") == 0)) { + continue; + } + } + // regular print + printf("\n%s For %s ", resultsIter->resultsName, results->testName); + printf("\n"); + for (j = 0; j < results->numSizesToTest; j++) { + for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) { + calculateAverageAndStdDev(&resultsIter->averageRunTimesInMs[i][j], + &resultsIter->stdDevRunTimesInMs[i][j], + resultsIter->runTimesInMs[i][j], + results->numMeasurements); + if (resultsIter->reportAsBandwidth) { + calculateStdDevBandwidth(&resultsIter->stdDevBandwidthInMBps[i][j], + resultsIter->runTimesInMs[i][j], + results->numMeasurements, + results->sizesToTest[j]); + } + } + } + printf("\nPrinting Average of %u measurements in (%s)\n", + results->numMeasurements, + resultsIter->reportAsBandwidth ? UNITS_BW : UNITS_Time); + printTimesInTableFormat(results, resultsIter, true, false); + if (print_std_deviation) { + printf( + "\nPrinting Standard Deviation as %% of average of %u measurements\n", + results->numMeasurements); + printTimesInTableFormat(results, resultsIter, false, true); + } + if (verboseResults) { + printAllResultsInVerboseMode(results, resultsIter); + } + } +} diff --git a/Samples/UnifiedMemoryPerf/matrixMultiplyPerf.cu b/Samples/UnifiedMemoryPerf/matrixMultiplyPerf.cu new file mode 100644 index 00000000..071ac9dc --- /dev/null +++ b/Samples/UnifiedMemoryPerf/matrixMultiplyPerf.cu @@ -0,0 +1,697 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include "commonDefs.hpp" +#include "commonKernels.hpp" + +#define VERIFY_GPU_CORRECTNESS 0 + +size_t maxSampleSizeInMb = 64; +int numKernelRuns = 100; +int verboseResults = 0; + +const char *memAllocTypeStr[MEMALLOC_TYPE_COUNT] = { + "Managed_Memory_With_Hints", + "Managed_Memory_With_Hints_FullyAsync", + "Managed_Memory_NoHints", + "Zero_Copy", + "Memcpy_HostMalloc_DeviceCudaMalloc", + "MemcpyAsync_HostMalloc_DeviceCudaMalloc", + "Memcpy_HostCudaHostAlloc_DeviceCudaMalloc", + "MemcpyAsync_HostCudaHostAlloc_DeviceCudaMalloc"}; + +const char *memAllocTypeShortStr[MEMALLOC_TYPE_COUNT] = { + "UMhint", // Managed Memory With Hints + "UMhntAs", // Managed Memory With_Hints Async + "UMeasy", // Managed_Memory with No Hints + "0Copy", // Zero Copy + "MemCopy", // USE HOST PAGEABLE AND DEVICE_MEMORY + "CpAsync", // USE HOST PAGEABLE AND DEVICE_MEMORY ASYNC + "CpHpglk", // USE HOST PAGELOCKED AND DEVICE MEMORY + "CpPglAs" // USE HOST PAGELOCKED AND DEVICE MEMORY ASYNC +}; + +static float RandFloat(float low, float high) { + float t = (float)rand() / (float)RAND_MAX; + return (1.0f - t) * low + t * high; +} + +void fillMatrixWithRandomValues(float *matrix, unsigned int matrixDim) { + unsigned int i, j; + for (i = 0; i < matrixDim; ++i) { + for (j = 0; j < matrixDim; ++j) { + matrix[j + i * matrixDim] = RandFloat(0.0f, 10.0f); + } + } +} + +#if VERIFY_GPU_CORRECTNESS +void verifyMatrixMultiplyCorrectness(float *C, float *A, float *B, + unsigned int matrixDim) { + unsigned int i, j, k, numErrors = 0; + for (i = 0; i < matrixDim; ++i) { + for (j = 0; j < matrixDim; ++j) { + float result = 0.0f; + for (k = 0; k < matrixDim; ++k) { + result += A[k + i * matrixDim] * B[j + k * matrixDim]; + } + if (fabs(C[j + i * matrixDim] - result) > 0.001 * matrixDim) { + printf("At [%u, %u]: Expected %f, Found %f\n", i, j, result, + C[j + i * matrixDim]); + ++numErrors; + } + } + } + if (numErrors != 0) { + printf("%d value mismatches occured\n", numErrors); + fflush(stdout); + exit(EXIT_FAILURE); // exit since value mismatches occured + } +} +#endif + +void copyMatrix(float *dstMatrix, float *srcMatrix, unsigned int matrixDim) { + size_t size = matrixDim * matrixDim * sizeof(float); + memcpy(dstMatrix, srcMatrix, size); +} + +void verifyMatrixData(float *expectedData, float *observedData, + unsigned int matrixDim) { + unsigned int i, j, numErrors = 0; + for (i = 0; i < matrixDim; ++i) { + for (j = 0; j < matrixDim; ++j) { + if (expectedData[j + i * matrixDim] != observedData[j + i * matrixDim]) { + ++numErrors; + if (verboseResults) { + printf("At [%u, %u]: Expected %f, Found %f\n", i, j, + expectedData[j + i * matrixDim], + observedData[j + i * matrixDim]); + } + } + } + } + if (numErrors != 0) { + printf("%d value mismatches occured\n", numErrors); + fflush(stdout); + exit(EXIT_FAILURE); // exit since value mismatches occured + } +} + +#define BLOCK_SIZE 32 +__global__ void matrixMultiplyKernel(float *C, float *A, float *B, + unsigned int matrixDim) { + // Block index + int bx = blockIdx.x; + int by = blockIdx.y; + + // Thread index + int tx = threadIdx.x; + int ty = threadIdx.y; + + unsigned int wA = matrixDim; + unsigned int wB = matrixDim; + + // Index of the first sub-matrix of A processed by the block + int aBegin = matrixDim * BLOCK_SIZE * by; + + // Index of the last sub-matrix of A processed by the block + int aEnd = aBegin + wA - 1; + + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; + + // Index of the first sub-matrix of B processed by the block + int bBegin = BLOCK_SIZE * bx; + + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; + + // Csub is used to store the element of the block sub-matrix + // that is computed by the thread + float Csub = 0; + + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { + // Declaration of the shared memory array As used to + // store the sub-matrix of A + __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; + + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B + __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; + + // Load the matrices from device memory + // to shared memory; each thread loads + // one element of each matrix + As[ty][tx] = A[a + wA * ty + tx]; + Bs[ty][tx] = B[b + wB * ty + tx]; + + // Synchronize to make sure the matrices are loaded + __syncthreads(); + + // Multiply the two matrices together; + // each thread computes one element + // of the block sub-matrix +#pragma unroll + + for (int k = 0; k < BLOCK_SIZE; ++k) { + Csub += As[ty][k] * Bs[k][tx]; + } + + // Synchronize to make sure that the preceding + // computation is done before loading two new + // sub-matrices of A and B in the next iteration + __syncthreads(); + } + + // Write the block sub-matrix to device memory; + // each thread writes one element + int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; + C[c + wB * ty + tx] = Csub; +} + +void runMatrixMultiplyKernel(unsigned int matrixDim, int allocType, + unsigned int numLoops, double *gpuLaunchCallsTimes, + double *gpuTransferToCallsTimes, + double *gpuTransferFromCallsTimes, + double *gpuLaunchAndTransferCallsTimes, + double *gpuLaunchTransferSyncTimes, + double *cpuAccessTimes, double *overallTimes, + int device_id) { + float *dptrA = NULL, *hptrA = NULL; + float *dptrB = NULL, *hptrB = NULL; + float *dptrC = NULL, *hptrC = NULL; + float *randValuesX = NULL, *randValuesY = NULL; + float *randValuesVerifyXmulY = NULL, *randValuesVerifyYmulX = NULL; + bool copyRequired = false, hintsRequired = false; + bool someTransferOpRequired; + bool isAsync = false; + cudaStream_t streamToRunOn; + unsigned int *latch; + size_t size = matrixDim * matrixDim * sizeof(float); + dim3 threads(32, 32); + dim3 grid(matrixDim / threads.x, matrixDim / threads.y); + StopWatchInterface *gpuLaunchCallsTimer = 0, *gpuTransferCallsTimer = 0; + StopWatchInterface *gpuSyncTimer = 0, *cpuAccessTimer = 0; + sdkCreateTimer(&gpuLaunchCallsTimer); + sdkCreateTimer(&gpuTransferCallsTimer); + sdkCreateTimer(&gpuSyncTimer); + sdkCreateTimer(&cpuAccessTimer); + unsigned int i; + + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device_id)); + checkCudaErrors(cudaStreamCreate(&streamToRunOn)); + + randValuesX = (float *)malloc(size); + if (!randValuesX) { + exit(EXIT_FAILURE); // exit since memory allocation error + } + randValuesY = (float *)malloc(size); + if (!randValuesY) { + exit(EXIT_FAILURE); // exit since memory allocation error + } + randValuesVerifyXmulY = (float *)malloc(size); + if (!randValuesVerifyXmulY) { + exit(EXIT_FAILURE); // exit since memory allocation error + } + randValuesVerifyYmulX = (float *)malloc(size); + if (!randValuesVerifyYmulX) { + exit(EXIT_FAILURE); // exit since memory allocation error + } + checkCudaErrors(cudaMalloc(&dptrA, size)); + checkCudaErrors(cudaMalloc(&dptrB, size)); + checkCudaErrors(cudaMalloc(&dptrC, size)); + + fillMatrixWithRandomValues(randValuesX, matrixDim); + fillMatrixWithRandomValues(randValuesY, matrixDim); + + checkCudaErrors( + cudaMemcpyAsync(dptrA, randValuesX, size, cudaMemcpyHostToDevice)); + checkCudaErrors( + cudaMemcpyAsync(dptrB, randValuesY, size, cudaMemcpyHostToDevice)); + matrixMultiplyKernel<<>>(dptrC, dptrA, dptrB, matrixDim); + checkCudaErrors(cudaMemcpyAsync(randValuesVerifyXmulY, dptrC, size, + cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaStreamSynchronize(NULL)); + matrixMultiplyKernel<<>>(dptrC, dptrB, dptrA, matrixDim); + checkCudaErrors(cudaMemcpyAsync(randValuesVerifyYmulX, dptrC, size, + cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaStreamSynchronize(NULL)); +#if VERIFY_GPU_CORRECTNESS + verifyMatrixMultiplyCorrectness(randValuesVerifyXmulY, randValuesX, + randValuesY, matrixDim); + verifyMatrixMultiplyCorrectness(randValuesVerifyYmulX, randValuesY, + randValuesX, matrixDim); +#endif + checkCudaErrors(cudaFree(dptrA)); + checkCudaErrors(cudaFree(dptrB)); + checkCudaErrors(cudaFree(dptrC)); + + checkCudaErrors(cudaMallocHost(&latch, sizeof(unsigned int))); + + switch (allocType) { + case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY: + case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC: + hptrA = (float *)malloc(size); + if (!hptrA) { + exit(EXIT_FAILURE); // exit since memory allocation error + } + hptrB = (float *)malloc(size); + if (!hptrB) { + exit(EXIT_FAILURE); // exit since memory allocation error + } + hptrC = (float *)malloc(size); + if (!hptrC) { + exit(EXIT_FAILURE); // exit since memory allocation error + } + checkCudaErrors(cudaMalloc(&dptrA, size)); + checkCudaErrors(cudaMalloc(&dptrB, size)); + checkCudaErrors(cudaMalloc(&dptrC, size)); + copyRequired = true; + break; + + case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY: + case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC: + checkCudaErrors(cudaMallocHost(&hptrA, size)); + checkCudaErrors(cudaMallocHost(&hptrB, size)); + checkCudaErrors(cudaMallocHost(&hptrC, size)); + checkCudaErrors(cudaMalloc(&dptrA, size)); + checkCudaErrors(cudaMalloc(&dptrB, size)); + checkCudaErrors(cudaMalloc(&dptrC, size)); + copyRequired = true; + break; + + case USE_ZERO_COPY: + checkCudaErrors(cudaMallocHost(&hptrA, size)); + checkCudaErrors(cudaMallocHost(&hptrB, size)); + checkCudaErrors(cudaMallocHost(&hptrC, size)); + checkCudaErrors(cudaHostGetDevicePointer(&dptrA, hptrA, 0)); + checkCudaErrors(cudaHostGetDevicePointer(&dptrB, hptrB, 0)); + checkCudaErrors(cudaHostGetDevicePointer(&dptrC, hptrC, 0)); + break; + + case USE_MANAGED_MEMORY: + checkCudaErrors(cudaMallocManaged(&dptrA, size)); + checkCudaErrors(cudaMallocManaged(&dptrB, size)); + checkCudaErrors(cudaMallocManaged(&dptrC, size)); + hptrA = dptrA; + hptrB = dptrB; + hptrC = dptrC; + break; + + case USE_MANAGED_MEMORY_WITH_HINTS: + case USE_MANAGED_MEMORY_WITH_HINTS_ASYNC: + if (deviceProp.concurrentManagedAccess) { + checkCudaErrors(cudaMallocManaged(&dptrA, size)); + checkCudaErrors(cudaMallocManaged(&dptrB, size)); + checkCudaErrors(cudaMallocManaged(&dptrC, size)); + checkCudaErrors(cudaMemPrefetchAsync(dptrA, size, cudaCpuDeviceId)); + checkCudaErrors(cudaMemPrefetchAsync(dptrB, size, cudaCpuDeviceId)); + checkCudaErrors(cudaMemPrefetchAsync(dptrC, size, cudaCpuDeviceId)); + } else { + checkCudaErrors(cudaMallocManaged(&dptrA, size, cudaMemAttachHost)); + checkCudaErrors(cudaMallocManaged(&dptrB, size, cudaMemAttachHost)); + checkCudaErrors(cudaMallocManaged(&dptrC, size, cudaMemAttachHost)); + } + hptrA = dptrA; + hptrB = dptrB; + hptrC = dptrC; + hintsRequired = true; + break; + + default: + exit(EXIT_FAILURE); // exit with error + } + + if (allocType == USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC || + allocType == USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC || + allocType == USE_MANAGED_MEMORY_WITH_HINTS_ASYNC) { + isAsync = true; + } + + someTransferOpRequired = copyRequired || hintsRequired; + + // fill buffers with 0 to avoid any first access page-fault overheads. + memset(hptrA, 0, size); + memset(hptrB, 0, size); + memset(hptrC, 0, size); + + for (i = 0; i < numLoops; i++) { + cpuAccessTimes[i] = 0.0; + gpuLaunchCallsTimes[i] = 0.0; + gpuTransferToCallsTimes[i] = 0.0; + gpuTransferFromCallsTimes[i] = 0.0; + + sdkStartTimer(&cpuAccessTimer); + { + copyMatrix(hptrA, (i & 0x1 == 0) ? randValuesX : randValuesY, matrixDim); + copyMatrix(hptrB, (i & 0x1 == 0) ? randValuesY : randValuesX, matrixDim); + } + sdkStopTimer(&cpuAccessTimer); + cpuAccessTimes[i] += sdkGetAverageTimerValue(&cpuAccessTimer); + sdkResetTimer(&cpuAccessTimer); + + if (isAsync && hintsRequired) { + *latch = 0; + // Prevent any work on stream from starting until all work is pushed + spinWhileLessThanOne<<<1, 1, 0, streamToRunOn>>>(latch); + } + + if (someTransferOpRequired) { + sdkStartTimer(&gpuTransferCallsTimer); + if (copyRequired) { + if (isAsync) { + checkCudaErrors(cudaMemcpyAsync( + dptrA, hptrA, size, cudaMemcpyHostToDevice, streamToRunOn)); + checkCudaErrors(cudaMemcpyAsync( + dptrB, hptrB, size, cudaMemcpyHostToDevice, streamToRunOn)); + } else { + checkCudaErrors( + cudaMemcpy(dptrA, hptrA, size, cudaMemcpyHostToDevice)); + checkCudaErrors( + cudaMemcpy(dptrB, hptrB, size, cudaMemcpyHostToDevice)); + } + } + if (hintsRequired) { + if (deviceProp.concurrentManagedAccess) { + checkCudaErrors( + cudaMemPrefetchAsync(dptrA, size, device_id, streamToRunOn)); + checkCudaErrors( + cudaMemPrefetchAsync(dptrB, size, device_id, streamToRunOn)); + checkCudaErrors( + cudaMemPrefetchAsync(dptrC, size, device_id, streamToRunOn)); + } else { + checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrA, 0, + cudaMemAttachGlobal)); + checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrB, 0, + cudaMemAttachGlobal)); + checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrC, 0, + cudaMemAttachGlobal)); + } + if (!isAsync) { + checkCudaErrors(cudaStreamSynchronize(streamToRunOn)); + } + } + + sdkStopTimer(&gpuTransferCallsTimer); + gpuTransferToCallsTimes[i] += + sdkGetAverageTimerValue(&gpuTransferCallsTimer); + sdkResetTimer(&gpuTransferCallsTimer); + } + + sdkStartTimer(&gpuLaunchCallsTimer); + { + matrixMultiplyKernel<<>>( + dptrC, dptrA, dptrB, matrixDim); + if (!isAsync) { + checkCudaErrors(cudaStreamSynchronize(streamToRunOn)); + } + } + sdkStopTimer(&gpuLaunchCallsTimer); + + gpuLaunchCallsTimes[i] += sdkGetAverageTimerValue(&gpuLaunchCallsTimer); + sdkResetTimer(&gpuLaunchCallsTimer); + + if (someTransferOpRequired) { + sdkStartTimer(&gpuTransferCallsTimer); + if (hintsRequired) { + if (deviceProp.concurrentManagedAccess) { + checkCudaErrors(cudaMemPrefetchAsync(dptrA, size, cudaCpuDeviceId)); + checkCudaErrors(cudaMemPrefetchAsync(dptrB, size, cudaCpuDeviceId)); + checkCudaErrors(cudaMemPrefetchAsync(dptrC, size, cudaCpuDeviceId)); + } else { + checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrA, 0, + cudaMemAttachHost)); + checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrB, 0, + cudaMemAttachHost)); + checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrC, 0, + cudaMemAttachHost)); + } + if (!isAsync) { + checkCudaErrors(cudaStreamSynchronize(streamToRunOn)); + } + } + if (copyRequired) { + if (isAsync) { + checkCudaErrors(cudaMemcpyAsync( + hptrC, dptrC, size, cudaMemcpyDeviceToHost, streamToRunOn)); + } else { + checkCudaErrors( + cudaMemcpy(hptrC, dptrC, size, cudaMemcpyDeviceToHost)); + } + } + sdkStopTimer(&gpuTransferCallsTimer); + gpuTransferFromCallsTimes[i] += + sdkGetAverageTimerValue(&gpuTransferCallsTimer); + sdkResetTimer(&gpuTransferCallsTimer); + } + gpuLaunchAndTransferCallsTimes[i] = gpuLaunchCallsTimes[i] + + gpuTransferToCallsTimes[i] + + gpuTransferFromCallsTimes[i]; + gpuLaunchTransferSyncTimes[i] = gpuLaunchAndTransferCallsTimes[i]; + if (isAsync) { + sdkStartTimer(&gpuSyncTimer); + { + if (hintsRequired) { + *latch = 1; + } + checkCudaErrors(cudaStreamSynchronize(streamToRunOn)); + } + sdkStopTimer(&gpuSyncTimer); + gpuLaunchTransferSyncTimes[i] += sdkGetAverageTimerValue(&gpuSyncTimer); + sdkResetTimer(&gpuSyncTimer); + } + + sdkStartTimer(&cpuAccessTimer); + { + verifyMatrixData( + (i & 0x1 == 0) ? randValuesVerifyXmulY : randValuesVerifyYmulX, hptrC, + matrixDim); + } + sdkStopTimer(&cpuAccessTimer); + cpuAccessTimes[i] += sdkGetAverageTimerValue(&cpuAccessTimer); + sdkResetTimer(&cpuAccessTimer); + overallTimes[i] = cpuAccessTimes[i] + gpuLaunchTransferSyncTimes[i]; + } + + switch (allocType) { + case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY: + case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC: + free(hptrA); + free(hptrB); + free(hptrC); + checkCudaErrors(cudaFree(dptrA)); + checkCudaErrors(cudaFree(dptrB)); + checkCudaErrors(cudaFree(dptrC)); + break; + + case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY: + case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC: + checkCudaErrors(cudaFreeHost(hptrA)); + checkCudaErrors(cudaFreeHost(hptrB)); + checkCudaErrors(cudaFreeHost(hptrC)); + checkCudaErrors(cudaFree(dptrA)); + checkCudaErrors(cudaFree(dptrB)); + checkCudaErrors(cudaFree(dptrC)); + break; + + case USE_ZERO_COPY: + checkCudaErrors(cudaFreeHost(hptrA)); + checkCudaErrors(cudaFreeHost(hptrB)); + checkCudaErrors(cudaFreeHost(hptrC)); + break; + + case USE_MANAGED_MEMORY: + case USE_MANAGED_MEMORY_WITH_HINTS: + case USE_MANAGED_MEMORY_WITH_HINTS_ASYNC: + checkCudaErrors(cudaFree(dptrA)); + checkCudaErrors(cudaFree(dptrB)); + checkCudaErrors(cudaFree(dptrC)); + break; + + default: + exit(EXIT_FAILURE); // exit due to error + } + + checkCudaErrors(cudaStreamDestroy(streamToRunOn)); + checkCudaErrors(cudaFreeHost(latch)); + free(randValuesX); + free(randValuesY); + free(randValuesVerifyXmulY); + free(randValuesVerifyYmulX); + sdkDeleteTimer(&gpuLaunchCallsTimer); + sdkDeleteTimer(&gpuTransferCallsTimer); + sdkDeleteTimer(&gpuSyncTimer); + sdkDeleteTimer(&cpuAccessTimer); +} + +void matrixMultiplyPerfRunner(bool reportAsBandwidth, + bool print_launch_transfer_results, + bool print_std_deviation, int device_id) { + int i; + unsigned int minMatrixDim = 32; + unsigned int multiplierDim = 2; + unsigned int matrixDim; + unsigned int minSize = minMatrixDim * minMatrixDim * sizeof(float); + unsigned int maxSize = + (maxSampleSizeInMb * ONE_MB) / + 4; // 3 buffers are used, but dividing by 4 (power of 2) + unsigned int multiplier = multiplierDim * multiplierDim; + unsigned int numSizesToTest; + + struct testResults *results; + struct resultsData *gpuLaunchCallsTimes; + struct resultsData *gpuTransferToCallsTimes; + struct resultsData *gpuTransferFromCallsTimes; + struct resultsData *gpuLaunchAndTransferCallsTimes; + struct resultsData *gpuLaunchTransferSyncTimes; + struct resultsData *cpuAccessTimes; + struct resultsData *overallTimes; + unsigned long *sizesToTest; + unsigned int j; + + numSizesToTest = findNumSizesToTest(minSize, maxSize, multiplier); + + createAndInitTestResults(&results, "matrixMultiplyPerf", numKernelRuns, + numSizesToTest); + + sizesToTest = getPtrSizesToTest(results); + + createResultDataAndAddToTestResults(&gpuLaunchCallsTimes, results, + "GPU Kernel Launch Call Time", false, + reportAsBandwidth); + createResultDataAndAddToTestResults(&gpuTransferToCallsTimes, results, + "CPU to GPU Transfer Calls Time", false, + reportAsBandwidth); + createResultDataAndAddToTestResults(&gpuTransferFromCallsTimes, results, + "GPU to CPU Transfer Calls Time", false, + reportAsBandwidth); + createResultDataAndAddToTestResults(&gpuLaunchAndTransferCallsTimes, results, + "GPU Launch and Transfer Calls Time", + false, reportAsBandwidth); + createResultDataAndAddToTestResults(&gpuLaunchTransferSyncTimes, results, + "GPU Launch Transfer and Sync Time", + false, reportAsBandwidth); + createResultDataAndAddToTestResults( + &cpuAccessTimes, results, "CPU Access Time", false, reportAsBandwidth); + createResultDataAndAddToTestResults(&overallTimes, results, "Overall Time", + false, reportAsBandwidth); + + printf("Running "); + for (matrixDim = minMatrixDim, j = 0; + matrixDim * matrixDim <= maxSize / sizeof(float); + matrixDim *= multiplierDim, ++j) { + sizesToTest[j] = matrixDim * matrixDim * sizeof(float); + for (i = MEMALLOC_TYPE_START; i <= MEMALLOC_TYPE_END; i++) { + printf("."); + fflush(stdout); + runMatrixMultiplyKernel( + matrixDim, i, numKernelRuns, + getPtrRunTimesInMs(gpuLaunchCallsTimes, i, j), + getPtrRunTimesInMs(gpuTransferToCallsTimes, i, j), + getPtrRunTimesInMs(gpuTransferFromCallsTimes, i, j), + getPtrRunTimesInMs(gpuLaunchAndTransferCallsTimes, i, j), + getPtrRunTimesInMs(gpuLaunchTransferSyncTimes, i, j), + getPtrRunTimesInMs(cpuAccessTimes, i, j), + getPtrRunTimesInMs(overallTimes, i, j), device_id); + } + } + printf("\n"); + printResults(results, print_launch_transfer_results, print_std_deviation); + freeTestResultsAndAllResultsData(results); +} + +static void usage() { + printf( + "./cudaMemoryTypesPerf [-device=] [-reportAsBandwidth] " + "[-print-launch-transfer-results] [-print-std-deviation] [-verbose]\n"); + printf("Options:\n"); + printf( + "-reportAsBandwidth: By default time taken is printed, this " + "option allows to instead print bandwidth.\n"); + printf( + "-print-launch-transfer-results: By default overall results are printed, " + "this option allows to print data transfers and kernel time as well.\n"); + printf( + "-print-std-deviation: Prints std deviation of the results.\n"); + printf( + "-kernel-iterations=: Number of times the kernel tests should " + "be run[default is 100 iterations].\n"); + printf( + "-device=: Allows to pass GPU Device ID on which " + "the tests will be run.\n"); + printf("-verbose: Prints highly verbose output.\n"); +} + +int main(int argc, char **argv) { + bool reportAsBandwidth = false; + bool print_launch_transfer_results = false; + bool print_std_deviation = false; + + if (checkCmdLineFlag(argc, (const char **)argv, "help") || + checkCmdLineFlag(argc, (const char **)argv, "h")) { + usage(); + printf("&&&& %s WAIVED\n", argv[0]); + exit(EXIT_WAIVED); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "reportAsBandwidth")) { + reportAsBandwidth = true; + } + + if (checkCmdLineFlag(argc, (const char **)argv, + "print-launch-transfer-results")) { + print_launch_transfer_results = true; + } + + if (checkCmdLineFlag(argc, (const char **)argv, "print-std-deviation")) { + print_std_deviation = true; + } + + if (checkCmdLineFlag(argc, (const char **)argv, "kernel-iterations")) { + numKernelRuns = + getCmdLineArgumentInt(argc, (const char **)argv, "kernel-iterations"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "verbose")) { + verboseResults = 1; + } + + int device_id = findCudaDevice(argc, (const char **)argv); + + matrixMultiplyPerfRunner(reportAsBandwidth, print_launch_transfer_results, + print_std_deviation, device_id); + + printf( + "\nNOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n"); + exit(EXIT_SUCCESS); +} diff --git a/Samples/conjugateGradientCudaGraphs/Makefile b/Samples/conjugateGradientCudaGraphs/Makefile new file mode 100644 index 00000000..0130308e --- /dev/null +++ b/Samples/conjugateGradientCudaGraphs/Makefile @@ -0,0 +1,302 @@ +################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-clang++ + endif + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu + LDFLAGS += --unresolved-symbols=ignore-in-shared-libs + CCFLAGS += -isystem=$(TARGET_FS)/usr/include + CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu + endif + endif +endif + +ifeq ($(TARGET_OS),qnx) + CCFLAGS += -DWIN_INTERFACE_CUSTOM + LDFLAGS += -lsocket +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +SAMPLE_ENABLED := 1 + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../Common +LIBRARIES := + +################################################################################ + +# Gencode arguments +SMS ?= 30 35 37 50 52 60 61 70 75 + +ifeq ($(SMS),) +$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) +SAMPLE_ENABLED := 0 +endif + +ifeq ($(GENCODE_FLAGS),) +# Generate SASS code for each SM architecture listed in $(SMS) +$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) + +# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility +HIGHEST_SM := $(lastword $(sort $(SMS))) +ifneq ($(HIGHEST_SM),) +GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) +endif +endif + +LIBRARIES += -lcublas_static -lcusparse_static -lculibos + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: conjugateGradientCudaGraphs + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +conjugateGradientCudaGraphs.o:conjugateGradientCudaGraphs.cu + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +conjugateGradientCudaGraphs: conjugateGradientCudaGraphs.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./conjugateGradientCudaGraphs + +clean: + rm -f conjugateGradientCudaGraphs conjugateGradientCudaGraphs.o + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/conjugateGradientCudaGraphs + +clobber: clean diff --git a/Samples/conjugateGradientCudaGraphs/NsightEclipse.xml b/Samples/conjugateGradientCudaGraphs/NsightEclipse.xml new file mode 100644 index 00000000..8d06fc36 --- /dev/null +++ b/Samples/conjugateGradientCudaGraphs/NsightEclipse.xml @@ -0,0 +1,84 @@ + + + + conjugateGradientCudaGraphs + + cudaStreamBeginCapture + cudaStreamEndCapture + cudaGraphCreate + cudaGraphLaunch + cudaGraphInstantiate + cudaGraphExecDestroy + cudaGraphDestroy + + + whole + + ./ + ../ + ../../common/inc + + + Linear Algebra + CUBLAS Library + CUSPARSE Library + + + CUDA + CUBLAS + CUSPARSE + Sparse Matrix + + + cublas_static + cusparse_static + culibos + + + + true + conjugateGradientCudaGraphs.cu + + CUBLAS + CUSPARSE + + + 1:CUDA Advanced Topics + 3:Linear Algebra + 1:CUDA Graphs + + sm30 + sm35 + sm37 + sm50 + sm52 + sm60 + sm61 + sm70 + sm75 + + + x86_64 + linux + + + windows7 + + + x86_64 + macosx + + + arm + + + ppc64le + linux + + + + all + + Conjugate Gradient using Cuda Graphs + exe + diff --git a/Samples/conjugateGradientCudaGraphs/README.md b/Samples/conjugateGradientCudaGraphs/README.md new file mode 100644 index 00000000..5a829f54 --- /dev/null +++ b/Samples/conjugateGradientCudaGraphs/README.md @@ -0,0 +1,98 @@ +# conjugateGradientCudaGraphs - Conjugate Gradient using Cuda Graphs + +## Description + +This sample implements a conjugate gradient solver on GPU using CUBLAS and CUSPARSE library calls captured and called using CUDA Graph APIs. + +## Key Concepts + +Linear Algebra, CUBLAS Library, CUSPARSE Library + +## Supported SM Architectures + +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows, MacOSX + +## Supported CPU Architecture + +x86_64, ppc64le, armv7l + +## CUDA APIs involved + +### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) +cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch, cudaGraphInstantiate, cudaGraphExecDestroy, cudaGraphDestroy + +## Dependencies needed to build/run +[CUBLAS](../../README.md#cublas), [CUSPARSE](../../README.md#cusparse) + +## Prerequisites + +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Make sure the dependencies mentioned in [Dependencies]() section above are installed. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +### Mac +The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` + +The samples makefiles can take advantage of certain options: + +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` + +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60". + ``` + $ make SMS="A B ..." + ``` + +* **HOST_COMPILER=** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers. + ``` + $ make HOST_COMPILER=clang + ``` + +## References (for more details) + diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu new file mode 100644 index 00000000..b6b83fba --- /dev/null +++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu @@ -0,0 +1,466 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +/* + * This sample implements a conjugate gradient solver on GPU + * using CUBLAS and CUSPARSE with CUDA Graphs + * + */ + +// includes, system +#include +#include +#include + +/* Using updated (v2) interfaces to cublas */ +#include +#include +#include + +#include + +// Utilities and system includes +#include // helper function CUDA error checking and initialization +#include // helper for shared functions common to CUDA Samples + +namespace cg = cooperative_groups; + +const char *sSDKname = "conjugateGradientCudaGraphs"; + +#ifndef WITH_GRAPH +#define WITH_GRAPH 1 +#endif + +/* genTridiag: generate a random tridiagonal symmetric matrix */ +void genTridiag(int *I, int *J, float *val, int N, int nz) { + I[0] = 0, J[0] = 0, J[1] = 1; + val[0] = (float)rand() / RAND_MAX + 10.0f; + val[1] = (float)rand() / RAND_MAX; + int start; + + for (int i = 1; i < N; i++) { + if (i > 1) { + I[i] = I[i - 1] + 3; + } else { + I[1] = 2; + } + + start = (i - 1) * 3 + 2; + J[start] = i - 1; + J[start + 1] = i; + + if (i < N - 1) { + J[start + 2] = i + 1; + } + + val[start] = val[start - 1]; + val[start + 1] = (float)rand() / RAND_MAX + 10.0f; + + if (i < N - 1) { + val[start + 2] = (float)rand() / RAND_MAX; + } + } + + I[N] = nz; +} + +__global__ void initVectors(float *rhs, float *x, int N) { + size_t gid = blockIdx.x * blockDim.x + threadIdx.x; + + for (size_t i = gid; i < N; i += gridDim.x * blockDim.x) { + rhs[i] = 1.0; + x[i] = 0.0; + } +} + +__global__ void gpuDotProduct(float *vecA, float *vecB, float *result, + int size) { + cg::thread_block cta = cg::this_thread_block(); + + int gid = blockIdx.x * blockDim.x + threadIdx.x; + extern __shared__ double tmp[]; + + double temp_sum = 0.0; + for (int i = gid; i < size; i += gridDim.x * blockDim.x) { + temp_sum += (double)(vecA[i] * vecB[i]); + } + tmp[cta.thread_rank()] = temp_sum; + + cg::sync(cta); + + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + double beta = temp_sum; + double temp; + + for (int i = tile32.size() / 2; i > 0; i >>= 1) { + if (tile32.thread_rank() < i) { + temp = tmp[cta.thread_rank() + i]; + beta += temp; + tmp[cta.thread_rank()] = beta; + } + cg::sync(tile32); + } + cg::sync(cta); + + if (cta.thread_rank() == 0) { + beta = 0.0; + for (int i = 0; i < cta.size(); i += tile32.size()) { + beta += tmp[i]; + } + atomicAdd(result, (float)beta); + } +} + +__global__ void gpuSpMV(int *I, int *J, float *val, int nnz, int num_rows, + float alpha, float *inputVecX, float *outputVecY) { + size_t gid = blockIdx.x * blockDim.x + threadIdx.x; + for (size_t i = gid; i < num_rows; i += blockDim.x * gridDim.x) { + int row_elem = I[i]; + int next_row_elem = I[i + 1]; + int num_elems_this_row = next_row_elem - row_elem; + + float output = 0.0; + for (int j = 0; j < num_elems_this_row; j++) { + output += alpha * val[row_elem + j] * inputVecX[J[row_elem + j]]; + } + + outputVecY[i] = output; + } +} + +__global__ void r1_div_x(float *r1, float *r0, float *b) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + if (gid == 0) { + b[0] = r1[0] / r0[0]; + } +} + +__global__ void a_minus(float *a, float *na) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + if (gid == 0) { + na[0] = -(a[0]); + } +} + +int main(int argc, char **argv) { + int N = 0, nz = 0, *I = NULL, *J = NULL; + float *val = NULL; + const float tol = 1e-5f; + const int max_iter = 10000; + float *x; + float *rhs; + float r1; + + int *d_col, *d_row; + float *d_val, *d_x; + float *d_r, *d_p, *d_Ax; + int k; + float alpha, beta, alpham1; + + cudaStream_t stream1, streamForGraph; + + // This will pick the best possible CUDA capable device + cudaDeviceProp deviceProp; + int devID = findCudaDevice(argc, (const char **)argv); + + if (devID < 0) { + printf("exiting...\n"); + exit(EXIT_SUCCESS); + } + + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); + + // Statistics about the GPU device + printf( + "> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", + deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); + + /* Generate a random tridiagonal symmetric matrix in CSR format */ + N = 1048576; + nz = (N - 2) * 3 + 4; + I = (int *)malloc(sizeof(int) * (N + 1)); + J = (int *)malloc(sizeof(int) * nz); + val = (float *)malloc(sizeof(float) * nz); + genTridiag(I, J, val, N, nz); + + x = (float *)malloc(sizeof(float) * N); + rhs = (float *)malloc(sizeof(float) * N); + + for (int i = 0; i < N; i++) { + rhs[i] = 1.0; + x[i] = 0.0; + } + + /* Get handle to the CUBLAS context */ + cublasHandle_t cublasHandle = 0; + cublasStatus_t cublasStatus; + cublasStatus = cublasCreate(&cublasHandle); + + checkCudaErrors(cublasStatus); + + /* Get handle to the CUSPARSE context */ + cusparseHandle_t cusparseHandle = 0; + cusparseStatus_t cusparseStatus; + cusparseStatus = cusparseCreate(&cusparseHandle); + + checkCudaErrors(cusparseStatus); + + checkCudaErrors(cudaStreamCreate(&stream1)); + + checkCudaErrors(cudaMalloc((void **)&d_col, nz * sizeof(int))); + checkCudaErrors(cudaMalloc((void **)&d_row, (N + 1) * sizeof(int))); + checkCudaErrors(cudaMalloc((void **)&d_val, nz * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_x, N * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_r, N * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_p, N * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_Ax, N * sizeof(float))); + + float *d_r1, *d_r0, *d_dot, *d_a, *d_na, *d_b; + checkCudaErrors(cudaMalloc((void **)&d_r1, sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_r0, sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_dot, sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_a, sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_na, sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(float))); + + cusparseMatDescr_t descr = 0; + checkCudaErrors(cusparseCreateMatDescr(&descr)); + + checkCudaErrors(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); + checkCudaErrors(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); + + int numBlocks = 0, blockSize = 0, numBlocks2 = 0, blockSize2 = 0; + checkCudaErrors( + cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, initVectors)); + + checkCudaErrors(cudaMemcpyAsync(d_col, J, nz * sizeof(int), + cudaMemcpyHostToDevice, stream1)); + checkCudaErrors(cudaMemcpyAsync(d_row, I, (N + 1) * sizeof(int), + cudaMemcpyHostToDevice, stream1)); + checkCudaErrors(cudaMemcpyAsync(d_val, val, nz * sizeof(float), + cudaMemcpyHostToDevice, stream1)); + + initVectors<<>>(d_r, d_x, N); + + checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&numBlocks2, &blockSize2, + gpuSpMV)); + checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, + gpuDotProduct)); + + alpha = 1.0; + alpham1 = -1.0; + beta = 0.0; + + checkCudaErrors(cusparseSetStream(cusparseHandle, stream1)); + checkCudaErrors( + cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, + &alpha, descr, d_val, d_row, d_col, d_x, &beta, d_Ax)); + + checkCudaErrors(cublasSetStream(cublasHandle, stream1)); + checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpham1, d_Ax, 1, d_r, 1)); + + checkCudaErrors( + cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE)); + checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, d_r1)); + + k = 1; + // First Iteration when k=1 starts + checkCudaErrors(cublasScopy(cublasHandle, N, d_r, 1, d_p, 1)); + checkCudaErrors( + cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, + &alpha, descr, d_val, d_row, d_col, d_p, &beta, d_Ax)); + + checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot)); + + r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_dot, d_a); + + checkCudaErrors(cublasSaxpy(cublasHandle, N, d_a, d_p, 1, d_x, 1)); + + a_minus<<<1, 1, 0, stream1>>>(d_a, d_na); + + checkCudaErrors(cublasSaxpy(cublasHandle, N, d_na, d_Ax, 1, d_r, 1)); + + checkCudaErrors(cudaMemcpyAsync(d_r0, d_r1, sizeof(float), + cudaMemcpyDeviceToDevice, stream1)); + + checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, d_r1)); + + checkCudaErrors(cudaMemcpyAsync(&r1, d_r1, sizeof(float), + cudaMemcpyDeviceToHost, stream1)); + checkCudaErrors(cudaStreamSynchronize(stream1)); + printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); + // First Iteration when k=1 ends + k++; + +#if WITH_GRAPH + cudaGraph_t initGraph; + checkCudaErrors(cudaStreamCreate(&streamForGraph)); + checkCudaErrors(cublasSetStream(cublasHandle, stream1)); + checkCudaErrors(cusparseSetStream(cusparseHandle, stream1)); + checkCudaErrors(cudaStreamBeginCapture(stream1)); + + r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_r0, d_b); + cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE); + checkCudaErrors(cublasSscal(cublasHandle, N, d_b, d_p, 1)); + cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_HOST); + checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1)); + cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE); + +#if 0 // Use cusparseScsrmv API when it is cuda graph compliant + checkCudaErrors( + cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST)); + checkCudaErrors( + cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, + &alpha, descr, d_val, d_row, d_col, d_p, &beta, d_Ax)); +#else + gpuSpMV<<>>(d_row, d_col, d_val, nz, + N, alpha, d_p, d_Ax); +#endif + + checkCudaErrors(cudaMemsetAsync(d_dot, 0, sizeof(float), stream1)); + // Use cublasSdot API when it is cuda graph compliant. + // checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot)); + gpuDotProduct<<>>( + d_p, d_Ax, d_dot, N); + + r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_dot, d_a); + + checkCudaErrors(cublasSaxpy(cublasHandle, N, d_a, d_p, 1, d_x, 1)); + + a_minus<<<1, 1, 0, stream1>>>(d_a, d_na); + + checkCudaErrors(cublasSaxpy(cublasHandle, N, d_na, d_Ax, 1, d_r, 1)); + + checkCudaErrors(cudaMemcpyAsync(d_r0, d_r1, sizeof(float), + cudaMemcpyDeviceToDevice, stream1)); + checkCudaErrors(cudaMemsetAsync(d_r1, 0, sizeof(float), stream1)); + // Use cublasSdot API when it is cuda graph compliant. + // checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, d_r1)); + gpuDotProduct<<>>( + d_r, d_r, d_r1, N); + checkCudaErrors(cudaMemcpyAsync((float *)&r1, d_r1, sizeof(float), + cudaMemcpyDeviceToHost, stream1)); + + checkCudaErrors(cudaStreamEndCapture(stream1, &initGraph)); + cudaGraphExec_t graphExec; + checkCudaErrors(cudaGraphInstantiate(&graphExec, initGraph, NULL, NULL, 0)); +#endif + + checkCudaErrors(cublasSetStream(cublasHandle, stream1)); + checkCudaErrors(cusparseSetStream(cusparseHandle, stream1)); + + while (r1 > tol * tol && k <= max_iter) { +#if WITH_GRAPH + checkCudaErrors(cudaGraphLaunch(graphExec, streamForGraph)); + checkCudaErrors(cudaStreamSynchronize(streamForGraph)); +#else + r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_r0, d_b); + cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE); + checkCudaErrors(cublasSscal(cublasHandle, N, d_b, d_p, 1)); + + cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_HOST); + checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1)); + + checkCudaErrors(cusparseScsrmv( + cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, + descr, d_val, d_row, d_col, d_p, &beta, d_Ax)); + + cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE); + checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot)); + + r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_dot, d_a); + + checkCudaErrors(cublasSaxpy(cublasHandle, N, d_a, d_p, 1, d_x, 1)); + + a_minus<<<1, 1, 0, stream1>>>(d_a, d_na); + checkCudaErrors(cublasSaxpy(cublasHandle, N, d_na, d_Ax, 1, d_r, 1)); + + checkCudaErrors(cudaMemcpyAsync(d_r0, d_r1, sizeof(float), + cudaMemcpyDeviceToDevice, stream1)); + + checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, d_r1)); + checkCudaErrors(cudaMemcpyAsync((float *)&r1, d_r1, sizeof(float), + cudaMemcpyDeviceToHost, stream1)); + checkCudaErrors(cudaStreamSynchronize(stream1)); +#endif + printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); + k++; + } + +#if WITH_GRAPH + checkCudaErrors(cudaMemcpyAsync(x, d_x, N * sizeof(float), + cudaMemcpyDeviceToHost, streamForGraph)); + checkCudaErrors(cudaStreamSynchronize(streamForGraph)); +#else + checkCudaErrors(cudaMemcpyAsync(x, d_x, N * sizeof(float), + cudaMemcpyDeviceToHost, stream1)); + checkCudaErrors(cudaStreamSynchronize(stream1)); +#endif + + float rsum, diff, err = 0.0; + + for (int i = 0; i < N; i++) { + rsum = 0.0; + + for (int j = I[i]; j < I[i + 1]; j++) { + rsum += val[j] * x[J[j]]; + } + + diff = fabs(rsum - rhs[i]); + + if (diff > err) { + err = diff; + } + } + +#if WITH_GRAPH + checkCudaErrors(cudaGraphExecDestroy(graphExec)); + checkCudaErrors(cudaGraphDestroy(initGraph)); + checkCudaErrors(cudaStreamDestroy(streamForGraph)); +#endif + checkCudaErrors(cudaStreamDestroy(stream1)); + cusparseDestroy(cusparseHandle); + cublasDestroy(cublasHandle); + + free(I); + free(J); + free(val); + free(x); + free(rhs); + cudaFree(d_col); + cudaFree(d_row); + cudaFree(d_val); + cudaFree(d_x); + cudaFree(d_r); + cudaFree(d_p); + cudaFree(d_Ax); + + printf("Test Summary: Error amount = %f\n", err); + exit((k <= max_iter) ? 0 : 1); +} diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2010.sln b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.sln similarity index 78% rename from Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2010.sln rename to Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.sln index f34801c6..347c0144 100644 --- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2010.sln +++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.sln @@ -1,7 +1,7 @@  -Microsoft Visual Studio Solution File, Format Version 11.00 -# Visual Studio 2010 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientMultiBlockCG", "conjugateGradientMultiBlockCG_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientCudaGraphs", "conjugateGradientCudaGraphs_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2010.vcxproj b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.vcxproj similarity index 84% rename from Samples/simpleCUFFT/simpleCUFFT_vs2010.vcxproj rename to Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.vcxproj index 97a37127..4a583fc9 100644 --- a/Samples/simpleCUFFT/simpleCUFFT_vs2010.vcxproj +++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.vcxproj @@ -15,14 +15,15 @@ {997E0757-EA74-4A4E-A0FC-47D8C8831A15} - simpleCUFFT_vs2010 - simpleCUFFT + conjugateGradientCudaGraphs_vs2012 + conjugateGradientCudaGraphs Application MultiByte + v110 true @@ -32,7 +33,7 @@ - + @@ -56,12 +57,12 @@ Console - cufft.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(CudaToolkitLibDir); - $(OutDir)/simpleCUFFT.exe + $(OutDir)/conjugateGradientCudaGraphs.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -96,11 +97,11 @@ - + - + diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.sln b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.sln new file mode 100644 index 00000000..da17ae9e --- /dev/null +++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 13.00 +# Visual Studio 2013 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientCudaGraphs", "conjugateGradientCudaGraphs_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.vcxproj b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.vcxproj new file mode 100644 index 00000000..eeb90636 --- /dev/null +++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + conjugateGradientCudaGraphs_vs2013 + conjugateGradientCudaGraphs + + + + + Application + MultiByte + v120 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/conjugateGradientCudaGraphs.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.sln b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.sln new file mode 100644 index 00000000..2c1c5cde --- /dev/null +++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 14.00 +# Visual Studio 2015 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientCudaGraphs", "conjugateGradientCudaGraphs_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.vcxproj b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.vcxproj new file mode 100644 index 00000000..69312b05 --- /dev/null +++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + conjugateGradientCudaGraphs_vs2015 + conjugateGradientCudaGraphs + + + + + Application + MultiByte + v140 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/conjugateGradientCudaGraphs.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.sln b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.sln new file mode 100644 index 00000000..5b819947 --- /dev/null +++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2017 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientCudaGraphs", "conjugateGradientCudaGraphs_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj new file mode 100644 index 00000000..13de64fc --- /dev/null +++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj @@ -0,0 +1,108 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + conjugateGradientCudaGraphs_vs2017 + conjugateGradientCudaGraphs + + + + + Application + MultiByte + v141 + 10.0.15063.0 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/conjugateGradientCudaGraphs.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/conjugateGradientMultiBlockCG/Makefile b/Samples/conjugateGradientMultiBlockCG/Makefile index 9609d70b..2092da3b 100644 --- a/Samples/conjugateGradientMultiBlockCG/Makefile +++ b/Samples/conjugateGradientMultiBlockCG/Makefile @@ -1,31 +1,29 @@ ################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # -# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. # -# NOTICE TO USER: -# -# This source code is subject to NVIDIA ownership rights under U.S. and -# international Copyright laws. -# -# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE -# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR -# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH -# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, -# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -# OR PERFORMANCE OF THIS SOURCE CODE. -# -# U.S. Government End Users. This source code is a "commercial item" as -# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of -# "commercial computer software" and "commercial computer software -# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) -# and is provided to the U.S. Government only as a commercial end item. -# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through -# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the -# source code with only those rights set forth herein. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ################################################################################ # @@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) export QNX_TARGET HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ else ifeq ($(TARGET_OS), android) - HOST_COMPILER ?= aarch64-linux-android-g++ + HOST_COMPILER ?= aarch64-linux-android-clang++ endif else ifeq ($(TARGET_ARCH),ppc64le) HOST_COMPILER ?= powerpc64le-linux-gnu-g++ @@ -266,7 +264,7 @@ LIBRARIES := ################################################################################ # Gencode arguments -SMS ?= 60 61 70 +SMS ?= 60 61 70 75 ifeq ($(SMS),) $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) diff --git a/Samples/conjugateGradientMultiBlockCG/NsightEclipse.xml b/Samples/conjugateGradientMultiBlockCG/NsightEclipse.xml index c730220c..067f3f07 100644 --- a/Samples/conjugateGradientMultiBlockCG/NsightEclipse.xml +++ b/Samples/conjugateGradientMultiBlockCG/NsightEclipse.xml @@ -42,6 +42,7 @@ sm60 sm61 sm70 + sm75 x86_64 diff --git a/Samples/conjugateGradientMultiBlockCG/README.md b/Samples/conjugateGradientMultiBlockCG/README.md index 1c0924e0..101b692a 100644 --- a/Samples/conjugateGradientMultiBlockCG/README.md +++ b/Samples/conjugateGradientMultiBlockCG/README.md @@ -10,7 +10,7 @@ Unified Memory, Linear Algebra, Cooperative Groups, MultiBlock Cooperative Group ## Supported SM Architectures -[SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) +[SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) ## Supported OSes @@ -27,7 +27,7 @@ x86_64, ppc64le ## Prerequisites -Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2012.vcxproj b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2012.vcxproj index 0075650c..fdf29d91 100644 --- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2012.vcxproj +++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2012.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/conjugateGradientMultiBlockCG.exe - compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -103,6 +103,6 @@ - + diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2013.vcxproj b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2013.vcxproj index 8e0f6f87..3ff37342 100644 --- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2013.vcxproj +++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2013.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/conjugateGradientMultiBlockCG.exe - compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -103,6 +103,6 @@ - + diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2015.vcxproj b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2015.vcxproj index f821ff82..0a5ad150 100644 --- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2015.vcxproj +++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2015.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/conjugateGradientMultiBlockCG.exe - compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -103,6 +103,6 @@ - + diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj index 7f3bf7e7..69e2bf9f 100644 --- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj +++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj @@ -34,7 +34,7 @@ - + @@ -63,7 +63,7 @@ $(OutDir)/conjugateGradientMultiBlockCG.exe - compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -104,6 +104,6 @@ - + diff --git a/Samples/conjugateGradientMultiDeviceCG/Makefile b/Samples/conjugateGradientMultiDeviceCG/Makefile index f6fb6065..cfeb8783 100644 --- a/Samples/conjugateGradientMultiDeviceCG/Makefile +++ b/Samples/conjugateGradientMultiDeviceCG/Makefile @@ -1,31 +1,29 @@ ################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # -# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. # -# NOTICE TO USER: -# -# This source code is subject to NVIDIA ownership rights under U.S. and -# international Copyright laws. -# -# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE -# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR -# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH -# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, -# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -# OR PERFORMANCE OF THIS SOURCE CODE. -# -# U.S. Government End Users. This source code is a "commercial item" as -# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of -# "commercial computer software" and "commercial computer software -# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) -# and is provided to the U.S. Government only as a commercial end item. -# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through -# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the -# source code with only those rights set forth herein. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ################################################################################ # @@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) export QNX_TARGET HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ else ifeq ($(TARGET_OS), android) - HOST_COMPILER ?= aarch64-linux-android-g++ + HOST_COMPILER ?= aarch64-linux-android-clang++ endif else ifeq ($(TARGET_ARCH),ppc64le) HOST_COMPILER ?= powerpc64le-linux-gnu-g++ @@ -266,7 +264,7 @@ LIBRARIES := ################################################################################ # Gencode arguments -SMS ?= 60 61 70 +SMS ?= 60 61 70 75 ifeq ($(SMS),) $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) diff --git a/Samples/conjugateGradientMultiDeviceCG/NsightEclipse.xml b/Samples/conjugateGradientMultiDeviceCG/NsightEclipse.xml index 7f60f0fc..b17237fc 100644 --- a/Samples/conjugateGradientMultiDeviceCG/NsightEclipse.xml +++ b/Samples/conjugateGradientMultiDeviceCG/NsightEclipse.xml @@ -49,6 +49,7 @@ sm60 sm61 sm70 + sm75 x86_64 @@ -58,6 +59,9 @@ ppc64le linux + + windows + 6.0 diff --git a/Samples/conjugateGradientMultiDeviceCG/README.md b/Samples/conjugateGradientMultiDeviceCG/README.md index 41bf4518..cc989f40 100644 --- a/Samples/conjugateGradientMultiDeviceCG/README.md +++ b/Samples/conjugateGradientMultiDeviceCG/README.md @@ -10,11 +10,11 @@ Unified Memory, Linear Algebra, Cooperative Groups, MultiDevice Cooperative Grou ## Supported SM Architectures -[SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) +[SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) ## Supported OSes -Linux +Linux, Windows ## Supported CPU Architecture @@ -30,11 +30,21 @@ cudaMemAdvise, cudaMemPrefetchAsync, cudaLaunchCooperativeKernelMultiDevice, cud ## Prerequisites -Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + ### Linux The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: ``` diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu index 3d3c7076..f71233b4 100644 --- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu +++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu @@ -414,7 +414,8 @@ void getIdenticalGPUs(int num_of_gpus, std::set &identicalGPUs) { deviceProp.minor != maxMajorMinor[1]) { identicalGPUs.erase(it); } - if (!deviceProp.cooperativeMultiDeviceLaunch) { + if (!deviceProp.cooperativeMultiDeviceLaunch || + !deviceProp.concurrentManagedAccess) { identicalGPUs.erase(it); } it++; @@ -449,7 +450,8 @@ int main(int argc, char **argv) { if (identicalGPUs.size() <= 1) { printf( "No Two or more GPUs with same architecture capable of " - "cooperativeMultiDeviceLaunch found. \nWaiving the sample\n"); + "cooperativeMultiDeviceLaunch & concurrentManagedAccess found. " + "\nWaiving the sample\n"); exit(EXIT_WAIVED); } @@ -617,9 +619,12 @@ int main(int argc, char **argv) { cudaCooperativeLaunchMultiDeviceNoPreSync | cudaCooperativeLaunchMultiDeviceNoPostSync)); - checkCudaErrors(cudaMemPrefetchAsync(x, sizeof(float) * N, cudaCpuDeviceId)); - checkCudaErrors( - cudaMemPrefetchAsync(dot_result, sizeof(double), cudaCpuDeviceId)); + if (deviceProp.concurrentManagedAccess) { + checkCudaErrors( + cudaMemPrefetchAsync(x, sizeof(float) * N, cudaCpuDeviceId)); + checkCudaErrors( + cudaMemPrefetchAsync(dot_result, sizeof(double), cudaCpuDeviceId)); + } deviceId = identicalGPUs.begin(); device_count = 0; diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2012.sln b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2012.sln new file mode 100644 index 00000000..64ef36a2 --- /dev/null +++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2012.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientMultiDeviceCG", "conjugateGradientMultiDeviceCG_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2010.vcxproj b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2012.vcxproj similarity index 89% rename from Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2010.vcxproj rename to Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2012.vcxproj index dba6bd6b..3e2d3377 100644 --- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2010.vcxproj +++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2012.vcxproj @@ -15,14 +15,15 @@ {997E0757-EA74-4A4E-A0FC-47D8C8831A15} - conjugateGradientMultiBlockCG_vs2010 - conjugateGradientMultiBlockCG + conjugateGradientMultiDeviceCG_vs2012 + conjugateGradientMultiDeviceCG Application MultiByte + v110 true @@ -32,7 +33,7 @@ - + @@ -58,10 +59,10 @@ Console cudadevrt.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(CudaToolkitLibDir); - $(OutDir)/conjugateGradientMultiBlockCG.exe + $(OutDir)/conjugateGradientMultiDeviceCG.exe - compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -97,11 +98,11 @@ - + - + diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2013.sln b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2013.sln new file mode 100644 index 00000000..77fa91e8 --- /dev/null +++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2013.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 13.00 +# Visual Studio 2013 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientMultiDeviceCG", "conjugateGradientMultiDeviceCG_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2013.vcxproj b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2013.vcxproj new file mode 100644 index 00000000..c8afcd5e --- /dev/null +++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2013.vcxproj @@ -0,0 +1,108 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + conjugateGradientMultiDeviceCG_vs2013 + conjugateGradientMultiDeviceCG + + + + + Application + MultiByte + v120 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cudadevrt.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/conjugateGradientMultiDeviceCG.exe + + + compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + true + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.sln b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.sln new file mode 100644 index 00000000..965ad779 --- /dev/null +++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 14.00 +# Visual Studio 2015 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientMultiDeviceCG", "conjugateGradientMultiDeviceCG_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.vcxproj b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.vcxproj new file mode 100644 index 00000000..8c4961b1 --- /dev/null +++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.vcxproj @@ -0,0 +1,108 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + conjugateGradientMultiDeviceCG_vs2015 + conjugateGradientMultiDeviceCG + + + + + Application + MultiByte + v140 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cudadevrt.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/conjugateGradientMultiDeviceCG.exe + + + compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + true + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.sln b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.sln new file mode 100644 index 00000000..caa50d74 --- /dev/null +++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2017 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientMultiDeviceCG", "conjugateGradientMultiDeviceCG_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj new file mode 100644 index 00000000..a1198775 --- /dev/null +++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj @@ -0,0 +1,109 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + conjugateGradientMultiDeviceCG_vs2017 + conjugateGradientMultiDeviceCG + + + + + Application + MultiByte + v141 + 10.0.15063.0 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cudadevrt.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/conjugateGradientMultiDeviceCG.exe + + + compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + true + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/cudaTensorCoreGemm/Makefile b/Samples/cudaTensorCoreGemm/Makefile index 41a9dfb8..b44fe082 100644 --- a/Samples/cudaTensorCoreGemm/Makefile +++ b/Samples/cudaTensorCoreGemm/Makefile @@ -1,31 +1,29 @@ ################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # -# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. # -# NOTICE TO USER: -# -# This source code is subject to NVIDIA ownership rights under U.S. and -# international Copyright laws. -# -# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE -# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR -# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH -# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, -# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -# OR PERFORMANCE OF THIS SOURCE CODE. -# -# U.S. Government End Users. This source code is a "commercial item" as -# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of -# "commercial computer software" and "commercial computer software -# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) -# and is provided to the U.S. Government only as a commercial end item. -# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through -# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the -# source code with only those rights set forth herein. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ################################################################################ # @@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) export QNX_TARGET HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ else ifeq ($(TARGET_OS), android) - HOST_COMPILER ?= aarch64-linux-android-g++ + HOST_COMPILER ?= aarch64-linux-android-clang++ endif else ifeq ($(TARGET_ARCH),ppc64le) HOST_COMPILER ?= powerpc64le-linux-gnu-g++ @@ -266,7 +264,7 @@ LIBRARIES := ################################################################################ # Gencode arguments -SMS ?= 70 +SMS ?= 70 75 ifeq ($(SMS),) $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) diff --git a/Samples/cudaTensorCoreGemm/NsightEclipse.xml b/Samples/cudaTensorCoreGemm/NsightEclipse.xml index d51dd809..73020e63 100644 --- a/Samples/cudaTensorCoreGemm/NsightEclipse.xml +++ b/Samples/cudaTensorCoreGemm/NsightEclipse.xml @@ -43,6 +43,7 @@ In addition to that, it demonstrates the use of the new CUDA function attribute 1:CUDA Basic Topics sm70 + sm75 x86_64 diff --git a/Samples/cudaTensorCoreGemm/README.md b/Samples/cudaTensorCoreGemm/README.md index 696e60f3..4cc8e332 100644 --- a/Samples/cudaTensorCoreGemm/README.md +++ b/Samples/cudaTensorCoreGemm/README.md @@ -14,7 +14,7 @@ Matrix Multiply, WMMA, Tensor Cores ## Supported SM Architectures -[SM 7.0 ](https://developer.nvidia.com/cuda-gpus) +[SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) ## Supported OSes @@ -31,7 +31,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate, ## Prerequisites -Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu index e564a74f..d2ce38ec 100644 --- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu +++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu @@ -72,6 +72,21 @@ #include #include +// Externally configurable parameters. + +#ifndef CPU_DEBUG +// Set this to 1 to verify the correctness of the GPU-computed matrix. +#define CPU_DEBUG 0 +#endif + +#ifndef SHARED_MEMORY_LIMIT_64K +// Set this to 0 to use more than 64 Kb of shared memory to cache data, to +// improve the performance of the computations on GPU. +// Note that you need a GPU that can have more than 64 Kb of shared memory +// per multiprocessor. +#define SHARED_MEMORY_LIMIT_64K 1 +#endif + // GPU configuration. #define WARP_SIZE 32 @@ -82,6 +97,10 @@ #define N 16 #define K 16 +#define WMMA_M 16 +#define WMMA_N 16 +#define WMMA_K 16 + // GEMM configuration. #define M_TILES 256 @@ -99,7 +118,24 @@ #define WARPS_PER_BLOCK 8 #define THREADS_PER_BLOCK (WARP_SIZE * WARPS_PER_BLOCK) +#if SHARED_MEMORY_LIMIT_64K +// With only 64 Kb shared memory available, we can fit two 8-tile chunks of +// the A and B matrix data, that are 16 * 16 * 8 * 8 * 2 = 32 Kb each +// (i.e. two 8x8 arrays of tiles of 16x16 half-typed elements per CTA). +// But we cannot account the 8 Kb total skew overhead, without which the +// performance would be severely impacted. So we choose to reduce the chunk size +// in half, i.e. the amount of A and B matrix data we cache in shared memory. +// Accordingly, this doubles the number of outer iterations across the global K +// dimension, which only slightly impacts the performance. +#define CHUNK_K 4 +#else #define CHUNK_K 8 +#endif + +#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(half)) +#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4)) +#define CHUNK_COPY_LINES_PER_WARP (WARP_COPY_BYTES / CHUNK_LINE_BYTES) +#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP) #define BLOCK_ROW_WARPS 2 #define BLOCK_COL_WARPS 4 @@ -194,14 +230,14 @@ __global__ void compute_gemm(const half *A, const half *B, const float *C, const size_t shmem_idx_b_off = BLOCK_COL_TILES * M; // This pointer is used to access the C and D matrix tiles this warp computes. - float *shmem_warp_tile_ptr = reinterpret_cast( - &shmem[0][0] + (warpId / 2) * SHMEM_STRIDE * K * 2 + - (warpId % 2) * SHMEM_OFFSET); + float *shmem_warp_tile_ptr = (float *)&shmem[0][0] + + (warpId / 2) * SHMEM_STRIDE * K * 2 + + (warpId % 2) * SHMEM_OFFSET; // This pointer is used to stream the C and D matrices block-wide tile to and // from shared memory. float *shmem_warp_stream_ptr = - reinterpret_cast(&shmem[0][0] + warpId * SHMEM_STRIDE * K); + (float *)&shmem[0][0] + warpId * SHMEM_STRIDE * K; // Adjust the beta scaler, as it'll be multiplied by alpha at the end of // each tile computation. Technically this is not generally correct (may @@ -292,23 +328,24 @@ __global__ void compute_gemm(const half *A, const half *B, const float *C, // First half of the warp copies the first row / column of the matrix, // the second half of the warp copies the next. int4 *lane_ptr = (int4 *)(warp_ptr + tile_k * K + - (laneId / (WARP_SIZE / 2)) * K_GLOBAL) + - (laneId % (WARP_SIZE / 2)); + (laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL) + + (laneId % CHUNK_COPY_LINE_LANES); // Shift the second half of the warp to the next row / column in the // shared memory. - shmem_idx += laneId / (WARP_SIZE / 2); + shmem_idx += laneId / CHUNK_COPY_LINE_LANES; #pragma unroll - for (int i = 0; i < (WARP_SIZE / 2); i++) { + for (int i = 0; i < ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP) * 2; + i++) { // Copy 16 bytes at once in each lane. - *((int4 *)&shmem[shmem_idx][0] + (laneId % (WARP_SIZE / 2))) = + *((int4 *)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) = *lane_ptr; // Advance the global memory pointer and the shared memory index. - lane_ptr = reinterpret_cast( - reinterpret_cast(lane_ptr + K_GLOBAL * 2)); - shmem_idx += 2; + lane_ptr = + (int4 *)((half *)lane_ptr + K_GLOBAL * CHUNK_COPY_LINES_PER_WARP); + shmem_idx += CHUNK_COPY_LINES_PER_WARP; } __syncthreads(); @@ -374,17 +411,98 @@ __global__ void compute_gemm(const half *A, const half *B, const float *C, #pragma unroll for (int i = 0; i < K; i++) { - *(reinterpret_cast(dst_gmem_warp_stream_ptr + - GLOBAL_MEM_STRIDE * i) + - laneId) = - *(reinterpret_cast(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + - laneId); + *((int4 *)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = + *((int4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); } __syncthreads(); } } +// Performs an MxNxK GEMM (C=alpha*A*B + beta*C) assuming: +// 1) Matrices are packed in memory. +// 2) M, N and K are multiples of 16. +// 3) Neither A nor B are transposed. +// Note: This is a less performant version of the compute_gemm kernel. It is +// designed for +// demonstration purposes only to show the CUDA WMMA API use without +// relying on availability of the shared memory. +__global__ void simple_wmma_gemm(half *a, half *b, float *c, float *d, int m_ld, + int n_ld, int k_ld, float alpha, float beta) { + // Leading dimensions. Packed with no transpositions. + int lda = m_ld; + int ldb = k_ld; + int ldc = n_ld; + + // Tile using a 2D grid + int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize; + int warpN = (blockIdx.y * blockDim.y + threadIdx.y); + + // Declare the fragments + wmma::fragment + a_frag; + wmma::fragment + b_frag; + wmma::fragment acc_frag; + wmma::fragment c_frag; + + wmma::fill_fragment(acc_frag, 0.0f); + + // Loop over k + for (int i = 0; i < k_ld; i += WMMA_K) { + int aCol = i; + int aRow = warpM * WMMA_M; + + int bCol = i; + int bRow = warpN * WMMA_N; + + // Bounds checking + if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) { + // Load the inputs + wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda); + wmma::load_matrix_sync(b_frag, b + bCol + bRow * ldb, ldb); + + // Perform the matrix multiplication + wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag); + } + } + + // Load in the current value of c, scale it by beta, and add this our result + // scaled by alpha + int cCol = warpN * WMMA_N; + int cRow = warpM * WMMA_M; + + if (cRow < m_ld && cCol < n_ld) { + wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, + wmma::mem_row_major); + + for (int i = 0; i < c_frag.num_elements; i++) { + c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i]; + } + + // Store the output + wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc, + wmma::mem_row_major); + } +} + +__host__ void matMultiplyOnHost(float *A, float *B, float *C, float alpha, + float beta, int numARows, int numAColumns, + int numBRows, int numBColumns, int numCRows, + int numCColumns) { + for (int i = 0; i < numCRows; i++) { + for (int j = 0; j < numCColumns; j++) { + float temp = 0.0; + + for (int k = 0; k < numAColumns; k++) { + temp += A[i * numAColumns + k] * B[j * numBRows + k]; + } + + C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j]; + } + } +} + int main(int argc, char **argv) { printf("Initializing...\n"); @@ -408,6 +526,10 @@ int main(int argc, char **argv) { float *A_h = NULL; float *B_h = NULL; float *C_h = NULL; +#if CPU_DEBUG + float *result_hD = NULL; + float *result_host = NULL; +#endif checkCudaErrors(cudaMallocManaged(reinterpret_cast(&A_h), sizeof(float) * M_GLOBAL * K_GLOBAL)); @@ -415,6 +537,12 @@ int main(int argc, char **argv) { sizeof(float) * K_GLOBAL * N_GLOBAL)); checkCudaErrors(cudaMallocManaged(reinterpret_cast(&C_h), sizeof(float) * M_GLOBAL * N_GLOBAL)); +#if CPU_DEBUG + checkCudaErrors(cudaMallocManaged((void **)&result_hD, + sizeof(float) * M_GLOBAL * N_GLOBAL)); + checkCudaErrors(cudaMallocManaged((void **)&result_host, + sizeof(float) * M_GLOBAL * N_GLOBAL)); +#endif half *A = NULL; half *B = NULL; @@ -446,16 +574,22 @@ int main(int argc, char **argv) { checkCudaErrors(cudaDeviceSynchronize()); enum { - SHMEM_SZ = - sizeof(half) * (BLOCK_COL_TILES * M) * (CHUNK_K * K + SKEW_HALF) * 2 + // Compute the right amount of shared memory to request. + // We need shared memory to hold per-CTA C and D matrix tiles, and to cache + // per-CTA chunks + // of the A and B matrices. Therefore, the right amount to request is the + // maximum of those + // two numbers. + SHMEM_SZ = MAX( + sizeof(half) * (BLOCK_COL_TILES * M) * (CHUNK_K * K + SKEW_HALF) * 2, + M * (BLOCK_ROW_WARPS * WARP_ROW_TILES) * N * + (BLOCK_COL_WARPS * WARP_COL_TILES) * sizeof(float)) }; printf("Required shared memory size: %lu Kb\n", SHMEM_SZ / 1024UL); - checkCudaErrors(cudaFuncSetAttribute( - compute_gemm, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); - - printf("Computing...\n"); + const float alpha = 1.1f; + const float beta = 1.2f; cudaEvent_t start, stop; @@ -463,16 +597,61 @@ int main(int argc, char **argv) { checkCudaErrors(cudaEventCreate(&stop)); checkCudaErrors(cudaEventRecord(start)); - const float alpha = 1.1f; - const float beta = 1.2f; + // If enough shared memory available on the GPU use high performant kernel + if (deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) { + printf("Computing... using high performance kernel compute_gemm \n"); - checkKernelErrors( - (compute_gemm<<>>(A, B, C, D, alpha, beta))); + checkCudaErrors(cudaFuncSetAttribute( + compute_gemm, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); + checkKernelErrors( + (compute_gemm<<>>(A, B, C, D, alpha, beta))); +#if CPU_DEBUG + checkCudaErrors(cudaMemcpy(result_hD, D, + sizeof(float) * M_GLOBAL * N_GLOBAL, + cudaMemcpyDeviceToHost)); +#endif + } else { + dim3 gridDim; + dim3 blockDim; + + // blockDim.x must be a multple of warpSize + // 128x4 means we have 16 warps and a block computes a 64x64 output tile + blockDim.x = 128; + blockDim.y = 4; + + gridDim.x = (M_GLOBAL + (WMMA_M * blockDim.x / 32 - 1)) / + (WMMA_M * blockDim.x / 32); + gridDim.y = (N_GLOBAL + WMMA_N * blockDim.y - 1) / (WMMA_N * blockDim.y); + + printf("Computing... using simple_wmma_gemm kernel\n"); + simple_wmma_gemm<<>>(A, B, C, D, M_GLOBAL, N_GLOBAL, + K_GLOBAL, alpha, beta); +#if CPU_DEBUG + checkCudaErrors(cudaMemcpy(result_hD, D, + sizeof(float) * M_GLOBAL * N_GLOBAL, + cudaMemcpyDeviceToHost)); +#endif + } checkCudaErrors(cudaEventRecord(stop)); checkCudaErrors(cudaEventSynchronize(stop)); +#if CPU_DEBUG + printf("Verifying correctness of the computations...\n"); + + memcpy(result_host, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL); + + matMultiplyOnHost(A_h, B_h, result_host, alpha, beta, M_GLOBAL, K_GLOBAL, + K_GLOBAL, N_GLOBAL, M_GLOBAL, N_GLOBAL); + + for (int i = 0; i < N_GLOBAL * M_GLOBAL; i++) { + if (fabs(result_hD[i] - result_host[i]) > 0.1f) + printf("mismatch i=%d result_hD=%f result_host=%f\n", i, result_hD[i], + result_host[i]); + } +#endif + float milliseconds = 0; checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop)); diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2012.vcxproj b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2012.vcxproj index 577cd582..bf77cf93 100644 --- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2012.vcxproj +++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2012.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/cudaTensorCoreGemm.exe - compute_70,sm_70; + compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -102,6 +102,6 @@ - + diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2013.vcxproj b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2013.vcxproj index 28e9b598..d8afbc90 100644 --- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2013.vcxproj +++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2013.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/cudaTensorCoreGemm.exe - compute_70,sm_70; + compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -102,6 +102,6 @@ - + diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2015.vcxproj b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2015.vcxproj index 176e377b..b6e3fb60 100644 --- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2015.vcxproj +++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2015.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/cudaTensorCoreGemm.exe - compute_70,sm_70; + compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -102,6 +102,6 @@ - + diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj index 70bb2e83..da8345fc 100644 --- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj +++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj @@ -34,7 +34,7 @@ - + @@ -63,7 +63,7 @@ $(OutDir)/cudaTensorCoreGemm.exe - compute_70,sm_70; + compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -103,6 +103,6 @@ - + diff --git a/Samples/deviceQuery/Makefile b/Samples/deviceQuery/Makefile index d332a397..09ef57d6 100644 --- a/Samples/deviceQuery/Makefile +++ b/Samples/deviceQuery/Makefile @@ -1,31 +1,29 @@ ################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # -# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. # -# NOTICE TO USER: -# -# This source code is subject to NVIDIA ownership rights under U.S. and -# international Copyright laws. -# -# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE -# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR -# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH -# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, -# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -# OR PERFORMANCE OF THIS SOURCE CODE. -# -# U.S. Government End Users. This source code is a "commercial item" as -# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of -# "commercial computer software" and "commercial computer software -# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) -# and is provided to the U.S. Government only as a commercial end item. -# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through -# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the -# source code with only those rights set forth herein. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ################################################################################ # @@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) export QNX_TARGET HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ else ifeq ($(TARGET_OS), android) - HOST_COMPILER ?= aarch64-linux-android-g++ + HOST_COMPILER ?= aarch64-linux-android-clang++ endif else ifeq ($(TARGET_ARCH),ppc64le) HOST_COMPILER ?= powerpc64le-linux-gnu-g++ @@ -248,7 +246,7 @@ LIBRARIES := ################################################################################ # Gencode arguments -SMS ?= 30 35 37 50 52 60 61 70 +SMS ?= 30 35 37 50 52 60 61 70 75 ifeq ($(SMS),) $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) diff --git a/Samples/deviceQuery/NsightEclipse.xml b/Samples/deviceQuery/NsightEclipse.xml index 6c26a3ed..e93f0864 100644 --- a/Samples/deviceQuery/NsightEclipse.xml +++ b/Samples/deviceQuery/NsightEclipse.xml @@ -39,6 +39,7 @@ sm60 sm61 sm70 + sm75 x86_64 diff --git a/Samples/deviceQuery/README.md b/Samples/deviceQuery/README.md index 5b2910dd..29182b74 100644 --- a/Samples/deviceQuery/README.md +++ b/Samples/deviceQuery/README.md @@ -10,7 +10,7 @@ CUDA Runtime API, Device Query ## Supported SM Architectures -[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) ## Supported OSes @@ -27,7 +27,7 @@ cudaSetDevice, cudaGetDeviceCount, cudaGetDeviceProperties, cudaDriverGetVersion ## Prerequisites -Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/deviceQuery/deviceQuery_vs2012.vcxproj b/Samples/deviceQuery/deviceQuery_vs2012.vcxproj index c827a82f..43281e98 100644 --- a/Samples/deviceQuery/deviceQuery_vs2012.vcxproj +++ b/Samples/deviceQuery/deviceQuery_vs2012.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/deviceQuery.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -102,6 +102,6 @@ - + diff --git a/Samples/deviceQuery/deviceQuery_vs2013.vcxproj b/Samples/deviceQuery/deviceQuery_vs2013.vcxproj index f01ab104..73e0e3ee 100644 --- a/Samples/deviceQuery/deviceQuery_vs2013.vcxproj +++ b/Samples/deviceQuery/deviceQuery_vs2013.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/deviceQuery.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -102,6 +102,6 @@ - + diff --git a/Samples/deviceQuery/deviceQuery_vs2015.vcxproj b/Samples/deviceQuery/deviceQuery_vs2015.vcxproj index 3b5128bd..60fb078b 100644 --- a/Samples/deviceQuery/deviceQuery_vs2015.vcxproj +++ b/Samples/deviceQuery/deviceQuery_vs2015.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/deviceQuery.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -102,6 +102,6 @@ - + diff --git a/Samples/deviceQuery/deviceQuery_vs2017.vcxproj b/Samples/deviceQuery/deviceQuery_vs2017.vcxproj index fe6e0cc1..b6c0f478 100644 --- a/Samples/deviceQuery/deviceQuery_vs2017.vcxproj +++ b/Samples/deviceQuery/deviceQuery_vs2017.vcxproj @@ -34,7 +34,7 @@ - + @@ -63,7 +63,7 @@ $(OutDir)/deviceQuery.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -103,6 +103,6 @@ - + diff --git a/Samples/matrixMul/Makefile b/Samples/matrixMul/Makefile index d2d2eb90..e5ade9c2 100644 --- a/Samples/matrixMul/Makefile +++ b/Samples/matrixMul/Makefile @@ -1,31 +1,29 @@ ################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # -# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. # -# NOTICE TO USER: -# -# This source code is subject to NVIDIA ownership rights under U.S. and -# international Copyright laws. -# -# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE -# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR -# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH -# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, -# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -# OR PERFORMANCE OF THIS SOURCE CODE. -# -# U.S. Government End Users. This source code is a "commercial item" as -# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of -# "commercial computer software" and "commercial computer software -# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) -# and is provided to the U.S. Government only as a commercial end item. -# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through -# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the -# source code with only those rights set forth herein. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ################################################################################ # @@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) export QNX_TARGET HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ else ifeq ($(TARGET_OS), android) - HOST_COMPILER ?= aarch64-linux-android-g++ + HOST_COMPILER ?= aarch64-linux-android-clang++ endif else ifeq ($(TARGET_ARCH),ppc64le) HOST_COMPILER ?= powerpc64le-linux-gnu-g++ @@ -248,7 +246,7 @@ LIBRARIES := ################################################################################ # Gencode arguments -SMS ?= 30 35 37 50 52 60 61 70 +SMS ?= 30 35 37 50 52 60 61 70 75 ifeq ($(SMS),) $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) diff --git a/Samples/matrixMul/NsightEclipse.xml b/Samples/matrixMul/NsightEclipse.xml index 4132ceb3..38ea6b03 100644 --- a/Samples/matrixMul/NsightEclipse.xml +++ b/Samples/matrixMul/NsightEclipse.xml @@ -46,6 +46,7 @@ sm60 sm61 sm70 + sm75 x86_64 diff --git a/Samples/matrixMul/README.md b/Samples/matrixMul/README.md index c6180676..ae3bdf71 100644 --- a/Samples/matrixMul/README.md +++ b/Samples/matrixMul/README.md @@ -10,7 +10,7 @@ CUDA Runtime API, Linear Algebra ## Supported SM Architectures -[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) ## Supported OSes @@ -27,7 +27,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla ## Prerequisites -Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/matrixMul/matrixMul_vs2012.vcxproj b/Samples/matrixMul/matrixMul_vs2012.vcxproj index 660cc790..10980117 100644 --- a/Samples/matrixMul/matrixMul_vs2012.vcxproj +++ b/Samples/matrixMul/matrixMul_vs2012.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/matrixMul.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -102,6 +102,6 @@ - + diff --git a/Samples/matrixMul/matrixMul_vs2013.vcxproj b/Samples/matrixMul/matrixMul_vs2013.vcxproj index fb59ec7e..fc8f1580 100644 --- a/Samples/matrixMul/matrixMul_vs2013.vcxproj +++ b/Samples/matrixMul/matrixMul_vs2013.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/matrixMul.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -102,6 +102,6 @@ - + diff --git a/Samples/matrixMul/matrixMul_vs2015.vcxproj b/Samples/matrixMul/matrixMul_vs2015.vcxproj index 88350f3a..135f764c 100644 --- a/Samples/matrixMul/matrixMul_vs2015.vcxproj +++ b/Samples/matrixMul/matrixMul_vs2015.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/matrixMul.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -102,6 +102,6 @@ - + diff --git a/Samples/matrixMul/matrixMul_vs2017.vcxproj b/Samples/matrixMul/matrixMul_vs2017.vcxproj index 7dcf0770..d145fa07 100644 --- a/Samples/matrixMul/matrixMul_vs2017.vcxproj +++ b/Samples/matrixMul/matrixMul_vs2017.vcxproj @@ -34,7 +34,7 @@ - + @@ -63,7 +63,7 @@ $(OutDir)/matrixMul.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -103,6 +103,6 @@ - + diff --git a/Samples/matrixMulDrv/Makefile b/Samples/matrixMulDrv/Makefile index fe6bc157..186de078 100644 --- a/Samples/matrixMulDrv/Makefile +++ b/Samples/matrixMulDrv/Makefile @@ -1,31 +1,29 @@ ################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # -# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. # -# NOTICE TO USER: -# -# This source code is subject to NVIDIA ownership rights under U.S. and -# international Copyright laws. -# -# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE -# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR -# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH -# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, -# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -# OR PERFORMANCE OF THIS SOURCE CODE. -# -# U.S. Government End Users. This source code is a "commercial item" as -# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of -# "commercial computer software" and "commercial computer software -# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) -# and is provided to the U.S. Government only as a commercial end item. -# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through -# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the -# source code with only those rights set forth herein. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ################################################################################ # @@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) export QNX_TARGET HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ else ifeq ($(TARGET_OS), android) - HOST_COMPILER ?= aarch64-linux-android-g++ + HOST_COMPILER ?= aarch64-linux-android-clang++ endif else ifeq ($(TARGET_ARCH),ppc64le) HOST_COMPILER ?= powerpc64le-linux-gnu-g++ diff --git a/Samples/matrixMulDrv/README.md b/Samples/matrixMulDrv/README.md index a510b31c..e22bee66 100644 --- a/Samples/matrixMulDrv/README.md +++ b/Samples/matrixMulDrv/README.md @@ -10,7 +10,7 @@ CUDA Driver API, Matrix Multiply ## Supported SM Architectures -[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) ## Supported OSes @@ -27,7 +27,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cu ## Prerequisites -Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2012.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2012.vcxproj index 10c5d339..e13d4bad 100644 --- a/Samples/matrixMulDrv/matrixMulDrv_vs2012.vcxproj +++ b/Samples/matrixMulDrv/matrixMulDrv_vs2012.vcxproj @@ -33,7 +33,7 @@ - + @@ -106,6 +106,6 @@ - + diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2013.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2013.vcxproj index 197edc7a..827374d7 100644 --- a/Samples/matrixMulDrv/matrixMulDrv_vs2013.vcxproj +++ b/Samples/matrixMulDrv/matrixMulDrv_vs2013.vcxproj @@ -33,7 +33,7 @@ - + @@ -106,6 +106,6 @@ - + diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2015.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2015.vcxproj index b4cc9ce1..6bff3567 100644 --- a/Samples/matrixMulDrv/matrixMulDrv_vs2015.vcxproj +++ b/Samples/matrixMulDrv/matrixMulDrv_vs2015.vcxproj @@ -33,7 +33,7 @@ - + @@ -106,6 +106,6 @@ - + diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj index 14a248bb..bd5d4078 100644 --- a/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj +++ b/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj @@ -34,7 +34,7 @@ - + @@ -107,6 +107,6 @@ - + diff --git a/Samples/p2pBandwidthLatencyTest/Makefile b/Samples/p2pBandwidthLatencyTest/Makefile new file mode 100644 index 00000000..07a4c9d5 --- /dev/null +++ b/Samples/p2pBandwidthLatencyTest/Makefile @@ -0,0 +1,300 @@ +################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-clang++ + endif + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu + LDFLAGS += --unresolved-symbols=ignore-in-shared-libs + CCFLAGS += -isystem=$(TARGET_FS)/usr/include + CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu + endif + endif +endif + +ifeq ($(TARGET_OS),qnx) + CCFLAGS += -DWIN_INTERFACE_CUSTOM + LDFLAGS += -lsocket +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +SAMPLE_ENABLED := 1 + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../Common +LIBRARIES := + +################################################################################ + +# Gencode arguments +SMS ?= 30 35 37 50 52 60 61 70 75 + +ifeq ($(SMS),) +$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) +SAMPLE_ENABLED := 0 +endif + +ifeq ($(GENCODE_FLAGS),) +# Generate SASS code for each SM architecture listed in $(SMS) +$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) + +# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility +HIGHEST_SM := $(lastword $(sort $(SMS))) +ifneq ($(HIGHEST_SM),) +GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) +endif +endif + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: p2pBandwidthLatencyTest + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +p2pBandwidthLatencyTest.o:p2pBandwidthLatencyTest.cu + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +p2pBandwidthLatencyTest: p2pBandwidthLatencyTest.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./p2pBandwidthLatencyTest + +clean: + rm -f p2pBandwidthLatencyTest p2pBandwidthLatencyTest.o + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/p2pBandwidthLatencyTest + +clobber: clean diff --git a/Samples/p2pBandwidthLatencyTest/NsightEclipse.xml b/Samples/p2pBandwidthLatencyTest/NsightEclipse.xml new file mode 100644 index 00000000..383a2972 --- /dev/null +++ b/Samples/p2pBandwidthLatencyTest/NsightEclipse.xml @@ -0,0 +1,77 @@ + + + + p2pBandwidthLatencyTest + + cudaDeviceCanAccessPeer + cudaDeviceEnablePeerAccess + cudaDeviceDisablePeerAccess + cudaEventCreateWithFlags + cudaEventElapsedTime + cudaMemcpy + + + whole + + ./ + ../ + ../../common/inc + + + Performance Strategies + Asynchronous Data Transfers + Unified Virtual Address Space + Peer to Peer Data Transfers + Multi-GPU + + + CUDA + Performance + multi-GPU support + peer to peer + + + + + + true + p2pBandwidthLatencyTest.cu + + 1:CUDA Basic Topics + 1:Performance Strategies + + sm30 + sm35 + sm37 + sm50 + sm52 + sm60 + sm61 + sm70 + sm75 + + + x86_64 + linux + + + windows7 + + + x86_64 + macosx + + + arm + + + ppc64le + linux + + + + all + + Peer-to-Peer Bandwidth Latency Test with Multi-GPUs + exe + diff --git a/Samples/p2pBandwidthLatencyTest/README.md b/Samples/p2pBandwidthLatencyTest/README.md new file mode 100644 index 00000000..47f0ca01 --- /dev/null +++ b/Samples/p2pBandwidthLatencyTest/README.md @@ -0,0 +1,94 @@ +# p2pBandwidthLatencyTest - Peer-to-Peer Bandwidth Latency Test with Multi-GPUs + +## Description + +This application demonstrates the CUDA Peer-To-Peer (P2P) data transfers between pairs of GPUs and computes latency and bandwidth. Tests on GPU pairs using P2P and without P2P are tested. + +## Key Concepts + +Performance Strategies, Asynchronous Data Transfers, Unified Virtual Address Space, Peer to Peer Data Transfers, Multi-GPU + +## Supported SM Architectures + +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows, MacOSX + +## Supported CPU Architecture + +x86_64, ppc64le, armv7l + +## CUDA APIs involved + +### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) +cudaDeviceCanAccessPeer, cudaDeviceEnablePeerAccess, cudaDeviceDisablePeerAccess, cudaEventCreateWithFlags, cudaEventElapsedTime, cudaMemcpy + +## Prerequisites + +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +### Mac +The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` + +The samples makefiles can take advantage of certain options: + +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` + +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60". + ``` + $ make SMS="A B ..." + ``` + +* **HOST_COMPILER=** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers. + ``` + $ make HOST_COMPILER=clang + ``` + +## References (for more details) + diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu new file mode 100644 index 00000000..f2e0c1e5 --- /dev/null +++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu @@ -0,0 +1,682 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include +#include + +using namespace std; + +const char *sSampleName = "P2P (Peer-to-Peer) GPU Bandwidth Latency Test"; + +typedef enum { + P2P_WRITE = 0, + P2P_READ = 1, +} P2PDataTransfer; + +typedef enum { + CE = 0, + SM = 1, +} P2PEngine; + +P2PEngine p2p_mechanism = CE; // By default use Copy Engine + +// Macro for checking cuda errors following a cuda launch or api call +#define cudaCheckError() \ + { \ + cudaError_t e = cudaGetLastError(); \ + if (e != cudaSuccess) { \ + printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, \ + cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ + } +__global__ void delay(volatile int *flag, + unsigned long long timeout_clocks = 10000000) { + // Wait until the application notifies us that it has completed queuing up the + // experiment, or timeout and exit, allowing the application to make progress + long long int start_clock, sample_clock; + start_clock = clock64(); + + while (!*flag) { + sample_clock = clock64(); + + if (sample_clock - start_clock > timeout_clocks) { + break; + } + } +} + +// This kernel is for demonstration purposes only, not a performant kernel for +// p2p transfers. +__global__ void copyp2p(int4 *__restrict__ dest, int4 const *__restrict__ src, + size_t num_elems) { + size_t globalId = blockIdx.x * blockDim.x + threadIdx.x; + size_t gridSize = blockDim.x * gridDim.x; + +#pragma unroll(5) + for (size_t i = globalId; i < num_elems; i += gridSize) { + dest[i] = src[i]; + } +} + +/////////////////////////////////////////////////////////////////////////// +// Print help screen +/////////////////////////////////////////////////////////////////////////// +void printHelp(void) { + printf("Usage: p2pBandwidthLatencyTest [OPTION]...\n"); + printf("Tests bandwidth/latency of GPU pairs using P2P and without P2P\n"); + printf("\n"); + + printf("Options:\n"); + printf("--help\t\tDisplay this help menu\n"); + printf( + "--p2p_read\tUse P2P reads for data transfers between GPU pairs and show " + "corresponding results.\n \t\tDefault used is P2P write operation.\n"); + printf("--sm_copy\tUse SM intiated p2p transfers instead of Copy Engine\n"); +} + +void checkP2Paccess(int numGPUs) { + for (int i = 0; i < numGPUs; i++) { + cudaSetDevice(i); + cudaCheckError(); + + for (int j = 0; j < numGPUs; j++) { + int access; + if (i != j) { + cudaDeviceCanAccessPeer(&access, i, j); + cudaCheckError(); + printf("Device=%d %s Access Peer Device=%d\n", i, + access ? "CAN" : "CANNOT", j); + } + } + } + printf( + "\n***NOTE: In case a device doesn't have P2P access to other one, it " + "falls back to normal memcopy procedure.\nSo you can see lesser " + "Bandwidth (GB/s) and unstable Latency (us) in those cases.\n\n"); +} + +void performP2PCopy(int *dest, int destDevice, int *src, int srcDevice, + int num_elems, int repeat, bool p2paccess, + cudaStream_t streamToRun) { + int blockSize = 0; + int numBlocks = 0; + + cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, copyp2p); + cudaCheckError(); + + if (p2p_mechanism == SM && p2paccess) { + for (int r = 0; r < repeat; r++) { + copyp2p<<>>( + (int4 *)dest, (int4 *)src, num_elems / 4); + } + } else { + for (int r = 0; r < repeat; r++) { + cudaMemcpyPeerAsync(dest, destDevice, src, srcDevice, + sizeof(int) * num_elems, streamToRun); + } + } +} + +void outputBandwidthMatrix(int numGPUs, bool p2p, P2PDataTransfer p2p_method) { + int numElems = 10000000; + int repeat = 5; + volatile int *flag = NULL; + vector buffers(numGPUs); + vector buffersD2D(numGPUs); // buffer for D2D, that is, intra-GPU copy + vector start(numGPUs); + vector stop(numGPUs); + vector stream(numGPUs); + + cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable); + cudaCheckError(); + + for (int d = 0; d < numGPUs; d++) { + cudaSetDevice(d); + cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking); + cudaMalloc(&buffers[d], numElems * sizeof(int)); + cudaCheckError(); + cudaMalloc(&buffersD2D[d], numElems * sizeof(int)); + cudaCheckError(); + cudaEventCreate(&start[d]); + cudaCheckError(); + cudaEventCreate(&stop[d]); + cudaCheckError(); + } + + vector bandwidthMatrix(numGPUs * numGPUs); + + for (int i = 0; i < numGPUs; i++) { + cudaSetDevice(i); + + for (int j = 0; j < numGPUs; j++) { + int access = 0; + if (p2p) { + cudaDeviceCanAccessPeer(&access, i, j); + if (access) { + cudaDeviceEnablePeerAccess(j, 0); + cudaCheckError(); + cudaSetDevice(j); + cudaCheckError(); + cudaDeviceEnablePeerAccess(i, 0); + cudaCheckError(); + cudaSetDevice(i); + cudaCheckError(); + } + } + + cudaStreamSynchronize(stream[i]); + cudaCheckError(); + + // Block the stream until all the work is queued up + // DANGER! - cudaMemcpy*Async may infinitely block waiting for + // room to push the operation, so keep the number of repeatitions + // relatively low. Higher repeatitions will cause the delay kernel + // to timeout and lead to unstable results. + *flag = 0; + delay<<<1, 1, 0, stream[i]>>>(flag); + cudaCheckError(); + cudaEventRecord(start[i], stream[i]); + cudaCheckError(); + + if (i == j) { + // Perform intra-GPU, D2D copies + performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat, + access, stream[i]); + + } else { + if (p2p_method == P2P_WRITE) { + performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access, + stream[i]); + } else { + performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access, + stream[i]); + } + } + + cudaEventRecord(stop[i], stream[i]); + cudaCheckError(); + + // Release the queued events + *flag = 1; + cudaStreamSynchronize(stream[i]); + cudaCheckError(); + + float time_ms; + cudaEventElapsedTime(&time_ms, start[i], stop[i]); + double time_s = time_ms / 1e3; + + double gb = numElems * sizeof(int) * repeat / (double)1e9; + if (i == j) { + gb *= 2; // must count both the read and the write here + } + bandwidthMatrix[i * numGPUs + j] = gb / time_s; + if (p2p && access) { + cudaDeviceDisablePeerAccess(j); + cudaSetDevice(j); + cudaDeviceDisablePeerAccess(i); + cudaSetDevice(i); + cudaCheckError(); + } + } + } + + printf(" D\\D"); + + for (int j = 0; j < numGPUs; j++) { + printf("%6d ", j); + } + + printf("\n"); + + for (int i = 0; i < numGPUs; i++) { + printf("%6d ", i); + + for (int j = 0; j < numGPUs; j++) { + printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]); + } + + printf("\n"); + } + + for (int d = 0; d < numGPUs; d++) { + cudaSetDevice(d); + cudaFree(buffers[d]); + cudaFree(buffersD2D[d]); + cudaCheckError(); + cudaEventDestroy(start[d]); + cudaCheckError(); + cudaEventDestroy(stop[d]); + cudaCheckError(); + cudaStreamDestroy(stream[d]); + cudaCheckError(); + } + + cudaFreeHost((void *)flag); + cudaCheckError(); +} + +void outputBidirectionalBandwidthMatrix(int numGPUs, bool p2p) { + int numElems = 10000000; + int repeat = 5; + volatile int *flag = NULL; + vector buffers(numGPUs); + vector buffersD2D(numGPUs); + vector start(numGPUs); + vector stop(numGPUs); + vector stream0(numGPUs); + vector stream1(numGPUs); + + cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable); + cudaCheckError(); + + for (int d = 0; d < numGPUs; d++) { + cudaSetDevice(d); + cudaMalloc(&buffers[d], numElems * sizeof(int)); + cudaMalloc(&buffersD2D[d], numElems * sizeof(int)); + cudaCheckError(); + cudaEventCreate(&start[d]); + cudaCheckError(); + cudaEventCreate(&stop[d]); + cudaCheckError(); + cudaStreamCreateWithFlags(&stream0[d], cudaStreamNonBlocking); + cudaCheckError(); + cudaStreamCreateWithFlags(&stream1[d], cudaStreamNonBlocking); + cudaCheckError(); + } + + vector bandwidthMatrix(numGPUs * numGPUs); + + for (int i = 0; i < numGPUs; i++) { + cudaSetDevice(i); + + for (int j = 0; j < numGPUs; j++) { + int access = 0; + if (p2p) { + cudaDeviceCanAccessPeer(&access, i, j); + if (access) { + cudaSetDevice(i); + cudaDeviceEnablePeerAccess(j, 0); + cudaCheckError(); + cudaSetDevice(j); + cudaDeviceEnablePeerAccess(i, 0); + cudaCheckError(); + } + } + + cudaSetDevice(i); + cudaStreamSynchronize(stream0[i]); + cudaStreamSynchronize(stream1[j]); + cudaCheckError(); + + // Block the stream until all the work is queued up + // DANGER! - cudaMemcpy*Async may infinitely block waiting for + // room to push the operation, so keep the number of repeatitions + // relatively low. Higher repeatitions will cause the delay kernel + // to timeout and lead to unstable results. + *flag = 0; + cudaSetDevice(i); + // No need to block stream1 since it'll be blocked on stream0's event + delay<<<1, 1, 0, stream0[i]>>>(flag); + cudaCheckError(); + + // Force stream1 not to start until stream0 does, in order to ensure + // the events on stream0 fully encompass the time needed for all + // operations + cudaEventRecord(start[i], stream0[i]); + cudaStreamWaitEvent(stream1[j], start[i], 0); + + if (i == j) { + // For intra-GPU perform 2 memcopies buffersD2D <-> buffers + performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat, + access, stream0[i]); + performP2PCopy(buffersD2D[i], i, buffers[i], i, numElems, repeat, + access, stream1[i]); + } else { + if (access && p2p_mechanism == SM) { + cudaSetDevice(j); + } + performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access, + stream1[j]); + if (access && p2p_mechanism == SM) { + cudaSetDevice(i); + } + performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access, + stream0[i]); + } + + // Notify stream0 that stream1 is complete and record the time of + // the total transaction + cudaEventRecord(stop[j], stream1[j]); + cudaStreamWaitEvent(stream0[i], stop[j], 0); + cudaEventRecord(stop[i], stream0[i]); + + // Release the queued operations + *flag = 1; + cudaStreamSynchronize(stream0[i]); + cudaStreamSynchronize(stream1[j]); + cudaCheckError(); + + float time_ms; + cudaEventElapsedTime(&time_ms, start[i], stop[i]); + double time_s = time_ms / 1e3; + + double gb = 2.0 * numElems * sizeof(int) * repeat / (double)1e9; + if (i == j) { + gb *= 2; // must count both the read and the write here + } + bandwidthMatrix[i * numGPUs + j] = gb / time_s; + if (p2p && access) { + cudaSetDevice(i); + cudaDeviceDisablePeerAccess(j); + cudaSetDevice(j); + cudaDeviceDisablePeerAccess(i); + } + } + } + + printf(" D\\D"); + + for (int j = 0; j < numGPUs; j++) { + printf("%6d ", j); + } + + printf("\n"); + + for (int i = 0; i < numGPUs; i++) { + printf("%6d ", i); + + for (int j = 0; j < numGPUs; j++) { + printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]); + } + + printf("\n"); + } + + for (int d = 0; d < numGPUs; d++) { + cudaSetDevice(d); + cudaFree(buffers[d]); + cudaFree(buffersD2D[d]); + cudaCheckError(); + cudaEventDestroy(start[d]); + cudaCheckError(); + cudaEventDestroy(stop[d]); + cudaCheckError(); + cudaStreamDestroy(stream0[d]); + cudaCheckError(); + cudaStreamDestroy(stream1[d]); + cudaCheckError(); + } + + cudaFreeHost((void *)flag); + cudaCheckError(); +} + +void outputLatencyMatrix(int numGPUs, bool p2p, P2PDataTransfer p2p_method) { + int repeat = 100; + volatile int *flag = NULL; + StopWatchInterface *stopWatch = NULL; + vector buffers(numGPUs); + vector buffersD2D(numGPUs); // buffer for D2D, that is, intra-GPU copy + vector stream(numGPUs); + vector start(numGPUs); + vector stop(numGPUs); + + cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable); + cudaCheckError(); + + if (!sdkCreateTimer(&stopWatch)) { + printf("Failed to create stop watch\n"); + exit(EXIT_FAILURE); + } + sdkStartTimer(&stopWatch); + + for (int d = 0; d < numGPUs; d++) { + cudaSetDevice(d); + cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking); + cudaMalloc(&buffers[d], sizeof(int)); + cudaMalloc(&buffersD2D[d], sizeof(int)); + cudaCheckError(); + cudaEventCreate(&start[d]); + cudaCheckError(); + cudaEventCreate(&stop[d]); + cudaCheckError(); + } + + vector gpuLatencyMatrix(numGPUs * numGPUs); + vector cpuLatencyMatrix(numGPUs * numGPUs); + + for (int i = 0; i < numGPUs; i++) { + cudaSetDevice(i); + + for (int j = 0; j < numGPUs; j++) { + int access = 0; + if (p2p) { + cudaDeviceCanAccessPeer(&access, i, j); + if (access) { + cudaDeviceEnablePeerAccess(j, 0); + cudaCheckError(); + cudaSetDevice(j); + cudaDeviceEnablePeerAccess(i, 0); + cudaSetDevice(i); + cudaCheckError(); + } + } + cudaStreamSynchronize(stream[i]); + cudaCheckError(); + + // Block the stream until all the work is queued up + // DANGER! - cudaMemcpy*Async may infinitely block waiting for + // room to push the operation, so keep the number of repeatitions + // relatively low. Higher repeatitions will cause the delay kernel + // to timeout and lead to unstable results. + *flag = 0; + delay<<<1, 1, 0, stream[i]>>>(flag); + cudaCheckError(); + cudaEventRecord(start[i], stream[i]); + + sdkResetTimer(&stopWatch); + if (i == j) { + // Perform intra-GPU, D2D copies + performP2PCopy(buffers[i], i, buffersD2D[i], i, 1, repeat, access, + stream[i]); + } else { + if (p2p_method == P2P_WRITE) { + performP2PCopy(buffers[j], j, buffers[i], i, 1, repeat, access, + stream[i]); + } else { + performP2PCopy(buffers[i], i, buffers[j], j, 1, repeat, access, + stream[i]); + } + } + float cpu_time_ms = sdkGetTimerValue(&stopWatch); + + cudaEventRecord(stop[i], stream[i]); + // Now that the work has been queued up, release the stream + *flag = 1; + cudaStreamSynchronize(stream[i]); + cudaCheckError(); + + float gpu_time_ms; + cudaEventElapsedTime(&gpu_time_ms, start[i], stop[i]); + + gpuLatencyMatrix[i * numGPUs + j] = gpu_time_ms * 1e3 / repeat; + cpuLatencyMatrix[i * numGPUs + j] = cpu_time_ms * 1e3 / repeat; + if (p2p && access) { + cudaDeviceDisablePeerAccess(j); + cudaSetDevice(j); + cudaDeviceDisablePeerAccess(i); + cudaSetDevice(i); + cudaCheckError(); + } + } + } + + printf(" GPU"); + + for (int j = 0; j < numGPUs; j++) { + printf("%6d ", j); + } + + printf("\n"); + + for (int i = 0; i < numGPUs; i++) { + printf("%6d ", i); + + for (int j = 0; j < numGPUs; j++) { + printf("%6.02f ", gpuLatencyMatrix[i * numGPUs + j]); + } + + printf("\n"); + } + + printf("\n CPU"); + + for (int j = 0; j < numGPUs; j++) { + printf("%6d ", j); + } + + printf("\n"); + + for (int i = 0; i < numGPUs; i++) { + printf("%6d ", i); + + for (int j = 0; j < numGPUs; j++) { + printf("%6.02f ", cpuLatencyMatrix[i * numGPUs + j]); + } + + printf("\n"); + } + + for (int d = 0; d < numGPUs; d++) { + cudaSetDevice(d); + cudaFree(buffers[d]); + cudaFree(buffersD2D[d]); + cudaCheckError(); + cudaEventDestroy(start[d]); + cudaCheckError(); + cudaEventDestroy(stop[d]); + cudaCheckError(); + cudaStreamDestroy(stream[d]); + cudaCheckError(); + } + + sdkDeleteTimer(&stopWatch); + + cudaFreeHost((void *)flag); + cudaCheckError(); +} + +int main(int argc, char **argv) { + int numGPUs; + P2PDataTransfer p2p_method = P2P_WRITE; + + cudaGetDeviceCount(&numGPUs); + cudaCheckError(); + + // process command line args + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printHelp(); + return 0; + } + + if (checkCmdLineFlag(argc, (const char **)argv, "p2p_read")) { + p2p_method = P2P_READ; + } + + if (checkCmdLineFlag(argc, (const char **)argv, "sm_copy")) { + p2p_mechanism = SM; + } + + printf("[%s]\n", sSampleName); + + // output devices + for (int i = 0; i < numGPUs; i++) { + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, i); + cudaCheckError(); + printf("Device: %d, %s, pciBusID: %x, pciDeviceID: %x, pciDomainID:%x\n", i, + prop.name, prop.pciBusID, prop.pciDeviceID, prop.pciDomainID); + } + + checkP2Paccess(numGPUs); + + // Check peer-to-peer connectivity + printf("P2P Connectivity Matrix\n"); + printf(" D\\D"); + + for (int j = 0; j < numGPUs; j++) { + printf("%6d", j); + } + printf("\n"); + + for (int i = 0; i < numGPUs; i++) { + printf("%6d\t", i); + for (int j = 0; j < numGPUs; j++) { + if (i != j) { + int access; + cudaDeviceCanAccessPeer(&access, i, j); + cudaCheckError(); + printf("%6d", (access) ? 1 : 0); + } else { + printf("%6d", 1); + } + } + printf("\n"); + } + + printf("Unidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n"); + outputBandwidthMatrix(numGPUs, false, P2P_WRITE); + printf("Unidirectional P2P=Enabled Bandwidth (P2P Writes) Matrix (GB/s)\n"); + outputBandwidthMatrix(numGPUs, true, P2P_WRITE); + if (p2p_method == P2P_READ) { + printf("Unidirectional P2P=Enabled Bandwidth (P2P Reads) Matrix (GB/s)\n"); + outputBandwidthMatrix(numGPUs, true, p2p_method); + } + printf("Bidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n"); + outputBidirectionalBandwidthMatrix(numGPUs, false); + printf("Bidirectional P2P=Enabled Bandwidth Matrix (GB/s)\n"); + outputBidirectionalBandwidthMatrix(numGPUs, true); + + printf("P2P=Disabled Latency Matrix (us)\n"); + outputLatencyMatrix(numGPUs, false, P2P_WRITE); + printf("P2P=Enabled Latency (P2P Writes) Matrix (us)\n"); + outputLatencyMatrix(numGPUs, true, P2P_WRITE); + if (p2p_method == P2P_READ) { + printf("P2P=Enabled Latency (P2P Reads) Matrix (us)\n"); + outputLatencyMatrix(numGPUs, true, p2p_method); + } + + printf( + "\nNOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n"); + + exit(EXIT_SUCCESS); +} diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2010.sln b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.sln similarity index 76% rename from Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2010.sln rename to Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.sln index 8947cbf0..9cfed601 100644 --- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2010.sln +++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.sln @@ -1,7 +1,7 @@  -Microsoft Visual Studio Solution File, Format Version 11.00 -# Visual Studio 2010 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "warpAggregatedAtomicsCG", "warpAggregatedAtomicsCG_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "p2pBandwidthLatencyTest", "p2pBandwidthLatencyTest_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2010.vcxproj b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.vcxproj similarity index 90% rename from Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2010.vcxproj rename to Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.vcxproj index 9e18d8bd..c763b1da 100644 --- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2010.vcxproj +++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.vcxproj @@ -15,14 +15,15 @@ {997E0757-EA74-4A4E-A0FC-47D8C8831A15} - simpleVoteIntrinsics_vs2010 - simpleVoteIntrinsics + p2pBandwidthLatencyTest_vs2012 + p2pBandwidthLatencyTest Application MultiByte + v110 true @@ -32,7 +33,7 @@ - + @@ -58,10 +59,10 @@ Console cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(CudaToolkitLibDir); - $(OutDir)/simpleVoteIntrinsics.exe + $(OutDir)/p2pBandwidthLatencyTest.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -96,11 +97,11 @@ - - + + - + diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2010.sln b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.sln similarity index 72% rename from Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2010.sln rename to Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.sln index af130be1..cac60a5d 100644 --- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2010.sln +++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.sln @@ -1,7 +1,7 @@  -Microsoft Visual Studio Solution File, Format Version 11.00 -# Visual Studio 2010 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cudaTensorCoreGemm", "cudaTensorCoreGemm_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +Microsoft Visual Studio Solution File, Format Version 13.00 +# Visual Studio 2013 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "p2pBandwidthLatencyTest", "p2pBandwidthLatencyTest_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.vcxproj b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.vcxproj new file mode 100644 index 00000000..2fa7ab33 --- /dev/null +++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + p2pBandwidthLatencyTest_vs2013 + p2pBandwidthLatencyTest + + + + + Application + MultiByte + v120 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/p2pBandwidthLatencyTest.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.sln b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.sln new file mode 100644 index 00000000..c53f00c5 --- /dev/null +++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 14.00 +# Visual Studio 2015 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "p2pBandwidthLatencyTest", "p2pBandwidthLatencyTest_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.vcxproj b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.vcxproj new file mode 100644 index 00000000..e6db5568 --- /dev/null +++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.vcxproj @@ -0,0 +1,107 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + p2pBandwidthLatencyTest_vs2015 + p2pBandwidthLatencyTest + + + + + Application + MultiByte + v140 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/p2pBandwidthLatencyTest.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.sln b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.sln new file mode 100644 index 00000000..88a5563a --- /dev/null +++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2017 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "p2pBandwidthLatencyTest", "p2pBandwidthLatencyTest_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj new file mode 100644 index 00000000..7b8ec8f6 --- /dev/null +++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj @@ -0,0 +1,108 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + p2pBandwidthLatencyTest_vs2017 + p2pBandwidthLatencyTest + + + + + Application + MultiByte + v141 + 10.0.15063.0 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/p2pBandwidthLatencyTest.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/shfl_scan/Makefile b/Samples/shfl_scan/Makefile index 72b4a970..fcb778ce 100644 --- a/Samples/shfl_scan/Makefile +++ b/Samples/shfl_scan/Makefile @@ -1,31 +1,29 @@ ################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # -# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. # -# NOTICE TO USER: -# -# This source code is subject to NVIDIA ownership rights under U.S. and -# international Copyright laws. -# -# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE -# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR -# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH -# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, -# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -# OR PERFORMANCE OF THIS SOURCE CODE. -# -# U.S. Government End Users. This source code is a "commercial item" as -# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of -# "commercial computer software" and "commercial computer software -# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) -# and is provided to the U.S. Government only as a commercial end item. -# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through -# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the -# source code with only those rights set forth herein. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ################################################################################ # @@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) export QNX_TARGET HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ else ifeq ($(TARGET_OS), android) - HOST_COMPILER ?= aarch64-linux-android-g++ + HOST_COMPILER ?= aarch64-linux-android-clang++ endif else ifeq ($(TARGET_ARCH),ppc64le) HOST_COMPILER ?= powerpc64le-linux-gnu-g++ @@ -248,7 +246,7 @@ LIBRARIES := ################################################################################ # Gencode arguments -SMS ?= 30 35 37 50 52 60 61 70 +SMS ?= 30 35 37 50 52 60 61 70 75 ifeq ($(SMS),) $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) diff --git a/Samples/shfl_scan/NsightEclipse.xml b/Samples/shfl_scan/NsightEclipse.xml index d996aa26..f7e1801d 100644 --- a/Samples/shfl_scan/NsightEclipse.xml +++ b/Samples/shfl_scan/NsightEclipse.xml @@ -42,6 +42,7 @@ sm60 sm61 sm70 + sm75 x86_64 diff --git a/Samples/shfl_scan/README.md b/Samples/shfl_scan/README.md index b9da8eca..f2726ddb 100644 --- a/Samples/shfl_scan/README.md +++ b/Samples/shfl_scan/README.md @@ -10,7 +10,7 @@ Data-Parallel Algorithms, Performance Strategies ## Supported SM Architectures -[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) ## Supported OSes @@ -24,7 +24,7 @@ x86_64, ppc64le, armv7l, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/shfl_scan/shfl_scan_vs2010.vcxproj b/Samples/shfl_scan/shfl_scan_vs2010.vcxproj deleted file mode 100644 index e88b66ed..00000000 --- a/Samples/shfl_scan/shfl_scan_vs2010.vcxproj +++ /dev/null @@ -1,107 +0,0 @@ - - - - $(VCTargetsPath)\BuildCustomizations - - - - Debug - x64 - - - Release - x64 - - - - {997E0757-EA74-4A4E-A0FC-47D8C8831A15} - shfl_scan_vs2010 - shfl_scan - - - - - Application - MultiByte - - - true - - - true - - - - - - - - - - - $(Platform)/$(Configuration)/ - $(IncludePath) - AllRules.ruleset - - - - - ../../bin/win64/$(Configuration)/ - - - - Level3 - WIN32;_MBCS;%(PreprocessorDefinitions) - ./;$(CudaToolkitDir)/include;../../Common; - - - Console - cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) - $(CudaToolkitLibDir); - $(OutDir)/shfl_scan.exe - - - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; - -Xcompiler "/wd 4819" %(AdditionalOptions) - ./;../../Common - WIN32 - - - - - Disabled - MultiThreadedDebug - - - true - Default - - - MTd - 64 - - - - - MaxSpeed - MultiThreaded - - - false - UseLinkTimeCodeGeneration - - - MT - 64 - - - - - - - - - - - - diff --git a/Samples/shfl_scan/shfl_scan_vs2012.vcxproj b/Samples/shfl_scan/shfl_scan_vs2012.vcxproj index ec988cd6..ffb126f9 100644 --- a/Samples/shfl_scan/shfl_scan_vs2012.vcxproj +++ b/Samples/shfl_scan/shfl_scan_vs2012.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/shfl_scan.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -103,6 +103,6 @@ - + diff --git a/Samples/shfl_scan/shfl_scan_vs2013.vcxproj b/Samples/shfl_scan/shfl_scan_vs2013.vcxproj index b914aa7b..08f212cb 100644 --- a/Samples/shfl_scan/shfl_scan_vs2013.vcxproj +++ b/Samples/shfl_scan/shfl_scan_vs2013.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/shfl_scan.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -103,6 +103,6 @@ - + diff --git a/Samples/shfl_scan/shfl_scan_vs2015.vcxproj b/Samples/shfl_scan/shfl_scan_vs2015.vcxproj index a87ad796..0debbb65 100644 --- a/Samples/shfl_scan/shfl_scan_vs2015.vcxproj +++ b/Samples/shfl_scan/shfl_scan_vs2015.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/shfl_scan.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -103,6 +103,6 @@ - + diff --git a/Samples/shfl_scan/shfl_scan_vs2017.vcxproj b/Samples/shfl_scan/shfl_scan_vs2017.vcxproj index ea82b089..93573298 100644 --- a/Samples/shfl_scan/shfl_scan_vs2017.vcxproj +++ b/Samples/shfl_scan/shfl_scan_vs2017.vcxproj @@ -34,7 +34,7 @@ - + @@ -63,7 +63,7 @@ $(OutDir)/shfl_scan.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -104,6 +104,6 @@ - + diff --git a/Samples/simpleCUBLAS/Makefile b/Samples/simpleCUBLAS/Makefile index 054515ee..47249152 100644 --- a/Samples/simpleCUBLAS/Makefile +++ b/Samples/simpleCUBLAS/Makefile @@ -1,31 +1,29 @@ ################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # -# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. # -# NOTICE TO USER: -# -# This source code is subject to NVIDIA ownership rights under U.S. and -# international Copyright laws. -# -# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE -# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR -# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH -# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, -# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -# OR PERFORMANCE OF THIS SOURCE CODE. -# -# U.S. Government End Users. This source code is a "commercial item" as -# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of -# "commercial computer software" and "commercial computer software -# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) -# and is provided to the U.S. Government only as a commercial end item. -# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through -# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the -# source code with only those rights set forth herein. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ################################################################################ # @@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) export QNX_TARGET HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ else ifeq ($(TARGET_OS), android) - HOST_COMPILER ?= aarch64-linux-android-g++ + HOST_COMPILER ?= aarch64-linux-android-clang++ endif else ifeq ($(TARGET_ARCH),ppc64le) HOST_COMPILER ?= powerpc64le-linux-gnu-g++ diff --git a/Samples/simpleCUBLAS/NsightEclipse.xml b/Samples/simpleCUBLAS/NsightEclipse.xml index d8f26a64..06b96ee0 100644 --- a/Samples/simpleCUBLAS/NsightEclipse.xml +++ b/Samples/simpleCUBLAS/NsightEclipse.xml @@ -41,6 +41,7 @@ sm60 sm61 sm70 + sm75 x86_64 diff --git a/Samples/simpleCUBLAS/README.md b/Samples/simpleCUBLAS/README.md index acaa26a3..7acb9182 100644 --- a/Samples/simpleCUBLAS/README.md +++ b/Samples/simpleCUBLAS/README.md @@ -10,7 +10,7 @@ Image Processing, CUBLAS Library ## Supported SM Architectures -[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) ## Supported OSes @@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2012.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2012.vcxproj index d1c4aacc..5b9f1601 100644 --- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2012.vcxproj +++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2012.vcxproj @@ -33,7 +33,7 @@ - + @@ -102,6 +102,6 @@ - + diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2013.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2013.vcxproj index 60d45caf..ba1a17c4 100644 --- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2013.vcxproj +++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2013.vcxproj @@ -33,7 +33,7 @@ - + @@ -102,6 +102,6 @@ - + diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2015.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2015.vcxproj index 6804f433..24a93d51 100644 --- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2015.vcxproj +++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2015.vcxproj @@ -33,7 +33,7 @@ - + @@ -102,6 +102,6 @@ - + diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj index 31a5f194..d2beef3d 100644 --- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj +++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj @@ -34,7 +34,7 @@ - + @@ -103,6 +103,6 @@ - + diff --git a/Samples/simpleCUBLASXT/Makefile b/Samples/simpleCUBLASXT/Makefile new file mode 100644 index 00000000..6fea5c80 --- /dev/null +++ b/Samples/simpleCUBLASXT/Makefile @@ -0,0 +1,308 @@ +################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-clang++ + endif + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu + LDFLAGS += --unresolved-symbols=ignore-in-shared-libs + CCFLAGS += -isystem=$(TARGET_FS)/usr/include + CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu + endif + endif +endif + +ifeq ($(TARGET_OS),qnx) + CCFLAGS += -DWIN_INTERFACE_CUSTOM + LDFLAGS += -lsocket +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +SAMPLE_ENABLED := 1 + +# This sample is not supported on ARMv7 +ifeq ($(TARGET_ARCH),armv7l) + $(info >>> WARNING - simpleCUBLASXT is not supported on ARMv7 - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../Common +LIBRARIES := + +################################################################################ + +# Gencode arguments +SMS ?= + +ifeq ($(GENCODE_FLAGS),) +# Generate SASS code for each SM architecture listed in $(SMS) +$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) + +ifeq ($(SMS),) +# Generate PTX code from SM 30 +GENCODE_FLAGS += -gencode arch=compute_30,code=compute_30 +endif + +# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility +HIGHEST_SM := $(lastword $(sort $(SMS))) +ifneq ($(HIGHEST_SM),) +GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) +endif +endif + +LIBRARIES += -lcublas + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: simpleCUBLASXT + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +simpleCUBLASXT.o:simpleCUBLASXT.cpp + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +simpleCUBLASXT: simpleCUBLASXT.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./simpleCUBLASXT + +clean: + rm -f simpleCUBLASXT simpleCUBLASXT.o + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/simpleCUBLASXT + +clobber: clean diff --git a/Samples/simpleCUBLASXT/NsightEclipse.xml b/Samples/simpleCUBLASXT/NsightEclipse.xml new file mode 100644 index 00000000..37cd8805 --- /dev/null +++ b/Samples/simpleCUBLASXT/NsightEclipse.xml @@ -0,0 +1,69 @@ + + + + simpleCUBLASXT + + whole + true + + ./ + ../ + ../../common/inc + + + CUBLAS-XT Library + + + CUDA + CUBLAS + Linear Algebra + + + cublas + + + + true + simpleCUBLASXT.cpp + + CUBLAS + + + 1:CUDA Basic Topics + 3:Linear Algebra + + sm30 + sm35 + sm37 + sm50 + sm52 + sm60 + sm61 + sm70 + sm75 + + + x86_64 + linux + + + windows7 + + + x86_64 + macosx + + + aarch64 + + + ppc64le + linux + + + + all + + Simple CUBLAS XT + exe + diff --git a/Samples/simpleCUBLASXT/README.md b/Samples/simpleCUBLASXT/README.md new file mode 100644 index 00000000..63260e6d --- /dev/null +++ b/Samples/simpleCUBLASXT/README.md @@ -0,0 +1,95 @@ +# simpleCUBLASXT - Simple CUBLAS XT + +## Description + +Example of using CUBLAS-XT library which performs GEMM operations over Multiple GPUs. + +## Key Concepts + +CUBLAS-XT Library + +## Supported SM Architectures + +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows, MacOSX + +## Supported CPU Architecture + +x86_64, ppc64le, aarch64 + +## CUDA APIs involved + +## Dependencies needed to build/run +[CUBLAS](../../README.md#cublas) + +## Prerequisites + +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Make sure the dependencies mentioned in [Dependencies]() section above are installed. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=aarch64`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +### Mac +The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` + +The samples makefiles can take advantage of certain options: + +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` + +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60". + ``` + $ make SMS="A B ..." + ``` + +* **HOST_COMPILER=** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers. + ``` + $ make HOST_COMPILER=clang + ``` + +## References (for more details) + diff --git a/Samples/simpleCUBLASXT/simpleCUBLASXT.cpp b/Samples/simpleCUBLASXT/simpleCUBLASXT.cpp new file mode 100644 index 00000000..d08b1e9f --- /dev/null +++ b/Samples/simpleCUBLASXT/simpleCUBLASXT.cpp @@ -0,0 +1,301 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* This example demonstrates how to use the CUBLAS library + * by scaling an array of floating-point values on the device + * and comparing the result to the same operation performed + * on the host. + */ + +/* Includes, system */ +#include +#include +#include + +/* Includes, cuda */ +#include +#include +#include + +/* Matrix size */ +//#define N (275) +#define N (1024) +// Restricting the max used GPUs as input matrix is not so large +#define MAX_NUM_OF_GPUS 2 + +/* Host implementation of a simple version of sgemm */ +static void simple_sgemm(int n, float alpha, const float *A, const float *B, + float beta, float *C) { + int i; + int j; + int k; + + for (i = 0; i < n; ++i) { + for (j = 0; j < n; ++j) { + float prod = 0; + + for (k = 0; k < n; ++k) { + prod += A[k * n + i] * B[j * n + k]; + } + + C[j * n + i] = alpha * prod + beta * C[j * n + i]; + } + } +} + +void findMultipleBestGPUs(int &num_of_devices, int *device_ids) { + // Find the best CUDA capable GPU device + int current_device = 0; + + int device_count; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + typedef struct gpu_perf_t { + uint64_t compute_perf; + int device_id; + } gpu_perf; + + gpu_perf *gpu_stats = (gpu_perf *)malloc(sizeof(gpu_perf) * device_count); + + cudaDeviceProp deviceProp; + int devices_prohibited = 0; + while (current_device < device_count) { + cudaGetDeviceProperties(&deviceProp, current_device); + + // If this GPU is not running on Compute Mode prohibited, + // then we can add it to the list + int sm_per_multiproc; + if (deviceProp.computeMode != cudaComputeModeProhibited) { + if (deviceProp.major == 9999 && deviceProp.minor == 9999) { + sm_per_multiproc = 1; + } else { + sm_per_multiproc = + _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); + } + + gpu_stats[current_device].compute_perf = + (uint64_t)deviceProp.multiProcessorCount * sm_per_multiproc * + deviceProp.clockRate; + gpu_stats[current_device].device_id = current_device; + + } else { + devices_prohibited++; + } + + ++current_device; + } + if (devices_prohibited == device_count) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " all devices have compute mode prohibited.\n"); + exit(EXIT_FAILURE); + } else { + gpu_perf temp_elem; + // Sort the GPUs by highest compute perf. + for (int i = 0; i < current_device - 1; i++) { + for (int j = 0; j < current_device - i - 1; j++) { + if (gpu_stats[j].compute_perf < gpu_stats[j + 1].compute_perf) { + temp_elem = gpu_stats[j]; + gpu_stats[j] = gpu_stats[j + 1]; + gpu_stats[j + 1] = temp_elem; + } + } + } + + for (int i = 0; i < num_of_devices; i++) { + device_ids[i] = gpu_stats[i].device_id; + } + } + free(gpu_stats); +} + +/* Main */ +int main(int argc, char **argv) { + cublasStatus_t status; + float *h_A; + float *h_B; + float *h_C; + float *h_C_ref; + float *d_A = 0; + float *d_B = 0; + float *d_C = 0; + float alpha = 1.0f; + float beta = 0.0f; + int n2 = N * N; + int i; + float error_norm; + float ref_norm; + float diff; + cublasXtHandle_t handle; + int *devices = NULL; + + int num_of_devices = 0; + + checkCudaErrors(cudaGetDeviceCount(&num_of_devices)); + + if (num_of_devices > MAX_NUM_OF_GPUS) { + num_of_devices = MAX_NUM_OF_GPUS; + } + devices = (int *)malloc(sizeof(int) * num_of_devices); + + findMultipleBestGPUs(num_of_devices, devices); + cudaDeviceProp deviceProp; + printf("Using %d GPUs\n", num_of_devices); + for (i = 0; i < num_of_devices; i++) { + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devices[i])); + printf("GPU ID = %d, Name = %s \n", devices[i], deviceProp.name); + } + + /* Initialize CUBLAS */ + printf("simpleCUBLASXT test running..\n"); + + status = cublasXtCreate(&handle); + + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! CUBLASXT initialization error\n"); + return EXIT_FAILURE; + } + + /* Select devices for use in CUBLASXT math functions */ + status = cublasXtDeviceSelect(handle, num_of_devices, devices); + + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! CUBLASXT device selection error\n"); + return EXIT_FAILURE; + } + + /* Optional: Set a block size for CUBLASXT math functions */ + status = cublasXtSetBlockDim(handle, 64); + + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! CUBLASXT set block dimension error\n"); + return EXIT_FAILURE; + } + + /* Allocate host memory for the matrices */ + h_A = (float *)malloc(n2 * sizeof(h_A[0])); + + if (h_A == 0) { + fprintf(stderr, "!!!! host memory allocation error (A)\n"); + return EXIT_FAILURE; + } + + h_B = (float *)malloc(n2 * sizeof(h_B[0])); + + if (h_B == 0) { + fprintf(stderr, "!!!! host memory allocation error (B)\n"); + return EXIT_FAILURE; + } + + h_C_ref = (float *)malloc(n2 * sizeof(h_C[0])); + + if (h_C_ref == 0) { + fprintf(stderr, "!!!! host memory allocation error (C_ref)\n"); + return EXIT_FAILURE; + } + + h_C = (float *)malloc(n2 * sizeof(h_C[0])); + + if (h_C == 0) { + fprintf(stderr, "!!!! host memory allocation error (C)\n"); + return EXIT_FAILURE; + } + + /* Fill the matrices with test data */ + for (i = 0; i < n2; i++) { + h_A[i] = rand() / (float)RAND_MAX; + h_B[i] = rand() / (float)RAND_MAX; + h_C[i] = rand() / (float)RAND_MAX; + h_C_ref[i] = h_C[i]; + } + + /* Performs operation using plain C code */ + simple_sgemm(N, alpha, h_A, h_B, beta, h_C_ref); + + /* Performs operation using cublas */ + status = cublasXtSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, h_A, + N, h_B, N, &beta, h_C, N); + + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! kernel execution error.\n"); + return EXIT_FAILURE; + } + + /* Check result against reference */ + error_norm = 0; + ref_norm = 0; + + for (i = 0; i < n2; ++i) { + diff = h_C_ref[i] - h_C[i]; + error_norm += diff * diff; + ref_norm += h_C_ref[i] * h_C_ref[i]; + } + + error_norm = (float)sqrt((double)error_norm); + ref_norm = (float)sqrt((double)ref_norm); + + if (fabs(ref_norm) < 1e-7) { + fprintf(stderr, "!!!! reference norm is 0\n"); + return EXIT_FAILURE; + } + + /* Memory clean up */ + free(h_A); + free(h_B); + free(h_C); + free(h_C_ref); + + if (cudaFree(d_A) != cudaSuccess) { + fprintf(stderr, "!!!! memory free error (A)\n"); + return EXIT_FAILURE; + } + + if (cudaFree(d_B) != cudaSuccess) { + fprintf(stderr, "!!!! memory free error (B)\n"); + return EXIT_FAILURE; + } + + if (cudaFree(d_C) != cudaSuccess) { + fprintf(stderr, "!!!! memory free error (C)\n"); + return EXIT_FAILURE; + } + + /* Shutdown */ + status = cublasXtDestroy(handle); + + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! shutdown error (A)\n"); + return EXIT_FAILURE; + } + + if (error_norm / ref_norm < 1e-6f) { + printf("simpleCUBLASXT test passed.\n"); + exit(EXIT_SUCCESS); + } else { + printf("simpleCUBLASXT test failed.\n"); + exit(EXIT_FAILURE); + } +} diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2010.sln b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2012.sln similarity index 81% rename from Samples/simpleCUBLAS/simpleCUBLAS_vs2010.sln rename to Samples/simpleCUBLASXT/simpleCUBLASXT_vs2012.sln index fcad3e9e..3d53b50e 100644 --- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2010.sln +++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2012.sln @@ -1,7 +1,7 @@  -Microsoft Visual Studio Solution File, Format Version 11.00 -# Visual Studio 2010 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUBLAS", "simpleCUBLAS_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUBLASXT", "simpleCUBLASXT_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2010.vcxproj b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2012.vcxproj similarity index 91% rename from Samples/simpleCUBLAS/simpleCUBLAS_vs2010.vcxproj rename to Samples/simpleCUBLASXT/simpleCUBLASXT_vs2012.vcxproj index 79b747f9..60731c2a 100644 --- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2010.vcxproj +++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2012.vcxproj @@ -15,14 +15,15 @@ {997E0757-EA74-4A4E-A0FC-47D8C8831A15} - simpleCUBLAS_vs2010 - simpleCUBLAS + simpleCUBLASXT_vs2012 + simpleCUBLASXT Application MultiByte + v110 true @@ -32,7 +33,7 @@ - + @@ -58,7 +59,7 @@ Console cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(CudaToolkitLibDir); - $(OutDir)/simpleCUBLAS.exe + $(OutDir)/simpleCUBLASXT.exe compute_30,compute_30; @@ -96,11 +97,11 @@ - + - + diff --git a/Samples/deviceQuery/deviceQuery_vs2010.sln b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2013.sln similarity index 73% rename from Samples/deviceQuery/deviceQuery_vs2010.sln rename to Samples/simpleCUBLASXT/simpleCUBLASXT_vs2013.sln index 7aa6dd37..0deb06af 100644 --- a/Samples/deviceQuery/deviceQuery_vs2010.sln +++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2013.sln @@ -1,7 +1,7 @@  -Microsoft Visual Studio Solution File, Format Version 11.00 -# Visual Studio 2010 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deviceQuery", "deviceQuery_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +Microsoft Visual Studio Solution File, Format Version 13.00 +# Visual Studio 2013 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUBLASXT", "simpleCUBLASXT_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2010.vcxproj b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2013.vcxproj similarity index 83% rename from Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2010.vcxproj rename to Samples/simpleCUBLASXT/simpleCUBLASXT_vs2013.vcxproj index 6daebbb1..861f6692 100644 --- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2010.vcxproj +++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2013.vcxproj @@ -15,14 +15,15 @@ {997E0757-EA74-4A4E-A0FC-47D8C8831A15} - cudaTensorCoreGemm_vs2010 - cudaTensorCoreGemm + simpleCUBLASXT_vs2013 + simpleCUBLASXT Application MultiByte + v120 true @@ -32,7 +33,7 @@ - + @@ -52,16 +53,16 @@ Level3 WIN32;_MBCS;%(PreprocessorDefinitions) - ./;$(CudaToolkitDir)/include;../../Common; + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); Console - cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(CudaToolkitLibDir); - $(OutDir)/cudaTensorCoreGemm.exe + $(OutDir)/simpleCUBLASXT.exe - compute_70,sm_70; + compute_30,compute_30; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -96,11 +97,11 @@ - + - + diff --git a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2015.sln b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2015.sln new file mode 100644 index 00000000..9fd28bea --- /dev/null +++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2015.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 14.00 +# Visual Studio 2015 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUBLASXT", "simpleCUBLASXT_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2010.vcxproj b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2015.vcxproj similarity index 83% rename from Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2010.vcxproj rename to Samples/simpleCUBLASXT/simpleCUBLASXT_vs2015.vcxproj index 4118fa81..747e9a4b 100644 --- a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2010.vcxproj +++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2015.vcxproj @@ -15,14 +15,15 @@ {997E0757-EA74-4A4E-A0FC-47D8C8831A15} - vectorAdd_nvrtc_vs2010 - vectorAdd_nvrtc + simpleCUBLASXT_vs2015 + simpleCUBLASXT Application MultiByte + v140 true @@ -32,7 +33,7 @@ - + @@ -52,16 +53,16 @@ Level3 WIN32;_MBCS;%(PreprocessorDefinitions) - ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include; + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); Console - cuda.lib;nvrtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(CudaToolkitLibDir); - $(OutDir)/vectorAdd_nvrtc.exe + $(OutDir)/simpleCUBLASXT.exe - + compute_30,compute_30; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -96,11 +97,11 @@ - - + + - + diff --git a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.sln b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.sln new file mode 100644 index 00000000..010dbc51 --- /dev/null +++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2017 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUBLASXT", "simpleCUBLASXT_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2010.vcxproj b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj similarity index 82% rename from Samples/matrixMulDrv/matrixMulDrv_vs2010.vcxproj rename to Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj index 96dfcbcc..07c44de1 100644 --- a/Samples/matrixMulDrv/matrixMulDrv_vs2010.vcxproj +++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj @@ -15,14 +15,16 @@ {997E0757-EA74-4A4E-A0FC-47D8C8831A15} - matrixMulDrv_vs2010 - matrixMulDrv + simpleCUBLASXT_vs2017 + simpleCUBLASXT Application MultiByte + v141 + 10.0.15063.0 true @@ -32,7 +34,7 @@ - + @@ -52,13 +54,13 @@ Level3 WIN32;_MBCS;%(PreprocessorDefinitions) - ./;$(CudaToolkitDir)/include;../../Common; + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); Console - cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(CudaToolkitLibDir); - $(OutDir)/matrixMulDrv.exe + $(OutDir)/simpleCUBLASXT.exe compute_30,compute_30; @@ -96,15 +98,11 @@ - - - data/%(Filename)64.ptx - ptx - - + + - + diff --git a/Samples/simpleCUFFT/Makefile b/Samples/simpleCUFFT/Makefile index a25ca0b9..f2a261a9 100644 --- a/Samples/simpleCUFFT/Makefile +++ b/Samples/simpleCUFFT/Makefile @@ -1,31 +1,29 @@ ################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # -# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. # -# NOTICE TO USER: -# -# This source code is subject to NVIDIA ownership rights under U.S. and -# international Copyright laws. -# -# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE -# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR -# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH -# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, -# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -# OR PERFORMANCE OF THIS SOURCE CODE. -# -# U.S. Government End Users. This source code is a "commercial item" as -# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of -# "commercial computer software" and "commercial computer software -# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) -# and is provided to the U.S. Government only as a commercial end item. -# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through -# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the -# source code with only those rights set forth herein. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ################################################################################ # @@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) export QNX_TARGET HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ else ifeq ($(TARGET_OS), android) - HOST_COMPILER ?= aarch64-linux-android-g++ + HOST_COMPILER ?= aarch64-linux-android-clang++ endif else ifeq ($(TARGET_ARCH),ppc64le) HOST_COMPILER ?= powerpc64le-linux-gnu-g++ @@ -248,7 +246,7 @@ LIBRARIES := ################################################################################ # Gencode arguments -SMS ?= 30 35 37 50 52 60 61 70 +SMS ?= 30 35 37 50 52 60 61 70 75 ifeq ($(SMS),) $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) diff --git a/Samples/simpleCUFFT/NsightEclipse.xml b/Samples/simpleCUFFT/NsightEclipse.xml index cac090a0..27a3e9ff 100644 --- a/Samples/simpleCUFFT/NsightEclipse.xml +++ b/Samples/simpleCUFFT/NsightEclipse.xml @@ -39,6 +39,7 @@ sm60 sm61 sm70 + sm75 x86_64 diff --git a/Samples/simpleCUFFT/README.md b/Samples/simpleCUFFT/README.md index f3de5071..95724675 100644 --- a/Samples/simpleCUFFT/README.md +++ b/Samples/simpleCUFFT/README.md @@ -10,7 +10,7 @@ Image Processing, CUFFT Library ## Supported SM Architectures -[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) ## Supported OSes @@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2010.sln b/Samples/simpleCUFFT/simpleCUFFT_vs2010.sln deleted file mode 100644 index 1bf9f623..00000000 --- a/Samples/simpleCUFFT/simpleCUFFT_vs2010.sln +++ /dev/null @@ -1,20 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 11.00 -# Visual Studio 2010 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCUFFT", "simpleCUFFT_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|x64 = Debug|x64 - Release|x64 = Release|x64 - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 - {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 - {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 - {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection -EndGlobal diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2012.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2012.vcxproj index cf19ef62..a29e47cf 100644 --- a/Samples/simpleCUFFT/simpleCUFFT_vs2012.vcxproj +++ b/Samples/simpleCUFFT/simpleCUFFT_vs2012.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/simpleCUFFT.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -102,6 +102,6 @@ - + diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2013.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2013.vcxproj index 384b3aa6..cb78fb43 100644 --- a/Samples/simpleCUFFT/simpleCUFFT_vs2013.vcxproj +++ b/Samples/simpleCUFFT/simpleCUFFT_vs2013.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/simpleCUFFT.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -102,6 +102,6 @@ - + diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2015.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2015.vcxproj index 6f5f651d..4d4ef344 100644 --- a/Samples/simpleCUFFT/simpleCUFFT_vs2015.vcxproj +++ b/Samples/simpleCUFFT/simpleCUFFT_vs2015.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/simpleCUFFT.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -102,6 +102,6 @@ - + diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj index bb012d34..1f5b2dfc 100644 --- a/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj +++ b/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj @@ -34,7 +34,7 @@ - + @@ -63,7 +63,7 @@ $(OutDir)/simpleCUFFT.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -103,6 +103,6 @@ - + diff --git a/Samples/simpleCudaGraphs/Makefile b/Samples/simpleCudaGraphs/Makefile new file mode 100644 index 00000000..9341732a --- /dev/null +++ b/Samples/simpleCudaGraphs/Makefile @@ -0,0 +1,300 @@ +################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-clang++ + endif + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu + LDFLAGS += --unresolved-symbols=ignore-in-shared-libs + CCFLAGS += -isystem=$(TARGET_FS)/usr/include + CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu + endif + endif +endif + +ifeq ($(TARGET_OS),qnx) + CCFLAGS += -DWIN_INTERFACE_CUSTOM + LDFLAGS += -lsocket +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +SAMPLE_ENABLED := 1 + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../Common +LIBRARIES := + +################################################################################ + +# Gencode arguments +SMS ?= 30 35 37 50 52 60 61 70 75 + +ifeq ($(SMS),) +$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) +SAMPLE_ENABLED := 0 +endif + +ifeq ($(GENCODE_FLAGS),) +# Generate SASS code for each SM architecture listed in $(SMS) +$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) + +# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility +HIGHEST_SM := $(lastword $(sort $(SMS))) +ifneq ($(HIGHEST_SM),) +GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) +endif +endif + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: simpleCudaGraphs + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +simpleCudaGraphs.o:simpleCudaGraphs.cu + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +simpleCudaGraphs: simpleCudaGraphs.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./simpleCudaGraphs + +clean: + rm -f simpleCudaGraphs simpleCudaGraphs.o + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/simpleCudaGraphs + +clobber: clean diff --git a/Samples/simpleCudaGraphs/NsightEclipse.xml b/Samples/simpleCudaGraphs/NsightEclipse.xml new file mode 100644 index 00000000..8c4e9870 --- /dev/null +++ b/Samples/simpleCudaGraphs/NsightEclipse.xml @@ -0,0 +1,78 @@ + + + + simpleCudaGraphs + + cudaStreamBeginCapture + cudaStreamEndCapture + cudaLaunchHostFunc + cudaGraphCreate + cudaGraphLaunch + cudaGraphInstantiate + cudaGraphAddHostNode + cudaGraphAddMemcpyNode + cudaGraphAddKernelNode + cudaGraphAddMemsetNode + cudaGraphExecDestroy + cudaGraphDestroy + + + whole + + ./ + ../ + ../../common/inc + + + CUDA Graphs + Stream Capture + + + CUDA + GPGPU + CUDA Graphs + + + + + + true + simpleCudaGraphs.cu + + 1:CUDA + + sm30 + sm35 + sm37 + sm50 + sm52 + sm60 + sm61 + sm70 + sm75 + + + x86_64 + linux + + + windows7 + + + x86_64 + macosx + + + arm + + + ppc64le + linux + + + + 3.0 + + Simple CUDA Graphs + exe + diff --git a/Samples/simpleCudaGraphs/README.md b/Samples/simpleCudaGraphs/README.md new file mode 100644 index 00000000..a4a226b4 --- /dev/null +++ b/Samples/simpleCudaGraphs/README.md @@ -0,0 +1,94 @@ +# simpleCudaGraphs - Simple CUDA Graphs + +## Description + +A demonstration of CUDA Graphs creation, instantiation and launch using Graphs APIs and Stream Capture APIs. + +## Key Concepts + +CUDA Graphs, Stream Capture + +## Supported SM Architectures + +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows, MacOSX + +## Supported CPU Architecture + +x86_64, ppc64le, armv7l + +## CUDA APIs involved + +### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) +cudaStreamBeginCapture, cudaStreamEndCapture, cudaLaunchHostFunc, cudaGraphCreate, cudaGraphLaunch, cudaGraphInstantiate, cudaGraphAddHostNode, cudaGraphAddMemcpyNode, cudaGraphAddKernelNode, cudaGraphAddMemsetNode, cudaGraphExecDestroy, cudaGraphDestroy + +## Prerequisites + +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +### Mac +The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` + +The samples makefiles can take advantage of certain options: + +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` + +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60". + ``` + $ make SMS="A B ..." + ``` + +* **HOST_COMPILER=** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers. + ``` + $ make HOST_COMPILER=clang + ``` + +## References (for more details) + diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs.cu b/Samples/simpleCudaGraphs/simpleCudaGraphs.cu new file mode 100644 index 00000000..6db123f8 --- /dev/null +++ b/Samples/simpleCudaGraphs/simpleCudaGraphs.cu @@ -0,0 +1,399 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include +#include +#include +#include + +namespace cg = cooperative_groups; + +#define THREADS_PER_BLOCK 512 +#define GRAPH_LAUNCH_ITERATIONS 3 + +typedef struct callBackData { + const char *fn_name; + double *data; +} callBackData_t; + +__global__ void reduce(float *inputVec, double *outputVec, size_t inputSize, + size_t outputSize) { + __shared__ double tmp[THREADS_PER_BLOCK]; + + cg::thread_block cta = cg::this_thread_block(); + size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x; + + double temp_sum = 0.0; + for (int i = globaltid; i < inputSize; i += gridDim.x * blockDim.x) { + temp_sum += (double)inputVec[i]; + } + tmp[cta.thread_rank()] = temp_sum; + + cg::sync(cta); + + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + double beta = temp_sum; + double temp; + + for (int i = tile32.size() / 2; i > 0; i >>= 1) { + if (tile32.thread_rank() < i) { + temp = tmp[cta.thread_rank() + i]; + beta += temp; + tmp[cta.thread_rank()] = beta; + } + cg::sync(tile32); + } + cg::sync(cta); + + if (cta.thread_rank() == 0 && blockIdx.x < outputSize) { + beta = 0.0; + for (int i = 0; i < cta.size(); i += tile32.size()) { + beta += tmp[i]; + } + outputVec[blockIdx.x] = beta; + } +} + +__global__ void reduceFinal(double *inputVec, double *result, + size_t inputSize) { + __shared__ double tmp[THREADS_PER_BLOCK]; + + cg::thread_block cta = cg::this_thread_block(); + size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x; + + double temp_sum = 0.0; + for (int i = globaltid; i < inputSize; i += gridDim.x * blockDim.x) { + temp_sum += (double)inputVec[i]; + } + tmp[cta.thread_rank()] = temp_sum; + + cg::sync(cta); + + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + // do reduction in shared mem + if ((blockDim.x >= 512) && (cta.thread_rank() < 256)) { + tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 256]; + } + + cg::sync(cta); + + if ((blockDim.x >= 256) && (cta.thread_rank() < 128)) { + tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 128]; + } + + cg::sync(cta); + + if ((blockDim.x >= 128) && (cta.thread_rank() < 64)) { + tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 64]; + } + + cg::sync(cta); + + if (cta.thread_rank() < 32) { + // Fetch final intermediate sum from 2nd warp + if (blockDim.x >= 64) temp_sum += tmp[cta.thread_rank() + 32]; + // Reduce final warp using shuffle + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { + temp_sum += tile32.shfl_down(temp_sum, offset); + } + } + // write result for this block to global mem + if (cta.thread_rank() == 0) result[0] = temp_sum; +} + +void init_input(float *a, size_t size) { + for (size_t i = 0; i < size; i++) a[i] = (rand() & 0xFF) / (float)RAND_MAX; +} + +void CUDART_CB myHostNodeCallback(void *data) { + // Check status of GPU after stream operations are done + callBackData_t *tmp = (callBackData_t *)(data); + // checkCudaErrors(tmp->status); + + double *result = (double *)(tmp->data); + char *function = (char *)(tmp->fn_name); + printf("[%s] Host callback final reduced sum = %lf\n", function, *result); + *result = 0.0; // reset the result +} + +void cudaGraphsManual(float *inputVec_h, float *inputVec_d, double *outputVec_d, + double *result_d, size_t inputSize, size_t numOfBlocks) { + cudaStream_t streamForGraph; + cudaGraph_t graph; + std::vector nodeDependencies; + cudaGraphNode_t memcpyNode, kernelNode, memsetNode; + double result_h = 0.0; + + checkCudaErrors(cudaStreamCreate(&streamForGraph)); + + cudaKernelNodeParams kernelNodeParams = {0}; + cudaMemcpy3DParms memcpyParams = {0}; + cudaMemsetParams memsetParams = {0}; + + memcpyParams.srcArray = NULL; + memcpyParams.srcPos = make_cudaPos(0, 0, 0); + memcpyParams.srcPtr = + make_cudaPitchedPtr(inputVec_h, sizeof(float) * inputSize, inputSize, 1); + memcpyParams.dstArray = NULL; + memcpyParams.dstPos = make_cudaPos(0, 0, 0); + memcpyParams.dstPtr = + make_cudaPitchedPtr(inputVec_d, sizeof(float) * inputSize, inputSize, 1); + memcpyParams.extent = make_cudaExtent(sizeof(float) * inputSize, 1, 1); + memcpyParams.kind = cudaMemcpyHostToDevice; + + memsetParams.dst = (void *)outputVec_d; + memsetParams.value = 0; + memsetParams.pitch = 0; + memsetParams.elementSize = sizeof(float); // elementSize can be max 4 bytes + memsetParams.width = numOfBlocks * 2; + memsetParams.height = 1; + + checkCudaErrors(cudaGraphCreate(&graph, 0)); + checkCudaErrors( + cudaGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &memcpyParams)); + checkCudaErrors( + cudaGraphAddMemsetNode(&memsetNode, graph, NULL, 0, &memsetParams)); + + nodeDependencies.push_back(memsetNode); + nodeDependencies.push_back(memcpyNode); + + void *kernelArgs[4] = {(void *)&inputVec_d, (void *)&outputVec_d, &inputSize, + &numOfBlocks}; + + kernelNodeParams.func = (void *)reduce; + kernelNodeParams.gridDim = dim3(numOfBlocks, 1, 1); + kernelNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void **)kernelArgs; + kernelNodeParams.extra = NULL; + + checkCudaErrors( + cudaGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(), + nodeDependencies.size(), &kernelNodeParams)); + + nodeDependencies.clear(); + nodeDependencies.push_back(kernelNode); + + memset(&memsetParams, 0, sizeof(memsetParams)); + memsetParams.dst = result_d; + memsetParams.value = 0; + memsetParams.elementSize = sizeof(float); + memsetParams.width = 2; + memsetParams.height = 1; + checkCudaErrors( + cudaGraphAddMemsetNode(&memsetNode, graph, NULL, 0, &memsetParams)); + + nodeDependencies.push_back(memsetNode); + + memset(&kernelNodeParams, 0, sizeof(kernelNodeParams)); + kernelNodeParams.func = (void *)reduceFinal; + kernelNodeParams.gridDim = dim3(1, 1, 1); + kernelNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + void *kernelArgs2[3] = {(void *)&outputVec_d, (void *)&result_d, + &numOfBlocks}; + kernelNodeParams.kernelParams = kernelArgs2; + kernelNodeParams.extra = NULL; + + checkCudaErrors( + cudaGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(), + nodeDependencies.size(), &kernelNodeParams)); + nodeDependencies.clear(); + nodeDependencies.push_back(kernelNode); + + memset(&memcpyParams, 0, sizeof(memcpyParams)); + + memcpyParams.srcArray = NULL; + memcpyParams.srcPos = make_cudaPos(0, 0, 0); + memcpyParams.srcPtr = make_cudaPitchedPtr(result_d, sizeof(double), 1, 1); + memcpyParams.dstArray = NULL; + memcpyParams.dstPos = make_cudaPos(0, 0, 0); + memcpyParams.dstPtr = make_cudaPitchedPtr(&result_h, sizeof(double), 1, 1); + memcpyParams.extent = make_cudaExtent(sizeof(double), 1, 1); + memcpyParams.kind = cudaMemcpyDeviceToHost; + checkCudaErrors( + cudaGraphAddMemcpyNode(&memcpyNode, graph, nodeDependencies.data(), + nodeDependencies.size(), &memcpyParams)); + nodeDependencies.clear(); + nodeDependencies.push_back(memcpyNode); + + cudaGraphNode_t hostNode; + cudaHostNodeParams hostParams = {0}; + hostParams.fn = myHostNodeCallback; + callBackData_t hostFnData; + hostFnData.data = &result_h; + hostFnData.fn_name = "cudaGraphsManual"; + hostParams.userData = &hostFnData; + + checkCudaErrors(cudaGraphAddHostNode(&hostNode, graph, + nodeDependencies.data(), + nodeDependencies.size(), &hostParams)); + + cudaGraphNode_t *nodes = NULL; + size_t numNodes = 0; + checkCudaErrors(cudaGraphGetNodes(graph, nodes, &numNodes)); + printf("\nNum of nodes in the graph created manually = %zu\n", numNodes); + + cudaGraphExec_t graphExec; + checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); + + cudaGraph_t clonedGraph; + cudaGraphExec_t clonedGraphExec; + checkCudaErrors(cudaGraphClone(&clonedGraph, graph)); + checkCudaErrors( + cudaGraphInstantiate(&clonedGraphExec, clonedGraph, NULL, NULL, 0)); + + for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) { + checkCudaErrors(cudaGraphLaunch(graphExec, streamForGraph)); + } + + checkCudaErrors(cudaStreamSynchronize(streamForGraph)); + + printf("Cloned Graph Output.. \n"); + for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) { + checkCudaErrors(cudaGraphLaunch(clonedGraphExec, streamForGraph)); + } + checkCudaErrors(cudaStreamSynchronize(streamForGraph)); + + checkCudaErrors(cudaGraphExecDestroy(graphExec)); + checkCudaErrors(cudaGraphExecDestroy(clonedGraphExec)); + checkCudaErrors(cudaGraphDestroy(graph)); + checkCudaErrors(cudaGraphDestroy(clonedGraph)); + checkCudaErrors(cudaStreamDestroy(streamForGraph)); +} + +void cudaGraphsUsingStreamCapture(float *inputVec_h, float *inputVec_d, + double *outputVec_d, double *result_d, + size_t inputSize, size_t numOfBlocks) { + cudaStream_t stream1, stream2, streamForGraph; + cudaEvent_t reduceKernelEvent; + cudaGraph_t graph; + double result_h = 0.0; + + checkCudaErrors(cudaStreamCreate(&stream1)); + checkCudaErrors(cudaStreamCreate(&stream2)); + checkCudaErrors(cudaStreamCreate(&streamForGraph)); + checkCudaErrors(cudaEventCreate(&reduceKernelEvent)); + + checkCudaErrors(cudaStreamBeginCapture(stream1)); + + checkCudaErrors(cudaMemcpyAsync(inputVec_d, inputVec_h, + sizeof(float) * inputSize, cudaMemcpyDefault, + stream1)); + checkCudaErrors( + cudaMemsetAsync(outputVec_d, 0, sizeof(double) * numOfBlocks, stream1)); + reduce<<>>( + inputVec_d, outputVec_d, inputSize, numOfBlocks); + checkCudaErrors(cudaEventRecord(reduceKernelEvent, stream1)); + + checkCudaErrors(cudaStreamWaitEvent(stream2, reduceKernelEvent, 0)); + checkCudaErrors(cudaMemsetAsync(result_d, 0, sizeof(double), stream2)); + reduceFinal<<<1, THREADS_PER_BLOCK, 0, stream2>>>(outputVec_d, result_d, + numOfBlocks); + checkCudaErrors(cudaMemcpyAsync(&result_h, result_d, sizeof(double), + cudaMemcpyDefault, stream2)); + checkCudaErrors(cudaEventRecord(reduceKernelEvent, stream2)); + checkCudaErrors(cudaStreamWaitEvent(stream1, reduceKernelEvent, 0)); + + callBackData_t hostFnData = {0}; + hostFnData.data = &result_h; + hostFnData.fn_name = "cudaGraphsUsingStreamCapture"; + cudaHostFn_t fn = myHostNodeCallback; + checkCudaErrors(cudaLaunchHostFunc(stream1, fn, &hostFnData)); + checkCudaErrors(cudaStreamEndCapture(stream1, &graph)); + + cudaGraphNode_t *nodes = NULL; + size_t numNodes = 0; + checkCudaErrors(cudaGraphGetNodes(graph, nodes, &numNodes)); + printf("\nNum of nodes in the graph created using stream capture API = %zu\n", + numNodes); + + cudaGraphExec_t graphExec; + checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); + + cudaGraph_t clonedGraph; + cudaGraphExec_t clonedGraphExec; + checkCudaErrors(cudaGraphClone(&clonedGraph, graph)); + checkCudaErrors( + cudaGraphInstantiate(&clonedGraphExec, clonedGraph, NULL, NULL, 0)); + + for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) { + checkCudaErrors(cudaGraphLaunch(graphExec, streamForGraph)); + } + + checkCudaErrors(cudaStreamSynchronize(streamForGraph)); + + printf("Cloned Graph Output.. \n"); + for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) { + checkCudaErrors(cudaGraphLaunch(clonedGraphExec, streamForGraph)); + } + + checkCudaErrors(cudaStreamSynchronize(streamForGraph)); + + checkCudaErrors(cudaGraphExecDestroy(graphExec)); + checkCudaErrors(cudaGraphExecDestroy(clonedGraphExec)); + checkCudaErrors(cudaGraphDestroy(graph)); + checkCudaErrors(cudaGraphDestroy(clonedGraph)); + checkCudaErrors(cudaStreamDestroy(stream1)); + checkCudaErrors(cudaStreamDestroy(stream2)); + checkCudaErrors(cudaStreamDestroy(streamForGraph)); +} + +int main(int argc, char **argv) { + size_t size = 1 << 24; // number of elements to reduce + size_t maxBlocks = 512; + + // This will pick the best possible CUDA capable device + int devID = findCudaDevice(argc, (const char **)argv); + + printf("%zu elements\n", size); + printf("threads per block = %d\n", THREADS_PER_BLOCK); + printf("Graph Launch iterations = %d\n", GRAPH_LAUNCH_ITERATIONS); + + float *inputVec_d = NULL, *inputVec_h = NULL; + double *outputVec_d = NULL, *result_d; + + inputVec_h = (float *)malloc(sizeof(float) * size); + checkCudaErrors(cudaMalloc(&inputVec_d, sizeof(float) * size)); + checkCudaErrors(cudaMalloc(&outputVec_d, sizeof(double) * maxBlocks)); + checkCudaErrors(cudaMalloc(&result_d, sizeof(double))); + + init_input(inputVec_h, size); + + cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, size, + maxBlocks); + cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, + size, maxBlocks); + + checkCudaErrors(cudaFree(inputVec_d)); + checkCudaErrors(cudaFree(outputVec_d)); + checkCudaErrors(cudaFree(result_d)); + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2012.sln b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2012.sln new file mode 100644 index 00000000..855904cc --- /dev/null +++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2012.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCudaGraphs", "simpleCudaGraphs_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/deviceQuery/deviceQuery_vs2010.vcxproj b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2012.vcxproj similarity index 90% rename from Samples/deviceQuery/deviceQuery_vs2010.vcxproj rename to Samples/simpleCudaGraphs/simpleCudaGraphs_vs2012.vcxproj index 06258954..ec603e11 100644 --- a/Samples/deviceQuery/deviceQuery_vs2010.vcxproj +++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2012.vcxproj @@ -15,14 +15,15 @@ {997E0757-EA74-4A4E-A0FC-47D8C8831A15} - deviceQuery_vs2010 - deviceQuery + simpleCudaGraphs_vs2012 + simpleCudaGraphs Application MultiByte + v110 true @@ -32,7 +33,7 @@ - + @@ -58,10 +59,10 @@ Console cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(CudaToolkitLibDir); - $(OutDir)/deviceQuery.exe + $(OutDir)/simpleCudaGraphs.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -96,11 +97,11 @@ - + - + diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2013.sln b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2013.sln new file mode 100644 index 00000000..4e87fbe9 --- /dev/null +++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2013.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 13.00 +# Visual Studio 2013 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCudaGraphs", "simpleCudaGraphs_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2010.vcxproj b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2013.vcxproj similarity index 90% rename from Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2010.vcxproj rename to Samples/simpleCudaGraphs/simpleCudaGraphs_vs2013.vcxproj index 8ed53622..def552f5 100644 --- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2010.vcxproj +++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2013.vcxproj @@ -15,14 +15,15 @@ {997E0757-EA74-4A4E-A0FC-47D8C8831A15} - warpAggregatedAtomicsCG_vs2010 - warpAggregatedAtomicsCG + simpleCudaGraphs_vs2013 + simpleCudaGraphs Application MultiByte + v120 true @@ -32,7 +33,7 @@ - + @@ -58,10 +59,10 @@ Console cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(CudaToolkitLibDir); - $(OutDir)/warpAggregatedAtomicsCG.exe + $(OutDir)/simpleCudaGraphs.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -96,11 +97,11 @@ - + - + diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2015.sln b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2015.sln new file mode 100644 index 00000000..290da8ed --- /dev/null +++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2015.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 14.00 +# Visual Studio 2015 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCudaGraphs", "simpleCudaGraphs_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/matrixMul/matrixMul_vs2010.vcxproj b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2015.vcxproj similarity index 90% rename from Samples/matrixMul/matrixMul_vs2010.vcxproj rename to Samples/simpleCudaGraphs/simpleCudaGraphs_vs2015.vcxproj index dc6c0653..d3632742 100644 --- a/Samples/matrixMul/matrixMul_vs2010.vcxproj +++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2015.vcxproj @@ -15,14 +15,15 @@ {997E0757-EA74-4A4E-A0FC-47D8C8831A15} - matrixMul_vs2010 - matrixMul + simpleCudaGraphs_vs2015 + simpleCudaGraphs Application MultiByte + v140 true @@ -32,7 +33,7 @@ - + @@ -58,10 +59,10 @@ Console cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(CudaToolkitLibDir); - $(OutDir)/matrixMul.exe + $(OutDir)/simpleCudaGraphs.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -96,11 +97,11 @@ - + - + diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.sln b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.sln new file mode 100644 index 00000000..8d248249 --- /dev/null +++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2017 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleCudaGraphs", "simpleCudaGraphs_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj new file mode 100644 index 00000000..57c7e22a --- /dev/null +++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj @@ -0,0 +1,108 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + simpleCudaGraphs_vs2017 + simpleCudaGraphs + + + + + Application + MultiByte + v141 + 10.0.15063.0 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common; + + + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/simpleCudaGraphs.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + diff --git a/Samples/simpleVoteIntrinsics/Makefile b/Samples/simpleVoteIntrinsics/Makefile index 0a44a02e..288a0289 100644 --- a/Samples/simpleVoteIntrinsics/Makefile +++ b/Samples/simpleVoteIntrinsics/Makefile @@ -1,31 +1,29 @@ ################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # -# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. # -# NOTICE TO USER: -# -# This source code is subject to NVIDIA ownership rights under U.S. and -# international Copyright laws. -# -# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE -# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR -# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH -# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, -# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -# OR PERFORMANCE OF THIS SOURCE CODE. -# -# U.S. Government End Users. This source code is a "commercial item" as -# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of -# "commercial computer software" and "commercial computer software -# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) -# and is provided to the U.S. Government only as a commercial end item. -# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through -# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the -# source code with only those rights set forth herein. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ################################################################################ # @@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) export QNX_TARGET HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ else ifeq ($(TARGET_OS), android) - HOST_COMPILER ?= aarch64-linux-android-g++ + HOST_COMPILER ?= aarch64-linux-android-clang++ endif else ifeq ($(TARGET_ARCH),ppc64le) HOST_COMPILER ?= powerpc64le-linux-gnu-g++ @@ -248,7 +246,7 @@ LIBRARIES := ################################################################################ # Gencode arguments -SMS ?= 30 35 37 50 52 60 61 70 +SMS ?= 30 35 37 50 52 60 61 70 75 ifeq ($(SMS),) $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) diff --git a/Samples/simpleVoteIntrinsics/NsightEclipse.xml b/Samples/simpleVoteIntrinsics/NsightEclipse.xml index 12a5b12c..3e5cadfe 100644 --- a/Samples/simpleVoteIntrinsics/NsightEclipse.xml +++ b/Samples/simpleVoteIntrinsics/NsightEclipse.xml @@ -40,6 +40,7 @@ sm60 sm61 sm70 + sm75 x86_64 diff --git a/Samples/simpleVoteIntrinsics/README.md b/Samples/simpleVoteIntrinsics/README.md index bd68536c..b9a4d9e6 100644 --- a/Samples/simpleVoteIntrinsics/README.md +++ b/Samples/simpleVoteIntrinsics/README.md @@ -10,7 +10,7 @@ Vote Intrinsics ## Supported SM Architectures -[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) ## Supported OSes @@ -27,7 +27,7 @@ cudaMalloc, cudaFree, cudaMemcpy, cudaFreeHost ## Prerequisites -Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2010.sln b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2010.sln deleted file mode 100644 index 63a2e03e..00000000 --- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2010.sln +++ /dev/null @@ -1,20 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 11.00 -# Visual Studio 2010 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleVoteIntrinsics", "simpleVoteIntrinsics_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|x64 = Debug|x64 - Release|x64 = Release|x64 - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 - {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 - {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 - {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection -EndGlobal diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2012.vcxproj b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2012.vcxproj index 5ce0436f..ef5ca072 100644 --- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2012.vcxproj +++ b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2012.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/simpleVoteIntrinsics.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -102,6 +102,6 @@ - + diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2013.vcxproj b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2013.vcxproj index 3028179d..c7dcd465 100644 --- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2013.vcxproj +++ b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2013.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/simpleVoteIntrinsics.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -102,6 +102,6 @@ - + diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2015.vcxproj b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2015.vcxproj index d8abe2a9..c58c4a88 100644 --- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2015.vcxproj +++ b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2015.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/simpleVoteIntrinsics.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -102,6 +102,6 @@ - + diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj index cd26d831..184ad6a3 100644 --- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj +++ b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj @@ -34,7 +34,7 @@ - + @@ -63,7 +63,7 @@ $(OutDir)/simpleVoteIntrinsics.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -103,6 +103,6 @@ - + diff --git a/Samples/simpleVulkan/Build_instructions.txt b/Samples/simpleVulkan/Build_instructions.txt new file mode 100644 index 00000000..2b19ed36 --- /dev/null +++ b/Samples/simpleVulkan/Build_instructions.txt @@ -0,0 +1,18 @@ +For Windows: +Follow these steps once you have installed Vulkan SDK for Windows from https://www.lunarg.com/vulkan-sdk/ +-- Install GLFW3 library at suitable location +-- Open the simpleVulkan VS project file. +To add the GLFW3 library path +-- Right click on Project name "simpleVulkan" click on "Properties" +-- In Property pages window go to Linker -> General. Here in "Additional Libraries Directories" edit and add path to glfw3dll.lib +To add the GLFW3 headers path +-- Right click on Project name "simpleVulkan" click on "Properties" +-- In Property pages window go to "VC++ Directories" section. Here in "Include Directories" edit and add path to GLFW3 headers include directory location. +** Make sure to add path to glfw3.dll in your PATH environment variable** + + +For Linux: +-- Install the Vulkan SDK from https://www.lunarg.com/vulkan-sdk/ and follow environment setup instructions. +-- Install GLFW3 library through your OS package repository. For example: apt-get for Ubuntu and dnf for RHEL/CentOS +-- Install "libxcb1-dev" and "xorg-dev" as GLFW3 is depended on it +-- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH \ No newline at end of file diff --git a/Samples/simpleVulkan/Makefile b/Samples/simpleVulkan/Makefile new file mode 100644 index 00000000..f4745812 --- /dev/null +++ b/Samples/simpleVulkan/Makefile @@ -0,0 +1,361 @@ +################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-clang++ + endif + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu + LDFLAGS += --unresolved-symbols=ignore-in-shared-libs + CCFLAGS += -isystem=$(TARGET_FS)/usr/include + CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu + endif + endif +endif + +ifeq ($(TARGET_OS),qnx) + CCFLAGS += -DWIN_INTERFACE_CUSTOM + LDFLAGS += -lsocket +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +SAMPLE_ENABLED := 1 + +# This sample is not supported on Mac OSX +ifeq ($(TARGET_OS),darwin) + $(info >>> WARNING - simpleVulkan is not supported on Mac OSX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +# This sample is not supported on ARMv7 +ifeq ($(TARGET_ARCH),armv7l) + $(info >>> WARNING - simpleVulkan is not supported on ARMv7 - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +# This sample is not supported on aarch64 +ifeq ($(TARGET_ARCH),aarch64) + $(info >>> WARNING - simpleVulkan is not supported on aarch64 - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../Common +LIBRARIES := + +################################################################################ + +# Makefile include to help find Vulkan SDK and dependencies +include ./findvulkan.mk + +# Vulkan specific libraries +ifeq ($(TARGET_OS),linux) + LIBRARIES += -L $(VULKAN_SDK_PATH)/lib + LIBRARIES += `pkg-config --static --libs glfw3` -lvulkan + INCLUDES += `pkg-config --static --cflags glfw3` -I$(VULKAN_SDK_PATH)/include +endif + +#Detect if installed version of GCC supports C++11 +ifeq ($(TARGET_OS),linux) + empty := + space := $(empty) $(empty) + GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`) +#Create version number without "." + GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.) + GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.) + GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.) +# Make sure the version number has at least 3 decimals + GCCVERSION += 00 +# Remove spaces from the version number + GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION)) +# Crop the version number to 3 decimals. + GCCVERSION := $(shell expr `echo $(GCCVERSION)` | cut -b1-3) +#$(warning $(GCCVERSION)) + + IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 470) + + ifeq ($(IS_MIN_VERSION), 1) + $(info >>> GCC Version is greater or equal to 4.7.0 <<<) + else + $(info >>> Waiving build. Minimum GCC version required for C++11 is 4.7.0 <<<) + SAMPLE_ENABLED := 0 + endif +endif + +# Gencode arguments +SMS ?= 30 35 37 50 52 60 61 70 75 + +ifeq ($(SMS),) +$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) +SAMPLE_ENABLED := 0 +endif + +ifeq ($(GENCODE_FLAGS),) +# Generate SASS code for each SM architecture listed in $(SMS) +$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) + +# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility +HIGHEST_SM := $(lastword $(sort $(SMS))) +ifneq ($(HIGHEST_SM),) +GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) +endif +endif + +ALL_CCFLAGS += --std=c++11 + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: simpleVulkan + $(EXEC) $(VULKAN_SDK_PATH)/bin/glslangValidator -V shader_sine.vert + $(EXEC) $(VULKAN_SDK_PATH)/bin/glslangValidator -V shader_sine.frag + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +vulkanCUDASinewave.o:vulkanCUDASinewave.cu + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +simpleVulkan: vulkanCUDASinewave.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./simpleVulkan + +clean: + rm -f simpleVulkan vulkanCUDASinewave.o + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/simpleVulkan + rm -rf vert.spv + rm -rf frag.spv + +clobber: clean diff --git a/Samples/simpleVulkan/NsightEclipse.xml b/Samples/simpleVulkan/NsightEclipse.xml new file mode 100644 index 00000000..7682326e --- /dev/null +++ b/Samples/simpleVulkan/NsightEclipse.xml @@ -0,0 +1,82 @@ + + + + simpleVulkan + + --std=c++11 + + + cudaImportExternalMemory + cudaExternalMemoryGetMappedBuffer + cudaImportExternalSemaphore + cudaImportExternalSemaphore + cudaSignalExternalSemaphoresAsync + cudaWaitExternalSemaphoresAsync + cudaDestroyExternalSemaphore + cudaDestroyExternalMemory + + + whole + + ./ + ../ + ../../common/inc + + + Graphics Interop + CUDA Vulkan Interop + Data Parallel Algorithms + + + CUDA + CPP11 + sine-wave + Vulkan + + + + + + true + + $(VULKAN_SDK)/Bin/glslangValidator.exe -V shader_sine.vert + $(VULKAN_SDK)/Bin/glslangValidator.exe -V shader_sine.frag + $(VULKAN_SDK_PATH)/bin/glslangValidator -V shader_sine.vert + $(VULKAN_SDK_PATH)/bin/glslangValidator -V shader_sine.frag + rm -rf vert.spv + rm -rf frag.spv + + vulkanCUDASinewave.cu + + X11 + VULKAN + + + 2:Graphics Interop + 1:CUDA Advanced Topics + 1:CUDA Vulkan Interop + + sm30 + sm35 + sm37 + sm50 + sm52 + sm60 + sm61 + sm70 + sm75 + + + x86_64 + linux + + + windows7 + + + + all + + Vulkan CUDA Interop Sinewave + exe + diff --git a/Samples/simpleVulkan/README.md b/Samples/simpleVulkan/README.md new file mode 100644 index 00000000..766f826f --- /dev/null +++ b/Samples/simpleVulkan/README.md @@ -0,0 +1,74 @@ +# simpleVulkan - Vulkan CUDA Interop Sinewave + +## Description + +This sample demonstrates Vulkan CUDA Interop. CUDA imports the Vulkan vertex buffer and operates on it to create sinewave, and synchronizes with Vulkan through vulkan semaphores imported by CUDA. This sample depends on Vulkan SDK, GLFW3 libraries, for building this sample please refer to "Build_instructions.txt" provided in this sample's directory + +## Key Concepts + +Graphics Interop, CUDA Vulkan Interop, Data Parallel Algorithms + +## Supported SM Architectures + +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows + +## Supported CPU Architecture + +x86_64 + +## CUDA APIs involved + +### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) +cudaImportExternalMemory, cudaExternalMemoryGetMappedBuffer, cudaImportExternalSemaphore, cudaImportExternalSemaphore, cudaSignalExternalSemaphoresAsync, cudaWaitExternalSemaphoresAsync, cudaDestroyExternalSemaphore, cudaDestroyExternalMemory + +## Dependencies needed to build/run +[X11](../../README.md#x11), [VULKAN](../../README.md#vulkan) + +## Prerequisites + +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Make sure the dependencies mentioned in [Dependencies]() section above are installed. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +## References (for more details) + diff --git a/Samples/simpleVulkan/findvulkan.mk b/Samples/simpleVulkan/findvulkan.mk new file mode 100644 index 00000000..40abd39c --- /dev/null +++ b/Samples/simpleVulkan/findvulkan.mk @@ -0,0 +1,143 @@ +################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +# +# findvulkan.mk is used to find the necessary Vulkan Libraries for specific distributions +# this is supported on Linux +# +################################################################################ + +# Determine OS platform and unix distribution +ifeq ("$(TARGET_OS)","linux") + # first search lsb_release + DISTRO = $(shell lsb_release -i -s 2>/dev/null | tr "[:upper:]" "[:lower:]") + ifeq ("$(DISTRO)","") + # second search and parse /etc/issue + DISTRO = $(shell more /etc/issue | awk '{print $$1}' | sed '1!d' | sed -e "/^$$/d" 2>/dev/null | tr "[:upper:]" "[:lower:]") + # ensure data from /etc/issue is valid + ifneq (,$(filter-out $(DISTRO),ubuntu fedora red rhel centos suse)) + DISTRO = + endif + ifeq ("$(DISTRO)","") + # third, we can search in /etc/os-release or /etc/{distro}-release + DISTRO = $(shell awk '/ID/' /etc/*-release | sed 's/ID=//' | grep -v "VERSION" | grep -v "ID" | grep -v "DISTRIB") + endif + endif +endif + +ifeq ("$(TARGET_OS)","linux") + # Each set of Linux Distros have different paths for where to find their GLM/GLFW3 libraries reside + UBUNTU = $(shell echo $(DISTRO) | grep -i ubuntu >/dev/null 2>&1; echo $$?) + FEDORA = $(shell echo $(DISTRO) | grep -i fedora >/dev/null 2>&1; echo $$?) + RHEL = $(shell echo $(DISTRO) | grep -i 'red\|rhel' >/dev/null 2>&1; echo $$?) + CENTOS = $(shell echo $(DISTRO) | grep -i centos >/dev/null 2>&1; echo $$?) + SUSE = $(shell echo $(DISTRO) | grep -i 'suse\|sles' >/dev/null 2>&1; echo $$?) + ifeq ("$(UBUNTU)","0") + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + GLPATH := /usr/arm-linux-gnueabihf/lib + GLLINK := -L/usr/arm-linux-gnueabihf/lib + ifneq ($(TARGET_FS),) + GLPATH += $(TARGET_FS)/usr/lib/arm-linux-gnueabihf + GLLINK += -L$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + else ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-aarch64) + GLPATH := /usr/aarch64-linux-gnu/lib + GLLINK := -L/usr/aarch64-linux-gnu/lib + ifneq ($(TARGET_FS),) + GLPATH += $(TARGET_FS)/usr/lib + GLPATH += $(TARGET_FS)/usr/lib/aarch64-linux-gnu + GLLINK += -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu + endif + else + UBUNTU_PKG_NAME = $(shell which dpkg >/dev/null 2>&1 && dpkg -l 'nvidia-*' | grep '^ii' | awk '{print $$2}' | head -1) + ifneq ("$(UBUNTU_PKG_NAME)","") + GLPATH ?= /usr/lib/$(UBUNTU_PKG_NAME) + GLLINK ?= -L/usr/lib/$(UBUNTU_PKG_NAME) + endif + + DFLT_PATH ?= /usr/lib + endif + endif + ifeq ("$(SUSE)","0") + GLPATH ?= /usr/X11R6/lib64 + GLLINK ?= -L/usr/X11R6/lib64 + DFLT_PATH ?= /usr/lib64 + endif + ifeq ("$(FEDORA)","0") + GLPATH ?= /usr/lib64/nvidia + GLLINK ?= -L/usr/lib64/nvidia + DFLT_PATH ?= /usr/lib64 + endif + ifeq ("$(RHEL)","0") + GLPATH ?= /usr/lib64/nvidia + GLLINK ?= -L/usr/lib64/nvidia + DFLT_PATH ?= /usr/lib64 + endif + ifeq ("$(CENTOS)","0") + GLPATH ?= /usr/lib64/nvidia + GLLINK ?= -L/usr/lib64/nvidia + DFLT_PATH ?= /usr/lib64 + endif + + VULKAN_SDK_PATH ?= ${VULKAN_SDK} + + ifeq ("$(VULKAN_SDK_PATH)","") + $(info >>> WARNING - Vulkan SDK not found, please install Vulkan SDK <<<) + SAMPLE_ENABLED := 0 + endif + + VULKAN_SDK_LIB := $(shell find -L $(VULKAN_SDK_PATH)/lib -name libvulkan.so -print 2>/dev/null) + X11LIB := $(shell find -L $(GLPATH) $(DFLT_PATH) -name libX11.so -print 2>/dev/null) + + ifeq ("$(VULKAN_SDK_LIB)","") + $(info >>> WARNING - libvulkan.so not found, please install libvulkan.so <<<) + SAMPLE_ENABLED := 0 + endif + + ifeq ("$(X11LIB)","") + $(info >>> WARNING - libX11.so not found, please install libX11.so <<<) + SAMPLE_ENABLED := 0 + endif + + HEADER_SEARCH_PATH ?= $(TARGET_FS)/usr/include + HEADER_SEARCH_PATH += $(TARGET_FS)/usr/local/include + ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + HEADER_SEARCH_PATH += /usr/arm-linux-gnueabihf/include + else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-aarch64-linux) + HEADER_SEARCH_PATH += /usr/aarch64-linux-gnu/include + endif + + VULKANHEADER := $(shell find -L $(VULKAN_SDK_PATH)/include -name vulkan.h -print 2>/dev/null) + + ifeq ("$(VULKANHEADER)","") + $(info >>> WARNING - vulkan.h not found, please install vulkan.h <<<) + SAMPLE_ENABLED := 0 + endif +else +endif + diff --git a/Samples/simpleVulkan/linmath.h b/Samples/simpleVulkan/linmath.h new file mode 100644 index 00000000..b4d386cc --- /dev/null +++ b/Samples/simpleVulkan/linmath.h @@ -0,0 +1,501 @@ +/* + * Copyright (c) 2015-2016 The Khronos Group Inc. + * Copyright (c) 2015-2016 Valve Corporation + * Copyright (c) 2015-2016 LunarG, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Relicensed from the WTFPL (http://www.wtfpl.net/faq/). + */ + +#ifndef LINMATH_H +#define LINMATH_H + +#include + +// Converts degrees to radians. +#define degreesToRadians(angleDegrees) (angleDegrees * M_PI / 180.0) + +// Converts radians to degrees. +#define radiansToDegrees(angleRadians) (angleRadians * 180.0 / M_PI) + +typedef float vec3[3]; +static inline void vec3_add(vec3 r, vec3 const a, vec3 const b) { + int i; + for (i = 0; i < 3; ++i) r[i] = a[i] + b[i]; +} +static inline void vec3_sub(vec3 r, vec3 const a, vec3 const b) { + int i; + for (i = 0; i < 3; ++i) r[i] = a[i] - b[i]; +} +static inline void vec3_scale(vec3 r, vec3 const v, float const s) { + int i; + for (i = 0; i < 3; ++i) r[i] = v[i] * s; +} +static inline float vec3_mul_inner(vec3 const a, vec3 const b) { + float p = 0.f; + int i; + for (i = 0; i < 3; ++i) p += b[i] * a[i]; + return p; +} +static inline void vec3_mul_cross(vec3 r, vec3 const a, vec3 const b) { + r[0] = a[1] * b[2] - a[2] * b[1]; + r[1] = a[2] * b[0] - a[0] * b[2]; + r[2] = a[0] * b[1] - a[1] * b[0]; +} +static inline float vec3_len(vec3 const v) { return sqrtf(vec3_mul_inner(v, v)); } +static inline void vec3_norm(vec3 r, vec3 const v) { + float k = 1.f / vec3_len(v); + vec3_scale(r, v, k); +} +static inline void vec3_reflect(vec3 r, vec3 const v, vec3 const n) { + float p = 2.f * vec3_mul_inner(v, n); + int i; + for (i = 0; i < 3; ++i) r[i] = v[i] - p * n[i]; +} + +typedef float vec4[4]; +static inline void vec4_add(vec4 r, vec4 const a, vec4 const b) { + int i; + for (i = 0; i < 4; ++i) r[i] = a[i] + b[i]; +} +static inline void vec4_sub(vec4 r, vec4 const a, vec4 const b) { + int i; + for (i = 0; i < 4; ++i) r[i] = a[i] - b[i]; +} +static inline void vec4_scale(vec4 r, vec4 v, float s) { + int i; + for (i = 0; i < 4; ++i) r[i] = v[i] * s; +} +static inline float vec4_mul_inner(vec4 a, vec4 b) { + float p = 0.f; + int i; + for (i = 0; i < 4; ++i) p += b[i] * a[i]; + return p; +} +static inline void vec4_mul_cross(vec4 r, vec4 a, vec4 b) { + r[0] = a[1] * b[2] - a[2] * b[1]; + r[1] = a[2] * b[0] - a[0] * b[2]; + r[2] = a[0] * b[1] - a[1] * b[0]; + r[3] = 1.f; +} +static inline float vec4_len(vec4 v) { return sqrtf(vec4_mul_inner(v, v)); } +static inline void vec4_norm(vec4 r, vec4 v) { + float k = 1.f / vec4_len(v); + vec4_scale(r, v, k); +} +static inline void vec4_reflect(vec4 r, vec4 v, vec4 n) { + float p = 2.f * vec4_mul_inner(v, n); + int i; + for (i = 0; i < 4; ++i) r[i] = v[i] - p * n[i]; +} + +typedef vec4 mat4x4[4]; +static inline void mat4x4_identity(mat4x4 M) { + int i, j; + for (i = 0; i < 4; ++i) + for (j = 0; j < 4; ++j) M[i][j] = i == j ? 1.f : 0.f; +} +static inline void mat4x4_dup(mat4x4 M, mat4x4 N) { + int i, j; + for (i = 0; i < 4; ++i) + for (j = 0; j < 4; ++j) M[i][j] = N[i][j]; +} +static inline void mat4x4_row(vec4 r, mat4x4 M, int i) { + int k; + for (k = 0; k < 4; ++k) r[k] = M[k][i]; +} +static inline void mat4x4_col(vec4 r, mat4x4 M, int i) { + int k; + for (k = 0; k < 4; ++k) r[k] = M[i][k]; +} +static inline void mat4x4_transpose(mat4x4 M, mat4x4 N) { + int i, j; + for (j = 0; j < 4; ++j) + for (i = 0; i < 4; ++i) M[i][j] = N[j][i]; +} +static inline void mat4x4_add(mat4x4 M, mat4x4 a, mat4x4 b) { + int i; + for (i = 0; i < 4; ++i) vec4_add(M[i], a[i], b[i]); +} +static inline void mat4x4_sub(mat4x4 M, mat4x4 a, mat4x4 b) { + int i; + for (i = 0; i < 4; ++i) vec4_sub(M[i], a[i], b[i]); +} +static inline void mat4x4_scale(mat4x4 M, mat4x4 a, float k) { + int i; + for (i = 0; i < 4; ++i) vec4_scale(M[i], a[i], k); +} +static inline void mat4x4_scale_aniso(mat4x4 M, mat4x4 a, float x, float y, float z) { + int i; + vec4_scale(M[0], a[0], x); + vec4_scale(M[1], a[1], y); + vec4_scale(M[2], a[2], z); + for (i = 0; i < 4; ++i) { + M[3][i] = a[3][i]; + } +} +static inline void mat4x4_mul(mat4x4 M, mat4x4 a, mat4x4 b) { + int k, r, c; + for (c = 0; c < 4; ++c) + for (r = 0; r < 4; ++r) { + M[c][r] = 0.f; + for (k = 0; k < 4; ++k) M[c][r] += a[k][r] * b[c][k]; + } +} +static inline void mat4x4_mul_vec4(vec4 r, mat4x4 M, vec4 v) { + int i, j; + for (j = 0; j < 4; ++j) { + r[j] = 0.f; + for (i = 0; i < 4; ++i) r[j] += M[i][j] * v[i]; + } +} +static inline void mat4x4_translate(mat4x4 T, float x, float y, float z) { + mat4x4_identity(T); + T[3][0] = x; + T[3][1] = y; + T[3][2] = z; +} +static inline void mat4x4_translate_in_place(mat4x4 M, float x, float y, float z) { + vec4 t = {x, y, z, 0}; + vec4 r; + int i; + for (i = 0; i < 4; ++i) { + mat4x4_row(r, M, i); + M[3][i] += vec4_mul_inner(r, t); + } +} +static inline void mat4x4_from_vec3_mul_outer(mat4x4 M, vec3 a, vec3 b) { + int i, j; + for (i = 0; i < 4; ++i) + for (j = 0; j < 4; ++j) M[i][j] = i < 3 && j < 3 ? a[i] * b[j] : 0.f; +} +static inline void mat4x4_rotate(mat4x4 R, mat4x4 M, float x, float y, float z, float angle) { + float s = sinf(angle); + float c = cosf(angle); + vec3 u = {x, y, z}; + + if (vec3_len(u) > 1e-4) { + vec3_norm(u, u); + mat4x4 T; + mat4x4_from_vec3_mul_outer(T, u, u); + + mat4x4 S = {{0, u[2], -u[1], 0}, {-u[2], 0, u[0], 0}, {u[1], -u[0], 0, 0}, {0, 0, 0, 0}}; + mat4x4_scale(S, S, s); + + mat4x4 C; + mat4x4_identity(C); + mat4x4_sub(C, C, T); + + mat4x4_scale(C, C, c); + + mat4x4_add(T, T, C); + mat4x4_add(T, T, S); + + T[3][3] = 1.; + mat4x4_mul(R, M, T); + } else { + mat4x4_dup(R, M); + } +} +static inline void mat4x4_rotate_X(mat4x4 Q, mat4x4 M, float angle) { + float s = sinf(angle); + float c = cosf(angle); + mat4x4 R = {{1.f, 0.f, 0.f, 0.f}, {0.f, c, s, 0.f}, {0.f, -s, c, 0.f}, {0.f, 0.f, 0.f, 1.f}}; + mat4x4_mul(Q, M, R); +} +static inline void mat4x4_rotate_Y(mat4x4 Q, mat4x4 M, float angle) { + float s = sinf(angle); + float c = cosf(angle); + mat4x4 R = {{c, 0.f, s, 0.f}, {0.f, 1.f, 0.f, 0.f}, {-s, 0.f, c, 0.f}, {0.f, 0.f, 0.f, 1.f}}; + mat4x4_mul(Q, M, R); +} +static inline void mat4x4_rotate_Z(mat4x4 Q, mat4x4 M, float angle) { + float s = sinf(angle); + float c = cosf(angle); + mat4x4 R = {{c, s, 0.f, 0.f}, {-s, c, 0.f, 0.f}, {0.f, 0.f, 1.f, 0.f}, {0.f, 0.f, 0.f, 1.f}}; + mat4x4_mul(Q, M, R); +} +static inline void mat4x4_invert(mat4x4 T, mat4x4 M) { + float s[6]; + float c[6]; + s[0] = M[0][0] * M[1][1] - M[1][0] * M[0][1]; + s[1] = M[0][0] * M[1][2] - M[1][0] * M[0][2]; + s[2] = M[0][0] * M[1][3] - M[1][0] * M[0][3]; + s[3] = M[0][1] * M[1][2] - M[1][1] * M[0][2]; + s[4] = M[0][1] * M[1][3] - M[1][1] * M[0][3]; + s[5] = M[0][2] * M[1][3] - M[1][2] * M[0][3]; + + c[0] = M[2][0] * M[3][1] - M[3][0] * M[2][1]; + c[1] = M[2][0] * M[3][2] - M[3][0] * M[2][2]; + c[2] = M[2][0] * M[3][3] - M[3][0] * M[2][3]; + c[3] = M[2][1] * M[3][2] - M[3][1] * M[2][2]; + c[4] = M[2][1] * M[3][3] - M[3][1] * M[2][3]; + c[5] = M[2][2] * M[3][3] - M[3][2] * M[2][3]; + + /* Assumes it is invertible */ + float idet = 1.0f / (s[0] * c[5] - s[1] * c[4] + s[2] * c[3] + s[3] * c[2] - s[4] * c[1] + s[5] * c[0]); + + T[0][0] = (M[1][1] * c[5] - M[1][2] * c[4] + M[1][3] * c[3]) * idet; + T[0][1] = (-M[0][1] * c[5] + M[0][2] * c[4] - M[0][3] * c[3]) * idet; + T[0][2] = (M[3][1] * s[5] - M[3][2] * s[4] + M[3][3] * s[3]) * idet; + T[0][3] = (-M[2][1] * s[5] + M[2][2] * s[4] - M[2][3] * s[3]) * idet; + + T[1][0] = (-M[1][0] * c[5] + M[1][2] * c[2] - M[1][3] * c[1]) * idet; + T[1][1] = (M[0][0] * c[5] - M[0][2] * c[2] + M[0][3] * c[1]) * idet; + T[1][2] = (-M[3][0] * s[5] + M[3][2] * s[2] - M[3][3] * s[1]) * idet; + T[1][3] = (M[2][0] * s[5] - M[2][2] * s[2] + M[2][3] * s[1]) * idet; + + T[2][0] = (M[1][0] * c[4] - M[1][1] * c[2] + M[1][3] * c[0]) * idet; + T[2][1] = (-M[0][0] * c[4] + M[0][1] * c[2] - M[0][3] * c[0]) * idet; + T[2][2] = (M[3][0] * s[4] - M[3][1] * s[2] + M[3][3] * s[0]) * idet; + T[2][3] = (-M[2][0] * s[4] + M[2][1] * s[2] - M[2][3] * s[0]) * idet; + + T[3][0] = (-M[1][0] * c[3] + M[1][1] * c[1] - M[1][2] * c[0]) * idet; + T[3][1] = (M[0][0] * c[3] - M[0][1] * c[1] + M[0][2] * c[0]) * idet; + T[3][2] = (-M[3][0] * s[3] + M[3][1] * s[1] - M[3][2] * s[0]) * idet; + T[3][3] = (M[2][0] * s[3] - M[2][1] * s[1] + M[2][2] * s[0]) * idet; +} +static inline void mat4x4_orthonormalize(mat4x4 R, mat4x4 M) { + mat4x4_dup(R, M); + float s = 1.; + vec3 h; + + vec3_norm(R[2], R[2]); + + s = vec3_mul_inner(R[1], R[2]); + vec3_scale(h, R[2], s); + vec3_sub(R[1], R[1], h); + vec3_norm(R[2], R[2]); + + s = vec3_mul_inner(R[1], R[2]); + vec3_scale(h, R[2], s); + vec3_sub(R[1], R[1], h); + vec3_norm(R[1], R[1]); + + s = vec3_mul_inner(R[0], R[1]); + vec3_scale(h, R[1], s); + vec3_sub(R[0], R[0], h); + vec3_norm(R[0], R[0]); +} + +static inline void mat4x4_frustum(mat4x4 M, float l, float r, float b, float t, float n, float f) { + M[0][0] = 2.f * n / (r - l); + M[0][1] = M[0][2] = M[0][3] = 0.f; + + M[1][1] = 2.f * n / (t - b); + M[1][0] = M[1][2] = M[1][3] = 0.f; + + M[2][0] = (r + l) / (r - l); + M[2][1] = (t + b) / (t - b); + M[2][2] = -(f + n) / (f - n); + M[2][3] = -1.f; + + M[3][2] = -2.f * (f * n) / (f - n); + M[3][0] = M[3][1] = M[3][3] = 0.f; +} +static inline void mat4x4_ortho(mat4x4 M, float l, float r, float b, float t, float n, float f) { + M[0][0] = 2.f / (r - l); + M[0][1] = M[0][2] = M[0][3] = 0.f; + + M[1][1] = 2.f / (t - b); + M[1][0] = M[1][2] = M[1][3] = 0.f; + + M[2][2] = -2.f / (f - n); + M[2][0] = M[2][1] = M[2][3] = 0.f; + + M[3][0] = -(r + l) / (r - l); + M[3][1] = -(t + b) / (t - b); + M[3][2] = -(f + n) / (f - n); + M[3][3] = 1.f; +} +static inline void mat4x4_perspective(mat4x4 m, float y_fov, float aspect, float n, float f) { + /* NOTE: Degrees are an unhandy unit to work with. + * linmath.h uses radians for everything! */ + float const a = (float)(1.f / tan(y_fov / 2.f)); + + m[0][0] = a / aspect; + m[0][1] = 0.f; + m[0][2] = 0.f; + m[0][3] = 0.f; + + m[1][0] = 0.f; + m[1][1] = a; + m[1][2] = 0.f; + m[1][3] = 0.f; + + m[2][0] = 0.f; + m[2][1] = 0.f; + m[2][2] = -((f + n) / (f - n)); + m[2][3] = -1.f; + + m[3][0] = 0.f; + m[3][1] = 0.f; + m[3][2] = -((2.f * f * n) / (f - n)); + m[3][3] = 0.f; +} +static inline void mat4x4_look_at(mat4x4 m, vec3 eye, vec3 center, vec3 up) { + /* Adapted from Android's OpenGL Matrix.java. */ + /* See the OpenGL GLUT documentation for gluLookAt for a description */ + /* of the algorithm. We implement it in a straightforward way: */ + + /* TODO: The negation of of can be spared by swapping the order of + * operands in the following cross products in the right way. */ + vec3 f; + vec3_sub(f, center, eye); + vec3_norm(f, f); + + vec3 s; + vec3_mul_cross(s, f, up); + vec3_norm(s, s); + + vec3 t; + vec3_mul_cross(t, s, f); + + m[0][0] = s[0]; + m[0][1] = t[0]; + m[0][2] = -f[0]; + m[0][3] = 0.f; + + m[1][0] = s[1]; + m[1][1] = t[1]; + m[1][2] = -f[1]; + m[1][3] = 0.f; + + m[2][0] = s[2]; + m[2][1] = t[2]; + m[2][2] = -f[2]; + m[2][3] = 0.f; + + m[3][0] = 0.f; + m[3][1] = 0.f; + m[3][2] = 0.f; + m[3][3] = 1.f; + + mat4x4_translate_in_place(m, -eye[0], -eye[1], -eye[2]); +} + +typedef float quat[4]; +static inline void quat_identity(quat q) { + q[0] = q[1] = q[2] = 0.f; + q[3] = 1.f; +} +static inline void quat_add(quat r, quat a, quat b) { + int i; + for (i = 0; i < 4; ++i) r[i] = a[i] + b[i]; +} +static inline void quat_sub(quat r, quat a, quat b) { + int i; + for (i = 0; i < 4; ++i) r[i] = a[i] - b[i]; +} +static inline void quat_mul(quat r, quat p, quat q) { + vec3 w; + vec3_mul_cross(r, p, q); + vec3_scale(w, p, q[3]); + vec3_add(r, r, w); + vec3_scale(w, q, p[3]); + vec3_add(r, r, w); + r[3] = p[3] * q[3] - vec3_mul_inner(p, q); +} +static inline void quat_scale(quat r, quat v, float s) { + int i; + for (i = 0; i < 4; ++i) r[i] = v[i] * s; +} +static inline float quat_inner_product(quat a, quat b) { + float p = 0.f; + int i; + for (i = 0; i < 4; ++i) p += b[i] * a[i]; + return p; +} +static inline void quat_conj(quat r, quat q) { + int i; + for (i = 0; i < 3; ++i) r[i] = -q[i]; + r[3] = q[3]; +} +#define quat_norm vec4_norm +static inline void quat_mul_vec3(vec3 r, quat q, vec3 v) { + quat v_ = {v[0], v[1], v[2], 0.f}; + + quat_conj(r, q); + quat_norm(r, r); + quat_mul(r, v_, r); + quat_mul(r, q, r); +} +static inline void mat4x4_from_quat(mat4x4 M, quat q) { + float a = q[3]; + float b = q[0]; + float c = q[1]; + float d = q[2]; + float a2 = a * a; + float b2 = b * b; + float c2 = c * c; + float d2 = d * d; + + M[0][0] = a2 + b2 - c2 - d2; + M[0][1] = 2.f * (b * c + a * d); + M[0][2] = 2.f * (b * d - a * c); + M[0][3] = 0.f; + + M[1][0] = 2 * (b * c - a * d); + M[1][1] = a2 - b2 + c2 - d2; + M[1][2] = 2.f * (c * d + a * b); + M[1][3] = 0.f; + + M[2][0] = 2.f * (b * d + a * c); + M[2][1] = 2.f * (c * d - a * b); + M[2][2] = a2 - b2 - c2 + d2; + M[2][3] = 0.f; + + M[3][0] = M[3][1] = M[3][2] = 0.f; + M[3][3] = 1.f; +} + +static inline void mat4x4o_mul_quat(mat4x4 R, mat4x4 M, quat q) { + /* XXX: The way this is written only works for othogonal matrices. */ + /* TODO: Take care of non-orthogonal case. */ + quat_mul_vec3(R[0], q, M[0]); + quat_mul_vec3(R[1], q, M[1]); + quat_mul_vec3(R[2], q, M[2]); + + R[3][0] = R[3][1] = R[3][2] = 0.f; + R[3][3] = 1.f; +} +static inline void quat_from_mat4x4(quat q, mat4x4 M) { + float r = 0.f; + int i; + + int perm[] = {0, 1, 2, 0, 1}; + int *p = perm; + + for (i = 0; i < 3; i++) { + float m = M[i][i]; + if (m < r) continue; + m = r; + p = &perm[i]; + } + + r = sqrtf(1.f + M[p[0]][p[0]] - M[p[1]][p[1]] - M[p[2]][p[2]]); + + if (r < 1e-6) { + q[0] = 1.f; + q[1] = q[2] = q[3] = 0.f; + return; + } + + q[0] = r / 2.f; + q[1] = (M[p[0]][p[1]] - M[p[1]][p[0]]) / (2.f * r); + q[2] = (M[p[2]][p[0]] - M[p[0]][p[2]]) / (2.f * r); + q[3] = (M[p[2]][p[1]] - M[p[1]][p[2]]) / (2.f * r); +} + +#endif diff --git a/Samples/simpleVulkan/shader_sine.frag b/Samples/simpleVulkan/shader_sine.frag new file mode 100644 index 00000000..1730b4f1 --- /dev/null +++ b/Samples/simpleVulkan/shader_sine.frag @@ -0,0 +1,10 @@ +#version 450 +#extension GL_ARB_separate_shader_objects : enable + +layout(location = 0) in vec3 fragColor; + +layout(location = 0) out vec4 outColor; + +void main() { + outColor = vec4(fragColor, 1.0); +} \ No newline at end of file diff --git a/Samples/simpleVulkan/shader_sine.vert b/Samples/simpleVulkan/shader_sine.vert new file mode 100644 index 00000000..80196343 --- /dev/null +++ b/Samples/simpleVulkan/shader_sine.vert @@ -0,0 +1,23 @@ +#version 450 +#extension GL_ARB_separate_shader_objects : enable + + +layout(binding = 0) uniform UniformBufferObject { + mat4 model; + mat4 view; + mat4 proj; +} ubo; + +layout(location = 0) in vec4 inPosition; +layout(location = 1) in vec3 inColor; + +layout(location = 0) out vec3 fragColor; + +out gl_PerVertex { + vec4 gl_Position; +}; + +void main() { + gl_Position = ubo.proj * ubo.view * ubo.model * inPosition; + fragColor = inColor; +} \ No newline at end of file diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2010.sln b/Samples/simpleVulkan/simpleVulkan_vs2013.sln similarity index 75% rename from Samples/matrixMulDrv/matrixMulDrv_vs2010.sln rename to Samples/simpleVulkan/simpleVulkan_vs2013.sln index 0c4ee250..e5c843d0 100644 --- a/Samples/matrixMulDrv/matrixMulDrv_vs2010.sln +++ b/Samples/simpleVulkan/simpleVulkan_vs2013.sln @@ -1,7 +1,7 @@  -Microsoft Visual Studio Solution File, Format Version 11.00 -# Visual Studio 2010 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matrixMulDrv", "matrixMulDrv_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +Microsoft Visual Studio Solution File, Format Version 13.00 +# Visual Studio 2013 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleVulkan", "simpleVulkan_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/Samples/simpleVulkan/simpleVulkan_vs2013.vcxproj b/Samples/simpleVulkan/simpleVulkan_vs2013.vcxproj new file mode 100644 index 00000000..0f630f8c --- /dev/null +++ b/Samples/simpleVulkan/simpleVulkan_vs2013.vcxproj @@ -0,0 +1,122 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + simpleVulkan_vs2013 + simpleVulkan + + + + + Application + MultiByte + v120 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(VULKAN_SDK)/include; + + + Console + cudart_static.lib;vulkan-1.lib;glfw3dll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir);../../common/lib/$(PlatformName);$(VULKAN_SDK)/Lib; + $(OutDir)/simpleVulkan.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + $(VULKAN_SDK)/Bin/glslangValidator.exe -V shader_sine.vert +$(VULKAN_SDK)/Bin/glslangValidator.exe -V shader_sine.frag + + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + + + + + + + + + + + diff --git a/Samples/shfl_scan/shfl_scan_vs2010.sln b/Samples/simpleVulkan/simpleVulkan_vs2015.sln similarity index 74% rename from Samples/shfl_scan/shfl_scan_vs2010.sln rename to Samples/simpleVulkan/simpleVulkan_vs2015.sln index 0ac6b8a7..50a142aa 100644 --- a/Samples/shfl_scan/shfl_scan_vs2010.sln +++ b/Samples/simpleVulkan/simpleVulkan_vs2015.sln @@ -1,7 +1,7 @@  -Microsoft Visual Studio Solution File, Format Version 11.00 -# Visual Studio 2010 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "shfl_scan", "shfl_scan_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +Microsoft Visual Studio Solution File, Format Version 14.00 +# Visual Studio 2015 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleVulkan", "simpleVulkan_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/Samples/simpleVulkan/simpleVulkan_vs2015.vcxproj b/Samples/simpleVulkan/simpleVulkan_vs2015.vcxproj new file mode 100644 index 00000000..0e4e9d64 --- /dev/null +++ b/Samples/simpleVulkan/simpleVulkan_vs2015.vcxproj @@ -0,0 +1,122 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + simpleVulkan_vs2015 + simpleVulkan + + + + + Application + MultiByte + v140 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(VULKAN_SDK)/include; + + + Console + cudart_static.lib;vulkan-1.lib;glfw3dll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir);../../common/lib/$(PlatformName);$(VULKAN_SDK)/Lib; + $(OutDir)/simpleVulkan.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + $(VULKAN_SDK)/Bin/glslangValidator.exe -V shader_sine.vert +$(VULKAN_SDK)/Bin/glslangValidator.exe -V shader_sine.frag + + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + + + + + + + + + + + diff --git a/Samples/matrixMul/matrixMul_vs2010.sln b/Samples/simpleVulkan/simpleVulkan_vs2017.sln similarity index 74% rename from Samples/matrixMul/matrixMul_vs2010.sln rename to Samples/simpleVulkan/simpleVulkan_vs2017.sln index 99cfcad9..6fadb97a 100644 --- a/Samples/matrixMul/matrixMul_vs2010.sln +++ b/Samples/simpleVulkan/simpleVulkan_vs2017.sln @@ -1,7 +1,7 @@  -Microsoft Visual Studio Solution File, Format Version 11.00 -# Visual Studio 2010 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matrixMul", "matrixMul_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2017 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleVulkan", "simpleVulkan_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj b/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj new file mode 100644 index 00000000..7c45957f --- /dev/null +++ b/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj @@ -0,0 +1,123 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + simpleVulkan_vs2017 + simpleVulkan + + + + + Application + MultiByte + v141 + 10.0.15063.0 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(VULKAN_SDK)/include; + + + Console + cudart_static.lib;vulkan-1.lib;glfw3dll.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir);../../common/lib/$(PlatformName);$(VULKAN_SDK)/Lib; + $(OutDir)/simpleVulkan.exe + + + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + $(VULKAN_SDK)/Bin/glslangValidator.exe -V shader_sine.vert +$(VULKAN_SDK)/Bin/glslangValidator.exe -V shader_sine.frag + + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + + + + + + + + + + + diff --git a/Samples/simpleVulkan/vulkanCUDASinewave.cu b/Samples/simpleVulkan/vulkanCUDASinewave.cu new file mode 100644 index 00000000..6de350d7 --- /dev/null +++ b/Samples/simpleVulkan/vulkanCUDASinewave.cu @@ -0,0 +1,1863 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#define GLFW_INCLUDE_VULKAN +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _WIN64 +#include +#include +#include +#include +#include +#define _USE_MATH_DEFINES +#endif + +#include +#include +#include + +#include "linmath.h" + +#define WIDTH 800 +#define HEIGHT 600 + +#define VULKAN_VALIDATION 0 + +const std::vector validationLayers = { + "VK_LAYER_LUNARG_standard_validation"}; + +#if VULKAN_VALIDATION +const bool enableValidationLayers = true; +#else +const bool enableValidationLayers = false; +#endif + +struct QueueFamilyIndices { + int graphicsFamily = -1; + int presentFamily = -1; + + bool isComplete() { return graphicsFamily >= 0 && presentFamily >= 0; } +}; + +const std::vector deviceExtensions = { + VK_KHR_SWAPCHAIN_EXTENSION_NAME, + VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME, + VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME, +#ifdef _WIN64 + VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME, + VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME, +#else + VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME, + VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME, +#endif +}; + +#ifdef _WIN64 +class WindowsSecurityAttributes { + protected: + SECURITY_ATTRIBUTES m_winSecurityAttributes; + PSECURITY_DESCRIPTOR m_winPSecurityDescriptor; + + public: + WindowsSecurityAttributes(); + SECURITY_ATTRIBUTES* operator&(); + ~WindowsSecurityAttributes(); +}; + +WindowsSecurityAttributes::WindowsSecurityAttributes() { + m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc( + 1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void**)); + // CHECK_NEQ(m_winPSecurityDescriptor, (PSECURITY_DESCRIPTOR)NULL); + + PSID* ppSID = + (PSID*)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH); + PACL* ppACL = (PACL*)((PBYTE)ppSID + sizeof(PSID*)); + + InitializeSecurityDescriptor(m_winPSecurityDescriptor, + SECURITY_DESCRIPTOR_REVISION); + + SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = + SECURITY_WORLD_SID_AUTHORITY; + AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0, + 0, 0, 0, 0, 0, ppSID); + + EXPLICIT_ACCESS explicitAccess; + ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS)); + explicitAccess.grfAccessPermissions = + STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL; + explicitAccess.grfAccessMode = SET_ACCESS; + explicitAccess.grfInheritance = INHERIT_ONLY; + explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID; + explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP; + explicitAccess.Trustee.ptstrName = (LPTSTR)*ppSID; + + SetEntriesInAcl(1, &explicitAccess, NULL, ppACL); + + SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE); + + m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes); + m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor; + m_winSecurityAttributes.bInheritHandle = TRUE; +} + +SECURITY_ATTRIBUTES* WindowsSecurityAttributes::operator&() { + return &m_winSecurityAttributes; +} + +WindowsSecurityAttributes::~WindowsSecurityAttributes() { + PSID* ppSID = + (PSID*)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH); + PACL* ppACL = (PACL*)((PBYTE)ppSID + sizeof(PSID*)); + + if (*ppSID) { + FreeSid(*ppSID); + } + if (*ppACL) { + LocalFree(*ppACL); + } + free(m_winPSecurityDescriptor); +} +#endif + +struct UniformBufferObject { + mat4x4 model; + mat4x4 view; + mat4x4 proj; +}; + +struct SwapChainSupportDetails { + VkSurfaceCapabilitiesKHR capabilities; + std::vector formats; + std::vector presentModes; +}; + +struct Vertex { + vec4 pos; + vec3 color; + + static VkVertexInputBindingDescription getBindingDescription() { + VkVertexInputBindingDescription bindingDescription = {}; + + bindingDescription.binding = 0; + bindingDescription.stride = sizeof(Vertex); + bindingDescription.inputRate = VK_VERTEX_INPUT_RATE_VERTEX; + + return bindingDescription; + } + + static std::array + getAttributeDescriptions() { + std::array attributeDescriptions = {}; + attributeDescriptions[0].binding = 0; + attributeDescriptions[0].location = 0; + attributeDescriptions[0].format = VK_FORMAT_R32G32B32A32_SFLOAT; + attributeDescriptions[0].offset = offsetof(Vertex, pos); + + attributeDescriptions[1].binding = 0; + attributeDescriptions[1].location = 1; + attributeDescriptions[1].format = VK_FORMAT_R32G32B32_SFLOAT; + attributeDescriptions[1].offset = offsetof(Vertex, color); + return attributeDescriptions; + } +}; + +size_t mesh_width = 0, mesh_height = 0; +std::string execution_path; + +__global__ void sinewave_gen_kernel(Vertex* vertices, unsigned int width, + unsigned int height, float time) { + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; + + // calculate uv coordinates + float u = x / (float)width; + float v = y / (float)height; + u = u * 2.0f - 1.0f; + v = v * 2.0f - 1.0f; + + // calculate simple sine wave pattern + float freq = 4.0f; + float w = sinf(u * freq + time) * cosf(v * freq + time) * 0.5f; + + if (y < height && x < width) { + // write output vertex + vertices[y * width + x].pos[0] = u; + vertices[y * width + x].pos[1] = w; + vertices[y * width + x].pos[2] = v; + vertices[y * width + x].pos[3] = 1.0f; + vertices[y * width + x].color[0] = 1.0f; + vertices[y * width + x].color[1] = 0.0f; + vertices[y * width + x].color[2] = 0.0f; + } +} + +class vulkanCudaApp { + public: + void run() { + initWindow(); + initVulkan(); + initCuda(); + mainLoop(); + cleanup(); + } + + private: + GLFWwindow* window; + VkInstance instance; + VkPhysicalDevice physicalDevice = VK_NULL_HANDLE; + uint8_t vkDeviceUUID[VK_UUID_SIZE]; + VkDevice device; + VkQueue graphicsQueue; + VkQueue presentQueue; + VkSurfaceKHR surface; + VkSwapchainKHR swapChain; + std::vector swapChainImages; + VkFormat swapChainImageFormat; + VkExtent2D swapChainExtent; + std::vector swapChainImageViews; + VkDescriptorSetLayout descriptorSetLayout; + VkDescriptorPool descriptorPool; + VkDescriptorSet descriptorSet; + VkPipelineLayout pipelineLayout; + VkRenderPass renderPass; + VkPipeline graphicsPipeline; + std::vector swapChainFramebuffers; + VkCommandPool commandPool; + VkBuffer vertexBuffer; + VkDeviceMemory vertexBufferMemory; + VkBuffer uniformBuffer; + VkDeviceMemory uniformBufferMemory; + std::vector commandBuffers; + VkSemaphore imageAvailableSemaphore; + VkSemaphore renderFinishedSemaphore; + VkSemaphore cudaUpdateVkVertexBufSemaphore; + VkSemaphore vkUpdateCudaVertexBufSemaphore; + + size_t vertexBufSize = 0; + bool startSubmit = 0; + double AnimTime = 1.0f; + + VkDebugReportCallbackEXT callback; + +#ifdef _WIN64 + PFN_vkGetMemoryWin32HandleKHR fpGetMemoryWin32HandleKHR; + PFN_vkGetSemaphoreWin32HandleKHR fpGetSemaphoreWin32HandleKHR; +#else + PFN_vkGetMemoryFdKHR fpGetMemoryFdKHR; + PFN_vkGetSemaphoreFdKHR fpGetSemaphoreFdKHR; +#endif + + PFN_vkGetPhysicalDeviceProperties2 fpGetPhysicalDeviceProperties2; + + // CUDA stuff + cudaExternalMemory_t cudaExtMemVertexBuffer; + cudaExternalSemaphore_t cudaExtCudaUpdateVkVertexBufSemaphore; + cudaExternalSemaphore_t cudaExtVkUpdateCudaVertexBufSemaphore; + void* cudaDevVertptr = NULL; + cudaStream_t streamToRun; + + bool checkValidationLayerSupport() { + uint32_t layerCount; + vkEnumerateInstanceLayerProperties(&layerCount, nullptr); + + std::vector availableLayers(layerCount); + vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data()); + + for (const char* layerName : validationLayers) { + bool layerFound = false; + + for (const auto& layerProperties : availableLayers) { + if (strcmp(layerName, layerProperties.layerName) == 0) { + layerFound = true; + break; + } + } + + if (!layerFound) { + return false; + } + } + + return true; + } + + static VKAPI_ATTR VkBool32 VKAPI_CALL + debugCallback(VkDebugReportFlagsEXT flags, VkDebugReportObjectTypeEXT objType, + uint64_t obj, size_t location, int32_t code, + const char* layerPrefix, const char* msg, void* userData) { + std::cerr << "validation layer: " << msg << std::endl; + + return VK_FALSE; + } + + VkResult CreateDebugReportCallbackEXT( + VkInstance instance, + const VkDebugReportCallbackCreateInfoEXT* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkDebugReportCallbackEXT* pCallback) { + auto func = (PFN_vkCreateDebugReportCallbackEXT)vkGetInstanceProcAddr( + instance, "vkCreateDebugReportCallbackEXT"); + if (func != nullptr) { + return func(instance, pCreateInfo, pAllocator, pCallback); + } else { + return VK_ERROR_EXTENSION_NOT_PRESENT; + } + } + + void DestroyDebugReportCallbackEXT(VkInstance instance, + VkDebugReportCallbackEXT callback, + const VkAllocationCallbacks* pAllocator) { + auto func = (PFN_vkDestroyDebugReportCallbackEXT)vkGetInstanceProcAddr( + instance, "vkDestroyDebugReportCallbackEXT"); + if (func != nullptr) { + func(instance, callback, pAllocator); + } + } + + void setupDebugCallback() { + if (!enableValidationLayers) return; + + VkDebugReportCallbackCreateInfoEXT createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT; + createInfo.flags = + VK_DEBUG_REPORT_ERROR_BIT_EXT | VK_DEBUG_REPORT_WARNING_BIT_EXT; + createInfo.pfnCallback = debugCallback; + + if (CreateDebugReportCallbackEXT(instance, &createInfo, nullptr, + &callback) != VK_SUCCESS) { + throw std::runtime_error("failed to set up debug callback!"); + } + } + void initWindow() { + glfwInit(); + glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API); + glfwWindowHint(GLFW_RESIZABLE, GLFW_FALSE); + window = glfwCreateWindow(WIDTH, HEIGHT, "Vulkan-CUDA Interop Sinewave", + nullptr, nullptr); + } + + void createInstance() { + if (enableValidationLayers && !checkValidationLayerSupport()) { + throw std::runtime_error( + "validation layers requested, but not available!"); + } + + VkApplicationInfo appInfo = {}; + appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; + appInfo.pApplicationName = "Vulkan CUDA Sinewave"; + appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0); + appInfo.pEngineName = "No Engine"; + appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0); + appInfo.apiVersion = VK_API_VERSION_1_0; + + VkInstanceCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; + createInfo.pApplicationInfo = &appInfo; + + uint32_t glfwExtensionCount = 0; + const char** glfwExtensions; + + glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount); + + std::vector enabledExtensionNameList; + enabledExtensionNameList.push_back( + VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME); + enabledExtensionNameList.push_back( + VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME); + enabledExtensionNameList.push_back( + VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME); + + for (int i = 0; i < glfwExtensionCount; i++) { + enabledExtensionNameList.push_back(glfwExtensions[i]); + } + if (enableValidationLayers) { + enabledExtensionNameList.push_back(VK_EXT_DEBUG_REPORT_EXTENSION_NAME); + createInfo.enabledLayerCount = + static_cast(validationLayers.size()); + createInfo.ppEnabledLayerNames = validationLayers.data(); + } else { + createInfo.enabledLayerCount = 0; + } + + createInfo.enabledExtensionCount = enabledExtensionNameList.size(); + createInfo.ppEnabledExtensionNames = enabledExtensionNameList.data(); + + if (vkCreateInstance(&createInfo, nullptr, &instance) != VK_SUCCESS) { + throw std::runtime_error("failed to create instance!"); + } else { + std::cout << "Instance created successfully!!\n"; + } + + fpGetPhysicalDeviceProperties2 = + (PFN_vkGetPhysicalDeviceProperties2)vkGetInstanceProcAddr( + instance, "vkGetPhysicalDeviceProperties2"); + if (fpGetPhysicalDeviceProperties2 == NULL) { + throw std::runtime_error( + "Vulkan: Proc address for \"vkGetPhysicalDeviceProperties2KHR\" not " + "found.\n"); + } + +#ifdef _WIN64 + fpGetMemoryWin32HandleKHR = + (PFN_vkGetMemoryWin32HandleKHR)vkGetInstanceProcAddr( + instance, "vkGetMemoryWin32HandleKHR"); + if (fpGetMemoryWin32HandleKHR == NULL) { + throw std::runtime_error( + "Vulkan: Proc address for \"vkGetMemoryWin32HandleKHR\" not " + "found.\n"); + } +#else + fpGetMemoryFdKHR = (PFN_vkGetMemoryFdKHR)vkGetInstanceProcAddr( + instance, "vkGetMemoryFdKHR"); + if (fpGetMemoryFdKHR == NULL) { + throw std::runtime_error( + "Vulkan: Proc address for \"vkGetMemoryFdKHR\" not found.\n"); + } +#endif + } + + void initVulkan() { + createInstance(); + setupDebugCallback(); + createSurface(); + pickPhysicalDevice(); + createLogicalDevice(); + getKhrExtensionsFn(); + createSwapChain(); + createImageViews(); + createRenderPass(); + createDescriptorSetLayout(); + createGraphicsPipeline(); + createFramebuffers(); + createCommandPool(); + createVertexBuffer(); + createUniformBuffer(); + createDescriptorPool(); + createDescriptorSet(); + createCommandBuffers(); + createSyncObjects(); + createSyncObjectsExt(); + } + + void initCuda() { + setCudaVkDevice(); + cudaVkImportVertexMem(); + cudaInitVertexMem(); + cudaVkImportSemaphore(); + } + + void createSurface() { + if (glfwCreateWindowSurface(instance, window, nullptr, &surface) != + VK_SUCCESS) { + throw std::runtime_error("failed to create window surface!"); + } + } + + void pickPhysicalDevice() { + uint32_t deviceCount = 0; + + vkEnumeratePhysicalDevices(instance, &deviceCount, nullptr); + + if (deviceCount == 0) { + throw std::runtime_error("failed to find GPUs with Vulkan support!"); + } + + std::vector devices(deviceCount); + vkEnumeratePhysicalDevices(instance, &deviceCount, devices.data()); + + for (const auto& device : devices) { + if (isDeviceSuitable(device)) { + physicalDevice = device; + break; + } + } + if (physicalDevice == VK_NULL_HANDLE) { + throw std::runtime_error("failed to find a suitable GPU!"); + } + + std::cout << "Selected physical device = " << physicalDevice << std::endl; + + VkPhysicalDeviceIDProperties vkPhysicalDeviceIDProperties = {}; + vkPhysicalDeviceIDProperties.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES; + vkPhysicalDeviceIDProperties.pNext = NULL; + + VkPhysicalDeviceProperties2 vkPhysicalDeviceProperties2 = {}; + vkPhysicalDeviceProperties2.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; + vkPhysicalDeviceProperties2.pNext = &vkPhysicalDeviceIDProperties; + + fpGetPhysicalDeviceProperties2(physicalDevice, + &vkPhysicalDeviceProperties2); + + memcpy(vkDeviceUUID, vkPhysicalDeviceIDProperties.deviceUUID, + sizeof(vkDeviceUUID)); + } + + int setCudaVkDevice() { + int current_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the GPU which is selected by Vulkan + while (current_device < device_count) { + cudaGetDeviceProperties(&deviceProp, current_device); + + if ((deviceProp.computeMode != cudaComputeModeProhibited)) { + // Compare the cuda device UUID with vulkan UUID + int ret = memcmp(&deviceProp.uuid, &vkDeviceUUID, VK_UUID_SIZE); + if (ret == 0) { + checkCudaErrors(cudaSetDevice(current_device)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + current_device, deviceProp.name, deviceProp.major, + deviceProp.minor); + + return current_device; + } + + } else { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "CUDA error:" + " No Vulkan-CUDA Interop capable GPU found.\n"); + exit(EXIT_FAILURE); + } + + return -1; + } + + bool isDeviceSuitable(VkPhysicalDevice device) { + QueueFamilyIndices indices = findQueueFamilies(device); + bool extensionsSupported = checkDeviceExtensionSupport(device); + + bool swapChainAdequate = false; + if (extensionsSupported) { + SwapChainSupportDetails swapChainSupport = querySwapChainSupport(device); + swapChainAdequate = !swapChainSupport.formats.empty() && + !swapChainSupport.presentModes.empty(); + } + + return indices.isComplete() && extensionsSupported && swapChainAdequate; + } + + bool checkDeviceExtensionSupport(VkPhysicalDevice device) { + uint32_t extensionCount; + vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, + nullptr); + + std::vector availableExtensions(extensionCount); + vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, + availableExtensions.data()); + + std::set requiredExtensions(deviceExtensions.begin(), + deviceExtensions.end()); + + for (const auto& extension : availableExtensions) { + requiredExtensions.erase(extension.extensionName); + } + + return requiredExtensions.empty(); + } + + QueueFamilyIndices findQueueFamilies(VkPhysicalDevice device) { + QueueFamilyIndices indices; + uint32_t queueFamilyCount = 0; + vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, + nullptr); + + std::vector queueFamilies(queueFamilyCount); + vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, + queueFamilies.data()); + + int i = 0; + for (const auto& queueFamily : queueFamilies) { + if (queueFamily.queueCount > 0 && + queueFamily.queueFlags & VK_QUEUE_GRAPHICS_BIT) { + indices.graphicsFamily = i; + } + + VkBool32 presentSupport = false; + vkGetPhysicalDeviceSurfaceSupportKHR(device, i, surface, &presentSupport); + + if (queueFamily.queueCount > 0 && presentSupport) { + indices.presentFamily = i; + } + + if (indices.isComplete()) { + break; + } + i++; + } + return indices; + } + + SwapChainSupportDetails querySwapChainSupport(VkPhysicalDevice device) { + SwapChainSupportDetails details; + vkGetPhysicalDeviceSurfaceCapabilitiesKHR(device, surface, + &details.capabilities); + + uint32_t formatCount; + vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, + nullptr); + + if (formatCount != 0) { + details.formats.resize(formatCount); + vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, + details.formats.data()); + } + + uint32_t presentModeCount; + vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, + &presentModeCount, nullptr); + + if (presentModeCount != 0) { + details.presentModes.resize(presentModeCount); + vkGetPhysicalDeviceSurfacePresentModesKHR( + device, surface, &presentModeCount, details.presentModes.data()); + } + + return details; + } + + VkSurfaceFormatKHR chooseSwapSurfaceFormat( + const std::vector& availableFormats) { + if (availableFormats.size() == 1 && + availableFormats[0].format == VK_FORMAT_UNDEFINED) { + return {VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR}; + } + + for (const auto& availableFormat : availableFormats) { + if (availableFormat.format == VK_FORMAT_B8G8R8A8_UNORM && + availableFormat.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) { + return availableFormat; + } + } + + return availableFormats[0]; + } + + VkPresentModeKHR chooseSwapPresentMode( + const std::vector availablePresentModes) { + VkPresentModeKHR bestMode = VK_PRESENT_MODE_FIFO_KHR; + + for (const auto& availablePresentMode : availablePresentModes) { + if (availablePresentMode == VK_PRESENT_MODE_MAILBOX_KHR) { + return availablePresentMode; + } else if (availablePresentMode == VK_PRESENT_MODE_IMMEDIATE_KHR) { + bestMode = availablePresentMode; + } + } + + return bestMode; + } + + VkExtent2D chooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities) { + if (capabilities.currentExtent.width != + std::numeric_limits::max()) { + return capabilities.currentExtent; + } else { + VkExtent2D actualExtent = {WIDTH, HEIGHT}; + + actualExtent.width = std::max( + capabilities.minImageExtent.width, + std::min(capabilities.maxImageExtent.width, actualExtent.width)); + actualExtent.height = std::max( + capabilities.minImageExtent.height, + std::min(capabilities.maxImageExtent.height, actualExtent.height)); + + return actualExtent; + } + } + + void createLogicalDevice() { + QueueFamilyIndices indices = findQueueFamilies(physicalDevice); + + std::vector queueCreateInfos; + std::set uniqueQueueFamilies = {indices.graphicsFamily, + indices.presentFamily}; + + float queuePriority = 1.0f; + for (int queueFamily : uniqueQueueFamilies) { + VkDeviceQueueCreateInfo queueCreateInfo = {}; + queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; + queueCreateInfo.queueFamilyIndex = queueFamily; + queueCreateInfo.queueCount = 1; + queueCreateInfo.pQueuePriorities = &queuePriority; + queueCreateInfos.push_back(queueCreateInfo); + } + + VkPhysicalDeviceFeatures deviceFeatures = {}; + + VkDeviceCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; + + createInfo.pQueueCreateInfos = queueCreateInfos.data(); + createInfo.queueCreateInfoCount = queueCreateInfos.size(); + + createInfo.pEnabledFeatures = &deviceFeatures; + std::vector enabledExtensionNameList; + + for (int i = 0; i < deviceExtensions.size(); i++) { + enabledExtensionNameList.push_back(deviceExtensions[i]); + } + if (enableValidationLayers) { + createInfo.enabledLayerCount = + static_cast(validationLayers.size()); + createInfo.ppEnabledLayerNames = validationLayers.data(); + } else { + createInfo.enabledLayerCount = 0; + } + createInfo.enabledExtensionCount = + static_cast(enabledExtensionNameList.size()); + createInfo.ppEnabledExtensionNames = enabledExtensionNameList.data(); + + if (vkCreateDevice(physicalDevice, &createInfo, nullptr, &device) != + VK_SUCCESS) { + throw std::runtime_error("failed to create logical device!"); + } + vkGetDeviceQueue(device, indices.graphicsFamily, 0, &graphicsQueue); + vkGetDeviceQueue(device, indices.presentFamily, 0, &presentQueue); + } + + void createSwapChain() { + SwapChainSupportDetails swapChainSupport = + querySwapChainSupport(physicalDevice); + + VkSurfaceFormatKHR surfaceFormat = + chooseSwapSurfaceFormat(swapChainSupport.formats); + VkPresentModeKHR presentMode = + chooseSwapPresentMode(swapChainSupport.presentModes); + VkExtent2D extent = chooseSwapExtent(swapChainSupport.capabilities); + + uint32_t imageCount = swapChainSupport.capabilities.minImageCount + 1; + if (swapChainSupport.capabilities.maxImageCount > 0 && + imageCount > swapChainSupport.capabilities.maxImageCount) { + imageCount = swapChainSupport.capabilities.maxImageCount; + } + + VkSwapchainCreateInfoKHR createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR; + createInfo.surface = surface; + createInfo.minImageCount = imageCount; + createInfo.imageFormat = surfaceFormat.format; + createInfo.imageColorSpace = surfaceFormat.colorSpace; + createInfo.imageExtent = extent; + createInfo.imageArrayLayers = 1; + createInfo.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + + QueueFamilyIndices indices = findQueueFamilies(physicalDevice); + uint32_t queueFamilyIndices[] = {(uint32_t)indices.graphicsFamily, + (uint32_t)indices.presentFamily}; + + if (indices.graphicsFamily != indices.presentFamily) { + createInfo.imageSharingMode = VK_SHARING_MODE_CONCURRENT; + createInfo.queueFamilyIndexCount = 2; + createInfo.pQueueFamilyIndices = queueFamilyIndices; + } else { + createInfo.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE; + createInfo.queueFamilyIndexCount = 0; // Optional + createInfo.pQueueFamilyIndices = nullptr; // Optional + } + + createInfo.preTransform = swapChainSupport.capabilities.currentTransform; + createInfo.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; + createInfo.presentMode = presentMode; + createInfo.clipped = VK_TRUE; + createInfo.oldSwapchain = VK_NULL_HANDLE; + + if (vkCreateSwapchainKHR(device, &createInfo, nullptr, &swapChain) != + VK_SUCCESS) { + throw std::runtime_error("failed to create swap chain!"); + } else { + std::cout << "Swapchain created!!\n"; + } + + vkGetSwapchainImagesKHR(device, swapChain, &imageCount, nullptr); + swapChainImages.resize(imageCount); + vkGetSwapchainImagesKHR(device, swapChain, &imageCount, + swapChainImages.data()); + + swapChainImageFormat = surfaceFormat.format; + swapChainExtent = extent; + } + + void createImageViews() { + swapChainImageViews.resize(swapChainImages.size()); + + for (size_t i = 0; i < swapChainImages.size(); i++) { + VkImageViewCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; + createInfo.image = swapChainImages[i]; + createInfo.viewType = VK_IMAGE_VIEW_TYPE_2D; + createInfo.format = swapChainImageFormat; + + createInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY; + createInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY; + createInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY; + createInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY; + + createInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + createInfo.subresourceRange.baseMipLevel = 0; + createInfo.subresourceRange.levelCount = 1; + createInfo.subresourceRange.baseArrayLayer = 0; + createInfo.subresourceRange.layerCount = 1; + + if (vkCreateImageView(device, &createInfo, nullptr, + &swapChainImageViews[i]) != VK_SUCCESS) { + throw std::runtime_error("failed to create image views!"); + } + } + } + + void createDescriptorSetLayout() { + VkDescriptorSetLayoutBinding uboLayoutBinding = {}; + uboLayoutBinding.binding = 0; + uboLayoutBinding.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + uboLayoutBinding.descriptorCount = 1; + uboLayoutBinding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT; + uboLayoutBinding.pImmutableSamplers = nullptr; // Optional + + VkDescriptorSetLayoutCreateInfo layoutInfo = {}; + layoutInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + layoutInfo.bindingCount = 1; + layoutInfo.pBindings = &uboLayoutBinding; + + if (vkCreateDescriptorSetLayout(device, &layoutInfo, nullptr, + &descriptorSetLayout) != VK_SUCCESS) { + throw std::runtime_error("failed to create descriptor set layout!"); + } + } + + void createGraphicsPipeline() { + auto vertShaderCode = readFile("vert.spv"); + auto fragShaderCode = readFile("frag.spv"); + + VkShaderModule vertShaderModule; + VkShaderModule fragShaderModule; + + vertShaderModule = createShaderModule(vertShaderCode); + fragShaderModule = createShaderModule(fragShaderCode); + + VkPipelineShaderStageCreateInfo vertShaderStageInfo = {}; + vertShaderStageInfo.sType = + VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + vertShaderStageInfo.stage = VK_SHADER_STAGE_VERTEX_BIT; + vertShaderStageInfo.module = vertShaderModule; + vertShaderStageInfo.pName = "main"; + + VkPipelineShaderStageCreateInfo fragShaderStageInfo = {}; + fragShaderStageInfo.sType = + VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + fragShaderStageInfo.stage = VK_SHADER_STAGE_FRAGMENT_BIT; + fragShaderStageInfo.module = fragShaderModule; + fragShaderStageInfo.pName = "main"; + + VkPipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo, + fragShaderStageInfo}; + + VkPipelineVertexInputStateCreateInfo vertexInputInfo = {}; + vertexInputInfo.sType = + VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; + auto bindingDescription = Vertex::getBindingDescription(); + auto attributeDescriptions = Vertex::getAttributeDescriptions(); + vertexInputInfo.vertexBindingDescriptionCount = 1; + vertexInputInfo.pVertexBindingDescriptions = &bindingDescription; + vertexInputInfo.vertexAttributeDescriptionCount = + static_cast(attributeDescriptions.size()); + vertexInputInfo.pVertexAttributeDescriptions = attributeDescriptions.data(); + + VkPipelineInputAssemblyStateCreateInfo inputAssembly = {}; + inputAssembly.sType = + VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; + inputAssembly.topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST; + inputAssembly.primitiveRestartEnable = VK_FALSE; + + VkViewport viewport = {}; + viewport.x = 0.0f; + viewport.y = 0.0f; + viewport.width = (float)swapChainExtent.width; + viewport.height = (float)swapChainExtent.height; + viewport.minDepth = 0.0f; + viewport.maxDepth = 1.0f; + + VkRect2D scissor = {}; + scissor.offset = {0, 0}; + scissor.extent = swapChainExtent; + + VkPipelineViewportStateCreateInfo viewportState = {}; + viewportState.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; + viewportState.viewportCount = 1; + viewportState.pViewports = &viewport; + viewportState.scissorCount = 1; + viewportState.pScissors = &scissor; + + VkPipelineRasterizationStateCreateInfo rasterizer = {}; + rasterizer.sType = + VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; + rasterizer.depthClampEnable = VK_FALSE; + rasterizer.rasterizerDiscardEnable = VK_FALSE; + rasterizer.polygonMode = VK_POLYGON_MODE_FILL; + rasterizer.lineWidth = 1.0f; + rasterizer.cullMode = VK_CULL_MODE_BACK_BIT; + rasterizer.frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE; + rasterizer.depthBiasEnable = VK_FALSE; + rasterizer.depthBiasConstantFactor = 0.0f; // Optional + rasterizer.depthBiasClamp = 0.0f; // Optional + rasterizer.depthBiasSlopeFactor = 0.0f; // Optional + + VkPipelineMultisampleStateCreateInfo multisampling = {}; + multisampling.sType = + VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; + multisampling.sampleShadingEnable = VK_FALSE; + multisampling.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; + multisampling.minSampleShading = 1.0f; // Optional + multisampling.pSampleMask = nullptr; // Optional + multisampling.alphaToCoverageEnable = VK_FALSE; // Optional + multisampling.alphaToOneEnable = VK_FALSE; // Optional + + VkPipelineColorBlendAttachmentState colorBlendAttachment = {}; + colorBlendAttachment.colorWriteMask = + VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; + colorBlendAttachment.blendEnable = VK_FALSE; + colorBlendAttachment.srcColorBlendFactor = VK_BLEND_FACTOR_ONE; // Optional + colorBlendAttachment.dstColorBlendFactor = + VK_BLEND_FACTOR_ZERO; // Optional + colorBlendAttachment.colorBlendOp = VK_BLEND_OP_ADD; // Optional + colorBlendAttachment.srcAlphaBlendFactor = VK_BLEND_FACTOR_ONE; // Optional + colorBlendAttachment.dstAlphaBlendFactor = + VK_BLEND_FACTOR_ZERO; // Optional + colorBlendAttachment.alphaBlendOp = VK_BLEND_OP_ADD; // Optional + + VkPipelineColorBlendStateCreateInfo colorBlending = {}; + colorBlending.sType = + VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; + colorBlending.logicOpEnable = VK_FALSE; + colorBlending.logicOp = VK_LOGIC_OP_COPY; // Optional + colorBlending.attachmentCount = 1; + colorBlending.pAttachments = &colorBlendAttachment; + colorBlending.blendConstants[0] = 0.0f; // Optional + colorBlending.blendConstants[1] = 0.0f; // Optional + colorBlending.blendConstants[2] = 0.0f; // Optional + colorBlending.blendConstants[3] = 0.0f; // Optional + +#if 0 + VkDynamicState dynamicStates[] = { + VK_DYNAMIC_STATE_VIEWPORT, + VK_DYNAMIC_STATE_LINE_WIDTH + }; + + VkPipelineDynamicStateCreateInfo dynamicState = {}; + dynamicState.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO; + dynamicState.dynamicStateCount = 2; + dynamicState.pDynamicStates = dynamicStates; +#endif + VkPipelineLayoutCreateInfo pipelineLayoutInfo = {}; + pipelineLayoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + pipelineLayoutInfo.setLayoutCount = 1; // Optional + pipelineLayoutInfo.pSetLayouts = &descriptorSetLayout; // Optional + pipelineLayoutInfo.pushConstantRangeCount = 0; // Optional + pipelineLayoutInfo.pPushConstantRanges = nullptr; // Optional + + if (vkCreatePipelineLayout(device, &pipelineLayoutInfo, nullptr, + &pipelineLayout) != VK_SUCCESS) { + throw std::runtime_error("failed to create pipeline layout!"); + } + + VkGraphicsPipelineCreateInfo pipelineInfo = {}; + pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; + pipelineInfo.stageCount = 2; + pipelineInfo.pStages = shaderStages; + pipelineInfo.pVertexInputState = &vertexInputInfo; + pipelineInfo.pInputAssemblyState = &inputAssembly; + pipelineInfo.pViewportState = &viewportState; + pipelineInfo.pRasterizationState = &rasterizer; + pipelineInfo.pMultisampleState = &multisampling; + pipelineInfo.pDepthStencilState = nullptr; // Optional + pipelineInfo.pColorBlendState = &colorBlending; + pipelineInfo.pDynamicState = nullptr; // Optional + pipelineInfo.layout = pipelineLayout; + pipelineInfo.renderPass = renderPass; + pipelineInfo.subpass = 0; + pipelineInfo.basePipelineHandle = VK_NULL_HANDLE; // Optional + pipelineInfo.basePipelineIndex = -1; // Optional + + if (vkCreateGraphicsPipelines(device, VK_NULL_HANDLE, 1, &pipelineInfo, + nullptr, &graphicsPipeline) != VK_SUCCESS) { + throw std::runtime_error("failed to create graphics pipeline!"); + } else { + std::cout << "Pipeline created successfully!!\n"; + } + vkDestroyShaderModule(device, fragShaderModule, nullptr); + vkDestroyShaderModule(device, vertShaderModule, nullptr); + } + + void createRenderPass() { + VkAttachmentDescription colorAttachment = {}; + colorAttachment.format = swapChainImageFormat; + colorAttachment.samples = VK_SAMPLE_COUNT_1_BIT; + + colorAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; + colorAttachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE; + + colorAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; + colorAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; + + colorAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + colorAttachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; + + VkAttachmentReference colorAttachmentRef = {}; + colorAttachmentRef.attachment = 0; + colorAttachmentRef.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; + + VkSubpassDescription subpass = {}; + subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; + + subpass.colorAttachmentCount = 1; + subpass.pColorAttachments = &colorAttachmentRef; + + VkRenderPassCreateInfo renderPassInfo = {}; + renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO; + renderPassInfo.attachmentCount = 1; + renderPassInfo.pAttachments = &colorAttachment; + renderPassInfo.subpassCount = 1; + renderPassInfo.pSubpasses = &subpass; + + VkSubpassDependency dependency = {}; + dependency.srcSubpass = VK_SUBPASS_EXTERNAL; + dependency.dstSubpass = 0; + dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + dependency.srcAccessMask = 0; + dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + dependency.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | + VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; + renderPassInfo.dependencyCount = 1; + renderPassInfo.pDependencies = &dependency; + + if (vkCreateRenderPass(device, &renderPassInfo, nullptr, &renderPass) != + VK_SUCCESS) { + throw std::runtime_error("failed to create render pass!"); + } + } + + void createFramebuffers() { + swapChainFramebuffers.resize(swapChainImageViews.size()); + + for (size_t i = 0; i < swapChainImageViews.size(); i++) { + VkImageView attachments[] = {swapChainImageViews[i]}; + + VkFramebufferCreateInfo framebufferInfo = {}; + framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; + framebufferInfo.renderPass = renderPass; + framebufferInfo.attachmentCount = 1; + framebufferInfo.pAttachments = attachments; + framebufferInfo.width = swapChainExtent.width; + framebufferInfo.height = swapChainExtent.height; + framebufferInfo.layers = 1; + + if (vkCreateFramebuffer(device, &framebufferInfo, nullptr, + &swapChainFramebuffers[i]) != VK_SUCCESS) { + throw std::runtime_error("failed to create framebuffer!"); + } + } + } + + void createCommandPool() { + QueueFamilyIndices queueFamilyIndices = findQueueFamilies(physicalDevice); + + VkCommandPoolCreateInfo poolInfo = {}; + poolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + poolInfo.queueFamilyIndex = queueFamilyIndices.graphicsFamily; + poolInfo.flags = 0; // Optional + + if (vkCreateCommandPool(device, &poolInfo, nullptr, &commandPool) != + VK_SUCCESS) { + throw std::runtime_error("failed to create command pool!"); + } + } + + void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, + VkMemoryPropertyFlags properties, VkBuffer& buffer, + VkDeviceMemory& bufferMemory) { + VkBufferCreateInfo bufferInfo = {}; + bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + bufferInfo.size = size; + bufferInfo.usage = usage; + bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + + if (vkCreateBuffer(device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { + throw std::runtime_error("failed to create buffer!"); + } + + VkMemoryRequirements memRequirements; + vkGetBufferMemoryRequirements(device, buffer, &memRequirements); + + VkMemoryAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + allocInfo.allocationSize = memRequirements.size; + allocInfo.memoryTypeIndex = + findMemoryType(memRequirements.memoryTypeBits, properties); + + if (vkAllocateMemory(device, &allocInfo, nullptr, &bufferMemory) != + VK_SUCCESS) { + throw std::runtime_error("failed to allocate buffer memory!"); + } + + vkBindBufferMemory(device, buffer, bufferMemory, 0); + } + + void createBufferExtMem(VkDeviceSize size, VkBufferUsageFlags usage, + VkMemoryPropertyFlags properties, + VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, + VkBuffer& buffer, VkDeviceMemory& bufferMemory) { + VkBufferCreateInfo bufferInfo = {}; + bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + bufferInfo.size = size; + bufferInfo.usage = usage; + bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + + if (vkCreateBuffer(device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { + throw std::runtime_error("failed to create buffer!"); + } + + VkMemoryRequirements memRequirements; + vkGetBufferMemoryRequirements(device, buffer, &memRequirements); + +#ifdef _WIN64 + WindowsSecurityAttributes winSecurityAttributes; + + VkExportMemoryWin32HandleInfoKHR vulkanExportMemoryWin32HandleInfoKHR = {}; + vulkanExportMemoryWin32HandleInfoKHR.sType = + VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR; + vulkanExportMemoryWin32HandleInfoKHR.pNext = NULL; + vulkanExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes; + vulkanExportMemoryWin32HandleInfoKHR.dwAccess = + DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; + vulkanExportMemoryWin32HandleInfoKHR.name = (LPCWSTR)NULL; +#endif + VkExportMemoryAllocateInfoKHR vulkanExportMemoryAllocateInfoKHR = {}; + vulkanExportMemoryAllocateInfoKHR.sType = + VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR; +#ifdef _WIN64 + vulkanExportMemoryAllocateInfoKHR.pNext = + extMemHandleType & VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR + ? &vulkanExportMemoryWin32HandleInfoKHR + : NULL; + vulkanExportMemoryAllocateInfoKHR.handleTypes = extMemHandleType; +#else + vulkanExportMemoryAllocateInfoKHR.pNext = NULL; + vulkanExportMemoryAllocateInfoKHR.handleTypes = + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; +#endif + VkMemoryAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + allocInfo.pNext = &vulkanExportMemoryAllocateInfoKHR; + allocInfo.allocationSize = memRequirements.size; + allocInfo.memoryTypeIndex = + findMemoryType(memRequirements.memoryTypeBits, properties); + + if (vkAllocateMemory(device, &allocInfo, nullptr, &bufferMemory) != + VK_SUCCESS) { + throw std::runtime_error("failed to allocate external buffer memory!"); + } + + vkBindBufferMemory(device, buffer, bufferMemory, 0); + } + + void createVertexBuffer() { + mesh_width = swapChainExtent.width / 2; + mesh_height = swapChainExtent.height / 2; + vertexBufSize = mesh_height * mesh_width; + + VkDeviceSize bufferSize = sizeof(Vertex) * vertexBufSize; +#ifdef _WIN64 + if (IsWindows8OrGreater()) { + createBufferExtMem(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT, + vertexBuffer, vertexBufferMemory); + } else { + createBufferExtMem(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT, + vertexBuffer, vertexBufferMemory); + } +#else + createBufferExtMem(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT, + vertexBuffer, vertexBufferMemory); +#endif + } + + void cudaInitVertexMem() { + checkCudaErrors(cudaStreamCreate(&streamToRun)); + + dim3 block(16, 16, 1); + dim3 grid(mesh_width / 16, mesh_height / 16, 1); + Vertex* vertices = (Vertex*)cudaDevVertptr; + sinewave_gen_kernel<<>>(vertices, mesh_width, + mesh_height, 1.0); + checkCudaErrors(cudaStreamSynchronize(streamToRun)); + } + + void createUniformBuffer() { + VkDeviceSize bufferSize = sizeof(UniformBufferObject); + createBuffer(bufferSize, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + uniformBuffer, uniformBufferMemory); + } + + uint32_t findMemoryType(uint32_t typeFilter, + VkMemoryPropertyFlags properties) { + VkPhysicalDeviceMemoryProperties memProperties; + vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties); + + for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) { + if (typeFilter & (1 << i) && (memProperties.memoryTypes[i].propertyFlags & + properties) == properties) { + return i; + } + } + + throw std::runtime_error("failed to find suitable memory type!"); + } + + void getKhrExtensionsFn() { +#ifdef _WIN64 + + fpGetSemaphoreWin32HandleKHR = + (PFN_vkGetSemaphoreWin32HandleKHR)vkGetDeviceProcAddr( + device, "vkGetSemaphoreWin32HandleKHR"); + if (fpGetSemaphoreWin32HandleKHR == NULL) { + throw std::runtime_error( + "Vulkan: Proc address for \"vkGetSemaphoreWin32HandleKHR\" not " + "found.\n"); + } +#else + fpGetSemaphoreFdKHR = (PFN_vkGetSemaphoreFdKHR)vkGetDeviceProcAddr( + device, "vkGetSemaphoreFdKHR"); + if (fpGetSemaphoreFdKHR == NULL) { + throw std::runtime_error( + "Vulkan: Proc address for \"vkGetSemaphoreFdKHR\" not found.\n"); + } +#endif + } + + void createCommandBuffers() { + commandBuffers.resize(swapChainFramebuffers.size()); + + VkCommandBufferAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + allocInfo.commandPool = commandPool; + allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + allocInfo.commandBufferCount = (uint32_t)commandBuffers.size(); + + if (vkAllocateCommandBuffers(device, &allocInfo, commandBuffers.data()) != + VK_SUCCESS) { + throw std::runtime_error("failed to allocate command buffers!"); + } + + for (size_t i = 0; i < commandBuffers.size(); i++) { + VkCommandBufferBeginInfo beginInfo = {}; + beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + beginInfo.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; + beginInfo.pInheritanceInfo = nullptr; // Optional + + if (vkBeginCommandBuffer(commandBuffers[i], &beginInfo) != VK_SUCCESS) { + throw std::runtime_error("failed to begin recording command buffer!"); + } + + VkRenderPassBeginInfo renderPassInfo = {}; + renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; + renderPassInfo.renderPass = renderPass; + renderPassInfo.framebuffer = swapChainFramebuffers[i]; + renderPassInfo.renderArea.offset = {0, 0}; + renderPassInfo.renderArea.extent = swapChainExtent; + + VkClearValue clearColor = {0.0f, 0.0f, 0.0f, 1.0f}; + renderPassInfo.clearValueCount = 1; + renderPassInfo.pClearValues = &clearColor; + + vkCmdBeginRenderPass(commandBuffers[i], &renderPassInfo, + VK_SUBPASS_CONTENTS_INLINE); + vkCmdBindPipeline(commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, + graphicsPipeline); + VkBuffer vertexBuffers[] = {vertexBuffer}; + VkDeviceSize offsets[] = {0}; + vkCmdBindVertexBuffers(commandBuffers[i], 0, 1, vertexBuffers, offsets); + vkCmdBindDescriptorSets(commandBuffers[i], + VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, + 0, 1, &descriptorSet, 0, nullptr); + vkCmdDraw(commandBuffers[i], static_cast(vertexBufSize), 1, 0, + 0); + vkCmdEndRenderPass(commandBuffers[i]); + if (vkEndCommandBuffer(commandBuffers[i]) != VK_SUCCESS) { + throw std::runtime_error("failed to record command buffer!"); + } + } + } + + VkShaderModule createShaderModule(const std::vector& code) { + VkShaderModuleCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; + createInfo.codeSize = code.size(); + createInfo.pCode = reinterpret_cast(code.data()); + + VkShaderModule shaderModule; + if (vkCreateShaderModule(device, &createInfo, nullptr, &shaderModule) != + VK_SUCCESS) { + throw std::runtime_error("failed to create shader module!"); + } + + return shaderModule; + } + + static std::vector readFile(const std::string& filename) { + char* file_path = sdkFindFilePath(filename.c_str(), execution_path.c_str()); + + std::ifstream file(file_path, std::ios::ate | std::ios::binary); + + if (!file.is_open()) { + throw std::runtime_error("failed to open shader spv file!\n"); + } + size_t fileSize = (size_t)file.tellg(); + std::vector buffer(fileSize); + file.seekg(0); + file.read(buffer.data(), fileSize); + file.close(); + + return buffer; + } + + void mainLoop() { + updateUniformBuffer(); + + while (!glfwWindowShouldClose(window)) { + glfwPollEvents(); + drawFrame(); + } + + vkDeviceWaitIdle(device); + } + + void updateUniformBuffer() { + UniformBufferObject ubo = {}; + + mat4x4_identity(ubo.model); + mat4x4 Model; + mat4x4_dup(Model, ubo.model); + mat4x4_rotate(ubo.model, Model, 1.0f, 0.0f, 1.0f, degreesToRadians(45.0f)); + + vec3 eye = {2.0f, 2.0f, 2.0f}; + vec3 center = {0.0f, 0.0f, 0.0f}; + vec3 up = {0.0f, 0.0f, 1.0f}; + mat4x4_look_at(ubo.view, eye, center, up); + mat4x4_perspective(ubo.proj, degreesToRadians(45.0f), + swapChainExtent.width / (float)swapChainExtent.height, + 0.1f, 10.0f); + ubo.proj[1][1] *= -1; + void* data; + vkMapMemory(device, uniformBufferMemory, 0, sizeof(ubo), 0, &data); + memcpy(data, &ubo, sizeof(ubo)); + vkUnmapMemory(device, uniformBufferMemory); + } + + void createDescriptorPool() { + VkDescriptorPoolSize poolSize = {}; + poolSize.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + poolSize.descriptorCount = 1; + + VkDescriptorPoolCreateInfo poolInfo = {}; + poolInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + poolInfo.poolSizeCount = 1; + poolInfo.pPoolSizes = &poolSize; + poolInfo.maxSets = 1; + + if (vkCreateDescriptorPool(device, &poolInfo, nullptr, &descriptorPool) != + VK_SUCCESS) { + throw std::runtime_error("failed to create descriptor pool!"); + } + } + + void createDescriptorSet() { + VkDescriptorSetLayout layouts[] = {descriptorSetLayout}; + VkDescriptorSetAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + allocInfo.descriptorPool = descriptorPool; + allocInfo.descriptorSetCount = 1; + allocInfo.pSetLayouts = layouts; + + if (vkAllocateDescriptorSets(device, &allocInfo, &descriptorSet) != + VK_SUCCESS) { + throw std::runtime_error("failed to allocate descriptor set!"); + } + + VkDescriptorBufferInfo bufferInfo = {}; + bufferInfo.buffer = uniformBuffer; + bufferInfo.offset = 0; + bufferInfo.range = sizeof(UniformBufferObject); + + VkWriteDescriptorSet descriptorWrite = {}; + descriptorWrite.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + descriptorWrite.dstSet = descriptorSet; + descriptorWrite.dstBinding = 0; + descriptorWrite.dstArrayElement = 0; + descriptorWrite.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + descriptorWrite.descriptorCount = 1; + descriptorWrite.pBufferInfo = &bufferInfo; + descriptorWrite.pImageInfo = nullptr; // Optional + descriptorWrite.pTexelBufferView = nullptr; // Optional + + vkUpdateDescriptorSets(device, 1, &descriptorWrite, 0, nullptr); + } + + void drawFrame() { + uint32_t imageIndex; + vkAcquireNextImageKHR(device, swapChain, + std::numeric_limits::max(), + imageAvailableSemaphore, VK_NULL_HANDLE, &imageIndex); + + if (!startSubmit) { + submitVulkan(imageIndex); + startSubmit = 1; + } else { + submitVulkanCuda(imageIndex); + } + + VkPresentInfoKHR presentInfo = {}; + presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR; + + VkSemaphore signalSemaphores[] = {renderFinishedSemaphore}; + + presentInfo.waitSemaphoreCount = 1; + presentInfo.pWaitSemaphores = signalSemaphores; + + VkSwapchainKHR swapChains[] = {swapChain}; + presentInfo.swapchainCount = 1; + presentInfo.pSwapchains = swapChains; + presentInfo.pImageIndices = &imageIndex; + presentInfo.pResults = nullptr; // Optional + + vkQueuePresentKHR(presentQueue, &presentInfo); + + cudaUpdateVertexBuffer(); + // Added sleep of 5 millisecs so that CPU does not submit too much work to + // GPU + std::this_thread::sleep_for(std::chrono::microseconds(5000)); + } + + void submitVulkan(uint32_t imageIndex) { + VkSubmitInfo submitInfo = {}; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + + VkSemaphore waitSemaphores[] = {imageAvailableSemaphore}; + VkPipelineStageFlags waitStages[] = { + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT}; + submitInfo.waitSemaphoreCount = 1; + submitInfo.pWaitSemaphores = waitSemaphores; + submitInfo.pWaitDstStageMask = waitStages; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &commandBuffers[imageIndex]; + + VkSemaphore signalSemaphores[] = {renderFinishedSemaphore, + vkUpdateCudaVertexBufSemaphore}; + + submitInfo.signalSemaphoreCount = 2; + submitInfo.pSignalSemaphores = signalSemaphores; + + if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE) != + VK_SUCCESS) { + throw std::runtime_error("failed to submit draw command buffer!"); + } + } + + void submitVulkanCuda(uint32_t imageIndex) { + VkSubmitInfo submitInfo = {}; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + + VkSemaphore waitSemaphores[] = {imageAvailableSemaphore, + cudaUpdateVkVertexBufSemaphore}; + VkPipelineStageFlags waitStages[] = { + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT}; + submitInfo.waitSemaphoreCount = 2; + submitInfo.pWaitSemaphores = waitSemaphores; + submitInfo.pWaitDstStageMask = waitStages; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &commandBuffers[imageIndex]; + + VkSemaphore signalSemaphores[] = {renderFinishedSemaphore, + vkUpdateCudaVertexBufSemaphore}; + + submitInfo.signalSemaphoreCount = 2; + submitInfo.pSignalSemaphores = signalSemaphores; + + if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE) != + VK_SUCCESS) { + throw std::runtime_error("failed to submit draw command buffer!"); + } + } + + void createSyncObjects() { + VkSemaphoreCreateInfo semaphoreInfo = {}; + semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + + if (vkCreateSemaphore(device, &semaphoreInfo, nullptr, + &imageAvailableSemaphore) != VK_SUCCESS || + vkCreateSemaphore(device, &semaphoreInfo, nullptr, + &renderFinishedSemaphore) != VK_SUCCESS) { + throw std::runtime_error( + "failed to create synchronization objects for a frame!"); + } + } + + void createSyncObjectsExt() { + VkSemaphoreCreateInfo semaphoreInfo = {}; + semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + + memset(&semaphoreInfo, 0, sizeof(semaphoreInfo)); + semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + +#ifdef _WIN64 + WindowsSecurityAttributes winSecurityAttributes; + + VkExportSemaphoreWin32HandleInfoKHR + vulkanExportSemaphoreWin32HandleInfoKHR = {}; + vulkanExportSemaphoreWin32HandleInfoKHR.sType = + VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR; + vulkanExportSemaphoreWin32HandleInfoKHR.pNext = NULL; + vulkanExportSemaphoreWin32HandleInfoKHR.pAttributes = + &winSecurityAttributes; + vulkanExportSemaphoreWin32HandleInfoKHR.dwAccess = + DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; + vulkanExportSemaphoreWin32HandleInfoKHR.name = (LPCWSTR)NULL; +#endif + VkExportSemaphoreCreateInfoKHR vulkanExportSemaphoreCreateInfo = {}; + vulkanExportSemaphoreCreateInfo.sType = + VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR; +#ifdef _WIN64 + vulkanExportSemaphoreCreateInfo.pNext = + IsWindows8OrGreater() ? &vulkanExportSemaphoreWin32HandleInfoKHR : NULL; + vulkanExportSemaphoreCreateInfo.handleTypes = + IsWindows8OrGreater() + ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT + : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT; +#else + vulkanExportSemaphoreCreateInfo.pNext = NULL; + vulkanExportSemaphoreCreateInfo.handleTypes = + VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT; +#endif + semaphoreInfo.pNext = &vulkanExportSemaphoreCreateInfo; + + if (vkCreateSemaphore(device, &semaphoreInfo, nullptr, + &cudaUpdateVkVertexBufSemaphore) != VK_SUCCESS || + vkCreateSemaphore(device, &semaphoreInfo, nullptr, + &vkUpdateCudaVertexBufSemaphore) != VK_SUCCESS) { + throw std::runtime_error( + "failed to create synchronization objects for a CUDA-Vulkan!"); + } + } + + void cudaVkImportVertexMem() { + cudaExternalMemoryHandleDesc cudaExtMemHandleDesc; + memset(&cudaExtMemHandleDesc, 0, sizeof(cudaExtMemHandleDesc)); +#ifdef _WIN64 + cudaExtMemHandleDesc.type = + IsWindows8OrGreater() ? cudaExternalMemoryHandleTypeOpaqueWin32 + : cudaExternalMemoryHandleTypeOpaqueWin32Kmt; + cudaExtMemHandleDesc.handle.win32.handle = getVkMemHandle( + IsWindows8OrGreater() + ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT + : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT); +#else + cudaExtMemHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueFd; + cudaExtMemHandleDesc.handle.fd = + getVkMemHandle(VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT); +#endif + cudaExtMemHandleDesc.size = sizeof(Vertex) * vertexBufSize; + + checkCudaErrors(cudaImportExternalMemory(&cudaExtMemVertexBuffer, + &cudaExtMemHandleDesc)); + + cudaExternalMemoryBufferDesc cudaExtBufferDesc; + cudaExtBufferDesc.offset = 0; + cudaExtBufferDesc.size = sizeof(Vertex) * vertexBufSize; + cudaExtBufferDesc.flags = 0; + + checkCudaErrors(cudaExternalMemoryGetMappedBuffer( + &cudaDevVertptr, cudaExtMemVertexBuffer, &cudaExtBufferDesc)); + printf("CUDA Imported Vulkan vertex buffer\n"); + } + + void cudaVkImportSemaphore() { + cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc; + memset(&externalSemaphoreHandleDesc, 0, + sizeof(externalSemaphoreHandleDesc)); +#ifdef _WIN64 + externalSemaphoreHandleDesc.type = + IsWindows8OrGreater() ? cudaExternalSemaphoreHandleTypeOpaqueWin32 + : cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt; + externalSemaphoreHandleDesc.handle.win32.handle = getVkSemaphoreHandle( + IsWindows8OrGreater() + ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT + : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT, + cudaUpdateVkVertexBufSemaphore); +#else + externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd; + externalSemaphoreHandleDesc.handle.fd = + getVkSemaphoreHandle(VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT, + cudaUpdateVkVertexBufSemaphore); +#endif + externalSemaphoreHandleDesc.flags = 0; + + checkCudaErrors(cudaImportExternalSemaphore( + &cudaExtCudaUpdateVkVertexBufSemaphore, &externalSemaphoreHandleDesc)); + + memset(&externalSemaphoreHandleDesc, 0, + sizeof(externalSemaphoreHandleDesc)); +#ifdef _WIN64 + externalSemaphoreHandleDesc.type = + IsWindows8OrGreater() ? cudaExternalSemaphoreHandleTypeOpaqueWin32 + : cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt; + ; + externalSemaphoreHandleDesc.handle.win32.handle = getVkSemaphoreHandle( + IsWindows8OrGreater() + ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT + : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT, + vkUpdateCudaVertexBufSemaphore); +#else + externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd; + externalSemaphoreHandleDesc.handle.fd = + getVkSemaphoreHandle(VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT, + vkUpdateCudaVertexBufSemaphore); +#endif + externalSemaphoreHandleDesc.flags = 0; + checkCudaErrors(cudaImportExternalSemaphore( + &cudaExtVkUpdateCudaVertexBufSemaphore, &externalSemaphoreHandleDesc)); + printf("CUDA Imported Vulkan semaphore\n"); + } + +#ifdef _WIN64 // For windows + HANDLE getVkMemHandle( + VkExternalMemoryHandleTypeFlagsKHR externalMemoryHandleType) { + HANDLE handle; + + VkMemoryGetWin32HandleInfoKHR vkMemoryGetWin32HandleInfoKHR = {}; + vkMemoryGetWin32HandleInfoKHR.sType = + VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR; + vkMemoryGetWin32HandleInfoKHR.pNext = NULL; + vkMemoryGetWin32HandleInfoKHR.memory = vertexBufferMemory; + vkMemoryGetWin32HandleInfoKHR.handleType = + (VkExternalMemoryHandleTypeFlagBitsKHR)externalMemoryHandleType; + + fpGetMemoryWin32HandleKHR(device, &vkMemoryGetWin32HandleInfoKHR, &handle); + return handle; + } +#else + int getVkMemHandle( + VkExternalMemoryHandleTypeFlagsKHR externalMemoryHandleType) { + if (externalMemoryHandleType == + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT) { + int fd; + + VkMemoryGetFdInfoKHR vkMemoryGetFdInfoKHR = {}; + vkMemoryGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR; + vkMemoryGetFdInfoKHR.pNext = NULL; + vkMemoryGetFdInfoKHR.memory = vertexBufferMemory; + vkMemoryGetFdInfoKHR.handleType = + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; + + fpGetMemoryFdKHR(device, &vkMemoryGetFdInfoKHR, &fd); + + return fd; + } + return -1; + } +#endif + +#ifdef _WIN64 + HANDLE getVkSemaphoreHandle( + VkExternalSemaphoreHandleTypeFlagBitsKHR externalSemaphoreHandleType, + VkSemaphore& semVkCuda) { + HANDLE handle; + + VkSemaphoreGetWin32HandleInfoKHR vulkanSemaphoreGetWin32HandleInfoKHR = {}; + vulkanSemaphoreGetWin32HandleInfoKHR.sType = + VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR; + vulkanSemaphoreGetWin32HandleInfoKHR.pNext = NULL; + vulkanSemaphoreGetWin32HandleInfoKHR.semaphore = semVkCuda; + vulkanSemaphoreGetWin32HandleInfoKHR.handleType = + externalSemaphoreHandleType; + + fpGetSemaphoreWin32HandleKHR(device, &vulkanSemaphoreGetWin32HandleInfoKHR, + &handle); + + return handle; + } +#else + int getVkSemaphoreHandle( + VkExternalSemaphoreHandleTypeFlagBitsKHR externalSemaphoreHandleType, + VkSemaphore& semVkCuda) { + if (externalSemaphoreHandleType == + VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { + int fd; + + VkSemaphoreGetFdInfoKHR vulkanSemaphoreGetFdInfoKHR = {}; + vulkanSemaphoreGetFdInfoKHR.sType = + VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR; + vulkanSemaphoreGetFdInfoKHR.pNext = NULL; + vulkanSemaphoreGetFdInfoKHR.semaphore = semVkCuda; + vulkanSemaphoreGetFdInfoKHR.handleType = + VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; + + fpGetSemaphoreFdKHR(device, &vulkanSemaphoreGetFdInfoKHR, &fd); + + return fd; + } + return -1; + } +#endif + + void cudaVkSemaphoreSignal(cudaExternalSemaphore_t& extSemaphore) { + cudaExternalSemaphoreSignalParams extSemaphoreSignalParams; + memset(&extSemaphoreSignalParams, 0, sizeof(extSemaphoreSignalParams)); + + extSemaphoreSignalParams.params.fence.value = 0; + extSemaphoreSignalParams.flags = 0; + checkCudaErrors(cudaSignalExternalSemaphoresAsync( + &extSemaphore, &extSemaphoreSignalParams, 1, streamToRun)); + } + + void cudaVkSemaphoreWait(cudaExternalSemaphore_t& extSemaphore) { + cudaExternalSemaphoreWaitParams extSemaphoreWaitParams; + + memset(&extSemaphoreWaitParams, 0, sizeof(extSemaphoreWaitParams)); + + extSemaphoreWaitParams.params.fence.value = 0; + extSemaphoreWaitParams.flags = 0; + + checkCudaErrors(cudaWaitExternalSemaphoresAsync( + &extSemaphore, &extSemaphoreWaitParams, 1, streamToRun)); + } + + void cudaUpdateVertexBuffer() { + cudaVkSemaphoreWait(cudaExtVkUpdateCudaVertexBufSemaphore); + + dim3 block(16, 16, 1); + dim3 grid(mesh_width / block.x, mesh_height / block.y, 1); + Vertex* pos = (Vertex*)cudaDevVertptr; + AnimTime += 0.01f; + sinewave_gen_kernel<<>>(pos, mesh_width, + mesh_height, AnimTime); + cudaVkSemaphoreSignal(cudaExtCudaUpdateVkVertexBufSemaphore); + } + + void cleanup() { + if (enableValidationLayers) { + DestroyDebugReportCallbackEXT(instance, callback, nullptr); + } + + vkDestroySemaphore(device, renderFinishedSemaphore, nullptr); + vkDestroySemaphore(device, imageAvailableSemaphore, nullptr); + checkCudaErrors( + cudaDestroyExternalSemaphore(cudaExtCudaUpdateVkVertexBufSemaphore)); + vkDestroySemaphore(device, cudaUpdateVkVertexBufSemaphore, nullptr); + checkCudaErrors( + cudaDestroyExternalSemaphore(cudaExtVkUpdateCudaVertexBufSemaphore)); + vkDestroySemaphore(device, vkUpdateCudaVertexBufSemaphore, nullptr); + + vkDestroyCommandPool(device, commandPool, nullptr); + for (auto framebuffer : swapChainFramebuffers) { + vkDestroyFramebuffer(device, framebuffer, nullptr); + } + for (auto imageView : swapChainImageViews) { + vkDestroyImageView(device, imageView, nullptr); + } + vkDestroyPipeline(device, graphicsPipeline, nullptr); + vkDestroyPipelineLayout(device, pipelineLayout, nullptr); + vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr); + vkDestroyBuffer(device, uniformBuffer, nullptr); + vkFreeMemory(device, uniformBufferMemory, nullptr); + vkDestroyRenderPass(device, renderPass, nullptr); + vkDestroySwapchainKHR(device, swapChain, nullptr); + checkCudaErrors(cudaDestroyExternalMemory(cudaExtMemVertexBuffer)); + vkDestroyBuffer(device, vertexBuffer, nullptr); + vkFreeMemory(device, vertexBufferMemory, nullptr); + vkDestroyDescriptorPool(device, descriptorPool, nullptr); + vkDestroyDevice(device, nullptr); + vkDestroySurfaceKHR(instance, surface, nullptr); + vkDestroyInstance(instance, nullptr); + glfwDestroyWindow(window); + glfwTerminate(); + } +}; + +int main(int argc, char* argv[]) { + execution_path = argv[0]; + vulkanCudaApp app; + + try { + app.run(); + } catch (const std::runtime_error& e) { + std::cerr << e.what() << std::endl; + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/Samples/systemWideAtomics/Makefile b/Samples/systemWideAtomics/Makefile new file mode 100644 index 00000000..ac55d319 --- /dev/null +++ b/Samples/systemWideAtomics/Makefile @@ -0,0 +1,318 @@ +################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-clang++ + endif + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu + LDFLAGS += --unresolved-symbols=ignore-in-shared-libs + CCFLAGS += -isystem=$(TARGET_FS)/usr/include + CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu + endif + endif +endif + +ifeq ($(TARGET_OS),qnx) + CCFLAGS += -DWIN_INTERFACE_CUSTOM + LDFLAGS += -lsocket +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +SAMPLE_ENABLED := 1 + +# This sample is not supported on Mac OSX +ifeq ($(TARGET_OS),darwin) + $(info >>> WARNING - systemWideAtomics is not supported on Mac OSX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +# This sample is not supported on ARMv7 +ifeq ($(TARGET_ARCH),armv7l) + $(info >>> WARNING - systemWideAtomics is not supported on ARMv7 - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +# This sample is not supported on aarch64 +ifeq ($(TARGET_ARCH),aarch64) + $(info >>> WARNING - systemWideAtomics is not supported on aarch64 - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../Common +LIBRARIES := + +################################################################################ + +# Gencode arguments +SMS ?= 60 61 70 75 + +ifeq ($(SMS),) +$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) +SAMPLE_ENABLED := 0 +endif + +ifeq ($(GENCODE_FLAGS),) +# Generate SASS code for each SM architecture listed in $(SMS) +$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) + +# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility +HIGHEST_SM := $(lastword $(sort $(SMS))) +ifneq ($(HIGHEST_SM),) +GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) +endif +endif + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: systemWideAtomics + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +systemWideAtomics.o:systemWideAtomics.cu + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +systemWideAtomics: systemWideAtomics.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./systemWideAtomics + +clean: + rm -f systemWideAtomics systemWideAtomics.o + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/systemWideAtomics + +clobber: clean diff --git a/Samples/systemWideAtomics/NsightEclipse.xml b/Samples/systemWideAtomics/NsightEclipse.xml new file mode 100644 index 00000000..b977c6da --- /dev/null +++ b/Samples/systemWideAtomics/NsightEclipse.xml @@ -0,0 +1,58 @@ + + + + systemWideAtomics + + cudaMalloc + cudaFree + cudaMemcpy + cudaFreeHost + + + whole + + ./ + ../ + ../../common/inc + + + Atomic Intrinsics + Unified Memory + + + CUDA + GPGPU + system-wide atomic + + + + + + true + systemWideAtomics.cu + + UVM + + + 1:CUDA Basic Topics + + sm60 + sm61 + sm70 + sm75 + + + x86_64 + linux + + + ppc64le + linux + + + + 6.0 + + System wide Atomics + exe + diff --git a/Samples/systemWideAtomics/README.md b/Samples/systemWideAtomics/README.md new file mode 100644 index 00000000..462eb23c --- /dev/null +++ b/Samples/systemWideAtomics/README.md @@ -0,0 +1,64 @@ +# systemWideAtomics - System wide Atomics + +## Description + +A simple demonstration of system wide atomic instructions. + +## Key Concepts + +Atomic Intrinsics, Unified Memory + +## Supported SM Architectures + +[SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux + +## Supported CPU Architecture + +x86_64, ppc64le + +## CUDA APIs involved + +### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) +cudaMalloc, cudaFree, cudaMemcpy, cudaFreeHost + +## Dependencies needed to build/run +[UVM](../../README.md#uvm) + +## Prerequisites + +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Make sure the dependencies mentioned in [Dependencies]() section above are installed. + +## Build and Run + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +## References (for more details) + diff --git a/Samples/systemWideAtomics/systemWideAtomics.cu b/Samples/systemWideAtomics/systemWideAtomics.cu new file mode 100644 index 00000000..cb022818 --- /dev/null +++ b/Samples/systemWideAtomics/systemWideAtomics.cu @@ -0,0 +1,342 @@ +/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* A program demonstrating trivial use of system-wide atomics on migratable + * memory. + */ + +#include +#include +#include +#include +#include +#include + +#define min(a, b) (a) < (b) ? (a) : (b) +#define max(a, b) (a) > (b) ? (a) : (b) + +#define LOOP_NUM 50 + +__global__ void atomicKernel(int *atom_arr) { + unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; + + for (int i = 0; i < LOOP_NUM; i++) { + // Atomic addition + atomicAdd_system(&atom_arr[0], 10); + + // Atomic exchange + atomicExch_system(&atom_arr[1], tid); + + // Atomic maximum + atomicMax_system(&atom_arr[2], tid); + + // Atomic minimum + atomicMin_system(&atom_arr[3], tid); + + // Atomic increment (modulo 17+1) + atomicInc_system((unsigned int *)&atom_arr[4], 17); + + // Atomic decrement + atomicDec_system((unsigned int *)&atom_arr[5], 137); + + // Atomic compare-and-swap + atomicCAS_system(&atom_arr[6], tid - 1, tid); + + // Bitwise atomic instructions + + // Atomic AND + atomicAnd_system(&atom_arr[7], 2 * tid + 7); + + // Atomic OR + atomicOr_system(&atom_arr[8], 1 << tid); + + // Atomic XOR + atomicXor_system(&atom_arr[9], tid); + } +} + +void atomicKernel_CPU(int *atom_arr, int no_of_threads) { + for (int i = no_of_threads; i < 2 * no_of_threads; i++) { + for (int j = 0; j < LOOP_NUM; j++) { + // Atomic addition + __sync_fetch_and_add(&atom_arr[0], 10); + + // Atomic exchange + __sync_lock_test_and_set(&atom_arr[1], i); + + // Atomic maximum + int old, expected; + do { + expected = atom_arr[2]; + old = __sync_val_compare_and_swap(&atom_arr[2], expected, + max(expected, i)); + } while (old != expected); + + // Atomic minimum + do { + expected = atom_arr[3]; + old = __sync_val_compare_and_swap(&atom_arr[3], expected, + min(expected, i)); + } while (old != expected); + + // Atomic increment (modulo 17+1) + int limit = 17; + do { + expected = atom_arr[4]; + old = __sync_val_compare_and_swap( + &atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1); + } while (old != expected); + + // Atomic decrement + limit = 137; + do { + expected = atom_arr[5]; + old = __sync_val_compare_and_swap( + &atom_arr[5], expected, + ((expected == 0) || (expected > limit)) ? limit : expected - 1); + } while (old != expected); + + // Atomic compare-and-swap + __sync_val_compare_and_swap(&atom_arr[6], i - 1, i); + + // Bitwise atomic instructions + + // Atomic AND + __sync_fetch_and_and(&atom_arr[7], 2 * i + 7); + + // Atomic OR + __sync_fetch_and_or(&atom_arr[8], 1 << i); + + // Atomic XOR + // 11th element should be 0xff + __sync_fetch_and_xor(&atom_arr[9], i); + } + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Compute reference data set +//! Each element is multiplied with the number of threads / array length +//! @param reference reference data, computed but preallocated +//! @param idata input data as provided to device +//! @param len number of elements in reference / idata +//////////////////////////////////////////////////////////////////////////////// +int verify(int *testData, const int len) { + int val = 0; + + for (int i = 0; i < len * LOOP_NUM; ++i) { + val += 10; + } + + if (val != testData[0]) { + printf("atomicAdd failed val = %d testData = %d\n", val, testData[0]); + return false; + } + + val = 0; + + bool found = false; + + for (int i = 0; i < len; ++i) { + // second element should be a member of [0, len) + if (i == testData[1]) { + found = true; + break; + } + } + + if (!found) { + printf("atomicExch failed\n"); + return false; + } + + val = -(1 << 8); + + for (int i = 0; i < len; ++i) { + // third element should be len-1 + val = max(val, i); + } + + if (val != testData[2]) { + printf("atomicMax failed\n"); + return false; + } + + val = 1 << 8; + + for (int i = 0; i < len; ++i) { + val = min(val, i); + } + + if (val != testData[3]) { + printf("atomicMin failed\n"); + return false; + } + + int limit = 17; + val = 0; + + for (int i = 0; i < len * LOOP_NUM; ++i) { + val = (val >= limit) ? 0 : val + 1; + } + + if (val != testData[4]) { + printf("atomicInc failed\n"); + return false; + } + + limit = 137; + val = 0; + + for (int i = 0; i < len * LOOP_NUM; ++i) { + val = ((val == 0) || (val > limit)) ? limit : val - 1; + } + + if (val != testData[5]) { + printf("atomicDec failed\n"); + return false; + } + + found = false; + + for (int i = 0; i < len; ++i) { + // seventh element should be a member of [0, len) + if (i == testData[6]) { + found = true; + break; + } + } + + if (!found) { + printf("atomicCAS failed\n"); + return false; + } + + val = 0xff; + + for (int i = 0; i < len; ++i) { + // 8th element should be 1 + val &= (2 * i + 7); + } + + if (val != testData[7]) { + printf("atomicAnd failed\n"); + return false; + } + + val = 0; + + for (int i = 0; i < len; ++i) { + // 9th element should be 0xff + val |= (1 << i); + } + + if (val != testData[8]) { + printf("atomicOr failed\n"); + return false; + } + + val = 0xff; + + for (int i = 0; i < len; ++i) { + // 11th element should be 0xff + val ^= i; + } + + if (val != testData[9]) { + printf("atomicXor failed\n"); + return false; + } + + return true; +} + +int main(int argc, char **argv) { + // set device + cudaDeviceProp device_prop; + int dev_id = findCudaDevice(argc, (const char **)argv); + checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id)); + + if (!device_prop.managedMemory) { + // This samples requires being run on a device that supports Unified Memory + fprintf(stderr, "Unified Memory not supported on this device\n"); + exit(EXIT_WAIVED); + } + + if (device_prop.computeMode == cudaComputeModeProhibited) { + // This sample requires being run with a default or process exclusive mode + fprintf(stderr, + "This sample requires a device in either default or process " + "exclusive mode\n"); + exit(EXIT_WAIVED); + } + + if (device_prop.major < 6) { + printf( + "%s: requires a minimum CUDA compute 6.0 capability, waiving " + "testing.\n", + argv[0]); + exit(EXIT_WAIVED); + } + + unsigned int numThreads = 256; + unsigned int numBlocks = 64; + unsigned int numData = 10; + + int *atom_arr; + + if (device_prop.pageableMemoryAccess) { + printf("CAN access pageable memory\n"); + atom_arr = (int *)malloc(sizeof(int) * numData); + } else { + printf("CANNOT access pageable memory\n"); + checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData)); + } + + for (unsigned int i = 0; i < numData; i++) atom_arr[i] = 0; + + // To make the AND and XOR tests generate something other than 0... + atom_arr[7] = atom_arr[9] = 0xff; + + atomicKernel<<>>(atom_arr); + atomicKernel_CPU(atom_arr, numBlocks * numThreads); + + checkCudaErrors(cudaDeviceSynchronize()); + + // Compute & verify reference solution + int testResult = verify(atom_arr, 2 * numThreads * numBlocks); + + if (device_prop.pageableMemoryAccess) { + free(atom_arr); + } else { + cudaFree(atom_arr); + } + + printf("systemWideAtomics completed, returned %s \n", + testResult ? "OK" : "ERROR!"); + exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); +} diff --git a/Samples/vectorAdd_nvrtc/Makefile b/Samples/vectorAdd_nvrtc/Makefile index 08ab2bd2..5561b18a 100644 --- a/Samples/vectorAdd_nvrtc/Makefile +++ b/Samples/vectorAdd_nvrtc/Makefile @@ -1,31 +1,29 @@ ################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # -# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. # -# NOTICE TO USER: -# -# This source code is subject to NVIDIA ownership rights under U.S. and -# international Copyright laws. -# -# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE -# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR -# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH -# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, -# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -# OR PERFORMANCE OF THIS SOURCE CODE. -# -# U.S. Government End Users. This source code is a "commercial item" as -# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of -# "commercial computer software" and "commercial computer software -# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) -# and is provided to the U.S. Government only as a commercial end item. -# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through -# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the -# source code with only those rights set forth herein. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ################################################################################ # @@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) export QNX_TARGET HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ else ifeq ($(TARGET_OS), android) - HOST_COMPILER ?= aarch64-linux-android-g++ + HOST_COMPILER ?= aarch64-linux-android-clang++ endif else ifeq ($(TARGET_ARCH),ppc64le) HOST_COMPILER ?= powerpc64le-linux-gnu-g++ diff --git a/Samples/vectorAdd_nvrtc/README.md b/Samples/vectorAdd_nvrtc/README.md index 93675b45..aa9308d3 100644 --- a/Samples/vectorAdd_nvrtc/README.md +++ b/Samples/vectorAdd_nvrtc/README.md @@ -10,7 +10,7 @@ CUDA Driver API, Vector Addition, Runtime Compilation ## Supported SM Architectures -[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) ## Supported OSes @@ -30,7 +30,7 @@ cuMemAlloc, cuMemFree, cuMemcpyHtoD, cuMemcpyDtoH ## Prerequisites -Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2010.sln b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2010.sln deleted file mode 100644 index 5fe04847..00000000 --- a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2010.sln +++ /dev/null @@ -1,20 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 11.00 -# Visual Studio 2010 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vectorAdd_nvrtc", "vectorAdd_nvrtc_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|x64 = Debug|x64 - Release|x64 = Release|x64 - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 - {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 - {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 - {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection -EndGlobal diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2012.vcxproj b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2012.vcxproj index 5386c8ac..ae30b7a2 100644 --- a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2012.vcxproj +++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2012.vcxproj @@ -33,7 +33,7 @@ - + @@ -102,6 +102,6 @@ - + diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2013.vcxproj b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2013.vcxproj index f4cba895..99e6a833 100644 --- a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2013.vcxproj +++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2013.vcxproj @@ -33,7 +33,7 @@ - + @@ -102,6 +102,6 @@ - + diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2015.vcxproj b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2015.vcxproj index a8d3b4b9..3ba3e437 100644 --- a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2015.vcxproj +++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2015.vcxproj @@ -33,7 +33,7 @@ - + @@ -102,6 +102,6 @@ - + diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj index e449811e..5caded61 100644 --- a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj +++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj @@ -34,7 +34,7 @@ - + @@ -103,6 +103,6 @@ - + diff --git a/Samples/warpAggregatedAtomicsCG/Makefile b/Samples/warpAggregatedAtomicsCG/Makefile index 3041a71e..5353f6fe 100644 --- a/Samples/warpAggregatedAtomicsCG/Makefile +++ b/Samples/warpAggregatedAtomicsCG/Makefile @@ -1,31 +1,29 @@ ################################################################################ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # -# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. # -# NOTICE TO USER: -# -# This source code is subject to NVIDIA ownership rights under U.S. and -# international Copyright laws. -# -# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE -# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR -# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH -# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, -# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -# OR PERFORMANCE OF THIS SOURCE CODE. -# -# U.S. Government End Users. This source code is a "commercial item" as -# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of -# "commercial computer software" and "commercial computer software -# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) -# and is provided to the U.S. Government only as a commercial end item. -# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through -# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the -# source code with only those rights set forth herein. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ################################################################################ # @@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) export QNX_TARGET HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ else ifeq ($(TARGET_OS), android) - HOST_COMPILER ?= aarch64-linux-android-g++ + HOST_COMPILER ?= aarch64-linux-android-clang++ endif else ifeq ($(TARGET_ARCH),ppc64le) HOST_COMPILER ?= powerpc64le-linux-gnu-g++ @@ -248,7 +246,7 @@ LIBRARIES := ################################################################################ # Gencode arguments -SMS ?= 30 35 37 50 52 60 61 70 +SMS ?= 30 35 37 50 52 60 61 70 75 ifeq ($(SMS),) $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) diff --git a/Samples/warpAggregatedAtomicsCG/NsightEclipse.xml b/Samples/warpAggregatedAtomicsCG/NsightEclipse.xml index d2c36a1d..49a896d3 100644 --- a/Samples/warpAggregatedAtomicsCG/NsightEclipse.xml +++ b/Samples/warpAggregatedAtomicsCG/NsightEclipse.xml @@ -34,6 +34,7 @@ sm60 sm61 sm70 + sm75 x86_64 diff --git a/Samples/warpAggregatedAtomicsCG/README.md b/Samples/warpAggregatedAtomicsCG/README.md index 3939db26..dacffd13 100644 --- a/Samples/warpAggregatedAtomicsCG/README.md +++ b/Samples/warpAggregatedAtomicsCG/README.md @@ -10,7 +10,7 @@ Cooperative Groups, Atomic Intrinsics ## Supported SM Architectures -[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) +[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) ## Supported OSes @@ -24,7 +24,7 @@ x86_64, ppc64le, armv7l, aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2012.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2012.vcxproj index 2e57ab7c..4f0f3838 100644 --- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2012.vcxproj +++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2012.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/warpAggregatedAtomicsCG.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -102,6 +102,6 @@ - + diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2013.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2013.vcxproj index 4ad896cb..cc4187be 100644 --- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2013.vcxproj +++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2013.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/warpAggregatedAtomicsCG.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -102,6 +102,6 @@ - + diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2015.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2015.vcxproj index 6cc5401b..cc614a67 100644 --- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2015.vcxproj +++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2015.vcxproj @@ -33,7 +33,7 @@ - + @@ -62,7 +62,7 @@ $(OutDir)/warpAggregatedAtomicsCG.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -102,6 +102,6 @@ - + diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj index 01ddb390..a36350c5 100644 --- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj +++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj @@ -34,7 +34,7 @@ - + @@ -63,7 +63,7 @@ $(OutDir)/warpAggregatedAtomicsCG.exe - compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70; + compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75; -Xcompiler "/wd 4819" %(AdditionalOptions) ./;../../Common WIN32 @@ -103,6 +103,6 @@ - +